提交 3420c949 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_python35_CI_random_fail

...@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh ...@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one. # and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \ tar -xz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr cp -rf /usr/local/TensorRT/lib /usr
......
...@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85 ...@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
## Installation ## Installation
It is recommended to check out the It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
before looking into the
[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
## Documentation ## Documentation
We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation. [Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html) - [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook. You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html) - [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters. You can run distributed training jobs on MPI clusters.
- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html) - [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
You can also run distributed training jobs on Kubernetes clusters.
- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
Our new API enables much shorter programs. Our new API enables much shorter programs.
- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html) - [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
We appreciate your contributions! We appreciate your contributions!
......
...@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) ...@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here. # So, don't set these flags here.
if (NOT WIN32) # windows msvc2015 support c++11 natively.
# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11") list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
endif(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
# in cuda9, suppress cuda warning on eigen # in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w") list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings # Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release") elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
...@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") ...@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead # nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
else(NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
endif()
endif(NOT WIN32)
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
...@@ -44,7 +44,7 @@ ExternalProject_Add( ...@@ -44,7 +44,7 @@ ExternalProject_Add(
# 3. keep only zlib, cares, protobuf, boringssl under "third_party", # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
# checkout and clean other dirs under third_party # checkout and clean other dirs under third_party
# 4. remove .git, and package the directory. # 4. remove .git, and package the directory.
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz" URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
URL_MD5 "1f268a2aff6759839dccd256adcc91cf" URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR} PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
......
...@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") ...@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework") set(module "framework")
if (NOT WIN32) if (NOT WIN32)
copy(framework_lib DEPS framework_py_proto set(framework_lib_deps framework_py_proto)
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h endif(NOT WIN32)
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} copy(framework_lib DEPS ${framework_lib_deps}
)
else()
copy(framework_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${src_dir}/${module}/ir/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
) )
endif(NOT WIN32)
set(module "memory") set(module "memory")
copy(memory_lib copy(memory_lib
...@@ -161,7 +158,8 @@ set(module "inference") ...@@ -161,7 +158,8 @@ set(module "inference")
copy(inference_lib DEPS ${inference_deps} copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
) )
set(module "platform") set(module "platform")
......
服务器端部署 - Anakin Anakin - 服务器端加速引擎
##################### #######################
使用文档 使用文档
......
服务器端部署 - 原生引擎
#######################
.. toctree::
:maxdepth: 2
build_and_install_lib_cn.rst
native_infer.rst
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
deploy/index_native.rst
deploy/index_anakin.rst deploy/index_anakin.rst
deploy/index_mobile.rst deploy/index_mobile.rst
development/contribute_to_paddle.md development/contribute_to_paddle.md
......
*.pyc
train.log
output
data/cifar-10-batches-py/
data/cifar-10-python.tar.gz
data/*.txt
data/*.list
data/mean.meta
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。 图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。
<p align="center"> <p align="center">
<img src="image/dog_cat.png " width="350" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/dog_cat.png?raw=true" width="350" ><br/>
图1. 通用图像分类展示 图1. 通用图像分类展示
</p> </p>
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
<p align="center"> <p align="center">
<img src="image/flowers.png" width="400" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/flowers.png?raw=true" width="400" ><br/>
图2. 细粒度图像分类展示 图2. 细粒度图像分类展示
</p> </p>
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。 一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。
<p align="center"> <p align="center">
<img src="image/variations.png" width="550" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/variations.png?raw=true" width="550" ><br/>
图3. 扰动图片展示[22] 图3. 扰动图片展示[22]
</p> </p>
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。
<p align="center"> <p align="center">
<img src="image/ilsvrc.png" width="500" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/ilsvrc.png?raw=true" width="500" ><br/>
图4. ILSVRC图像分类Top-5错误率 图4. ILSVRC图像分类Top-5错误率
</p> </p>
...@@ -70,7 +70,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得 ...@@ -70,7 +70,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。 传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。
<p align="center"> <p align="center">
<img src="image/lenet.png"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/lenet.png?raw=true"><br/>
图5. CNN网络示例[20] 图5. CNN网络示例[20]
</p> </p>
...@@ -89,7 +89,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得 ...@@ -89,7 +89,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。 牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
<p align="center"> <p align="center">
<img src="image/vgg16.png" width="750" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/vgg16.png?raw=true" width="750" ><br/>
图6. 基于ImageNet的VGG16模型 图6. 基于ImageNet的VGG16模型
</p> </p>
...@@ -106,7 +106,7 @@ NIN模型主要有两个特点: ...@@ -106,7 +106,7 @@ NIN模型主要有两个特点:
Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。 Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。
<p align="center"> <p align="center">
<img src="image/inception.png" width="800" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/inception.png?raw=ture" width="800" ><br/>
图7. Inception模块 图7. Inception模块
</p> </p>
...@@ -115,7 +115,7 @@ GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没 ...@@ -115,7 +115,7 @@ GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没
GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。 GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。
<p align="center"> <p align="center">
<img src="image/googlenet.jpeg" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/googlenet.jpeg?raw=true" ><br/>
图8. GoogleNet[12] 图8. GoogleNet[12]
</p> </p>
...@@ -130,14 +130,14 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类 ...@@ -130,14 +130,14 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256->64),下面的1x1卷积用来升维(图示例即64->256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。 残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256->64),下面的1x1卷积用来升维(图示例即64->256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
<p align="center"> <p align="center">
<img src="image/resnet_block.jpg" width="400"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet_block.jpg?raw=true" width="400"><br/>
图9. 残差模块 图9. 残差模块
</p> </p>
图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。 图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。
<p align="center"> <p align="center">
<img src="image/resnet.png"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet.png?raw=true"><br/>
图10. 基于ImageNet的ResNet模型 图10. 基于ImageNet的ResNet模型
</p> </p>
...@@ -149,7 +149,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类 ...@@ -149,7 +149,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。 由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。
<p align="center"> <p align="center">
<img src="image/cifar.png" width="350"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/cifar.png?raw=true" width="350"><br/>
图11. CIFAR10数据集[21] 图11. CIFAR10数据集[21]
</p> </p>
...@@ -377,7 +377,7 @@ test_reader = paddle.batch( ...@@ -377,7 +377,7 @@ test_reader = paddle.batch(
`event_handler_plot`可以用来利用回调数据来打点画图: `event_handler_plot`可以用来利用回调数据来打点画图:
<p align="center"> <p align="center">
<img src="image/train_and_test.png" width="350"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/train_and_test.png?raw=true" width="350"><br/>
图12. 训练结果 图12. 训练结果
</p> </p>
...@@ -469,7 +469,7 @@ Test with Pass 0, Loss 1.1, Acc 0.6 ...@@ -469,7 +469,7 @@ Test with Pass 0, Loss 1.1, Acc 0.6
图13是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。 图13是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。
<p align="center"> <p align="center">
<img src="image/plot.png" width="400" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/plot.png?raw=true" width="400" ><br/>
图13. CIFAR10数据集上VGG模型的分类错误率 图13. CIFAR10数据集上VGG模型的分类错误率
</p> </p>
......
data/train.list
data/test.*
data/conll05st-release.tar.gz
data/conll05st-release
data/predicate_dict
data/label_dict
data/word_dict
data/emb
data/feature
output
predict.res
train.log
...@@ -21,7 +21,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb ...@@ -21,7 +21,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
5. 对第4步的结果,通过多分类得到论元的语义角色标签。可以看到,句法分析是基础,并且后续步骤常常会构造的一些人工特征,这些特征往往也来自句法分析。 5. 对第4步的结果,通过多分类得到论元的语义角色标签。可以看到,句法分析是基础,并且后续步骤常常会构造的一些人工特征,这些特征往往也来自句法分析。
<div align="center"> <div align="center">
<img src="image/dependency_parsing.png" width = "80%" align=center /><br> <img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/dependency_parsing.png?raw=true" width = "80%" align=center /><br>
图1. 依存句法分析句法树示例 图1. 依存句法分析句法树示例
</div> </div>
...@@ -30,7 +30,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb ...@@ -30,7 +30,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
我们继续以上面的这句话为例,图1展示了BIO表示方法。 我们继续以上面的这句话为例,图1展示了BIO表示方法。
<div align="center"> <div align="center">
<img src="image/bio_example.png" width = "90%" align=center /><br> <img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bio_example.png?raw=true" width = "90%" align=center /><br>
图2. BIO标注方法示例 图2. BIO标注方法示例
</div> </div>
...@@ -53,7 +53,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb ...@@ -53,7 +53,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
图3是最终得到的栈式循环神经网络结构示意图。 图3是最终得到的栈式循环神经网络结构示意图。
<p align="center"> <p align="center">
<img src="./image/stacked_lstm.png" width = "40%" align=center><br> <img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/stacked_lstm.png?raw=true" width = "40%" align=center><br>
图3. 基于LSTM的栈式循环神经网络结构示意图 图3. 基于LSTM的栈式循环神经网络结构示意图
</p> </p>
...@@ -64,7 +64,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb ...@@ -64,7 +64,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。 为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
<p align="center"> <p align="center">
<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br> <img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bidirectional_stacked_lstm.png?raw=true" width = "60%" align=center><br>
图4. 基于LSTM的双向循环神经网络结构示意图 图4. 基于LSTM的双向循环神经网络结构示意图
</p> </p>
...@@ -79,7 +79,7 @@ CRF是一种概率化结构模型,可以看作是一个概率无向图模型 ...@@ -79,7 +79,7 @@ CRF是一种概率化结构模型,可以看作是一个概率无向图模型
序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。 序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。
<p align="center"> <p align="center">
<img src="./image/linear_chain_crf.png" width = "35%" align=center><br> <img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/linear_chain_crf.png?raw=true" width = "35%" align=center><br>
图5. 序列标注任务中使用的线性链条件随机场 图5. 序列标注任务中使用的线性链条件随机场
</p> </p>
...@@ -123,7 +123,7 @@ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\pr ...@@ -123,7 +123,7 @@ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\pr
4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注; 4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注;
<div align="center"> <div align="center">
<img src="image/db_lstm_network.png" width = "60%" align=center /><br> <img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/db_lstm_network.png?raw=true" width = "60%" align=center /><br>
图6. SRL任务上的深层双向LSTM模型 图6. SRL任务上的深层双向LSTM模型
</div> </div>
......
data/wmt14
data/pre-wmt14
pretrained/wmt14_model
gen.log
gen_result
train.log
dataprovider_copy_1.py
*.pyc
multi-bleu.perl
...@@ -11,10 +11,10 @@ ...@@ -11,10 +11,10 @@
为解决以上问题,统计机器翻译(Statistical Machine Translation, SMT)技术应运而生。在统计机器翻译技术中,转化规则是由机器自动从大规模的语料中学习得到的,而非我们人主动提供规则。因此,它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题,但仍然存在许多挑战:1)人为设计许多特征(feature),但永远无法覆盖所有的语言现象;2)难以利用全局的特征;3)依赖于许多预处理环节,如词语对齐、分词或符号化(tokenization)、规则抽取、句法分析等,而每个环节的错误会逐步累积,对翻译的影响也越来越大。 为解决以上问题,统计机器翻译(Statistical Machine Translation, SMT)技术应运而生。在统计机器翻译技术中,转化规则是由机器自动从大规模的语料中学习得到的,而非我们人主动提供规则。因此,它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题,但仍然存在许多挑战:1)人为设计许多特征(feature),但永远无法覆盖所有的语言现象;2)难以利用全局的特征;3)依赖于许多预处理环节,如词语对齐、分词或符号化(tokenization)、规则抽取、句法分析等,而每个环节的错误会逐步累积,对翻译的影响也越来越大。
近年来,深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类:1)仍以统计机器翻译系统为框架,只是利用神经网络来改进其中的关键模块,如语言模型、调序模型等(见图1的左半部分);2)不再以统计机器翻译系统为框架,而是直接用神经网络将源语言映射到目标语言,即端到端的神经网络机器翻译(End-to-End Neural Machine Translation, End-to-End NMT)(见图1的右半部分),简称为NMT模型。 近年来,深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类:1)仍以统计机器翻译系统为框架,只是利用神经网络来改进其中的关键模块,如语言模型、调序模型等(见图1的左半部分);2)不再以统计机器翻译系统为框架,而是直接用神经网络将源语言映射到目标语言,即端到端的神经网络机器翻译(End-to-End Neural Machine Translation, End-to-End NMT)(见图1的右半部分),简称为NMT模型。
![nmt](./image/nmt.png) <div align="center">
<p align="center"> <img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/nmt.png?raw=true" width = "400" align=center/><br/>
图1. 基于神经网络的机器翻译系统 图1. 基于神经网络的机器翻译系统
</p> </div>
本教程主要介绍NMT模型,以及如何用PaddlePaddle来训练一个NMT模型。 本教程主要介绍NMT模型,以及如何用PaddlePaddle来训练一个NMT模型。
...@@ -45,19 +45,22 @@ ...@@ -45,19 +45,22 @@
具体来说,该双向循环神经网络分别在时间维以顺序和逆序——即前向(forward)和后向(backward)——依次处理输入序列,并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点,都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN,其中有六个权重矩阵:输入到前向隐层和后向隐层的权重矩阵(`$W_1, W_3$`),隐层到隐层自己的权重矩阵(`$W_2,W_5$`),前向隐层和后向隐层到输出层的权重矩阵(`$W_4, W_6$`)。注意,该网络的前向隐层和后向隐层之间没有连接。 具体来说,该双向循环神经网络分别在时间维以顺序和逆序——即前向(forward)和后向(backward)——依次处理输入序列,并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点,都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN,其中有六个权重矩阵:输入到前向隐层和后向隐层的权重矩阵(`$W_1, W_3$`),隐层到隐层自己的权重矩阵(`$W_2,W_5$`),前向隐层和后向隐层到输出层的权重矩阵(`$W_4, W_6$`)。注意,该网络的前向隐层和后向隐层之间没有连接。
![bi_rnn](./image/bi_rnn.png)
<p align="center"> <div align="center">
图3. 按时间步展开的双向循环神经网络 <img src = "https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/bi_rnn.png?raw=true" width="400"><br/>
</p> 图2. 按时间步展开的双向循环神经网络
</div>
### 编码器-解码器框架 ### 编码器-解码器框架
编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。 编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
![encoder_decoder](./image/encoder_decoder.png) ![encoder_decoder](./image/encoder_decoder.png)
<p align="center"> <div align="center">
图4. 编码器-解码器框架 <img src ="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_decoder.png?raw=true" width="400"><br/>
</p> 图3. 编码器-解码器框架
</div>
<a name="编码器"></a>
#### 编码器 #### 编码器
编码阶段分为三步: 编码阶段分为三步:
...@@ -69,19 +72,17 @@ ...@@ -69,19 +72,17 @@
3. 用RNN编码源语言词序列:这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`,其中`$h_0$`是一个全零的向量,`$\varnothing _\theta$`是一个非线性激活函数,最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码,或使用时间维上的池化(pooling)结果。 3. 用RNN编码源语言词序列:这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`,其中`$h_0$`是一个全零的向量,`$\varnothing _\theta$`是一个非线性激活函数,最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码,或使用时间维上的池化(pooling)结果。
第3步也可以使用双向循环神经网络实现更复杂的句编码表示,具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词,并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的,后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词,得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`,通过拼接两个GRU的结果得到它的隐层状态,即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$` 第3步也可以使用双向循环神经网络实现更复杂的句编码表示,具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词,并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的,后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词,得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`,通过拼接两个GRU的结果得到它的隐层状态,即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`
<div align="center">
![encoder_attention](./image/encoder_attention.png) <img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_attention.png?raw=true" width="400"><br/>
<p align="center"> 图4. 使用双向GRU的编码器
图5. 使用双向GRU的编码器 </div>
</p>
#### 解码器 #### 解码器
机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: 机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是:
1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下: 1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下:
$$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$``$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记`<s>`,表示解码开始;`$z_i$``$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用注意力机制时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$``$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记`<s>`,表示解码开始;`$z_i$``$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。
2.`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下: 2.`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下:
$$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
...@@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ ...@@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法) 机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法)
<a name="柱搜索算法"></a>
### 柱搜索算法 ### 柱搜索算法
柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”,就算目标语言字典中只有3个词(`<s>`, `<e>`, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。 柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”,就算目标语言字典中只有3个词(`<s>`, `<e>`, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。
...@@ -100,7 +102,6 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ ...@@ -100,7 +102,6 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
柱搜索算法使用广度优先策略建立搜索树,在树的每一层,按照启发代价(heuristic cost)(本教程中,为生成词的log概率之和)对节点进行排序,然后仅留下预先确定的个数(文献中通常称为beam width、beam size、柱宽度等)的节点。只有这些节点会在下一层继续扩展,其他节点就被剪掉了,也就是说保留了质量较高的节点,剪枝了质量较差的节点。因此,搜索所占用的空间和时间大幅减少,但缺点是无法保证一定获得最优解。 柱搜索算法使用广度优先策略建立搜索树,在树的每一层,按照启发代价(heuristic cost)(本教程中,为生成词的log概率之和)对节点进行排序,然后仅留下预先确定的个数(文献中通常称为beam width、beam size、柱宽度等)的节点。只有这些节点会在下一层继续扩展,其他节点就被剪掉了,也就是说保留了质量较高的节点,剪枝了质量较差的节点。因此,搜索所占用的空间和时间大幅减少,但缺点是无法保证一定获得最优解。
使用柱搜索算法的解码阶段,目标是最大化生成序列的概率。思路是: 使用柱搜索算法的解码阶段,目标是最大化生成序列的概率。思路是:
1. 每一个时刻,根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$` 1. 每一个时刻,根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$``$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`
2.`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$` 2.`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`
......
...@@ -37,7 +37,7 @@ Prediction Score is 4.25 ...@@ -37,7 +37,7 @@ Prediction Score is 4.25
YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示: YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示:
<p align="center"> <p align="center">
<img src="image/YouTube_Overview.png" width="70%" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/YouTube_Overview.png?raw=true" width="70%" ><br/>
图1. YouTube 推荐系统结构 图1. YouTube 推荐系统结构
</p> </p>
...@@ -48,7 +48,7 @@ YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐 ...@@ -48,7 +48,7 @@ YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐
首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。 首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
<p align="center"> <p align="center">
<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/Deep_candidate_generation_model_architecture.png?raw=true" width="70%" ><br/>
图2. 候选生成网络结构 图2. 候选生成网络结构
</p> </p>
...@@ -73,7 +73,7 @@ $$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$ ...@@ -73,7 +73,7 @@ $$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
卷积神经网络主要由卷积(convolution)和池化(pooling)操作构成,其应用及组合方式灵活多变,种类繁多。本小结我们以如图3所示的网络进行讲解: 卷积神经网络主要由卷积(convolution)和池化(pooling)操作构成,其应用及组合方式灵活多变,种类繁多。本小结我们以如图3所示的网络进行讲解:
<p align="center"> <p align="center">
<img src="image/text_cnn.png" width = "80%" align="center"/><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/text_cnn.png?raw=true" width = "80%" align="center"/><br/>
图3. 卷积神经网络文本分类模型 图3. 卷积神经网络文本分类模型
</p> </p>
...@@ -107,7 +107,7 @@ $$\hat c=max(c)$$ ...@@ -107,7 +107,7 @@ $$\hat c=max(c)$$
<p align="center"> <p align="center">
<img src="image/rec_regression_network.png" width="90%" ><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/rec_regression_network.png?raw=true" width="90%" ><br/>
图4. 融合推荐模型 图4. 融合推荐模型
</p> </p>
......
data/aclImdb
data/imdb
data/pre-imdb
data/mosesdecoder-master
*.log
model_output
dataprovider_copy_1.py
model.list
*.pyc
.DS_Store
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上,循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据(词序列),近年来,循环神经网络及其变体(如long short term memory\[[5](#参考文献)\]等)在自然语言处理的多个领域,如语言模型、句法解析、语义角色标注(或一般的序列标注)、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。 循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上,循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据(词序列),近年来,循环神经网络及其变体(如long short term memory\[[5](#参考文献)\]等)在自然语言处理的多个领域,如语言模型、句法解析、语义角色标注(或一般的序列标注)、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
<p align="center"> <p align="center">
<img src="image/rnn.png" width = "60%" align="center"/><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/rnn.png?raw=true" width = "60%" align="center"/><br/>
图1. 循环神经网络按时间展开的示意图 图1. 循环神经网络按时间展开的示意图
</p> </p>
...@@ -66,7 +66,7 @@ $$ h_t = o_t\odot tanh(c_t) $$ ...@@ -66,7 +66,7 @@ $$ h_t = o_t\odot tanh(c_t) $$
其中,$i_t, f_t, c_t, o_t$分别表示输入门,遗忘门,记忆单元及输出门的向量值,带角标的$W$及$b$为模型参数,$tanh$为双曲正切函数,$\odot$表示逐元素(elementwise)的乘法操作。输入门控制着新输入进入记忆单元$c$的强度,遗忘门控制着记忆单元维持上一时刻值的强度,输出门控制着输出记忆单元的强度。三种门的计算方式类似,但有着完全不同的参数,它们各自以不同的方式控制着记忆单元$c$,如图2所示: 其中,$i_t, f_t, c_t, o_t$分别表示输入门,遗忘门,记忆单元及输出门的向量值,带角标的$W$及$b$为模型参数,$tanh$为双曲正切函数,$\odot$表示逐元素(elementwise)的乘法操作。输入门控制着新输入进入记忆单元$c$的强度,遗忘门控制着记忆单元维持上一时刻值的强度,输出门控制着输出记忆单元的强度。三种门的计算方式类似,但有着完全不同的参数,它们各自以不同的方式控制着记忆单元$c$,如图2所示:
<p align="center"> <p align="center">
<img src="image/lstm.png" width = "65%" align="center"/><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/lstm.png?raw=true" width = "65%" align="center"/><br/>
图2. 时刻$t$的LSTM [7] 图2. 时刻$t$的LSTM [7]
</p> </p>
...@@ -83,7 +83,7 @@ $$ h_t=Recrurent(x_t,h_{t-1})$$ ...@@ -83,7 +83,7 @@ $$ h_t=Recrurent(x_t,h_{t-1})$$
如图3所示(以三层为例),奇数层LSTM正向,偶数层LSTM反向,高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入,对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示(这一表示充分融合了文本的上下文信息,并且对文本进行了深层次抽象),最后我们将文本表示连接至softmax构建分类模型。 如图3所示(以三层为例),奇数层LSTM正向,偶数层LSTM反向,高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入,对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示(这一表示充分融合了文本的上下文信息,并且对文本进行了深层次抽象),最后我们将文本表示连接至softmax构建分类模型。
<p align="center"> <p align="center">
<img src="image/stacked_lstm.jpg" width=450><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/stacked_lstm.jpg?raw=true" width=450><br/>
图3. 栈式双向LSTM用于文本分类 图3. 栈式双向LSTM用于文本分类
</p> </p>
...@@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): ...@@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。 网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
<a name="栈值双向LSTM"></a>
### 栈式双向LSTM ### 栈式双向LSTM
栈式双向神经网络`stacked_lstm_net`的代码片段如下: 栈式双向神经网络`stacked_lstm_net`的代码片段如下:
......
data/train.list
data/test.list
data/simple-examples*
...@@ -34,7 +34,7 @@ $$X = USV^T$$ ...@@ -34,7 +34,7 @@ $$X = USV^T$$
本章中,当词向量训练好后,我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影(如下图所示)。从图中可以看出,语义相关的词语(如a, the, these; big, huge)在投影上距离很近,语意无关的词(如say, business; decision, japan)在投影上的距离很远。 本章中,当词向量训练好后,我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影(如下图所示)。从图中可以看出,语义相关的词语(如a, the, these; big, huge)在投影上距离很近,语意无关的词(如say, business; decision, japan)在投影上的距离很远。
<p align="center"> <p align="center">
<img src = "image/2d_similarity.png" width=400><br/> <img src = "https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/2d_similarity.png?raw=true" width=400><br/>
图1. 词向量的二维投影 图1. 词向量的二维投影
</p> </p>
...@@ -50,7 +50,7 @@ similarity: -0.0997506977351 ...@@ -50,7 +50,7 @@ similarity: -0.0997506977351
``` ```
以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[应用模型](#应用模型)中详细描述用法。 以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[模型应用](#模型应用)中详细描述用法。
## 模型概览 ## 模型概览
...@@ -90,7 +90,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$ ...@@ -90,7 +90,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率,$R(\theta)$表示参数正则项。 其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率,$R(\theta)$表示参数正则项。
<p align="center"> <p align="center">
<img src="image/nnlm.png" width=500><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/nnlm.png?raw=true" width=500><br/>
图2. N-gram神经网络模型 图2. N-gram神经网络模型
</p> </p>
...@@ -122,7 +122,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$ ...@@ -122,7 +122,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
CBOW模型通过一个词的上下文(各N个词)预测当前词。当N=2时,模型如下图所示: CBOW模型通过一个词的上下文(各N个词)预测当前词。当N=2时,模型如下图所示:
<p align="center"> <p align="center">
<img src="image/cbow.png" width=250><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/cbow.png?raw=true" width=250><br/>
图3. CBOW模型 图3. CBOW模型
</p> </p>
...@@ -137,7 +137,7 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$ ...@@ -137,7 +137,7 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去掉了噪声,因此在小数据集上很有效。而Skip-gram的方法中,用一个词预测其上下文,得到了当前词上下文的很多样本,因此可用于更大的数据集。 CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去掉了噪声,因此在小数据集上很有效。而Skip-gram的方法中,用一个词预测其上下文,得到了当前词上下文的很多样本,因此可用于更大的数据集。
<p align="center"> <p align="center">
<img src="image/skipgram.png" width=250><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/skipgram.png?raw=true" width=250><br/>
图4. Skip-gram模型 图4. Skip-gram模型
</p> </p>
...@@ -189,12 +189,13 @@ dream that one day <e> ...@@ -189,12 +189,13 @@ dream that one day <e>
最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。 最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。
<a name="训练模型"></a>
## 编程实现 ## 编程实现
本配置的模型结构如下图所示: 本配置的模型结构如下图所示:
<p align="center"> <p align="center">
<img src="image/ngram.png" width=400><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/ngram.png?raw=true" width=400><br/>
图5. 模型配置中的N-gram神经网络模型 图5. 模型配置中的N-gram神经网络模型
</p> </p>
...@@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995 ...@@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995
... ...
``` ```
<a name="模型应用"></a>
## 模型应用 ## 模型应用
在模型训练后,我们可以用它做一些预测。 在模型训练后,我们可以用它做一些预测。
......
...@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldo ...@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldo
## 效果展示 ## 效果展示
我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。
<p align="center"> <p align="center">
<img src = "image/predictions.png" width=400><br/> <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/predictions.png?raw=true" width=400><br/>
图1. 预测值 V.S. 真实值 图1. 预测值 V.S. 真实值
</p> </p>
...@@ -40,13 +40,9 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$ ...@@ -40,13 +40,9 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
### 训练过程 ### 训练过程
定义好模型结构之后,我们要通过以下几个步骤进行模型训练 定义好模型结构之后,我们要通过以下几个步骤进行模型训练
1. 初始化参数,其中包括权重$\omega_i$和偏置$b$,对其进行初始化(如0均值,1方差)。 1. 初始化参数,其中包括权重$\omega_i$和偏置$b$,对其进行初始化(如0均值,1方差)。
2. 网络正向传播计算网络输出和损失函数。 2. 网络正向传播计算网络输出和损失函数。
3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。 3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。
4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。 4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。
## 数据集 ## 数据集
...@@ -84,7 +80,7 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$ ...@@ -84,7 +80,7 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
- 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。 - 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。
<p align="center"> <p align="center">
<img src = "image/ranges.png" width=550><br/> <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/ranges.png?raw=true" width=550><br/>
图2. 各维属性的取值范围 图2. 各维属性的取值范围
</p> </p>
...@@ -199,10 +195,12 @@ step = 0 ...@@ -199,10 +195,12 @@ step = 0
def event_handler_plot(event): def event_handler_plot(event):
global step global step
if isinstance(event, fluid.EndStepEvent): if isinstance(event, fluid.EndStepEvent):
if event.step % 10 == 0: # record the test cost every 10 seconds if step % 10 == 0: # record a train cost every 10 batches
plot_cost.append(train_title, step, event.metrics[0])
if step % 100 == 0: # record a test cost every 100 batches
test_metrics = trainer.test( test_metrics = trainer.test(
reader=test_reader, feed_order=feed_order) reader=test_reader, feed_order=feed_order)
plot_cost.append(test_title, step, test_metrics[0]) plot_cost.append(test_title, step, test_metrics[0])
plot_cost.plot() plot_cost.plot()
...@@ -210,12 +208,13 @@ def event_handler_plot(event): ...@@ -210,12 +208,13 @@ def event_handler_plot(event):
# If the accuracy is good enough, we can stop the training. # If the accuracy is good enough, we can stop the training.
print('loss is less than 10.0, stop') print('loss is less than 10.0, stop')
trainer.stop() trainer.stop()
step += 1
if isinstance(event, fluid.EndEpochEvent):
if event.epoch % 10 == 0:
# We can save the trained parameters for the inferences later # We can save the trained parameters for the inferences later
if params_dirname is not None: if params_dirname is not None:
trainer.save_params(params_dirname) trainer.save_params(params_dirname)
step += 1
``` ```
### 开始训练 ### 开始训练
...@@ -231,11 +230,10 @@ trainer.train( ...@@ -231,11 +230,10 @@ trainer.train(
event_handler=event_handler_plot, event_handler=event_handler_plot,
feed_order=feed_order) feed_order=feed_order)
``` ```
<div align="center">
<p align="center"> <img src="https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/train_and_test.png?raw=true" width="400"><br/>
<img src = "image/train_and_test1.png" width=400><br/> 图3 训练结果
图3. 训练结果 </div>
</p>
## 预测 ## 预测
...@@ -262,18 +260,18 @@ inferencer = fluid.Inferencer( ...@@ -262,18 +260,18 @@ inferencer = fluid.Inferencer(
batch_size = 10 batch_size = 10
test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size)
test_data = test_reader().next() test_data = test_reader().next()
test_feat = numpy.array([data[0] for data in test_data]).astype("float32") test_x = numpy.array([data[0] for data in test_data]).astype("float32")
test_label = numpy.array([data[1] for data in test_data]).astype("float32") test_y = numpy.array([data[1] for data in test_data]).astype("float32")
results = inferencer.infer({'x': test_feat}) results = inferencer.infer({'x': test_x})
print("infer results: (House Price)") print("infer results: (House Price)")
for k in range(0, batch_size-1): for idx, val in enumerate(results[0]):
print("%d. %f" % (k, results[0][k])) print("%d: %.2f" % (idx, val))
print("\nground truth:") print("\nground truth:")
for k in range(0, batch_size-1): for idx, val in enumerate(test_y):
print("%d. %f" % (k, test_label[k])) print("%d: %.2f" % (idx, val))
``` ```
## 总结 ## 总结
......
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
当我们学习编程的时候,编写的第一个程序一般是实现打印"Hello World"。而机器学习(或深度学习)的入门教程,一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题,比较简单,同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集,包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵,标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。 当我们学习编程的时候,编写的第一个程序一般是实现打印"Hello World"。而机器学习(或深度学习)的入门教程,一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题,比较简单,同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集,包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵,标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
<p align="center"> <p align="center">
<img src="image/mnist_example_image.png" width="400"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mnist_example_image.png?raw=true" width="400"><br/>
图1. MNIST图片示例 图1. MNIST图片示例
</p> </p>
MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3(SD-3)和Special Database 1(SD-1)构建而来。由于SD-3是由美国人口调查局的员工进行标注,SD-1是由美国高中生进行标注,因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集(60000条数据)和测试集(10000条数据),其中训练集来自250位不同的标注员,此外还保证了训练集和测试集的标注员是不完全相同的。 MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3(SD-3)和Special Database 1(SD-1)构建而来。由于SD-3是由美国人口调查局的员工进行标注,SD-1是由美国高中生进行标注,因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集(60000条数据)和测试集(10000条数据),其中训练集来自250位不同的标注员,此外还保证了训练集和测试集的标注员是不完全相同的。
...@@ -40,12 +40,12 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$ ...@@ -40,12 +40,12 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
在分类问题中,我们一般采用交叉熵代价损失函数(cross entropy loss),公式如下: 在分类问题中,我们一般采用交叉熵代价损失函数(cross entropy loss),公式如下:
$$ L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$ $$ L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$
图2为softmax回归的网络图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 图2为softmax回归的网络图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
<p align="center"> <p align="center">
<img src="image/softmax_regression.png" width=400><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/softmax_regression.png?raw=true" width=400><br/>
图2. softmax回归网络结构图<br/> 图2. softmax回归网络结构图<br/>
</p> </p>
...@@ -54,16 +54,14 @@ $$ L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$ ...@@ -54,16 +54,14 @@ $$ L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
Softmax回归模型采用了最简单的两层神经网络,即只有输入层和输出层,因此其拟合能力有限。为了达到更好的识别效果,我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\] Softmax回归模型采用了最简单的两层神经网络,即只有输入层和输出层,因此其拟合能力有限。为了达到更好的识别效果,我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]
1. 经过第一个隐藏层,可以得到 $ H_1 = \phi(W_1X + b_1) $,其中$\phi$代表激活函数,常见的有sigmoid、tanh或ReLU等函数。 1. 经过第一个隐藏层,可以得到 $ H_1 = \phi(W_1X + b_1) $,其中$\phi$代表激活函数,常见的有sigmoid、tanh或ReLU等函数。
2. 经过第二个隐藏层,可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。 2. 经过第二个隐藏层,可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
3. 最后,再经过输出层,得到的$Y=\text{softmax}(W_3H_2 + b_3)$,即为最后的分类结果向量。 3. 最后,再经过输出层,得到的$Y=\text{softmax}(W_3H_2 + b_3)$,即为最后的分类结果向量。
图3为多层感知器的网络结构图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 图3为多层感知器的网络结构图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
<p align="center"> <p align="center">
<img src="image/mlp.png" width=500><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mlp.png?raw=true" width=500><br/>
图3. 多层感知器网络结构图<br/> 图3. 多层感知器网络结构图<br/>
</p> </p>
...@@ -72,7 +70,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层 ...@@ -72,7 +70,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
在多层感知器模型中,将图像展开成一维向量输入到网络中,忽略了图像的位置和结构信息,而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构:输入的二维图像,先经过两次卷积层到池化层,再经过全连接层,最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。 在多层感知器模型中,将图像展开成一维向量输入到网络中,忽略了图像的位置和结构信息,而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构:输入的二维图像,先经过两次卷积层到池化层,再经过全连接层,最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
<p align="center"> <p align="center">
<img src="image/cnn.png"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/cnn.png?raw=true" width="400"><br/>
图4. LeNet-5卷积神经网络结构<br/> 图4. LeNet-5卷积神经网络结构<br/>
</p> </p>
...@@ -81,7 +79,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层 ...@@ -81,7 +79,7 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积,即离散二维滤波器(也称作卷积核)与二维图像做卷积操作,简单的讲是二维滤波器滑动到二维图像上所有位置,并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域,不同卷积核可以提取不同的特征,例如边沿、线性、角等特征。在深层卷积神经网络中,通过卷积操作可以提取出图像低级到复杂的特征。 卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积,即离散二维滤波器(也称作卷积核)与二维图像做卷积操作,简单的讲是二维滤波器滑动到二维图像上所有位置,并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域,不同卷积核可以提取不同的特征,例如边沿、线性、角等特征。在深层卷积神经网络中,通过卷积操作可以提取出图像低级到复杂的特征。
<p align="center"> <p align="center">
<img src="image/conv_layer.png" width='750'><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/conv_layer.png?raw=true" width='750'><br/>
图5. 卷积层图片<br/> 图5. 卷积层图片<br/>
</p> </p>
...@@ -98,16 +96,15 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层 ...@@ -98,16 +96,15 @@ Softmax回归模型采用了最简单的两层神经网络,即只有输入层
#### 池化层 #### 池化层
<p align="center"> <p align="center">
<img src="image/max_pooling.png" width="400px"><br/> <img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/max_pooling.png?raw=true" width="400px"><br/>
图6. 池化层图片<br/> 图6. 池化层图片<br/>
</p> </p>
池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。 池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。
更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。 更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。
### 常见激活函数介绍 ### 常见激活函数介绍
- sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $ - sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
- tanh激活函数: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $ - tanh激活函数: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
...@@ -136,20 +133,18 @@ PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mni ...@@ -136,20 +133,18 @@ PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mni
我们建议使用 Fluid API,因为它更容易学起来。 我们建议使用 Fluid API,因为它更容易学起来。
下面是快速的 Fluid API 概述。 下面是快速的 Fluid API 概述。
1. `inference_program`:指定如何从数据输入中获得预测的函数。 1. `inference_program`:指定如何从数据输入中获得预测的函数。
这是指定网络流的地方。 这是指定网络流的地方。
2. `train_program`:指定如何从 `inference_program``标签值`中获取 `loss` 的函数。 1. `train_program`:指定如何从 `inference_program``标签值`中获取 `loss` 的函数。
这是指定损失计算的地方。 这是指定损失计算的地方。
3. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。 1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
4. `Trainer`:PaddlePaddle Trainer 管理由 `train_program``optimizer` 指定的训练过程。 1. `Trainer`:PaddlePaddle Trainer 管理由 `train_program``optimizer` 指定的训练过程。
通过 `event_handler` 回调函数,用户可以监控培训的进展。 通过 `event_handler` 回调函数,用户可以监控培训的进展。
5. `Inferencer`:Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。 1. `Inferencer`:Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
然后,它可以推断数据和返回预测。 然后,它可以推断数据和返回预测。
在这个演示中,我们将深入了解它们。 在这个演示中,我们将深入了解它们。
...@@ -240,6 +235,7 @@ def train_program(): ...@@ -240,6 +235,7 @@ def train_program():
acc = fluid.layers.accuracy(input=predict, label=label) acc = fluid.layers.accuracy(input=predict, label=label)
return [avg_cost, acc] return [avg_cost, acc]
``` ```
#### Optimizer Function 配置 #### Optimizer Function 配置
...@@ -255,9 +251,9 @@ def optimizer_program(): ...@@ -255,9 +251,9 @@ def optimizer_program():
下一步,我们开始训练过程。`paddle.dataset.movielens.train()``paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python yield generator。 下一步,我们开始训练过程。`paddle.dataset.movielens.train()``paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python yield generator。
下面`shuffle`是一个reader decorator,它接受一个reader A,返回另一个reader B 。reader B 每次读入`buffer_size`条训练数据到一个buffer里,然后随机打乱其顺序,并且逐条输出。 下面`shuffle`是一个reader decorator,它接受一个reader A,返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里,然后随机打乱其顺序,并且逐条输出。
`batch`是一个特殊的decorator,它的输入是一个reader,输出是一个batched reader 。在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minibatch。 `batch`是一个特殊的decorator,它的输入是一个reader,输出是一个batched reader。在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minibatch。
```python ```python
train_reader = paddle.batch( train_reader = paddle.batch(
...@@ -280,7 +276,6 @@ place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() ...@@ -280,7 +276,6 @@ place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_program) train_func=train_program, place=place, optimizer_func=optimizer_program)
``` ```
#### Event Handler 配置 #### Event Handler 配置
...@@ -315,11 +310,10 @@ def event_handler(event): ...@@ -315,11 +310,10 @@ def event_handler(event):
`event_handler_plot` 可以用来在训练过程中画图如下: `event_handler_plot` 可以用来在训练过程中画图如下:
<div align="center">
<p align="center"> <img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/train_and_test.png?raw=true" width="400"><br/>
<img src="image/train_and_test2.png" width="400"><br/> 图7 训练结果
图7. 训练结果 </div>
</p>
```python ```python
......
...@@ -149,7 +149,7 @@ python setup.py bdist_wheel ...@@ -149,7 +149,7 @@ python setup.py bdist_wheel
pip install --upgrade dist/visualdl-*.whl pip install --upgrade dist/visualdl-*.whl
``` ```
如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md) 如果打包和安装遇到其他问题,不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)
## SDK ## SDK
......
############
模型预测部署
############
PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
.. toctree::
:maxdepth: 2
build_and_install_lib_cn.rst
native_infer.rst
...@@ -4,12 +4,13 @@ Paddle 预测 API ...@@ -4,12 +4,13 @@ Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API 为了更简单方便的预测部署,Fluid 提供了一套高层 API
用来隐藏底层不同的优化实现。 用来隐藏底层不同的优化实现。
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__ `预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
包括 包括
- 头文件 ``paddle_inference_api.h`` 定义了所有的接口 - 头文件 ``paddle_inference_api.h`` 定义了所有的接口
- 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a`` - 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
下面是一些 API 概念的介绍 下面是一些 API 概念的介绍
...@@ -95,7 +96,7 @@ engine ...@@ -95,7 +96,7 @@ engine
CHECK(predictor->Run(slots, &outputs)); CHECK(predictor->Run(slots, &outputs));
// 获取 outputs ... // 获取 outputs ...
编译时,联编 ``libpaddle_fluid.a/.so`` 可。 编译时,联编 ``libpaddle_fluid.a/.so`` 便可。
详细代码参考 详细代码参考
------------ ------------
......
...@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式: ...@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
:maxdepth: 2 :maxdepth: 2
feeding_data feeding_data
use_recordio_reader
Python Reader Python Reader
############# #############
......
.. _user_guide_use_recordio_as_train_data:
############################
使用RecordIO文件作为训练数据
############################
相比于 :ref:`user_guide_use_numpy_array_as_train_data`,
:ref:`user_guide_use_recordio_as_train_data` 的性能更好;
但是用户需要先将训练数据集转换成RecordIO文件格式,再使用
:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝,
使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
将训练数据转换成RecordIO文件格式
################################
:code:`fluid.recordio_writer` 中,每个记录都是一个
:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
的所有特征。例如对于图像分类来说,这个数组可以包含图片和分类标签。
用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
:ref:`user_guide_reader` 转换成多个RecordIO文件。
具体使用方法为:
.. code-block:: python
import paddle.fluid as fluid
import numpy
def reader_creator():
def __impl__():
for i in range(1000):
yield [
numpy.random.random(size=[3,224,224], dtype="float32"),
numpy.random.random(size=[1], dtype="int64")
]
return __impl__
img = fluid.layers.data(name="image", shape=[3, 224, 224])
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
BATCH_SIZE = 32
reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
fluid.recordio_writer.convert_reader_to_recordio_file(
"train.recordio", feeder=feeder, reader_creator=reader)
其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
:ref:`_api_fluid_data_feeder_DataFeeder`
是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
:ref:`user_guide_reader` 。
上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件,
其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整,
用户可以将每一个Record的样本数设置成1。并参考
:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
配置神经网络, 打开RecordIO文件
##############################
RecordIO文件转换好之后,用户可以使用 :code:`fluid.layers.open_files()`
打开文件,并使用 :code:`fluid.layers.read_file` 读取文件内容。
简单使用方法如下:
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(
filenames=["train.recordio"],
shape=[[3, 224, 224], [1]],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
pass_num=100
)
image, label = fluid.layers.read_file(file_obj)
其中如果设置了 :code:`pass_num` ,那么当所有数据读完后,会重新读取数据,
直到读取了 :code:`pass_num` 遍。
进阶使用
########
使用 :code:`fluid.layers.double_buffer()`
------------------------------------------
:code:`Double buffer` 使用双缓冲技术,将训练数据从内存中复制到显存中。配置双缓冲
需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
.. code-block:: python
import paddle.fliud as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.double_buffer(file_obj)
image, label = fluid.layers.read_file(file_obj)
双缓冲技术可以参考
`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
配置数据增强
------------
使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
preprocessor = fluid.layers.Preprocessor(reader=data_file)
with preprocessor.block():
image, label = preprocessor.inputs()
image = image / 2
label = label + 1
preprocessor.outputs(image, label)
如上代码所示,使用 :code:`Preprocessor` 定义了一个数据增强模块,并在
:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
:code:`preprocessor.outputs()` 标记预处理后的输出。
.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
使用Op组batch
-------------
使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fluid.layers.batch(file_obj, batch_size=32)
img, label = fluid.layers.read_file(file_obj)
需要注意的是,如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据,
那么这几个样本直接组成一个批量数据进行训练。
读入数据的shuffle
-----------------
使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
.. code-block:: python
import paddle.fluid as fluid
file_obj = fluid.layers.open_files(...)
file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
img, label = fliud.layers.read_file(file_obj)
需要注意的是:
1. :code:`shuffle` 实现方法是:
先读入 :code:`buffer_size` 条样本,再随机的选出样本进行训练。
2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存,需要确定训练过程中内存
足够支持缓存 :code:`buffer_size` 条数据。
...@@ -15,4 +15,5 @@ ...@@ -15,4 +15,5 @@
howto/training/index howto/training/index
howto/debug/index howto/debug/index
howto/evaluation/index howto/evaluation/index
howto/inference/index
models/index.rst models/index.rst
...@@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], ...@@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'],
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)) paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
...@@ -311,7 +312,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw ...@@ -311,7 +312,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -375,7 +376,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l ...@@ -375,7 +376,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
.tensor_util.cu
.data_type_transform.cu
\ No newline at end of file
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(src ${windows_symbolic_SRCS})
get_filename_component(src ${src} NAME_WE)
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
endif()
add_custom_command(OUTPUT .${src}.cu
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
COMMENT "create hidden file of ${src}.cu")
add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
endforeach()
endfunction()
add_subdirectory(ir) add_subdirectory(ir)
if (NOT WIN32) if (NOT WIN32)
add_subdirectory(details) add_subdirectory(details)
...@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim) ...@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU) if(WITH_GPU)
if (WIN32)
windows_symbolic(tensor_util SRCS tensor_util.cu)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
endif() endif()
...@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu ...@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function) DEPS operator op_registry device_context math_function)
if(WITH_GPU) if(WITH_GPU)
if (WIN32)
windows_symbolic(hidden_file SRCS data_type_transform.cu)
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
add_dependencies(data_type_transform hidden_file)
else()
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
endif(WIN32)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else() else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
......
...@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
ir::Graph &result = *graph; ir::Graph &result = *graph;
for (auto &node : nodes) { for (auto &node : nodes) {
if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) { if (node->IsVar() && node->Var()) {
all_vars_.emplace(node->Name(), node->Var()); all_vars_.emplace(node->Name(), node->Var());
} }
} }
...@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ...@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
} }
} }
bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const {
bool is_pg_once =
grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
if (is_pg_once) {
// Insert NCCL AllReduce Op
og_has_been_broadcast->insert(og);
}
return is_pg_once;
}
int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
ir::Node *node) const { ir::Node *node) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
...@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, ...@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var; return var;
} }
// Find the first occurence of `prev_op_name` and make current `op` depend
// on it.
void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const {
for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
if (prev_op->Name() == prev_op_name) {
auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
prev_op->AddOutput(dep_var);
result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
op->AddInput(dep_var);
}
}
}
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
int op_dev_id = -1; int op_dev_id = -1;
......
...@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
std::vector<std::string> FindDistTrainRecvVars( std::vector<std::string> FindDistTrainRecvVars(
const std::vector<ir::Node *> &nodes) const; const std::vector<ir::Node *> &nodes) const;
void ConnectOp(ir::Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
...@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void CreateComputationalOp(ir::Graph *result, ir::Node *node, void CreateComputationalOp(ir::Graph *result, ir::Node *node,
int dev_id) const; int dev_id) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
......
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n")
file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
function(pass_library TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
endfunction()
cc_library(node SRCS node.cc DEPS proto_desc) cc_library(node SRCS node.cc DEPS proto_desc)
cc_library(graph SRCS graph.cc DEPS node) cc_library(graph SRCS graph.cc DEPS node)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) pass_library(graph_to_program_pass)
cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) pass_library(graph_viz_pass)
cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) pass_library(fc_fuse_pass)
cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector) pass_library(attention_lstm_fuse_pass)
pass_library(infer_clean_graph_pass)
pass_library(fc_lstm_fuse_pass)
pass_library(seq_concat_fc_fuse_pass)
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
...@@ -13,10 +13,10 @@ ...@@ -13,10 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) { ...@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
auto* cell_init = graph->RetriveNode(6); auto* cell_init = graph->RetriveNode(6);
auto* hidden_init = graph->RetriveNode(8); auto* hidden_init = graph->RetriveNode(8);
#define LINK_TO(node0, node1) \
node0->outputs.push_back(node1); \
node1->inputs.push_back(node0);
auto* lstm_op = graph->CreateOpNode(&op_desc); auto* lstm_op = graph->CreateOpNode(&op_desc);
PrepareParameters(graph, param); PrepareParameters(graph, param);
LINK_TO(X, lstm_op); IR_NODE_LINK_TO(X, lstm_op);
LINK_TO(cell_init, lstm_op); IR_NODE_LINK_TO(cell_init, lstm_op);
LINK_TO(hidden_init, lstm_op); IR_NODE_LINK_TO(hidden_init, lstm_op);
LINK_TO(lstm_op, LSTMOUT); IR_NODE_LINK_TO(lstm_op, LSTMOUT);
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} }
...@@ -216,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, ...@@ -216,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
float* out_data = out->mutable_data<float>(platform::CPUPlace()); float* out_data = out->mutable_data<float>(platform::CPUPlace());
std::array<const float*, 4> tensors( std::array<const float*, 4> tensors(
{W_forget_w0.data<float>(), W_input_w0.data<float>(), {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
W_output_w0.data<float>(), W_cell_w0.data<float>()}); W_output_w0.data<float>(), W_cell_w0.data<float>()}});
std::array<const float*, 4> tensors1( std::array<const float*, 4> tensors1(
{W_forget_w1.data<float>(), W_input_w1.data<float>(), {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
W_output_w1.data<float>(), W_cell_w1.data<float>()}); W_output_w1.data<float>(), W_cell_w1.data<float>()}});
for (int row = 0; row < D; row++) { for (int row = 0; row < D; row++) {
for (int col = 0; col < 4; col++) { for (int col = 0; col < 4; col++) {
...@@ -243,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, ...@@ -243,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
const LoDTensor& B_output, const LoDTensor& B_cell, const LoDTensor& B_output, const LoDTensor& B_cell,
LoDTensor* out) { LoDTensor* out) {
std::array<const float*, 4> tensors( std::array<const float*, 4> tensors(
{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(), {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
B_cell.data<float>()}); B_cell.data<float>()}});
PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
int D = B_forget.dims()[0]; int D = B_forget.dims()[0];
......
...@@ -21,74 +21,26 @@ namespace paddle { ...@@ -21,74 +21,26 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
bool VarOutLinksToOp(Node* node, const std::string& op_type) {
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
void BuildFCPattern(PDPattern* pattern) {
// Create Operators
auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul");
auto* elementwise_add_op =
pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add");
// Create variables
// w
auto* mul_weight_var = pattern->NewNode("mul_weight")
->AsInput()
->assert_is_op_nth_input("mul", "Y", 0);
// x
auto* mul_tmp_var = pattern->NewNode("mul_tmp_var")
->AsInput()
->assert_is_op_nth_input("mul", "X", 0);
// intermediate variable, will be removed in the IR after fuse.
auto* mul_out_var = pattern->NewNode("mul_out")
->AsIntermediate()
->assert_is_only_output_of_op("mul")
->assert_is_op_input("elementwise_add");
// bias
auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar")
->assert_is_op_input("elementwise_add")
->AsInput();
// output
auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out")
->AsOutput()
->assert_is_op_output("elementwise_add");
mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
.LinksTo({elementwise_add_out_var});
}
// Replace the node `from` in the links to `to`
bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
for (auto*& n : *links) {
if (n == from) {
n = to;
return true;
}
}
return false;
}
std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
FusePassBase::Init("fc", graph.get()); FusePassBase::Init("fc_fuse", graph.get());
std::unordered_set<Node*> nodes2delete; std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd; GraphPatternDetector gpd;
BuildFCPattern(gpd.mutable_pattern()); // BuildFCPattern(gpd.mutable_pattern());
auto* x = gpd.mutable_pattern()
->NewNode("fc_fuse/x")
->AsInput()
->assert_is_op_input("mul", "X");
patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/);
#define GET_NODE(id) \ #define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \ PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
"pattern has no Node called %s", #id); \ "pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \ auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
int found_fc_count = 0; int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
...@@ -98,10 +50,10 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -98,10 +50,10 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
// scenerio. // scenerio.
// FC's fusion is simple, just op fuse, no need to process the // FC's fusion is simple, just op fuse, no need to process the
// parameters. // parameters.
GET_NODE(mul_tmp_var); // x GET_NODE(x); // x
GET_NODE(mul_weight); // Y GET_NODE(w); // Y
GET_NODE(elementwise_add_tmpvar); // bias GET_NODE(fc_bias); // bias
GET_NODE(elementwise_add_out); // Out GET_NODE(fc_out); // Out
GET_NODE(mul); // MUL op GET_NODE(mul); // MUL op
GET_NODE(elementwise_add); // ELEMENT_ADD op GET_NODE(elementwise_add); // ELEMENT_ADD op
GET_NODE(mul_out); // tmp GET_NODE(mul_out); // tmp
...@@ -109,32 +61,22 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -109,32 +61,22 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
// Create an FC Node. // Create an FC Node.
OpDesc desc; OpDesc desc;
std::string fc_x_in = mul_tmp_var->Name(); std::string fc_x_in = x->Name();
std::string fc_Y_in = mul_weight->Name(); std::string fc_Y_in = w->Name();
std::string fc_bias_in = elementwise_add_tmpvar->Name(); std::string fc_bias_in = fc_bias->Name();
std::string fc_out = elementwise_add_out->Name(); std::string fc_out_out = fc_out->Name();
desc.SetInput("Input", std::vector<std::string>({fc_x_in})); desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
desc.SetInput("W", std::vector<std::string>({fc_Y_in})); desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in})); desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
desc.SetOutput("Out", std::vector<std::string>({fc_out})); desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
desc.SetType("fc"); desc.SetType("fc");
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
fc_node->inputs = GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
fc_node->outputs.push_back(elementwise_add_out);
// Update link relatons
PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
elementwise_add, fc_node));
PADDLE_ENFORCE(
LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
// Drop old nodes IR_NODE_LINK_TO(x, fc_node);
graph->RemoveNode(mul); IR_NODE_LINK_TO(w, fc_node);
graph->RemoveNode(elementwise_add); IR_NODE_LINK_TO(fc_bias, fc_node);
graph->RemoveNode(mul_out); // tmp variable IR_NODE_LINK_TO(fc_node, fc_out);
found_fc_count++; found_fc_count++;
}; };
......
...@@ -11,39 +11,39 @@ ...@@ -11,39 +11,39 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl( std::string GenNodeName(const std::string& prefix, const std::string& name) {
std::unique_ptr<ir::Graph> graph) const { return prefix + "/" + name;
GraphPatternDetector gpd; }
auto* pattern = gpd.mutable_pattern();
std::unordered_set<int> fused_ops({// first lstm
13, 15, 16,
// second lstm
23, 25, 26});
pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); },
"any_node");
std::unordered_set<Node*> marked_nodes; void BuildPattern(PDPattern* pattern, const std::string& name_scope,
bool with_fc_bias) {
PDNode* x = pattern->NewNode(name_scope, "x")
->assert_is_op_input("mul")
->assert_var_not_persistable();
auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
patterns::LSTM(pattern, name_scope, fc_out);
// LOG(INFO) << "\n" << pattern->DotString();
}
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
Graph* g) { bool with_fc_bias) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node")); BuildPattern(pattern, name_scope, with_fc_bias);
marked_nodes.insert(id);
};
gpd(graph.get(), handler);
// Create New OpDesc // Create New OpDesc
auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h, auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
int bias, int hidden, int cell, int xx) { int bias, int hidden, int cell, int xx, int fc_bias) {
#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x); #define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
GET_NODE(input); GET_NODE(input);
GET_NODE(weight_x); GET_NODE(weight_x);
...@@ -61,61 +61,147 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl( ...@@ -61,61 +61,147 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
SET_IN(WeightX, weight_x); SET_IN(WeightX, weight_x);
SET_IN(WeightH, weight_h); SET_IN(WeightH, weight_h);
SET_IN(Bias, bias); SET_IN(Bias, bias);
#undef GET_NODE
#undef SET_IN #undef SET_IN
if (with_fc_bias) {
// Add FC-bias with LSTM-bias and create a new weight
PADDLE_ENFORCE(scope);
const std::string& new_bias_var = name_scope + "_bias.new";
auto* bias_var = scope->Var(new_bias_var);
PADDLE_ENFORCE(bias_var);
auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
auto* lstm_bias_var = scope->FindVar(bias_n->Name());
PADDLE_ENFORCE(lstm_bias_var);
const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
bias_tensor->Resize(lstm_bias_tensor.dims());
GET_NODE(fc_bias);
auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < bias_tensor->numel(); i++) {
data[i] =
fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
}
op_desc.SetInput("Bias", {new_bias_var});
}
#undef GET_NODE
VLOG(4) << "hidden_n: " << hidden_n->Name(); // Create temp variables.
VLOG(4) << "cell: " << cell_n->Name(); scope->Var(name_scope + "/BatchedInput.new")
VLOG(4) << "xx: " << xx_n->Name(); ->GetMutable<framework::LoDTensor>();
scope->Var(name_scope + "/BatchCellPreAct.new")
->GetMutable<framework::LoDTensor>();
scope->Var(name_scope + "/BatchedGate.new")
->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {}); op_desc.SetInput("C0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Hidden", {hidden_n->Name()});
op_desc.SetOutput("Cell", {cell_n->Name()}); op_desc.SetOutput("Cell", {cell_n->Name()});
op_desc.SetOutput("XX", {xx_n->Name()}); op_desc.SetOutput("XX", {xx_n->Name()});
op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"}); op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"}); op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", false); op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
auto* op = graph->CreateOpNode(&op_desc); // TODO(TJ): get from attr
op_desc.SetAttr("use_seq", true);
#define TMP_NAME(x) "at.new.tmp." #x
#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
OP_SET_OUT(BatchedCell);
OP_SET_OUT(BatchedHidden);
OP_SET_OUT(ReorderedH0);
OP_SET_OUT(ReorderedC0);
#undef OP_SET_OUT
#define LINK_TO(a, b) \ auto* op = graph->CreateOpNode(&op_desc);
a->outputs.push_back(b); \ PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
b->inputs.push_back(a); auto* scope = graph->Get<Scope*>(kParamScopeAttr);
LINK_TO(input_n, op);
LINK_TO(weight_x_n, op); #define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
LINK_TO(weight_h_n, op); TMP_NEW(BatchedCell);
LINK_TO(bias_n, op); TMP_NEW(BatchedHidden);
LINK_TO(op, hidden_n); TMP_NEW(ReorderedH0);
#undef LINK_TO TMP_NEW(ReorderedC0);
#undef TMP_NEW
#undef TMP_NAME
IR_NODE_LINK_TO(input_n, op);
IR_NODE_LINK_TO(weight_x_n, op);
IR_NODE_LINK_TO(weight_h_n, op);
IR_NODE_LINK_TO(bias_n, op);
IR_NODE_LINK_TO(op, hidden_n);
return op; return op;
};
int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
#define GET_NODE(name__) \
std::string name__##key = name_scope + "/" + #name__; \
auto* name__##n = pattern->RetrieveNode(name__##key); \
PADDLE_ENFORCE(name__##n); \
PADDLE_ENFORCE(subgraph.count(name__##n)); \
Node* name__##_n = subgraph.at(name__##n); \
int name__ __attribute__((unused)) = name__##_n->id();
GET_NODE(x);
GET_NODE(w);
GET_NODE(mul);
GET_NODE(fc_out);
GET_NODE(Weight);
GET_NODE(lstm);
GET_NODE(Bias);
GET_NODE(Hidden);
GET_NODE(Cell);
if (with_fc_bias) {
GET_NODE(fc_bias);
GET_NODE(elementwise_add);
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul_n, lstm_n, elementwise_add_n});
GraphSafeRemoveNodes(graph, marked_nodes);
} else {
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
GraphSafeRemoveNodes(graph, marked_nodes);
}
#undef GET_NODE
++fusion_count;
}; };
lstm_creator(16, 12, 14, 18, 17, 22, 21, 19); gpd(graph, handler);
lstm_creator(26, 12, 24, 28, 27, 32, 31, 29);
// remove all the nodes return fusion_count;
}
for (auto* node : marked_nodes) { std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
graph->RemoveNode(const_cast<Node*>(node)); std::unique_ptr<ir::Graph> graph) const {
} FusePassBase::Init(name_scope_, graph.get());
for (auto* node : graph->Nodes()) { int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
for (auto it = node->inputs.begin(); it != node->inputs.end();) { false /*with_fc_bias*/);
if (marked_nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it); AddStatis(fusion_count);
} else return graph;
it++; }
}
for (auto it = node->outputs.begin(); it != node->outputs.end();) { std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
if (marked_nodes.count(*it)) { std::unique_ptr<ir::Graph> graph) const {
it = const_cast<Node*>(node)->outputs.erase(it); FusePassBase::Init(name_scope_, graph.get());
} else
it++; int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
} true /*with_fc_bias*/);
}
AddStatis(fusion_count);
return graph; return graph;
} }
...@@ -123,4 +209,5 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl( ...@@ -123,4 +209,5 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass); REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
...@@ -12,20 +12,36 @@ ...@@ -12,20 +12,36 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
class FCLstmFusePass : public Pass { // The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
// Just FC without bias
class FCLstmFusePass : public FusePassBase {
public: public:
virtual ~FCLstmFusePass() {} virtual ~FCLstmFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_lstm_fuse"};
};
class MulLstmFusePass : public FusePassBase {
public:
virtual ~MulLstmFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"fc_nobias_lstm_fuse"};
}; };
} // namespace ir } // namespace ir
......
...@@ -167,7 +167,6 @@ class Graph { ...@@ -167,7 +167,6 @@ class Graph {
std::map<std::string, std::function<void(void)>> attr_dels_; std::map<std::string, std::function<void(void)>> attr_dels_;
std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_; std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
std::unordered_set<ir::Node *> node_set_; std::unordered_set<ir::Node *> node_set_;
int node_count_{0};
}; };
bool IsControlDepVar(const ir::Node &var); bool IsControlDepVar(const ir::Node &var);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -71,7 +72,10 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) { ...@@ -71,7 +72,10 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
void GraphPatternDetector::operator()(Graph* graph, void GraphPatternDetector::operator()(Graph* graph,
GraphPatternDetector::handle_t handler) { GraphPatternDetector::handle_t handler) {
if (!MarkPDNodesInGraph(*graph)) return; if (!MarkPDNodesInGraph(*graph)) {
return;
}
auto subgraphs = DetectPatterns(); auto subgraphs = DetectPatterns();
UniquePatterns(&subgraphs); UniquePatterns(&subgraphs);
RemoveOverlappedMatch(&subgraphs); RemoveOverlappedMatch(&subgraphs);
...@@ -81,13 +85,13 @@ void GraphPatternDetector::operator()(Graph* graph, ...@@ -81,13 +85,13 @@ void GraphPatternDetector::operator()(Graph* graph,
LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
int id = 0; int id = 0;
for (auto& g : subgraphs) { for (auto& g : subgraphs) {
LOG(INFO) << "optimizing #" << id++ << " subgraph"; VLOG(3) << "optimizing #" << id++ << " subgraph";
handler(g, graph); handler(g, graph);
} }
} }
bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
VLOG(4) << "mark pdnodes in graph"; VLOG(3) << "mark pdnodes in graph";
if (graph.Nodes().empty()) return false; if (graph.Nodes().empty()) return false;
for (auto& node : GraphTraits::DFS(graph)) { for (auto& node : GraphTraits::DFS(graph)) {
...@@ -106,7 +110,13 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { ...@@ -106,7 +110,13 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
return false; return false;
} }
} }
for (auto& item : pdnodes2nodes_) {
for (auto& n : item.second) {
GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
}
}
VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty(); return !pdnodes2nodes_.empty();
} }
...@@ -272,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch( ...@@ -272,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
for (const auto& subgraph : *subgraphs) { for (const auto& subgraph : *subgraphs) {
bool valid = true; bool valid = true;
for (auto& item : subgraph) { for (auto& item : subgraph) {
if (node_set.count(item.second)) { if (item.first->IsIntermediate() && node_set.count(item.second)) {
valid = false; valid = false;
break; break;
} }
...@@ -328,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) { ...@@ -328,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
} }
PDNode* PDNode::assert_is_op() { PDNode* PDNode::assert_is_op() {
asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); }); asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
return this; return this;
} }
PDNode* PDNode::assert_is_op(const std::string& op_type) { PDNode* PDNode::assert_is_op(const std::string& op_type) {
asserts_.emplace_back([this, op_type](Node* x) { asserts_.emplace_back([op_type](Node* x) {
return x && x->IsOp() && x->Op()->Type() == op_type; return x && x->IsOp() && x->Op()->Type() == op_type;
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_var() { PDNode* PDNode::assert_is_var() {
asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); }); asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
return this; return this;
} }
PDNode* PDNode::assert_var_not_persistable() { PDNode* PDNode::assert_var_not_persistable() {
assert_is_var(); assert_is_var();
asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); }); asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
return this; return this;
} }
PDNode* PDNode::assert_is_persistable_var() { PDNode* PDNode::assert_is_persistable_var() {
...@@ -357,7 +367,9 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type, ...@@ -357,7 +367,9 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
assert_is_op_input(op_type); assert_is_op_input(op_type);
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node* x) {
for (auto* op : x->outputs) { for (auto* op : x->outputs) {
if (IsNthInput(x, op, argument, nth)) return true; if (op->IsOp() && op->Op()->Type() == op_type &&
IsNthInput(x, op, argument, nth))
return true;
} }
return false; return false;
}); });
...@@ -368,7 +380,9 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type, ...@@ -368,7 +380,9 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node* x) {
for (auto* op : x->inputs) { for (auto* op : x->inputs) {
if (IsNthOutput(x, op, argument, nth)) return true; if (op->IsOp() && op->Op()->Type() == op_type &&
IsNthOutput(x, op, argument, nth))
return true;
} }
return false; return false;
}); });
...@@ -412,6 +426,12 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) { ...@@ -412,6 +426,12 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_op_output(const std::string& op_type,
const std::string& argument) {
assert_is_var();
assert_is_op_nth_output(op_type, argument, 0);
return this;
}
PDNode* PDNode::assert_is_op_input(const std::string& op_type) { PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node* x) {
...@@ -424,6 +444,12 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) { ...@@ -424,6 +444,12 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_op_input(const std::string& op_type,
const std::string& argument) {
assert_is_var();
assert_is_op_nth_input(op_type, argument, 0);
return this;
}
PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) { PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
assert_is_op(op_type); assert_is_op(op_type);
asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; }); asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; });
...@@ -439,6 +465,130 @@ PDNode* PDNode::assert_more(PDNode::teller_t&& teller) { ...@@ -439,6 +465,130 @@ PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
return this; return this;
} }
bool VarLinksToOp(Node* node, const std::string& op_type) {
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->Op()->Input(argument).size() <= nth) return false;
return var->Name() == op->Op()->Input(argument)[nth];
}
bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->Op()->Output(argument).size() <= nth) return false;
return var->Name() == op->Op()->Output(argument)[nth];
}
void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes) {
for (auto* node : nodes) {
graph->RemoveNode(const_cast<Node*>(node));
}
for (auto* node : graph->Nodes()) {
for (auto it = node->inputs.begin(); it != node->inputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it);
} else {
it++;
}
}
for (auto it = node->outputs.begin(); it != node->outputs.end();) {
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->outputs.erase(it);
} else {
it++;
}
}
}
}
bool VarLinksFromOp(Node* node, const std::string& op_type) {
for (auto* out : node->inputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
PDNode* x, bool with_bias) {
// Create Operators
PDNode* elementwise_add_op{nullptr};
auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
if (with_bias) {
elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
->assert_is_op("elementwise_add");
}
// Create variables
// w
auto* mul_weight_var = pattern->NewNode(name_scope, "w")
->AsInput()
->assert_is_persistable_var()
->assert_is_op_nth_input("mul", "Y", 0);
PDNode* mul_out_var{nullptr};
if (with_bias) {
// intermediate variable, will be removed in the IR after fuse.
mul_out_var = pattern->NewNode(name_scope, "mul_out")
->AsIntermediate()
->assert_is_only_output_of_op("mul")
->assert_is_op_input("elementwise_add");
}
PDNode *bias{nullptr}, *fc_out{nullptr};
if (with_bias) {
// bias
bias = pattern->NewNode(name_scope, "fc_bias")
->assert_is_op_input("elementwise_add")
->AsInput();
// output
fc_out = pattern->NewNode(name_scope, "fc_out")
->AsOutput()
->assert_is_op_output("elementwise_add");
} else {
fc_out = pattern->NewNode(name_scope, "fc_out")
->AsOutput()
->assert_is_op_output("mul");
}
if (with_bias) {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
} else {
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
}
return fc_out;
}
PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
PDNode* x) {
x->assert_is_op_input("lstm", "Input");
auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
#define NEW_NODE(arg__, io__) \
auto* arg__ = pattern->NewNode(name_scope, #arg__) \
->assert_is_op_##io__("lstm", #arg__);
// Currently, the H0 and C0 are optional
// TODO(Superjomn) upgrade the fuse framework to support optional.
// NEW_NODE(H0, input);
// NEW_NODE(C0, input);
NEW_NODE(Weight, input);
NEW_NODE(Bias, input);
NEW_NODE(Hidden, output);
NEW_NODE(Cell, output);
NEW_NODE(BatchGate, output);
NEW_NODE(BatchCellPreAct, output);
lstm_op->LinksFrom({x, Weight, Bias});
lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
return Hidden;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
#endif #endif
#include <numeric> #include <numeric>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
...@@ -95,7 +98,11 @@ struct PDNode { ...@@ -95,7 +98,11 @@ struct PDNode {
PDNode* assert_var_not_persistable(); PDNode* assert_var_not_persistable();
PDNode* assert_is_persistable_var(); PDNode* assert_is_persistable_var();
PDNode* assert_is_op_output(const std::string& op_type); PDNode* assert_is_op_output(const std::string& op_type);
PDNode* assert_is_op_output(const std::string& op_type,
const std::string& argument);
PDNode* assert_is_op_input(const std::string& op_type); PDNode* assert_is_op_input(const std::string& op_type);
PDNode* assert_is_op_input(const std::string& op_type,
const std::string& argument);
PDNode* assert_is_op_nth_input(const std::string& op_type, PDNode* assert_is_op_nth_input(const std::string& op_type,
const std::string& argument, int nth); const std::string& argument, int nth);
PDNode* assert_is_op_nth_output(const std::string& op_type, PDNode* assert_is_op_nth_output(const std::string& op_type,
...@@ -167,6 +174,9 @@ class PDPattern { ...@@ -167,6 +174,9 @@ class PDPattern {
PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID()); PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
PDNode* NewNode(const std::string& name = NewID()); PDNode* NewNode(const std::string& name = NewID());
PDNode* NewNode(const std::string& prefix, const std::string& name) {
return NewNode(prefix + "/" + name);
}
PDNode* RetrieveNode(const std::string& id) const; PDNode* RetrieveNode(const std::string& id) const;
const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; } const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
...@@ -238,6 +248,8 @@ class GraphPatternDetector { ...@@ -238,6 +248,8 @@ class GraphPatternDetector {
void UniquePatterns(std::vector<subgraph_t>* subgraphs); void UniquePatterns(std::vector<subgraph_t>* subgraphs);
// Remove overlapped match subgraphs, when overlapped, keep the previous one. // Remove overlapped match subgraphs, when overlapped, keep the previous one.
// The intermediate PDNodes will be removed, so can't shared by multiple
// patterns.
void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs); void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
// Validate whether the intermediate nodes are linked by external nodes. // Validate whether the intermediate nodes are linked by external nodes.
...@@ -257,64 +269,40 @@ class GraphPatternDetector { ...@@ -257,64 +269,40 @@ class GraphPatternDetector {
// some helper methods. // some helper methods.
// Op's input. // Tell if a var links to an Op
static bool VarLinksToOp(Node* node, const std::string& op_type) { bool VarLinksToOp(Node* node, const std::string& op_type);
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
// Op's output. // Tell if an op links to a var
static bool VarLinksFromOp(Node* node, const std::string& op_type) { bool VarLinksFromOp(Node* node, const std::string& op_type);
for (auto* out : node->inputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
// Check whether a var node is a op node's nth input. // Check whether a var node is a op node's nth input.
static bool IsNthInput(Node* var, Node* op, const std::string& argument, bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->inputs.size() <= nth) return false;
return var->Name() == op->Op()->Input(argument)[nth];
}
static bool IsNthOutput(Node* var, Node* op, const std::string& argument,
size_t nth) {
PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp());
if (op->inputs.size() <= nth) return false;
return var->Name() == op->Op()->Output(argument)[nth];
}
static void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes) {
for (auto* node : nodes) {
graph->RemoveNode(const_cast<Node*>(node));
}
for (auto* node : graph->Nodes()) { // Tell whether a var node is a op node's nth output.
for (auto it = node->inputs.begin(); it != node->inputs.end();) { bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
if (nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it); // Graph safely remove some nodes, will automatically clean up the edges.
} else void GraphSafeRemoveNodes(Graph* graph,
it++; const std::unordered_set<const Node*>& nodes);
}
for (auto it = node->outputs.begin(); it != node->outputs.end();) { // Some pre-defined patterns those can be reused in multiple passes.
if (nodes.count(*it)) { namespace patterns {
it = const_cast<Node*>(node)->outputs.erase(it);
} else // FC with bias
it++; // op: mul + elementwise_add
} // named nodes:
} // mul, elementwise_add
} // w, mul_out, bias, fc_out
PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
bool with_bias);
PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
} // namespace patterns
#define IR_NODE_LINK_TO(a, b) \
a->outputs.push_back(b); \
b->inputs.push_back(a);
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
......
...@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) { ...@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3"); return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
}, },
"OP0"); "OP0");
auto* any_var = x.mutable_pattern()->NewNode( auto* any_var = x.mutable_pattern()
[](Node* node) { return node->IsVar(); }, "VAR"); ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
->AsIntermediate();
auto* any_op1 = x.mutable_pattern()->NewNode( auto* any_op1 = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsOp(); }, "OP1"); [](Node* node) { return node->IsOp(); }, "OP1");
......
...@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( ...@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
Dot dot; Dot dot;
std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"), const std::vector<Dot::Attr> op_attrs({
Dot::Attr("shape", "box"), Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("fillcolor", "red")}); Dot::Attr("shape", "box"), //
std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"), Dot::Attr("color", "#303A3A"), //
// Dot::Attr("shape", "diamond"), Dot::Attr("fontcolor", "#ffffff"), //
Dot::Attr("width", "1.3"), //
Dot::Attr("height", "0.84"), //
Dot::Attr("fontname", "Arial"), //
});
const std::vector<Dot::Attr> arg_attrs({
Dot::Attr("shape", "box"), //
Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("fontname", "Arial"), //
Dot::Attr("fillcolor", "#999999"), //
Dot::Attr("color", "#dddddd"), //
});
const std::vector<Dot::Attr> param_attrs({
Dot::Attr("shape", "box"), //
Dot::Attr("style", "rounded,filled,bold"), //
Dot::Attr("fontname", "Arial"), //
Dot::Attr("color", "#148b97"), //
Dot::Attr("fontcolor", "#ffffff"), //
});
const std::vector<Dot::Attr> marked_op_attrs(
{Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "yellow")});
const std::vector<Dot::Attr> marked_var_attrs(
{Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "yellow")}); Dot::Attr("fillcolor", "yellow")});
std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
Dot::Attr("shape", "box"),
Dot::Attr("fillcolor", "lightgray")});
std::vector<Dot::Attr> marked_var_attrs(
{Dot::Attr("style", "filled,rounded"),
// Dot::Attr("shape", "diamond"),
Dot::Attr("fillcolor", "lightgray")});
auto marked_nodes = ConsumeMarkedNodes(graph.get()); auto marked_nodes = ConsumeMarkedNodes(graph.get());
// Create nodes // Create nodes
...@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( ...@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
marked_nodes.count(n) ? marked_op_attrs : op_attrs; marked_nodes.count(n) ? marked_op_attrs : op_attrs;
dot.AddNode(node_id, attr, node_id); dot.AddNode(node_id, attr, node_id);
} else if (n->IsVar()) { } else if (n->IsVar()) {
decltype(op_attrs) attr = decltype(op_attrs)* attr;
marked_nodes.count(n) ? marked_var_attrs : var_attrs; if (marked_nodes.count(n)) {
dot.AddNode(node_id, attr, node_id); attr = &marked_var_attrs;
} else if (const_cast<Node*>(n)->Var() &&
const_cast<Node*>(n)->Var()->Persistable()) {
attr = &param_attrs;
} else {
attr = &arg_attrs;
}
dot.AddNode(node_id, *attr, node_id);
} }
node2dot[n] = node_id; node2dot[n] = node_id;
} }
......
...@@ -42,6 +42,13 @@ class GraphVizPass : public Pass { ...@@ -42,6 +42,13 @@ class GraphVizPass : public Pass {
marked_nodes_t ConsumeMarkedNodes(Graph* graph) const; marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
}; };
static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
if (!graph->Has(kGraphvizMarkedNodeAttr)) {
graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
}
return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,42 +13,41 @@ ...@@ -13,42 +13,41 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
class InferCleanGraphPass : public Pass { class InferCleanGraphPass : public FusePassBase {
public: public:
virtual ~InferCleanGraphPass() {} virtual ~InferCleanGraphPass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init("original_graph", graph.get());
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
auto is_valid_node = [](Node* x) { auto is_valid_node = [](Node* x) {
return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
}; };
std::unordered_set<Node*> invalid_nodes; std::unordered_set<const Node*> invalid_nodes;
int valid_op = 0;
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (is_valid_node(node)) { if (is_valid_node(node)) {
invalid_nodes.insert(node); invalid_nodes.insert(node);
} else if (node->IsOp()) {
// Collect all the operators to help tracking number of operators.
++valid_op;
} }
} }
// remove nodes from the graph. GraphSafeRemoveNodes(graph.get(), invalid_nodes);
for (auto* node : invalid_nodes) {
graph->RemoveNode(node);
}
// clean edges. AddStatis(valid_op);
for (auto* node : graph->Nodes()) {
CleanEdges(&node->inputs, invalid_nodes);
CleanEdges(&node->outputs, invalid_nodes);
}
return graph; return graph;
} }
......
...@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( ...@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
op_desc.SetAttr("fc_activation", act->Op()->Type()); op_desc.SetAttr("fc_activation", act->Op()->Type());
auto* op_node = graph->CreateOpNode(&op_desc); auto* op_node = graph->CreateOpNode(&op_desc);
// Add links // Add links
#define NODE_LINKS(a, b) \ IR_NODE_LINK_TO(fc_w, op_node);
a->outputs.push_back(b); \ IR_NODE_LINK_TO(fc_bias, op_node);
b->inputs.push_back(a); IR_NODE_LINK_TO(concat_in0, op_node);
NODE_LINKS(fc_w, op_node); IR_NODE_LINK_TO(sequence_expand0_in, op_node);
NODE_LINKS(fc_bias, op_node); IR_NODE_LINK_TO(sequence_expand1_in, op_node);
NODE_LINKS(concat_in0, op_node); IR_NODE_LINK_TO(op_node, fc_out);
NODE_LINKS(sequence_expand0_in, op_node);
NODE_LINKS(sequence_expand1_in, op_node);
NODE_LINKS(op_node, fc_out);
// Clean nodes. // Clean nodes.
std::unordered_set<const Node*> marked_nodes; std::unordered_set<const Node*> marked_nodes;
...@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( ...@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
marked_nodes.erase(sequence_expand0_in); marked_nodes.erase(sequence_expand0_in);
marked_nodes.erase(sequence_expand1_in); marked_nodes.erase(sequence_expand1_in);
marked_nodes.erase(fc_out); marked_nodes.erase(fc_out);
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
}); });
......
...@@ -10,19 +10,19 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) ...@@ -10,19 +10,19 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
# paddle_fluid_origin exclude inference api interface # paddle_fluid_origin exclude inference api interface
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
if(NOT APPLE) #if(APPLE)
add_subdirectory(api) add_subdirectory(api)
endif() #endif()
# Create static library # Create static library
cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api) cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
if(NOT APPLE) if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
...@@ -32,6 +32,7 @@ endif() ...@@ -32,6 +32,7 @@ endif()
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED cc_library(paddle_fluid_shared SHARED
SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
DEPS ${fluid_modules} paddle_fluid_api) DEPS ${fluid_modules} paddle_fluid_api)
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
......
...@@ -25,17 +25,16 @@ function (inference_analysis_test TARGET) ...@@ -25,17 +25,16 @@ function (inference_analysis_test TARGET)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS EXTRA_DEPS) set(multiValueArgs SRCS ARGS EXTRA_DEPS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(mem_opt "") set(mem_opt "")
if(WITH_GPU) if(WITH_GPU)
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
endif() endif()
cc_test(${TARGET} cc_test(${TARGET}
SRCS "${analysis_test_SRCS}" SRCS "${analysis_test_SRCS}"
DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS} DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt}) ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING) endif(WITH_TESTING)
endfunction(inference_analysis_test) endfunction(inference_analysis_test)
...@@ -51,32 +50,19 @@ endfunction(inference_download_and_uncompress) ...@@ -51,32 +50,19 @@ endfunction(inference_download_and_uncompress)
set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz") set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz") set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE) set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
if (NOT EXISTS ${DITU_INSTALL_DIR}) if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz") inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz") inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
endif() endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
analysis_predictor ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
# ir
fc_fuse_pass
fc_lstm_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detector
infer_clean_graph_pass
attention_lstm_fuse_pass
paddle_inference_api
pass
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
--infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
--infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid) inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
...@@ -88,13 +74,37 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) ...@@ -88,13 +74,37 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR}) if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
endif() endif()
inference_analysis_test(test_chinese_ner SRCS chinese_ner_tester.cc inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api EXTRA_DEPS paddle_inference_api paddle_fluid_api
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api
ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt)
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
endif()
inference_analysis_test(test_text_classification SRCS test_text_classification.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta)
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
...@@ -41,20 +42,16 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -41,20 +42,16 @@ class DfgPassManagerImpl final : public DfgPassManager {
public: public:
DfgPassManagerImpl() { DfgPassManagerImpl() {
// TODO(Superjomn) set the key with pass reprs. // TODO(Superjomn) set the key with pass reprs.
LOG(INFO) if (!FLAGS_IA_enable_ir) {
<< "-----------------------------------------------------------------";
if (FLAGS_IA_enable_ir) {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
} else {
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
} else {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
} }
TryAddTensorRtPass(); TryAddTensorRtPass();
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
if (!FLAGS_IA_output_storage_path.empty()) { if (!FLAGS_IA_output_storage_path.empty()) {
AddPass("model-store-pass", new ModelStorePass); AddPass("model-store-pass", new ModelStorePass);
} }
LOG(INFO)
<< "-----------------------------------------------------------------";
} }
std::string repr() const override { return "dfg-pass-manager"; } std::string repr() const override { return "dfg-pass-manager"; }
...@@ -101,18 +98,15 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -101,18 +98,15 @@ class DfgPassManagerImpl final : public DfgPassManager {
Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
void Analyzer::Run(Argument* argument) { void Analyzer::Run(Argument* argument) {
// Ugly support fluid-to-ir-pass std::vector<std::string> passes;
argument->Set(kFluidToIrPassesAttr, for (auto& pass : all_ir_passes_) {
new std::vector<std::string>({ if (!disabled_ir_passes_.count(pass)) {
// Manual update the passes here. passes.push_back(pass);
"graph_viz_pass", // passes.push_back("graph_viz_pass"); // add graphviz for debug.
"infer_clean_graph_pass", "graph_viz_pass", // }
"attention_lstm_fuse_pass", "graph_viz_pass", // }
"fc_lstm_fuse_pass", "graph_viz_pass", // passes.push_back("graph_viz_pass");
"seq_concat_fc_fuse_pass", "graph_viz_pass", // argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
"fc_fuse_pass", "graph_viz_pass" //
}));
for (auto& x : data_) { for (auto& x : data_) {
PADDLE_ENFORCE(x->Initialize(argument)); PADDLE_ENFORCE(x->Initialize(argument));
...@@ -121,6 +115,11 @@ void Analyzer::Run(Argument* argument) { ...@@ -121,6 +115,11 @@ void Analyzer::Run(Argument* argument) {
} }
} }
Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
disabled_ir_passes_.insert(passes.begin(), passes.end());
return *this;
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -36,16 +36,10 @@ limitations under the License. */ ...@@ -36,16 +36,10 @@ limitations under the License. */
*/ */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(IA_graphviz_log_root);
DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
...@@ -57,7 +51,26 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -57,7 +51,26 @@ class Analyzer : public OrderedRegistry<PassManager> {
void Run(Argument* argument); void Run(Argument* argument);
Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
DISABLE_COPY_AND_ASSIGN(Analyzer); DISABLE_COPY_AND_ASSIGN(Analyzer);
private:
// All avaiable IR passes.
// The bigger fuse comes first, so that the small operators prefer to be
// merged in a larger fuse op. The small fusion will not break the pattern of
// larger fusion.
const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here.
"infer_clean_graph_pass", //
"attention_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
}};
std::unordered_set<std::string> disabled_ir_passes_;
}; };
} // namespace analysis } // namespace analysis
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC");
DEFINE_string(infer_data, "", "data file for LAC");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle {
namespace inference {
namespace analysis {
struct DataRecord {
std::vector<int64_t> data;
std::vector<size_t> lod;
// for dataset and nextbatch
size_t batch_iter{0};
std::vector<std::vector<size_t>> batched_lods;
std::vector<std::vector<int64_t>> batched_datas;
std::vector<std::vector<int64_t>> datasets;
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1) {
Load(path);
Prepare(batch_size);
batch_iter = 0;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
datasets.resize(0);
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ';', &data);
std::vector<int64_t> words_ids;
split_to_int64(data[1], ' ', &words_ids);
datasets.emplace_back(words_ids);
}
}
void Prepare(int bs) {
if (bs == 1) {
batched_datas = datasets;
for (auto one_sentence : datasets) {
batched_lods.push_back({0, one_sentence.size()});
}
} else {
std::vector<int64_t> one_batch;
std::vector<size_t> lod{0};
int bs_id = 0;
for (auto one_sentence : datasets) {
bs_id++;
one_batch.insert(one_batch.end(), one_sentence.begin(),
one_sentence.end());
lod.push_back(lod.back() + one_sentence.size());
if (bs_id == bs) {
bs_id = 0;
batched_datas.push_back(one_batch);
batched_lods.push_back(lod);
one_batch.clear();
one_batch.resize(0);
lod.clear();
lod.resize(0);
lod.push_back(0);
}
}
if (one_batch.size() != 0) {
batched_datas.push_back(one_batch);
batched_lods.push_back(lod);
}
}
}
DataRecord NextBatch() {
DataRecord data;
data.data = batched_datas[batch_iter];
data.lod = batched_lods[batch_iter];
batch_iter++;
if (batch_iter >= batched_datas.size()) {
batch_iter = 0;
}
return data;
}
};
void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
auto one_batch = data->NextBatch();
PaddleTensor input_tensor;
input_tensor.name = "word";
input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
input_tensor.lod.assign({one_batch.lod});
input_tensor.dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
input_slots->assign({input_tensor});
}
static void PrintTime(const double latency, const int bs, const int repeat) {
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
<< ", avg latency: " << latency / repeat << "ms";
LOG(INFO) << "=====================================";
}
void BenchAllData(const std::string &model_path, const std::string &data_file,
const int batch_size, const int repeat) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
GetOneBatch(&input_slots, &data, batch_size);
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
double sum = 0;
for (int i = 0; i < repeat; i++) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
}
PrintTime(sum, batch_size, repeat);
}
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data) {
if (test_all_data) {
BenchAllData(model_path, data_file, batch_size, repeat);
return;
}
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size);
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
timer.tic();
for (int i = 0; i < repeat; i++) {
predictor->Run(input_slots, &outputs_slots);
}
PrintTime(timer.toc(), batch_size, repeat);
EXPECT_EQ(outputs_slots.size(), 1UL);
auto &out = outputs_slots[0];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
PADDLE_ENFORCE_GT(size, 0);
EXPECT_GE(size, batch1_size);
int64_t *pdata = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < batch1_size; ++i) {
EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
}
TEST(Analyzer_LAC, native) {
LOG(INFO) << "LAC with native";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
......
...@@ -16,25 +16,27 @@ ...@@ -16,25 +16,27 @@
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
DEFINE_int32(batch_size, 10, "batch size."); DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using namespace framework; using namespace framework; // NOLINT
TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_without_tensorrt) {
FLAGS_IA_enable_tensorrt_subgraph_engine = false; FLAGS_IA_enable_tensorrt_subgraph_engine = false;
...@@ -219,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -219,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
switch (tensor.dtype) {
case PaddleDType::FLOAT32:
os << "float32";
break;
case PaddleDType::INT64:
os << "int64";
break;
default:
os << "unset";
}
os << '\n';
os << " - shape: " << to_string(tensor.shape) << '\n';
os << " - lod: ";
for (auto &l : tensor.lod) {
os << to_string(l) << "; ";
}
os << "\n";
os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
return os.str();
}
} // namespace } // namespace
const float ditu_rnn_target_data[] = { const float ditu_rnn_target_data[] = {
...@@ -265,57 +234,97 @@ const float ditu_rnn_target_data[] = { ...@@ -265,57 +234,97 @@ const float ditu_rnn_target_data[] = {
10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0, 10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0,
93.5771, 3.84641, 0, 0, 0, 0, 0, 0, 93.5771, 3.84641, 0, 0, 0, 0, 0, 0,
169.426, 0, 0, 0, 0, 0, 0, 0}; 169.426, 0, 0, 0, 0, 0, 0, 0};
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model. // Test with a really complicate model.
void TestDituRNNPrediction(const std::string &model_path, void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
const std::string &data_path, int batch_size, int num_threads) {
bool use_analysis, bool activate_ir, AnalysisConfig config;
int num_times = 1) {
NativeConfig config;
config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
config.use_gpu = false; config.use_gpu = false;
config.device = 0; config.device = 0;
config.specify_input_name = true; config.specify_input_name = true;
config.enable_ir_optim = activate_ir;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor = auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor = auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
DataRecord data(data_path, batch_size); DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
// Prepare inputs. // Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size); PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs; std::vector<PaddleTensor> outputs, base_outputs;
base_predictor->Run(input_slots, &base_outputs); base_predictor->Run(input_slots, &base_outputs);
LOG(INFO) << "===========profile result===========";
if (num_threads == 1) {
// Prepare inputs.
Timer timer; Timer timer;
timer.tic(); timer.tic();
for (int i = 0; i < num_times; i++) { for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs); predictor->Run(input_slots, &outputs);
} }
LOG(INFO) << "===========profile result==========="; PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times CompareResult(outputs, base_outputs);
<< ", latency: " << timer.toc() / num_times << "ms"; } else {
LOG(INFO) << "====================================="; std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
PADDLE_ENFORCE_GT(outputs.size(), 0); // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); // because AttentionLSTM's hard code nodeid will be damanged.
for (size_t i = 0; i < outputs.size(); i++) { for (int tid = 0; tid < num_threads; ++tid) {
auto &out = outputs[i]; predictors.emplace_back(
auto &base_out = base_outputs[i]; CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, config));
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
} }
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local input_slots and outputs.
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
} }
}
LOG(INFO) << "=====================================";
if (use_analysis && activate_ir) { if (use_analysis && activate_ir) {
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
...@@ -327,39 +336,45 @@ void TestDituRNNPrediction(const std::string &model_path, ...@@ -327,39 +336,45 @@ void TestDituRNNPrediction(const std::string &model_path,
LOG(INFO) << "fused " << item.first << " " << item.second; LOG(INFO) << "fused " << item.first << " " << item.second;
} }
ASSERT_TRUE(fuse_statis.count("fc")); int num_ops = 0;
EXPECT_EQ(fuse_statis.at("fc"), 1); for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
} }
} }
LOG(INFO) << "has num ops: " << num_ops;
// Directly infer with the original model. ASSERT_TRUE(fuse_statis.count("fc_fuse"));
TEST(Analyzer, DituRNN_without_analysis) { EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
FLAGS_batch_size, false, false, FLAGS_repeat); EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
}
} }
// Inference with the original model with the analysis turned on, the analysis // Inference with analysis and IR, easy for profiling independently.
// module will transform the program to a data flow graph. TEST(Analyzer, DituRNN) {
TEST(Analyzer, DituRNN_with_analysis) { TestDituRNNPrediction(true, true, FLAGS_num_threads);
LOG(INFO) << "ditu rnn with analysis";
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
FLAGS_batch_size, true, false, FLAGS_repeat);
} }
// Inference with analysis and IR. The IR module will fuse some large kernels. // Other unit-tests of DituRNN, test different options of use_analysis,
TEST(Analyzer, DituRNN_with_analysis_with_IR) { // activate_ir and multi-threads.
LOG(INFO) << "ditu rnn with analysis and IR fuse"; TEST(Analyzer, DituRNN_tests) {
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data, int num_threads[2] = {1, 4};
FLAGS_batch_size, true, true, FLAGS_repeat); for (auto i : num_threads) {
// Directly infer with the original model.
TestDituRNNPrediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis
// module will transform the program to a data flow graph.
TestDituRNNPrediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestDituRNNPrediction(true, true, i);
}
} }
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
...@@ -67,7 +67,7 @@ struct Argument { ...@@ -67,7 +67,7 @@ struct Argument {
PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]", PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
key); key);
attrs_[key] = data; attrs_[key] = data;
attr_deleters_[key] = [data, key, this]() { attr_deleters_[key] = [data, key]() {
VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
VLOG(3) << "argument delete attr: " << key; VLOG(3) << "argument delete attr: " << key;
delete data; delete data;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(IA_graphviz_log_root);
DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
...@@ -85,9 +86,11 @@ class FluidToIrPass final : public DataFlowGraphPass { ...@@ -85,9 +86,11 @@ class FluidToIrPass final : public DataFlowGraphPass {
new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr))); new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
} }
if (FLAGS_IA_enable_ir) {
const auto &ir_passes_to_apply = const auto &ir_passes_to_apply =
argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr); argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
ir_passes.Apply(ir_passes_to_apply); ir_passes.Apply(ir_passes_to_apply);
}
PADDLE_ENFORCE(argument_->main_dfg.get()); PADDLE_ENFORCE(argument_->main_dfg.get());
argument_->main_dfg->Build(ir_passes.graph()); argument_->main_dfg->Build(ir_passes.graph());
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) { ...@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_fuse_pass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
namespace paddle {
template <typename T>
std::string to_string(const std::vector<T> &vec) {
std::stringstream ss;
for (const auto &c : vec) {
ss << c << " ";
}
return ss.str();
}
void PrintTime(const double latency, const int bs, const int repeat) {
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
<< ", avg latency: " << latency / repeat << "ms";
LOG(INFO) << "=====================================";
}
void Main(int batch_size) {
// Three sequence inputs.
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
int64_t data0[] = {0, 1, 2};
for (auto &input : input_slots) {
input.data.Reset(data0, sizeof(data0));
input.shape = std::vector<int>({3, 1});
// dtype --
input.dtype = PaddleDType::INT64;
// LoD --
input.lod = std::vector<std::vector<size_t>>({{0, 3}});
}
// shape --
// Create Predictor --
AnalysisConfig config;
config.model_dir = FLAGS_infer_model;
config.use_gpu = false;
config.enable_ir_optim = true;
config.ir_passes.push_back("fc_lstm_fuse_pass");
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
inference::Timer timer;
double sum = 0;
std::vector<PaddleTensor> output_slots;
for (int i = 0; i < FLAGS_repeat; i++) {
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
}
PrintTime(sum, batch_size, FLAGS_repeat);
// Get output
LOG(INFO) << "get outputs " << output_slots.size();
for (auto &output : output_slots) {
LOG(INFO) << "output.shape: " << to_string(output.shape);
// no lod ?
CHECK_EQ(output.lod.size(), 0UL);
LOG(INFO) << "output.dtype: " << output.dtype;
std::stringstream ss;
for (int i = 0; i < 5; i++) {
ss << static_cast<float *>(output.data.data())[i] << " ";
}
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
}
}
TEST(text_classification, basic) { Main(FLAGS_batch_size); }
} // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(seq_concat_fc_fuse_pass);
USE_PASS(fc_lstm_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
USE_PASS(attention_lstm_fuse_pass);
...@@ -18,10 +18,7 @@ if(APPLE) ...@@ -18,10 +18,7 @@ if(APPLE)
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
graph_viz_pass fc_fuse_pass
infer_clean_graph_pass
)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
...@@ -47,7 +44,19 @@ function(inference_api_test TARGET_NAME) ...@@ -47,7 +44,19 @@ function(inference_api_test TARGET_NAME)
endfunction(inference_api_test) endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
analysis
ir_pass_manager
pass
fc_fuse_pass
fc_lstm_fuse_pass
seq_concat_fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detector
infer_clean_graph_pass
attention_lstm_fuse_pass
)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
SRCS api_tester.cc SRCS api_tester.cc
......
...@@ -14,10 +14,13 @@ ...@@ -14,10 +14,13 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include <memory> #include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
namespace paddle { namespace paddle {
...@@ -27,10 +30,11 @@ bool AnalysisPredictor::Init( ...@@ -27,10 +30,11 @@ bool AnalysisPredictor::Init(
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
LOG(WARNING) << "ir optimize only supports CPU currently";
config_.enable_ir_optim = false;
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
PADDLE_ENFORCE(!parent_scope);
if (parent_scope) { if (parent_scope) {
scope_ = parent_scope; scope_ = parent_scope;
sub_scope_ = &(parent_scope->NewScope()); sub_scope_ = &(parent_scope->NewScope());
...@@ -72,7 +76,7 @@ bool AnalysisPredictor::Init( ...@@ -72,7 +76,7 @@ bool AnalysisPredictor::Init(
void AnalysisPredictor::OptimizeInferenceProgram() { void AnalysisPredictor::OptimizeInferenceProgram() {
LOG(INFO) << "optimize begin"; LOG(INFO) << "optimize begin";
FLAGS_IA_enable_ir = true; FLAGS_IA_enable_ir = config_.enable_ir_optim;
FLAGS_IA_enable_tensorrt_subgraph_engine = false; FLAGS_IA_enable_tensorrt_subgraph_engine = false;
FLAGS_IA_output_storage_path = ""; // Don't output the model. FLAGS_IA_output_storage_path = ""; // Don't output the model.
// Analyze inference_program // Analyze inference_program
...@@ -89,24 +93,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -89,24 +93,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
argument_.origin_program_desc.reset( argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto())); new ProgramDesc(*inference_program_->Proto()));
Analyzer().Run(&argument_); PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
"Only kExclude is supported yet.");
Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
CHECK(argument_.transformed_program_desc); CHECK(argument_.transformed_program_desc);
VLOG(5) << "to prepare executor"; VLOG(5) << "to prepare executor";
// LOG(INFO) << "transformed_parogram_desc " <<
// argument.transformed_program_desc->DebugString();
inference_program_.reset( inference_program_.reset(
new framework::ProgramDesc(*argument_.transformed_program_desc)); new framework::ProgramDesc(*argument_.transformed_program_desc));
PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr)); if (argument_.Has(framework::ir::kParamScopeAttr)) {
// Update scope. // Update scope.
scope_.reset( scope_.reset(
argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr)); argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
LOG(INFO) << "optimize end =="; }
LOG(INFO) << "== optimize end ==";
} }
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
VLOG(3) << "create NativePredictor"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
...@@ -133,7 +139,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -133,7 +139,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
} }
} // namespace paddle } // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
...@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc; ...@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
*/ */
class AnalysisPredictor : public NativePaddlePredictor { class AnalysisPredictor : public NativePaddlePredictor {
public: public:
explicit AnalysisPredictor(const NativeConfig& config) explicit AnalysisPredictor(const AnalysisConfig& config)
: NativePaddlePredictor(config), config_(config) {} : NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope); bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
...@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor { ...@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
Argument& analysis_argument() { return argument_; } Argument& analysis_argument() { return argument_; }
private: private:
NativeConfig config_; AnalysisConfig config_;
Argument argument_; Argument argument_;
}; };
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <sys/time.h>
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <set> #include <set>
...@@ -23,32 +22,14 @@ limitations under the License. */ ...@@ -23,32 +22,14 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid"); DEFINE_bool(profile, false, "Turn on profiler for fluid");
namespace paddle { namespace paddle {
namespace { namespace {
using paddle::inference::Timer;
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
template <class T> template <class T>
std::string num2str(T a) { std::string num2str(T a) {
...@@ -62,14 +43,14 @@ void NativePaddlePredictor::PrepareFeedFetch() { ...@@ -62,14 +43,14 @@ void NativePaddlePredictor::PrepareFeedFetch() {
for (auto *op : inference_program_->Block(0).AllOps()) { for (auto *op : inference_program_->Block(0).AllOps()) {
if (op->Type() == "feed") { if (op->Type() == "feed") {
int idx = boost::get<int>(op->GetAttr("col")); int idx = boost::get<int>(op->GetAttr("col"));
if (feeds_.size() <= (size_t)idx) { if (feeds_.size() <= static_cast<size_t>(idx)) {
feeds_.resize(idx + 1); feeds_.resize(idx + 1);
} }
feeds_[idx] = op; feeds_[idx] = op;
feed_names_[op->Output("Out")[0]] = idx; feed_names_[op->Output("Out")[0]] = idx;
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col")); int idx = boost::get<int>(op->GetAttr("col"));
if (fetchs_.size() <= (size_t)idx) { if (fetchs_.size() <= static_cast<size_t>(idx)) {
fetchs_.resize(idx + 1); fetchs_.resize(idx + 1);
} }
fetchs_[idx] = op; fetchs_[idx] = op;
...@@ -80,7 +61,7 @@ void NativePaddlePredictor::PrepareFeedFetch() { ...@@ -80,7 +61,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
bool NativePaddlePredictor::Init( bool NativePaddlePredictor::Init(
std::shared_ptr<framework::Scope> parent_scope) { std::shared_ptr<framework::Scope> parent_scope) {
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
#if !defined(_WIN32)
if (FLAGS_profile) { if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'"; LOG(INFO) << "You can turn off by set gflags '-profile false'";
...@@ -89,6 +70,7 @@ bool NativePaddlePredictor::Init( ...@@ -89,6 +70,7 @@ bool NativePaddlePredictor::Init(
: platform::ProfilerState::kCPU; : platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device); platform::EnableProfiler(tracking_device);
} }
#endif
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
...@@ -133,10 +115,12 @@ bool NativePaddlePredictor::Init( ...@@ -133,10 +115,12 @@ bool NativePaddlePredictor::Init(
} }
NativePaddlePredictor::~NativePaddlePredictor() { NativePaddlePredictor::~NativePaddlePredictor() {
#if !defined(_WIN32)
if (FLAGS_profile) { if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal, platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log"); "./profile.log");
} }
#endif
if (sub_scope_) { if (sub_scope_) {
scope_->DeleteScope(sub_scope_); scope_->DeleteScope(sub_scope_);
} }
...@@ -179,15 +163,21 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() { ...@@ -179,15 +163,21 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
LOG(ERROR) << "fail to call Init"; LOG(ERROR) << "fail to call Init";
return nullptr; return nullptr;
} }
#ifdef __clang__
// fix clang compile error
return cls;
#else
// fix manylinux compile error. // fix manylinux compile error.
return std::move(cls); return std::move(cls);
#endif
} }
bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework::Scope *scope) { framework::Scope *scope) {
VLOG(3) << "Predictor::set_feed"; VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feeds_.size()) { if (inputs.size() != feeds_.size()) {
LOG(ERROR) << "wrong feed input size."; LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
<< inputs.size();
return false; return false;
} }
for (size_t i = 0; i < inputs.size(); ++i) { for (size_t i = 0; i < inputs.size(); ++i) {
...@@ -329,7 +319,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -329,7 +319,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) { if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
return nullptr; return nullptr;
} }
#ifdef __clang__
// fix clang compile error
return predictor;
#else
return std::move(predictor); return std::move(predictor);
#endif
} }
} // namespace paddle } // namespace paddle
...@@ -3,6 +3,11 @@ cmake_minimum_required(VERSION 3.0) ...@@ -3,6 +3,11 @@ cmake_minimum_required(VERSION 3.0)
project(cpp_inference_demo CXX C) project(cpp_inference_demo CXX C)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
else()
set(CMAKE_STATIC_LIBRARY_PREFIX "")
endif()
if(NOT DEFINED PADDLE_LIB) if(NOT DEFINED PADDLE_LIB)
message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
...@@ -32,44 +37,56 @@ endif(NOT WIN32) ...@@ -32,44 +37,56 @@ endif(NOT WIN32)
include_directories("${PADDLE_LIB}/third_party/boost") include_directories("${PADDLE_LIB}/third_party/boost")
include_directories("${PADDLE_LIB}/third_party/eigen3") include_directories("${PADDLE_LIB}/third_party/eigen3")
if (NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
endif(NOT WIN32)
link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
link_directories("${PADDLE_LIB}/third_party/install/glog/lib") link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") link_directories("${PADDLE_LIB}/paddle/fluid/inference")
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL) if(WITH_MKL)
include_directories("${PADDLE_LIB}/third_party/install/mklml/include") include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so) ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
if(EXISTS ${MKLDNN_PATH}) if(EXISTS ${MKLDNN_PATH})
include_directories("${MKLDNN_PATH}/include") include_directories("${MKLDNN_PATH}/include")
set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
endif() endif()
else() else()
set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
endif() endif()
# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
if(WITH_STATIC_LIB) if(WITH_STATIC_LIB)
set(DEPS set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a) ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
else() else()
set(DEPS set(DEPS
${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so) ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
endif() endif()
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS} set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB} ${MATH_LIB} ${MKLDNN_LIB}
glog gflags protobuf snappystream snappy z glog gflags protobuf snappystream snappy z
${EXTERNAL_LIB}) ${EXTERNAL_LIB})
else()
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${EXTERNAL_LIB})
endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so) set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
endif() endif()
target_link_libraries(${DEMO_NAME} ${DEPS}) target_link_libraries(${DEMO_NAME} ${DEPS})
...@@ -14,7 +14,7 @@ else ...@@ -14,7 +14,7 @@ else
fi fi
PREFIX=inference-vis-demos%2F PREFIX=inference-vis-demos%2F
URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
# download vis_demo data # download vis_demo data
function download() { function download() {
......
...@@ -14,36 +14,19 @@ ...@@ -14,36 +14,19 @@
#pragma once #pragma once
#include <glog/logging.h>
#include <sys/time.h> #include <sys/time.h>
#include <algorithm> #include <algorithm>
#include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
static void split(const std::string &str, char sep, static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) { std::vector<std::string> *pieces) {
pieces->clear(); pieces->clear();
...@@ -106,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor, ...@@ -106,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
} }
} }
std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
switch (tensor.dtype) {
case PaddleDType::FLOAT32:
os << "float32";
break;
case PaddleDType::INT64:
os << "int64";
break;
default:
os << "unset";
}
os << '\n';
os << " - shape: " << to_string(tensor.shape) << '\n';
os << " - lod: ";
for (auto &l : tensor.lod) {
os << to_string(l) << "; ";
}
os << "\n";
os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
return os.str();
}
void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) {
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms";
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig { ...@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
int workspace_size{1 << 30}; int workspace_size{1 << 30};
}; };
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
//
enum class IrPassMode {
kSystem, // Use system default passes, not customize.
kInclude, // Specify the passes in `ir_passes`.
kExclude // Specify the disabled passes in `ir_passes`.
};
bool enable_ir_optim = true;
IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default.
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
};
// A factory to help create different predictors. // A factory to help create different predictors.
// //
// FOR EXTENSION DEVELOPER: // FOR EXTENSION DEVELOPER:
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono> // NOLINT
namespace paddle {
namespace inference {
// Timer for timer
class Timer {
public:
std::chrono::high_resolution_clock::time_point start;
std::chrono::high_resolution_clock::time_point startu;
void tic() { start = std::chrono::high_resolution_clock::now(); }
double toc() {
startu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_span =
std::chrono::duration_cast<std::chrono::duration<double>>(startu -
start);
double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
return used_time_ms;
}
};
} // namespace inference
} // namespace paddle
{ {
global: global:
*paddle*; *paddle*;
*Pass*;
local: local:
*; *;
}; };
...@@ -178,6 +178,8 @@ function(op_library TARGET) ...@@ -178,6 +178,8 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(relu);\n") file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "fake_dequantize") elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "fake_quantize")
file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op") elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
elseif(${TARGET} STREQUAL "fc") elseif(${TARGET} STREQUAL "fc")
...@@ -293,6 +295,7 @@ op_library(extract_rows_op DEPS memory) ...@@ -293,6 +295,7 @@ op_library(extract_rows_op DEPS memory)
op_library(flatten_op DEPS reshape_op) op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding) op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op) op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
......
...@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = static_cast<T>(1) / auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp()); (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out)); auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
dx.device(d) = dout * ((beta * out) + temp2); dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
} }
}; };
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h" #include "paddle/fluid/operators/attention_lstm_op.h"
#include <sys/time.h>
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/auc_op.h" #include "paddle/fluid/operators/auc_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(predict_height, label_height, PADDLE_ENFORCE_EQ(predict_height, label_height,
"Out and Label should have same height."); "Out and Label should have same height.");
int num_thres = ctx->Attrs().Get<int>("num_thresholds"); int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
ctx->SetOutputDim("AUC", {1}); ctx->SetOutputDim("AUC", {1});
ctx->SetOutputDim("TPOut", {num_thres}); ctx->SetOutputDim("BatchAUC", {1});
ctx->SetOutputDim("TNOut", {num_thres}); ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
ctx->SetOutputDim("FPOut", {num_thres}); ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
ctx->SetOutputDim("FNOut", {num_thres});
ctx->ShareLoD("Predict", /*->*/ "AUC");
} }
protected: protected:
...@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", AddInput("Label",
"A 2D int tensor indicating the label of the training data. " "A 2D int tensor indicating the label of the training data. "
"shape: [batch_size, 1]"); "shape: [batch_size, 1]");
AddInput("TP", "True-Positive value.");
AddInput("FP", "False-Positive value.");
AddInput("TN", "True-Negative value.");
AddInput("FN", "False-Negative value.");
// TODO(typhoonzero): support weight input // TODO(typhoonzero): support weight input
AddInput("StatPos", "Statistic value when label = 1");
AddInput("StatNeg", "Statistic value when label = 0");
AddOutput("AUC", AddOutput("AUC",
"A scalar representing the " "A scalar representing the "
"current area-under-the-curve."); "current area-under-the-curve.");
AddOutput("TPOut", "True-Positive value."); AddOutput("BatchAUC", "The AUC for current batch");
AddOutput("FPOut", "False-Positive value."); AddOutput("StatPosOut", "Statistic value when label = 1");
AddOutput("TNOut", "True-Negative value."); AddOutput("StatNegOut", "Statistic value when label = 0");
AddOutput("FNOut", "False-Negative value.");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.") AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC"); .SetDefault("ROC");
AddAttr<int>("num_thresholds", AddAttr<int>("num_thresholds",
"The number of thresholds to use when discretizing the" "The number of thresholds to use when discretizing the"
" roc curve.") " roc curve.")
.SetDefault(200); .SetDefault((2 << 12) - 1);
AddComment(R"DOC( AddComment(R"DOC(
Area Under The Curve (AUC) Operator. Area Under The Curve (AUC) Operator.
......
...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
...@@ -23,106 +23,85 @@ namespace operators { ...@@ -23,106 +23,85 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AucKernel : public framework::OpKernel<T> { class AucKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* predict = ctx.Input<Tensor>("Predict"); auto *predict = ctx.Input<Tensor>("Predict");
auto* label = ctx.Input<Tensor>("Label"); auto *label = ctx.Input<Tensor>("Label");
auto* auc = ctx.Output<Tensor>("AUC");
std::string curve = ctx.Attr<std::string>("curve");
int num_thresholds = ctx.Attr<int>("num_thresholds");
int num_pred_buckets = num_thresholds + 1;
// Only use output var for now, make sure it's persistable and // Only use output var for now, make sure it's persistable and
// not cleaned up for each batch. // not cleaned up for each batch.
auto* true_positive = ctx.Output<Tensor>("TPOut"); auto *auc = ctx.Output<Tensor>("AUC");
auto* false_positive = ctx.Output<Tensor>("FPOut"); auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
auto* true_negative = ctx.Output<Tensor>("TNOut"); auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
auto* false_negative = ctx.Output<Tensor>("FNOut");
auto* auc_data = auc->mutable_data<double>(ctx.GetPlace()); auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
auc);
std::string curve = ctx.Attr<std::string>("curve"); auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
int num_thresholds = ctx.Attr<int>("num_thresholds"); std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
std::vector<double> thresholds_list; std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
thresholds_list.reserve(num_thresholds); calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
for (int i = 1; i < num_thresholds - 1; i++) { num_thresholds, batch_auc);
thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1); }
private:
inline static double trapezoidArea(double X1, double X2, double Y1,
double Y2) {
return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
} }
const double kEpsilon = 1e-7;
thresholds_list[0] = 0.0f - kEpsilon;
thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
inline static void calcAuc(const framework::ExecutionContext &ctx,
const framework::Tensor *label,
const framework::Tensor *predict,
int64_t *stat_pos, int64_t *stat_neg,
int num_thresholds,
framework::Tensor *auc_tensor) {
size_t batch_size = predict->dims()[0]; size_t batch_size = predict->dims()[0];
size_t inference_width = predict->dims()[1]; size_t inference_width = predict->dims()[1];
const T *inference_data = predict->data<T>();
const auto *label_data = label->data<int64_t>();
const T* inference_data = predict->data<T>(); auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
const auto* label_data = label->data<int64_t>();
auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
// calculate TP, FN, TN, FP for current thresh
int64_t tp = 0, fn = 0, tn = 0, fp = 0;
for (size_t i = 0; i < batch_size; i++) { for (size_t i = 0; i < batch_size; i++) {
// NOTE: label_data used as bool, labels > 0 will be treated as true. uint32_t binIdx = static_cast<uint32_t>(
inference_data[i * inference_width + 1] * num_thresholds);
if (label_data[i]) { if (label_data[i]) {
if (inference_data[i * inference_width + 1] >= stat_pos[binIdx] += 1.0;
(thresholds_list[idx_thresh])) {
tp++;
} else {
fn++;
}
} else {
if (inference_data[i * inference_width + 1] >=
(thresholds_list[idx_thresh])) {
fp++;
} else { } else {
tn++; stat_neg[binIdx] += 1.0;
}
}
}
// store rates
tp_data[idx_thresh] += tp;
fn_data[idx_thresh] += fn;
tn_data[idx_thresh] += tn;
fp_data[idx_thresh] += fp;
}
// epsilon to avoid divide by zero.
double epsilon = 1e-6;
// Riemann sum to caculate auc.
Tensor tp_rate, fp_rate, rec_rate;
tp_rate.Resize({num_thresholds});
fp_rate.Resize({num_thresholds});
rec_rate.Resize({num_thresholds});
auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
for (int i = 0; i < num_thresholds; i++) {
tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
(tp_data[i] + fn_data[i] + epsilon);
fp_rate_data[i] =
static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
(tp_data[i] + fp_data[i] + epsilon);
} }
*auc_data = 0.0f;
if (curve == "ROC") {
for (int i = 0; i < num_thresholds - 1; i++) {
auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
*auc_data = *auc_data + dx * y;
} }
} else if (curve == "PR") {
for (int i = 1; i < num_thresholds; i++) { *auc = 0.0f;
auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f; double totPos = 0.0;
*auc_data = *auc_data + dx * y; double totNeg = 0.0;
double totPosPrev = 0.0;
double totNegPrev = 0.0;
int idx = num_thresholds;
while (idx >= 0) {
totPosPrev = totPos;
totNegPrev = totNeg;
totPos += stat_pos[idx];
totNeg += stat_neg[idx];
*auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
--idx;
} }
if (totPos > 0.0 && totNeg > 0.0) {
*auc = *auc / totPos / totNeg;
} }
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
/*
* transform that computes target bounding-box regression deltas
* given proposal boxes and ground-truth boxes.
*/
template <typename T>
inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
const framework::Tensor& gt_boxes, const T* weights,
const bool normalized, framework::Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
for (int64_t i = 0; i < box_num; ++i) {
ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
trg(i, 2) = std::log(gt_w / ex_w);
trg(i, 3) = std::log(gt_h / ex_h);
if (weights) {
trg(i, 0) = trg(i, 0) / weights[0];
trg(i, 1) = trg(i, 1) / weights[1];
trg(i, 2) = trg(i, 2) / weights[2];
trg(i, 3) = trg(i, 3) / weights[3];
}
}
}
template <typename T>
void Gather(const T* in, const int in_stride, const int* index, const int num,
T* out) {
const int stride_bytes = in_stride * sizeof(T);
for (int i = 0; i < num; ++i) {
int id = index[i];
memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
}
}
} // namespace operators
} // namespace paddle
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, ...@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
} }
} }
template <typename T>
void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes,
const std::vector<float>& weights, Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
auto box_delta_et = framework::EigenTensor<T, 2>::From(*box_delta);
T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
for (int64_t i = 0; i < box_num; ++i) {
ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1;
ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1;
ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1;
gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1;
gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0];
box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1];
box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2];
box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3];
}
}
template <typename T> template <typename T>
std::vector<std::vector<int>> SampleFgBgGt( std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& context, Tensor* iou, const platform::CPUDeviceContext& context, Tensor* iou,
...@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, ...@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
Tensor* sampled_labels, Tensor* sampled_gts) { Tensor* sampled_labels, Tensor* sampled_gts) {
int fg_num = fg_inds.size(); int fg_num = fg_inds.size();
int bg_num = bg_inds.size(); int bg_num = bg_inds.size();
int gt_num = fg_num + bg_num;
Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t; Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace()); int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace()); int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
int* gt_box_inds_data = int* gt_box_inds_data =
gt_box_inds_t.mutable_data<int>({gt_num}, context.GetPlace()); gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
int* gt_label_inds_data = int* gt_label_inds_data =
gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace()); gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data); std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
...@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage( ...@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(
// Gather boxes and labels // Gather boxes and labels
Tensor sampled_boxes, sampled_labels, sampled_gts; Tensor sampled_boxes, sampled_labels, sampled_gts;
int boxes_num = fg_inds.size() + bg_inds.size(); int fg_num = fg_inds.size();
int bg_num = bg_inds.size();
int boxes_num = fg_num + bg_num;
framework::DDim bbox_dim({boxes_num, kBoxDim}); framework::DDim bbox_dim({boxes_num, kBoxDim});
sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace()); sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace()); sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
sampled_gts.mutable_data<T>(bbox_dim, context.GetPlace()); sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds, GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts); gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
// Compute targets // Compute targets
Tensor bbox_targets_single; Tensor bbox_targets_single;
bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace()); bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
BoxToDelta<T>(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights, BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
&bbox_targets_single); &bbox_targets_single);
// Scale rois // Scale rois
...@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
auto rpn_rois_lod = rpn_rois->lod().back(); auto rpn_rois_lod = rpn_rois->lod().back();
auto gt_classes_lod = gt_classes->lod().back(); auto gt_classes_lod = gt_classes->lod().back();
auto gt_boxes_lod = gt_boxes->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back();
for (size_t i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor rpn_rois_slice = Tensor rpn_rois_slice =
rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
Tensor gt_classes_slice = Tensor gt_classes_slice =
......
...@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4}, rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
context.GetPlace()); context.GetPlace());
rpn_roi_probs->mutable_data<T>({scores->numel() / 4, 1}, rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
context.GetPlace());
Tensor bbox_deltas_swap, scores_swap; Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox}, bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
...@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> { ...@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
CPUGather<T>(ctx, proposals, keep, &bbox_sel); CPUGather<T>(ctx, proposals, keep, &bbox_sel);
CPUGather<T>(ctx, scores_sel, keep, &scores_filter); CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
if (nms_thresh <= 0) { if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_sel); return std::make_pair(bbox_sel, scores_filter);
} }
Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <random> #include <random>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/bbox_util.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
...@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { ...@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
auto in_dims = ctx->GetInputDim("DistMat"); auto in_dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(in_dims.size(), 2, PADDLE_ENFORCE_EQ(in_dims.size(), 2,
"The rank of Input(DistMat) must be 2."); "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("LocationIndex", {-1});
ctx->SetOutputDim("ScoreIndex", {-1});
ctx->SetOutputDim("TargetLabel", {-1, 1});
ctx->SetOutputDim("TargetBBox", {-1, 4});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("DistMat")->type()),
platform::CPUPlace());
} }
}; };
template <typename T> template <typename T>
class RpnTargetAssignKernel : public framework::OpKernel<T> { class RpnTargetAssignKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override {
auto* anchor_t = context.Input<Tensor>("Anchor"); // (H*W*A) * 4
auto* gt_bbox_t = context.Input<Tensor>("GtBox");
auto* dist_t = context.Input<LoDTensor>("DistMat");
auto* loc_index_t = context.Output<Tensor>("LocationIndex");
auto* score_index_t = context.Output<Tensor>("ScoreIndex");
auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
auto lod = dist_t->lod().back();
int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
int64_t anchor_num = dist_t->dims()[1];
PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
float pos_threshold = context.Attr<float>("rpn_positive_overlap");
float neg_threshold = context.Attr<float>("rpn_negative_overlap");
float fg_fraction = context.Attr<float>("fg_fraction");
int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
int64_t max_num = batch_num * anchor_num;
auto place = context.GetPlace();
tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
Tensor tmp_tgt_lbl;
auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
int fg_num = 0;
int bg_num = 0;
for (int i = 0; i < batch_num; ++i) {
Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
rpn_batch_size, fg_num_per_batch, engine,
tmp_lbl_data + i * anchor_num);
int cur_fg_num = fg_bg_gt[0].size();
int cur_bg_num = fg_bg_gt[1].size();
std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
[i, anchor_num](int d) { return d + i * anchor_num; });
memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
score_index + cur_fg_num,
[i, anchor_num](int d) { return d + i * anchor_num; });
// get target bbox deltas
if (cur_fg_num) {
Tensor fg_gt;
T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
T* tgt_data = tgt_bbox.data<T>();
Gather<T>(anchor_t->data<T>(), 4,
reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
tgt_data);
Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
cur_fg_num, gt_data);
BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
}
loc_index += cur_fg_num;
score_index += cur_fg_num + cur_bg_num;
fg_num += cur_fg_num;
bg_num += cur_bg_num;
}
int lbl_num = fg_num + bg_num;
PADDLE_ENFORCE_LE(fg_num, max_num);
PADDLE_ENFORCE_LE(lbl_num, max_num);
tgt_bbox_t->Resize({fg_num, 4});
loc_index_t->Resize({fg_num});
score_index_t->Resize({lbl_num});
auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
lbl_data);
}
private:
void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
const int row, const int col, const float pos_threshold, const int row, const int col, const float pos_threshold,
const float neg_threshold, int64_t* target_label_data, const float neg_threshold, int64_t* target_label,
std::vector<int>* fg_inds, std::vector<int>* bg_inds) const { std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
int fg_offset = fg_inds->size(); float epsilon = 0.0001;
int bg_offset = bg_inds->size();
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
const T* v = dist_data + i * col; const T* v = dist_data + i * col;
T max_dist = *std::max_element(v, v + col); T max = *std::max_element(v, v + col);
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
T val = dist_data[i * col + j]; if (std::abs(max - v[j]) < epsilon) {
if (val == max_dist) target_label_data[j] = 1; target_label[j] = 1;
}
} }
} }
// Pick the fg/bg and count the number // Pick the fg/bg
const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
if (anchor_to_gt_max.data<T>()[j] > pos_threshold) { if (anchor_to_gt_max_data[j] >= pos_threshold) {
target_label_data[j] = 1; target_label[j] = 1;
} else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) { } else if (anchor_to_gt_max_data[j] < neg_threshold) {
target_label_data[j] = 0; target_label[j] = 0;
} }
if (target_label_data[j] == 1) { if (target_label[j] == 1) {
fg_inds->push_back(fg_offset + j); fg_inds->push_back(j);
} else if (target_label_data[j] == 0) { } else if (target_label[j] == 0) {
bg_inds->push_back(bg_offset + j); bg_inds->push_back(j);
} }
} }
} }
void ReservoirSampling(const int num, const int offset, void ReservoirSampling(const int num, std::minstd_rand engine,
std::minstd_rand engine,
std::vector<int>* inds) const { std::vector<int>* inds) const {
std::uniform_real_distribution<float> uniform(0, 1); std::uniform_real_distribution<float> uniform(0, 1);
const int64_t size = static_cast<int64_t>(inds->size() - offset); size_t len = inds->size();
if (size > num) { if (len > static_cast<size_t>(num)) {
for (int64_t i = num; i < size; ++i) { for (size_t i = num; i < len; ++i) {
int rng_ind = std::floor(uniform(engine) * i); int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num) if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind + offset, std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
inds->begin() + i + offset);
} }
inds->resize(num);
} }
} }
void RpnTargetAssign(const framework::ExecutionContext& ctx, // std::vector<std::vector<int>> RpnTargetAssign(
const Tensor& dist, const float pos_threshold, std::vector<std::vector<int>> SampleFgBgGt(
const float neg_threshold, const int rpn_batch_size, const platform::CPUDeviceContext& ctx, const Tensor& dist,
const int fg_num, std::minstd_rand engine, const float pos_threshold, const float neg_threshold,
std::vector<int>* fg_inds, std::vector<int>* bg_inds, const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
int64_t* target_label_data) const { int64_t* target_label) const {
auto* dist_data = dist.data<T>(); auto* dist_data = dist.data<T>();
int64_t row = dist.dims()[0]; int row = dist.dims()[0];
int64_t col = dist.dims()[1]; int col = dist.dims()[1];
int fg_offset = fg_inds->size();
int bg_offset = bg_inds->size(); std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::vector<int> gt_inds;
// Calculate the max IoU between anchors and gt boxes // Calculate the max IoU between anchors and gt boxes
Tensor anchor_to_gt_max; // Map from anchor to gt box that has highest overlap
anchor_to_gt_max.mutable_data<T>( auto place = ctx.GetPlace();
framework::make_ddim({static_cast<int64_t>(col), 1}), Tensor anchor_to_gt_max, anchor_to_gt_argmax;
platform::CPUPlace()); anchor_to_gt_max.mutable_data<T>({col}, place);
auto& place = *ctx.template device_context<platform::CPUDeviceContext>() int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
.eigen_device();
auto x = EigenMatrix<T>::From(dist); auto x = framework::EigenMatrix<T>::From(dist);
auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max); auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
x_col_max.device(place) = auto x_col_argmax =
x.maximum(Eigen::DSizes<int, 1>(0)) framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
.reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1)); x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
x_col_argmax = x.argmax(0).template cast<int>();
// Follow the Faster RCNN's implementation // Follow the Faster RCNN's implementation
ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
neg_threshold, target_label_data, fg_inds, bg_inds); neg_threshold, target_label, &fg_inds, &bg_inds);
// Reservoir Sampling // Reservoir Sampling
ReservoirSampling(fg_num, fg_offset, engine, fg_inds); ReservoirSampling(fg_num, engine, &fg_inds);
int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset); int fg_num2 = static_cast<int>(fg_inds.size());
ReservoirSampling(bg_num, bg_offset, engine, bg_inds); int bg_num = rpn_batch_size - fg_num2;
} ReservoirSampling(bg_num, engine, &bg_inds);
void Compute(const framework::ExecutionContext& context) const override { gt_inds.reserve(fg_num2);
auto* dist = context.Input<LoDTensor>("DistMat"); for (int i = 0; i < fg_num2; ++i) {
auto* loc_index = context.Output<Tensor>("LocationIndex"); gt_inds.emplace_back(argmax[fg_inds[i]]);
auto* score_index = context.Output<Tensor>("ScoreIndex");
auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
auto col = dist->dims()[1];
int64_t n = dist->lod().size() == 0UL
? 1
: static_cast<int64_t>(dist->lod().back().size() - 1);
if (dist->lod().size()) {
PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
"Only support 1 level of LoD.");
} }
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im"); std::vector<std::vector<int>> fg_bg_gt;
float pos_threshold = context.Attr<float>("rpn_positive_overlap"); fg_bg_gt.emplace_back(fg_inds);
float neg_threshold = context.Attr<float>("rpn_negative_overlap"); fg_bg_gt.emplace_back(bg_inds);
float fg_fraction = context.Attr<float>("fg_fraction"); fg_bg_gt.emplace_back(gt_inds);
int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
int64_t* target_label_data = return fg_bg_gt;
tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
if (n == 1) {
RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
target_label_data);
} else {
auto lod = dist->lod().back();
for (size_t i = 0; i < lod.size() - 1; ++i) {
Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
target_label_data + i * col);
}
}
int* loc_index_data = loc_index->mutable_data<int>(
{static_cast<int>(fg_inds.size())}, context.GetPlace());
int* score_index_data = score_index->mutable_data<int>(
{static_cast<int>(fg_inds.size() + bg_inds.size())},
context.GetPlace());
memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
fg_inds.size() * sizeof(int));
memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
fg_inds.size() * sizeof(int));
memcpy(score_index_data + fg_inds.size(),
reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
} }
}; };
class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Anchor",
"(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
AddInput( AddInput(
"DistMat", "DistMat",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
...@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"ScoreIndex", "ScoreIndex",
"(Tensor), The indexes of foreground and background anchors in all " "(Tensor), The indexes of foreground and background anchors in all "
"RPN anchors(The rest anchors are ignored). The shape of the " "RPN anchors(The rest anchors are ignored). The shape of the "
"ScoreIndex is [F + B], F and B depend on the value of input " "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
"tensor and attributes."); " number.");
AddOutput("TargetLabel", AddOutput("TargetBBox",
"(Tensor<int64_t>), The target bbox deltas with shape "
"[F, 4], F is the sampled foreground number.");
AddOutput(
"TargetLabel",
"(Tensor<int64_t>), The target labels of each anchor with shape " "(Tensor<int64_t>), The target labels of each anchor with shape "
"[K * M, 1], " "[F + B, 1], F and B are sampled foreground and backgroud number.");
"K and M is the same as they are in DistMat.");
AddComment(R"DOC( AddComment(R"DOC(
This operator can be, for given the IoU between the ground truth bboxes and the This operator can be, for given the IoU between the ground truth bboxes and the
anchors, to assign classification and regression targets to each prediction. anchors, to assign classification and regression targets to each prediction.
......
...@@ -39,8 +39,17 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -39,8 +39,17 @@ bool RequestSendHandler::Handle(const std::string& varname,
const std::string& out_var_name) { const std::string& out_var_name) {
VLOG(4) << "RequestSendHandler:" << varname; VLOG(4) << "RequestSendHandler:" << varname;
// Sync
if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
rpc_server_->IncreaseBatchBarrier(kRequestSend);
} else if (varname == COMPLETE_MESSAGE) {
VLOG(3) << "sync: recv complete message";
rpc_server_->Complete();
} else {
// Async // Async
if (!sync_mode_) { if (!sync_mode_) {
VLOG(3) << "async process var: " << varname;
rpc_server_->Profiler().OneStep(); rpc_server_->Profiler().OneStep();
try { try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
...@@ -50,17 +59,7 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -50,17 +59,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
return false; return false;
} }
return true; return true;
} } else { // sync
// Sync
if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
rpc_server_->IncreaseBatchBarrier(kRequestSend);
} else if (varname == COMPLETE_MESSAGE) {
VLOG(3) << "sync: recv complete message";
rpc_server_->Complete();
} else {
VLOG(3) << "sync: received var_name: " << varname;
rpc_server_->WaitCond(kRequestSend); rpc_server_->WaitCond(kRequestSend);
VLOG(3) << "sync: processing received var: " << varname; VLOG(3) << "sync: processing received var: " << varname;
...@@ -68,11 +67,13 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -68,11 +67,13 @@ bool RequestSendHandler::Handle(const std::string& varname,
LOG(FATAL) << "sync: Can not find server side var: " << varname; LOG(FATAL) << "sync: Can not find server side var: " << varname;
return false; return false;
} }
if (invar->IsType<framework::SelectedRows>()) { if (invar->IsType<framework::SelectedRows>()) {
std::unique_lock<std::mutex> lock(mutex_sparse_vars_); std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
sparse_vars_.push_back(invar); sparse_vars_.push_back(invar);
} }
} }
}
return true; return true;
} }
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <iterator>
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -94,8 +95,11 @@ class RowwiseTransformIterator; ...@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
template <typename T, typename DeviceContext> template <typename T, typename DeviceContext>
class MidWiseTransformIterator; class MidWiseTransformIterator;
// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
template <typename T> template <typename T>
class RowwiseTransformIterator<T, platform::CPUDeviceContext> { class RowwiseTransformIterator<T, platform::CPUDeviceContext>
: public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
T *, T &> {
public: public:
RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {} RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
...@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> { ...@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
}; };
template <typename T> template <typename T>
class MidWiseTransformIterator<T, platform::CPUDeviceContext> { class MidWiseTransformIterator<T, platform::CPUDeviceContext>
: public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
T *, T &> {
public: public:
MidWiseTransformIterator(const T *ptr, int n, int post) MidWiseTransformIterator(const T *ptr, int n, int post)
: ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
...@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast( ...@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
const framework::Tensor &dout, int axis, framework::Tensor *dx, const framework::Tensor &dout, int axis, framework::Tensor *dx,
framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
size_t N = static_cast<size_t>(framework::product(x_dim)); size_t N = static_cast<size_t>(framework::product(x_dim));
#if !defined(_WIN32)
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), N); ctx.template device_context<DeviceContext>(), N);
#else
platform::ForRange<DeviceContext> for_range(
ctx.device_context<DeviceContext>(), N);
#endif // !_WIN32
for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{ for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op, x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
...@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx, ...@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
template <typename Functor, typename DeviceContext, typename T, template <typename Functor, typename DeviceContext, typename T,
typename OutType = T> typename OutType = T>
void ElementwiseComputeEx(const framework::ExecutionContext &ctx, void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *x,
const framework::Tensor *y, int axis, Functor func, const framework::Tensor *y, int axis, Functor func,
framework::Tensor *z) { framework::Tensor *z) {
TransformFunctor<Functor, T, DeviceContext, OutType> functor( TransformFunctor<Functor, T, DeviceContext, OutType> functor(
x, y, z, ctx.template device_context<DeviceContext>(), func); x, y, z, ctx.template device_context<DeviceContext>(), func);
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims_untrimed = y->dims(); auto y_dims_untrimed = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(), PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
......
...@@ -14,86 +14,198 @@ limitations under the License. */ ...@@ -14,86 +14,198 @@ limitations under the License. */
#include "paddle/fluid/operators/fake_quantize_op.h" #include "paddle/fluid/operators/fake_quantize_op.h"
#include <string> #include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class FakeQuantizeOp : public framework::OperatorWithKernel { template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVectorArrayMap =
Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using ConstEigenVectorArrayMap =
Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
template <typename T>
struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, const T* in,
const int num, T* out) {
Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
out_e = in_e.abs().maximum();
}
};
template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
template <typename T>
struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, framework::Tensor* out) {
T s = scale.data<T>()[0];
platform::Transform<platform::CPUDeviceContext> trans;
trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
auto in_e = framework::EigenVector<T>::Flatten(in);
auto out_e = framework::EigenVector<T>::Flatten(*out);
out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round();
}
};
template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
template <typename T>
struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx,
const framework::Tensor& cur_scale,
const framework::Tensor& last_scale,
const framework::Tensor& iter, const int window_size,
framework::Tensor* scales_arr, framework::Tensor* out_scale) {
T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
int64_t it = iter.data<int64_t>()[0];
int idx = it % window_size;
T removed = scale_arr[idx];
T cur = cur_scale.data<T>()[0];
scale_arr[idx] = cur;
T max = last_scale.data<T>()[0];
if (max < cur) {
max = cur;
} else if (fabs(removed - max) < 1e-6) {
int size = (it > window_size) ? window_size : it;
FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
&max);
}
out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
}
};
template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
public: public:
FakeQuantizeOp(const std::string &type, FakeQuantizeAbsMaxOp(const std::string& type,
const framework::VariableNameMap &inputs, const framework::VariableNameMap& inputs,
const framework::VariableNameMap &outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap &attrs) const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeQuantizeOp should not be null."); "Input(X) of FakeQuantizeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FakeQuantizeOp should not be null."); "Output(Out) of FakeQuantizeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"), PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
"OutMovingScale(Out) of FakeQuantizeOp should not be null"); "Output(Scale) of FakeQuantizeOp should not be null.");
// if (ctx->HasInput("InMovingScale")) {
ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
//}
// if (ctx->HasInput("InScales")) {
PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
"OutScales(Out) of FakeQuantizeOp should not be null");
ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
// PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
// ctx->Outputs("OutScales")[0],
// "Mean and MeanOut should share the same memory");
//}
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScale", {1});
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
}; };
class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker { class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", "(Tensor) Input tensor of scale operator."); AddInput("X", "(Tensor) Input is float data type.");
AddInput("InScales", "(Tensor) scale buffer, used in static quantization.") AddOutput("Out",
.AsDispensable(); "(Tensor) Output of quantized low level tensor, "
AddInput("InMovingScale", "Last scale, used in static quantization.") "but also saved as float data type.");
.AsDispensable(); AddOutput("OutScale", "(Tensor) Current scale");
AddInput("InCurrentIter",
"Last iteration number, used in static quantization.")
.AsDispensable();
AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
AddOutput("OutScales",
"(Tensor) scale buffer, used in static quantization.")
.AsDispensable();
AddOutput("OutMovingScale", " Current scale");
AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
AddAttr<std::string>("quantize_type",
"(string, default abs_max)"
"The scaling tpe of the quantize operator.")
.SetDefault("abs_max");
AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
AddAttr<int>("bit_length", "(int, default 8)") AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8) .SetDefault(8)
.AddCustomChecker([](const int &bit_length) { .AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16."); "'bit_length' should be between 1 and 16.");
}); });
AddAttr<bool>("is_test", "").SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
FakeQuantize operator FakeQuantize operator
quantize_type = abs_max: $$scale = max(abs(X))$$
$$range = 2^{bit_length - 1} - 1$$
$$Out = round(X/scale * range)$$
$$scale = max(abs(x))$$ )DOC");
}
};
quantize_type = range_abs_max: class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
public:
FakeQuantizeRangeAbsMaxOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
$$scale = max(max(abs(x)), history_abs_max)$$ void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeQuantizeRangeAbsMaxOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of FakeQuantizeRangeAbsMaxOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("OutScale"),
"Output(OutScale) of FakeQuantizeRangeAbsMaxOp should not be null");
if (ctx->HasOutput("OutScales")) {
int window_size = ctx->Attrs().Get<int>("window_size");
ctx->SetOutputDim("OutScales", {window_size});
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->SetOutputDim("OutScale", {1});
ctx->ShareLoD("X", /*->*/ "Out");
}
quantize_type = moving_average_abs_max: protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
};
$$scale = 0.1*scale+0.9*new_abs_max)$$ class FakeQuantizeRangeAbsMaxOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddInput("InScale", "Last scale.");
AddInput("Iter", "Global step iteration.").AsDispensable();
AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
AddOutput("OutScale", " Current scale");
AddOutput("OutScales", "(Tensor) scale buffer.").AsDispensable();
AddAttr<int>("window_size", "(int, default 10000) window range size.")
.SetDefault(10000);
AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16.");
});
AddAttr<bool>("is_test", "").SetDefault(false);
AddComment(R"DOC(
FakeQuantize operator is used in static quantization.
$$Out = scale*X$$ $$scale = max(max(abs(x)), history_abs_max)$$
$$range = 2^{bit_length - 1} - 1$$
$$Out = round(X/scale * range)$$
)DOC"); )DOC");
} }
...@@ -103,10 +215,16 @@ $$Out = scale*X$$ ...@@ -103,10 +215,16 @@ $$Out = scale*X$$
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxOp,
ops::FakeQuantizeAbsMaxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
ops::FakeQuantizeAbsMaxKernel<CPU, float>);
REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker, REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
ops::FakeQuantizeRangeAbsMaxOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
fake_quantize, ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/fake_quantize_op.h" #include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
...@@ -20,7 +21,7 @@ namespace paddle { ...@@ -20,7 +21,7 @@ namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T>
__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) { __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x; int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x; int tid = threadIdx.x;
...@@ -43,7 +44,7 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) { ...@@ -43,7 +44,7 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
__syncthreads(); __syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) { for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) { if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
shared_max_data[tid] = shared_max_data[tid + i]; shared_max_data[tid] = shared_max_data[tid + i];
} }
__syncthreads(); __syncthreads();
...@@ -53,220 +54,125 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) { ...@@ -53,220 +54,125 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
} }
} }
float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array, template <typename T>
int length) { struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
float host_max; void operator()(const platform::CUDADeviceContext& ctx, const T* in,
int kNumTheads = 1024; const int num, T* out) {
int gridDimx = (kNumTheads - 1 + length) / kNumTheads; int block = 1024;
gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx; int grid = (block - 1 + num) / block;
framework::Tensor t; grid = (grid > block) ? block : grid;
float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
platform::CUDAPlace()); framework::Tensor max;
FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float), T* max_data =
ctx.stream()>>>(length, array, device_max); max.mutable_data<T>(framework::make_ddim({grid}), ctx.GetPlace());
FindAbsMaxKernel< FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>( in, num, max_data);
gridDimx, device_max, device_max); FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
PADDLE_ENFORCE_EQ( max_data, grid, out);
cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost), }
cudaSuccess, "cudaMemcpy failed"); };
return host_max;
} template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T> template <typename T>
__global__ void ApplySaturateKernel(const int n, const T* in, T* out, __global__ void ClipAndQuantKernel(const T* in, const T* scale,
int* num_saturate, const T min, const int bin_cnt, const int n, T* out) {
const T max) {
int bid = threadIdx.x + blockIdx.x * blockDim.x; int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x; int tid = threadIdx.x;
extern __shared__ int shared_count[]; T s = scale[0];
shared_count[tid] = 0;
for (int i = bid; i < n; i += blockDim.x * gridDim.x) { for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
if (in[i] > max) { T x = in[bid];
out[i] = max; T v = x > s ? s : x;
shared_count[tid] += 1; v = v < -s ? -s : v;
} else if (in[i] < min) { v = bin_cnt / s * v;
out[i] = min; out[bid] = round(v);
shared_count[tid] += 1;
} else {
out[i] = in[i];
}
}
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i) {
shared_count[tid] += shared_count[tid + i];
}
__syncthreads();
}
if (tid == 0) {
num_saturate[blockIdx.x] = shared_count[0];
} }
} }
template <typename T> template <typename T>
__global__ void ReduceKernel(const int n, const T* in, T* out) { __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
int tid = threadIdx.x; const T* last_scale,
extern __shared__ T shared_sum[]; const int64_t* iter,
if (tid < n) { const int window_size, T* scale_arr,
shared_sum[tid] = in[tid]; T* out_scale, int* need_find_max,
int* out_size) {
int it = iter[0];
int idx = it % window_size;
T removed = scale_arr[idx];
T cur = cur_scale[0];
scale_arr[idx] = cur;
T max = last_scale[0];
out_scale[0] = max < cur ? cur : max;
if (fabs(removed - max) < 1e-6) {
need_find_max[0] = 1;
out_size[0] = it > window_size ? window_size : it;
} else { } else {
shared_sum[tid] = T(0); need_find_max[0] = 0;
}
__syncthreads();
// blockDim.x must >= n
for (int i = (n + 1) / 2; i > 0; i >>= 1) {
if (tid < i) {
shared_sum[tid] += shared_sum[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[0] = shared_sum[0];
} }
} }
template <typename T> template <typename T>
int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n, struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
const T* in, T* out, const T min, const T max) { void operator()(const platform::CUDADeviceContext& ctx,
int host_num_saturate; const framework::Tensor& cur_scale,
int kNumTheads = 1024; const framework::Tensor& last_scale,
int gridDimx = (n + kNumTheads - 1) / kNumTheads; const framework::Tensor& iter, const int window_size,
gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx; framework::Tensor* scales_arr, framework::Tensor* out_scale) {
framework::Tensor t; const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
int* device_num_saturate = t.mutable_data<int>(
framework::make_ddim({gridDimx}), platform::CUDAPlace()); T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
ApplySaturateKernel< T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
n, in, out, device_num_saturate, min, max); framework::Tensor need_find_max, out_size;
ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>( int* find_max = need_find_max.mutable_data<int>(gpu_place);
gridDimx, device_num_saturate, device_num_saturate); int* out_size_data = out_size.mutable_data<int>(gpu_place);
PADDLE_ENFORCE_EQ(cudaSuccess,
cudaMemcpy(&host_num_saturate, device_num_saturate, FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
sizeof(int), cudaMemcpyDeviceToHost), cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
"cudaMemcpy failed"); window_size, scale_arr, out_scale_data, find_max, out_size_data);
return host_num_saturate;
} int g_find_max;
memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
template <typename DeviceContext, typename T> sizeof(int), 0);
class FakeQuantizeCUDAKernel : public framework::OpKernel<T> { if (g_find_max) {
public: int len;
T FindRangeAbsMax(const platform::CUDADeviceContext& ctx, memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
framework::Tensor* scale_list, framework::Tensor* out_scale, sizeof(int), 0);
const T& cur_scale, int window_size, FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
int current_iter) const { out_scale_data);
T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
T remove_tmp = sl[current_iter];
sl[current_iter] = cur_scale;
T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
if (max_scale < cur_scale) {
max_scale = cur_scale;
} else if (fabs(remove_tmp - max_scale) < 1e-6) {
int size = (current_iter > window_size) ? window_size : current_iter;
max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
}
return max_scale;
}
T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
framework::Tensor* out_scale,
const T& cur_scale) const {
T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
return T(outs[0]);
} }
virtual void Compute(const framework::ExecutionContext& context) const {
PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
"This kernel only runs on GPU device.");
auto& device_ctx = context.cuda_device_context();
auto* tensor = context.Output<framework::Tensor>("Out");
auto* in = context.Input<framework::Tensor>("X");
const bool is_test = context.Attr<bool>("is_test");
tensor->mutable_data<T>(in->place());
context.Output<framework::Tensor>("OutMovingScale")
->mutable_data<T>(
context.Input<framework::Tensor>("InMovingScale")->place());
auto quantize_type =
static_cast<std::string>(context.Attr<std::string>("quantize_type"));
if (quantize_type == std::string("range_abs_max")) {
context.Output<framework::Tensor>("OutScales")
->mutable_data<T>(
context.Input<framework::Tensor>("InScales")->place());
context.Output<framework::Tensor>("OutCurrentIter")
->mutable_data<T>(
context.Input<framework::Tensor>("InCurrentIter")->place());
} }
};
T scale = T(1); template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
int window_size = context.Attr<int>("window_size");
T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
if (quantize_type == std::string("abs_max")) {
auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
auto& device_ctx = context.template device_context<DeviceContext>(); template <typename T>
auto* scale_list = context.Output<framework::Tensor>("OutScales"); struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
math::SetConstant<DeviceContext, T> scalar; void operator()(const platform::CUDADeviceContext& ctx,
scale_list->mutable_data<T>(context.GetPlace()); const framework::Tensor& in, const framework::Tensor& scale,
scalar(device_ctx, scale_list, static_cast<T>(0)); const int bin_cnt, framework::Tensor* out) {
auto* iter = context.Output<framework::Tensor>("OutCurrentIter"); int num = in.numel();
iter->mutable_data<T>(context.GetPlace()); int block = 1024;
scalar(device_ctx, iter, static_cast<T>(0)); int grid = (block - 1 + num) / block;
} else if (quantize_type == std::string("range_abs_max")) {
auto* moving_scale = const_cast<framework::Tensor*>(
context.Input<framework::Tensor>("InMovingScale"));
if (is_test) {
scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
} else {
auto* it = const_cast<framework::Tensor*>(
context.Input<framework::Tensor>("InCurrentIter"));
auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
int* last_iter = it->mutable_data<int>(platform::CPUPlace());
int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
auto* scale_list = context.Output<framework::Tensor>("OutScales");
auto* saving_scale =
context.Output<framework::Tensor>("OutMovingScale");
scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
window_size, current_iter[0]);
(*current_iter) = (*last_iter) + 1;
}
} else if (quantize_type == std::string("moving_average_abs_max")) {
auto* moving_scale = const_cast<framework::Tensor*>(
context.Input<framework::Tensor>("InMovingScale"));
if (is_test) {
scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
} else {
scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
auto* saving_scale =
context.Output<framework::Tensor>("OutMovingScale");
scale = FindMovingAverageAbsMmax(
const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
}
}
ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(), const T* in_data = in.data<T>();
tensor->mutable_data<T>(in->place()), -scale, scale); const T* scale_data = scale.data<T>();
scale = bin_cnt / scale; T* out_data = out->mutable_data<T>(ctx.GetPlace());
auto& dev = ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
*context.template device_context<DeviceContext>().eigen_device(); in_data, scale_data, bin_cnt, num, out_data);
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
eigen_out.device(dev) = (scale * eigen_in).round();
} }
}; };
template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(fake_quantize, namespace ops = paddle::operators;
paddle::operators::FakeQuantizeCUDAKernel< using CUDA = paddle::platform::CUDADeviceContext;
paddle::platform::CUDADeviceContext, float>, REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
paddle::operators::FakeQuantizeCUDAKernel< ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
paddle::platform::CUDADeviceContext, double>); REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
...@@ -17,137 +17,91 @@ limitations under the License. */ ...@@ -17,137 +17,91 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using platform::Transform; template <typename DeviceContext, typename T>
struct FindAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
};
template <typename DeviceContext, typename T>
struct ClipAndFakeQuantFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& in,
const framework::Tensor& scale, const int bin_cnt,
framework::Tensor* out);
};
template <typename DeviceContext, typename T>
struct FindRangeAbsMaxFunctor {
void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
const framework::Tensor& last_scale,
const framework::Tensor& iter, const int window_size,
framework::Tensor* scales_arr, framework::Tensor* out_scale);
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class FakeQuantizeKernel : public framework::OpKernel<T> { class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
public: public:
T FindAbsMax(framework::Tensor* in, int n) const { void Compute(const framework::ExecutionContext& context) const override {
T* p = in->mutable_data<T>(platform::CPUPlace()); auto* in = context.Input<framework::Tensor>("X");
T abs_max = (T)0.00000001;
for (int i = 0; i < n; i++) { auto* out = context.Output<framework::Tensor>("Out");
T tmp = fabs(p[i]); auto* out_scale = context.Output<framework::Tensor>("OutScale");
if (tmp > abs_max) abs_max = tmp; T* out_s = out_scale->mutable_data<T>(context.GetPlace());
}
return T(abs_max);
}
T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
const T& cur_scale, int window_size,
int current_iter) const {
T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
T remove_tmp = sl[current_iter];
sl[current_iter] = cur_scale;
T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
if (max_scale < cur_scale) {
max_scale = cur_scale;
} else if (fabs(remove_tmp - max_scale) < 1e-6) {
int size = (current_iter > window_size) ? window_size : current_iter;
max_scale = T(FindAbsMax(scale_list, size));
}
return max_scale;
}
T FindMovingAverageAbsMmax(framework::Tensor* in_scale, int bit_length = context.Attr<int>("bit_length");
framework::Tensor* out_scale, int bin_cnt = std::pow(2, bit_length - 1) - 1;
const T& cur_scale) const {
T* ins = in_scale->mutable_data<T>(platform::CPUPlace()); auto& dev_ctx = context.template device_context<DeviceContext>();
T* outs = out_scale->mutable_data<T>(platform::CPUPlace()); const T* in_data = in->data<T>();
outs[0] = 0.9 * cur_scale + 0.1 * ins[0]; FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
return T(outs[0]); ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
bin_cnt, out);
} }
};
virtual void Compute(const framework::ExecutionContext& context) const { template <typename DeviceContext, typename T>
auto* tensor = context.Output<framework::Tensor>("Out"); class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
const bool is_test = context.Attr<bool>("is_test"); auto* in_scale = context.Input<framework::Tensor>("InScale");
tensor->mutable_data<T>(in->place());
auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
oms_tensor->mutable_data<T>(in->place());
auto quantize_type =
static_cast<std::string>(context.Attr<std::string>("quantize_type"));
if (quantize_type == std::string("range_abs_max")) {
auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
oss_tensor->mutable_data<T>(
context.Input<framework::Tensor>("InScales")->place());
auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
oci_tensor->mutable_data<T>(
context.Input<framework::Tensor>("InCurrentIter")->place());
}
T scale = static_cast<T>(1); auto* out = context.Output<framework::Tensor>("Out");
int window_size = context.Attr<int>("window_size"); out->mutable_data<T>(context.GetPlace());
bool is_test = context.Attr<bool>("is_test");
int bit_length = context.Attr<int>("bit_length"); int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1; int bin_cnt = std::pow(2, bit_length - 1) - 1;
auto& dev_ctx = context.template device_context<DeviceContext>();
auto& dev = // testing
*context.template device_context<DeviceContext>().eigen_device();
auto raw_in = framework::EigenVector<T>::Flatten(*in);
if (quantize_type == std::string("abs_max")) {
auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
scale_out.device(dev) = raw_in.abs().maximum();
scale = scale_out(0);
auto& device_ctx = context.template device_context<DeviceContext>();
auto* scale_list = context.Output<framework::Tensor>("OutScales");
math::SetConstant<DeviceContext, T> scalar;
scale_list->mutable_data<T>(context.GetPlace());
scalar(device_ctx, scale_list, static_cast<T>(0));
auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
iter->mutable_data<T>(context.GetPlace());
scalar(device_ctx, iter, static_cast<T>(0));
} else if (quantize_type == std::string("range_abs_max")) {
auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
if (is_test) {
scale = moving_scale->data<T>()[0];
} else {
auto* it = context.Input<framework::Tensor>("InCurrentIter");
auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
const int* last_iter = it->data<int>();
int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
auto* scale_list = context.Output<framework::Tensor>("OutScales");
auto* saving_scale =
context.Output<framework::Tensor>("OutMovingScale");
auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
scale_out.device(dev) = raw_in.abs().maximum();
scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
current_iter[0]);
saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
(*current_iter) = (*last_iter) + 1;
}
} else if (quantize_type == std::string("moving_average_abs_max")) {
auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
if (is_test) { if (is_test) {
scale = moving_scale->data<T>()[0]; ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
} else { bin_cnt, out);
auto* saving_scale = return;
context.Output<framework::Tensor>("OutMovingScale");
auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
scale_out.device(dev) = raw_in.abs().maximum();
scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
scale = FindMovingAverageAbsMmax(
const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
}
} }
Transform<DeviceContext> trans; // training
trans(context.template device_context<DeviceContext>(), in->data<T>(), auto* out_scale = context.Output<framework::Tensor>("OutScale");
in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()), auto* out_scales = context.Output<framework::Tensor>("OutScales");
ClipFunctor<T>(-scale, scale)); auto* iter = context.Input<framework::Tensor>("Iter");
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*tensor); int window_size = context.Attr<int>("window_size");
eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round(); out_scale->mutable_data<T>(context.GetPlace());
framework::Tensor cur_scale;
T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
cur_scale_data);
FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
*iter, window_size, out_scales,
out_scale);
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
bin_cnt, out);
} }
}; };
......
...@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase { ...@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
} }
}; };
// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
// the XShape is used to carry the shape and lod of X which will be used in
// flatten_grad, in this way, the framework can reuse the memory of X
// immediately the flatten2_op is finished.
// Considering compatibility issues, we could not fix flatten2_op
class Flatten2OpInferShape : public FlattenOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
FlattenOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output (XShape) of Flatten op should not be null.");
const auto &in_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(in_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
xshape_dims[i + 1] = in_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", "XShape");
}
};
class Flatten2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axis = Attr<int>("axis");
auto in_dims =
scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
framework::AttributeMap attrs;
attrs["shape"] = out_dims;
attrs["inplace"] = false;
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Flatten2OpMaker : public FlattenOpMaker {
public:
void Make() override {
FlattenOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp.")
.AsIntermediate();
}
};
class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("flatten2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Flatten2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Flatten2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
attrs["inplace"] = false;
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, ...@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
ops::FlattenOpInferShape, ops::FlattenOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape); REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
ops::Flatten2GradInferShape);
...@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and ...@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/fusion_gru_op.h" #include "paddle/fluid/operators/fusion_gru_op.h"
#include <cstring> // for memcpy
#include <string> #include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
"Input(WeightH) of GRU should not be null."); "Input(WeightH) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(BatchedGate) of GRU should not be null."); "Output(ReorderedH0) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"), PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchResetHiddenPrev) of GRU should not be null."); "Output(BatchedInput) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
"Output(BatchedHidden) of GRU should not be null."); "Output(BatchedOut) of GRU should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"), PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of GRU should not be null."); "Output(Hidden) of GRU should not be null.");
...@@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
} }
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedHidden", out_dims); ctx->SetOutputDim("BatchedOut", out_dims);
ctx->SetOutputDim("BatchResetHiddenPrev", out_dims);
ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Hidden");
int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; int xx_width;
if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1];
} else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
}
ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX"); ctx->ShareLoD("X", "XX");
} }
...@@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() { ...@@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() {
"(Tensor) The FC weight with shape (M x 3D)," "(Tensor) The FC weight with shape (M x 3D),"
"where M is the dim size of x, D is the hidden size. "); "where M is the dim size of x, D is the hidden size. ");
AddInput("WeightH", AddInput("WeightH",
"(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "); "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
"This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
"Acutally they are D x 2D and D x D two part weights."
"{W_update, W_reset; W_state}"
"{D x (D + D); D x D}");
AddInput("Bias", AddInput("Bias",
"(Tensor, optional) (1 x 3D)." "(Tensor, optional) (1 x 3D)."
"Almost same as GRUOp." "Almost same as GRUOp."
"Note: if have FC bias it should be added on this bias.") "Note: if have FC bias it should be added on this bias.")
.AsDispensable(); .AsDispensable();
AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
.AsIntermediate();
AddOutput("XX", AddOutput("XX",
"(LoDTensor) the result after X * WeightX (size is T x 4D)" "(LoDTensor) the result after X * WeightX (size is T x 3D)"
" or batched_X (size is T x M), this will be automatically chosen," " or batched_X (size is T x M), this will be automatically chosen,"
" where T is the total time steps in this mini-batch," " where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input.") " D is the hidden size, M is the dim size of x input.")
.AsIntermediate(); .AsIntermediate();
AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate(); AddOutput("BatchedInput",
AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.") "(LoDTensor) This is the batched result of input X"
"or the batched result after fc, shape (T x 3D)")
.AsIntermediate(); .AsIntermediate();
AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.") AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
.AsIntermediate(); .AsIntermediate();
AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp"); AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
AddAttr<std::string>("activation", AddAttr<std::string>("activation",
...@@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() { ...@@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() {
"(bool, defalut: False) " "(bool, defalut: False) "
"whether to compute reversed GRU.") "whether to compute reversed GRU.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_seq",
"(bool, defalut: True) "
"whether to use seq mode to compute GRU.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
The Fusion complete GRU Operator. The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU, This operator fuse the fully-connected operator into GRU,
...@@ -153,172 +165,261 @@ more details can refer to GRU op. ...@@ -153,172 +165,261 @@ more details can refer to GRU op.
)DOC"); )DOC");
} }
template <typename DeviceContext, typename T> template <typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index_lod, dst, indexed_src);
}
template <typename DeviceContext, typename T>
class FusionGRUKernel : public framework::OpKernel<T> { class FusionGRUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<LoDTensor>("X"); if (ctx.Attr<bool>("use_seq")) {
auto* wx = ctx.Input<Tensor>("WeightX"); SeqCompute(ctx);
auto* wh = ctx.Input<Tensor>("WeightH"); } else {
auto* bias = ctx.Input<Tensor>("Bias"); BatchCompute(ctx);
auto* h0 = ctx.Input<Tensor>("H0"); }
}
auto* xx = ctx.Output<LoDTensor>("XX");
auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate"); #define INIT_VEC_FUNC \
auto* batch_reset_hidden_prev = std::function<void(const int, const T *, T *)> act_gate, act_state; \
ctx.Output<LoDTensor>("BatchResetHiddenPrev"); std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
auto* batch_hidden = ctx.Output<LoDTensor>("BatchedHidden"); auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); auto& act_state_str = ctx.Attr<std::string>("activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::avx>; \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::isa_any>; \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); bool is_reverse = ctx.Attr<bool>("is_reverse");
#define INIT_BASE_SIZES \
auto x_dims = x->dims(); /* T x M*/ \
auto wh_dims = wh->dims(); /* D x 3D*/ \
const int total_T = x_dims[0]; \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D3 = wh_dims[1]; \
const int D2 = D * 2;
void SeqCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto x_lod = x->lod();
const int N = x_lod[0].size() - 1;
const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>();
const T* wh_state_data = wh_data + D * D2;
T* xx_data = xx->mutable_data<T>(ctx.GetPlace()); T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace()); T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
batch_reset_hidden_prev->mutable_data<T>(ctx.GetPlace());
batch_hidden->mutable_data<T>(ctx.GetPlace()); auto blas = math::GetBlas<DeviceContext, T>(ctx);
hidden_out->mutable_data<T>(ctx.GetPlace()); math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
xx_data,
bias ? bias->data<T>() : nullptr);
int xx_offset = D3;
int gate_offset = D;
if (is_reverse) {
const int offset = (total_T - 1) * D;
xx_data = xx_data + offset * 3;
hidden_out_data = hidden_out_data + offset;
xx_offset = -D3;
gate_offset = -D;
}
auto move_step = [&]() {
xx_data = xx_data + xx_offset;
hidden_out_data = hidden_out_data + gate_offset;
};
for (int i = 0; i < N; ++i) {
int bid = is_reverse ? N - 1 - i : i;
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
const T* prev_hidden_data = nullptr;
int tstart = 0;
if (h0_data) {
prev_hidden_data = h0_data + bid * D;
} else {
// W: {W_update, W_reset; W_state}
// update gate
act_gate(D, xx_data, xx_data);
// state gate
act_state(D, xx_data + D2, xx_data + D2);
// out = a*b
blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
// save prev
prev_hidden_data = hidden_out_data;
tstart = 1;
move_step();
}
for (int step = tstart; step < seq_len; ++step) {
// gemm prev * (Wu + Wr)
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
D3);
act_gate(D2, xx_data, xx_data);
// rt = rt*ht_1 inplace result
blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
// gemm rt * Ws
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
xx_data + D2, D3);
act_state(D, xx_data + D2, xx_data + D2);
// out = zt*ht~ + (1-zt)*ht_1
cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
// save prev
prev_hidden_data = hidden_out_data;
move_step();
}
}
}
void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
if (x->lod()[0].size() == 2) {
SeqCompute(ctx);
return;
}
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>(); const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>(); const T* wh_data = wh->data<T>();
auto x_dims = x->dims(); T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
auto wx_dims = wx->dims(); T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
hidden_out->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch; math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
if (x_dims[1] > wx_dims[1]) { if (M > D3) {
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
x_data, wx_data, xx_data, xx_data,
bias ? bias->data<T>() : NULL); bias ? bias->data<T>() : nullptr);
to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
} else { } else {
to_batch(dev_ctx, *x, xx, true, is_reverse); to_batch(dev_ctx, *x, xx, true, is_reverse);
batched_gate->set_lod(xx->lod()); batched_input->set_lod(xx->lod());
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, xx_data, wx_data,
xx_data, wx_data, batched_gate_data, batched_input_data,
bias ? bias->data<T>() : NULL); bias ? bias->data<T>() : nullptr);
} }
int frame_size = static_cast<int>(wx_dims[1] / 3); auto batched_lod = batched_input->lod();
math::GRUMetaValue<T> gru_value; const auto& seq_order = batched_lod[2];
gru_value.gate_weight = const_cast<T*>(wh_data); const int max_bs = seq_order.size();
gru_value.state_weight = reordered_h0->Resize({max_bs, D});
const_cast<T*>(wh_data + 2 * frame_size * frame_size);
Tensor ordered_h0;
framework::Vector<size_t> order(batched_gate->lod()[2]);
int tstart = 0;
T* prev_hidden_data = nullptr;
if (h0) { if (h0) {
ReorderInitState<DeviceContext, T>( // reorder h0
ctx.template device_context<DeviceContext>(), *h0, order, &ordered_h0, T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
true); const T* h0_data = h0->data<T>();
gru_value.prev_out_value = ordered_h0.data<T>(); prev_hidden_data = reordered_h0_data;
} else { size_t sz = sizeof(T) * D;
gru_value.prev_out_value = nullptr; for (int i = 0; i < max_bs; ++i) {
std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
reordered_h0_data += D;
} }
auto batch_starts = batched_gate->lod()[0]; } else {
size_t seq_len = batch_starts.size() - 1; // compute without h0
auto active_node = T* cur_in_data = batched_input_data;
math::detail::GetActivationType(ctx.Attr<std::string>("activation")); T* cur_out_data = batched_out_data;
auto active_gate = math::detail::GetActivationType( // W: {W_update, W_reset; W_state}
ctx.Attr<std::string>("gate_activation")); for (int i = 0; i < max_bs; ++i) {
// update gate
#ifdef PADDLE_WITH_MKLML act_gate(D, cur_in_data, cur_in_data);
// use MKL packed to speedup GEMM // state gate
if (FLAGS_paddle_num_threads >= 4) { act_state(D, cur_in_data + D2, cur_in_data + D2);
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); // out = a*b
T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
frame_size * 2 /*width of weight*/, // add offset
frame_size /*height of height*/); cur_in_data += D3;
PADDLE_ENFORCE(packed_gate); cur_out_data += D;
blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
packed_gate);
T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
frame_size /*width of weight*/,
frame_size /*height of height*/);
PADDLE_ENFORCE(packed_state);
blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
frame_size, T(1.0), gru_value.state_weight, frame_size,
packed_state);
for (size_t n = 0; n < seq_len; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart;
Tensor gate_t = batched_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t =
batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.output_value = hidden_t.data<T>();
gru_value.gate_value = gate_t.data<T>();
gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
if (gru_value.prev_out_value) {
blas.GEMM_COMPUTE(
CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
frame_size, gru_value.prev_out_value, frame_size, packed_gate,
frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
} }
tstart = 1;
math::detail::forward_reset_output( prev_hidden_data = batched_out_data;
math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
cur_batch_size, active_gate);
if (gru_value.prev_out_value) {
blas.GEMM_COMPUTE(
CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
gru_value.reset_output_value, frame_size, packed_state,
frame_size, T(1), gru_value.gate_value + frame_size * 2,
frame_size * 3);
} }
// Then start from next
math::detail::forward_final_output( const T* wh_state_data = wh_data + D * D2;
math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size, const auto& batch_starts = batched_lod[0];
cur_batch_size, active_node); const int max_seq_len = batch_starts.size() - 1;
batched_input_data = batched_input_data + tstart * max_bs * D3;
gru_value.prev_out_value = gru_value.output_value; batched_out_data = batched_out_data + tstart * max_bs * D;
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
// gemm prev * (Wu + Wr)
blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
batched_input_data, D3);
T* cur_batched_data = batched_input_data;
T* cur_out_data = batched_out_data;
T* cur_prev_hidden_data = prev_hidden_data;
for (int i = 0; i < cur_bs; ++i) {
act_gate(D2, cur_batched_data, cur_batched_data);
// rt = rt*ht_1 inplace result
blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
cur_batched_data += D3;
cur_prev_hidden_data += D;
cur_out_data += D;
} }
blas.GEMM_FREE(packed_gate); cur_batched_data = batched_input_data;
blas.GEMM_FREE(packed_state); cur_out_data = batched_out_data;
} else { blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
#endif cur_out_data, D, wh_state_data, D, static_cast<T>(1),
for (size_t n = 0; n < seq_len; n++) { cur_batched_data + D2, D3);
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); cur_prev_hidden_data = prev_hidden_data;
int cur_batch_size = bend - bstart; for (int i = 0; i < cur_bs; ++i) {
// ht~ = act_state(...)
Tensor gate_t = batched_gate->Slice(bstart, bend); act_state(D, cur_batched_data + D2, cur_batched_data + D2);
Tensor reset_hidden_prev_t = // out = zt*ht~ + (1-zt)*ht_1
batch_reset_hidden_prev->Slice(bstart, bend); cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
Tensor hidden_t = batch_hidden->Slice(bstart, bend); cur_out_data);
gru_value.output_value = hidden_t.data<T>();
gru_value.gate_value = gate_t.data<T>(); cur_batched_data += D3;
gru_value.reset_output_value = reset_hidden_prev_t.data<T>(); cur_prev_hidden_data += D;
cur_out_data += D;
math::GRUUnitFunctor<DeviceContext, T>::compute(
dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
active_gate);
gru_value.prev_out_value = gru_value.output_value;
} }
#ifdef PADDLE_WITH_MKLML prev_hidden_data = batched_out_data;
batched_out_data = cur_out_data;
batched_input_data = cur_batched_data;
} }
#endif
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq; math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batch_hidden->set_lod(batched_gate->lod()); batched_out->set_lod(batched_lod);
to_seq(dev_ctx, *batch_hidden, hidden_out); to_seq(dev_ctx, *batched_out, hidden_out);
} }
#undef INIT_VEC_FUNC
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
}; };
} // namespace operators } // namespace operators
...@@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker, REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
fusion_gru, ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, float>, ops::FusionGRUKernel<double>);
ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -16,14 +16,10 @@ limitations under the License. */ ...@@ -16,14 +16,10 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool(seq_mode, true, "Use sequence mode");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
"Output(Hidden) of LSTM should not be null."); "Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"), PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null."); "Output(Cell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedGate) of LSTM should not be null."); "Output(BatchedInput) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
"Output(BatchedGate) of LSTM should not be null."); "Output(BatchedHidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
"Output(BatchedCell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
"Output(ReorderedC0) of LSTM should not be null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
...@@ -87,23 +89,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -87,23 +89,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(b_dims[0], 1, PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1."); "The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"), auto use_peepholes = ctx->Attrs().Get<bool>("use_peepholes");
"Do not support peephole yet."); PADDLE_ENFORCE_EQ(b_dims[1], (use_peepholes ? 7 : 4) * frame_size,
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be " "The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection", "7 * %d if enable peepholes connection or"
frame_size); "4 * %d if disable peepholes",
frame_size, frame_size);
framework::DDim out_dims({x_dims[0], frame_size}); framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims); ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("Cell", out_dims); ctx->SetOutputDim("Cell", out_dims);
ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchCellPreAct", out_dims); ctx->SetOutputDim("BatchedHidden", out_dims);
ctx->SetOutputDim("BatchedCell", out_dims);
ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("X", "Cell"); ctx->ShareLoD("X", "Cell");
int xx_width; int xx_width;
if (FLAGS_seq_mode) { if (ctx->Attrs().Get<bool>("use_seq")) {
xx_width = wx_dims[1]; xx_width = wx_dims[1];
} else { } else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
...@@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() { ...@@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() {
" where T is the total time steps in this mini-batch," " where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input.") " D is the hidden size, M is the dim size of x input.")
.AsIntermediate(); .AsIntermediate();
AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate(); AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).") AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
.AsIntermediate(); AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
AddAttr<bool>("use_peepholes", AddAttr<bool>("use_peepholes",
"(bool, defalut: True) " "(bool, defalut: True) "
"whether to enable diagonal/peephole connections.") "whether to enable diagonal/peephole connections.")
...@@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() { ...@@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() {
"(bool, defalut: False) " "(bool, defalut: False) "
"whether to compute reversed LSTM.") "whether to compute reversed LSTM.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_seq",
"(bool, defalut: True) "
"whether to use seq mode to compute.")
.SetDefault(true);
AddAttr<std::string>("gate_activation", AddAttr<std::string>("gate_activation",
"(string, default: sigmoid)" "(string, default: sigmoid)"
"The activation for input gate, forget gate and output " "The activation for input gate, forget gate and output "
...@@ -203,70 +212,76 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. ...@@ -203,70 +212,76 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
)DOC"); )DOC");
} }
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
// TODO(TJ): check mem copy perf
row_shuffle(ctx, src, index_lod, dst, indexed_src);
}
template <typename T> template <typename T>
class FuisonLSTMKernel : public framework::OpKernel<T> { class FuisonLSTMKernel : public framework::OpKernel<T> {
public: public:
void SeqCompute(const framework::ExecutionContext& ctx) const { #define INIT_VEC_FUNC \
using DeviceContext = paddle::platform::CPUDeviceContext; std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
auto* x = ctx.Input<LoDTensor>("X"); auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto* h0 = ctx.Input<Tensor>("H0"); auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto* c0 = ctx.Input<Tensor>("C0"); auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
auto* wx = ctx.Input<Tensor>("WeightX"); if (platform::jit::MayIUse(platform::jit::avx)) { \
auto* wh = ctx.Input<Tensor>("WeightH"); math::VecActivations<T, platform::jit::avx> act_functor; \
auto* bias = ctx.Input<Tensor>("Bias"); act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
auto* xx = ctx.Output<LoDTensor>("XX"); act_cand = act_functor(act_cand_str); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); } else { \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* c0 = ctx.Input<Tensor>("C0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); bool is_reverse = ctx.Attr<bool>("is_reverse");
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; #define INIT_BASE_SIZES \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); auto x_dims = x->dims(); /* T x M*/ \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); auto wh_dims = wh->dims(); /* D x 4D*/ \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); const int M = x_dims[1]; \
if (platform::jit::MayIUse(platform::jit::avx)) { const int D = wh_dims[0]; \
math::VecActivations<T, platform::jit::avx> act_functor; const int D2 = D * 2; \
act_gate = act_functor(act_gate_str); const int D3 = D * 3; \
act_cell = act_functor(act_cell_str); const int D4 = wh_dims[1];
act_cand = act_functor(act_cand_str);
} else { void SeqCompute(const framework::ExecutionContext& ctx) const {
math::VecActivations<T, platform::jit::isa_any> act_functor; using DeviceContext = paddle::platform::CPUDeviceContext;
act_gate = act_functor(act_gate_str); INIT_BASE_INPUT_OUTPUT
act_cell = act_functor(act_cell_str); INIT_BASE_SIZES
act_cand = act_functor(act_cand_str); INIT_VEC_FUNC
}
auto x_lod = x->lod(); auto x_lod = x->lod();
auto x_dims = x->dims(); // T x M
auto wh_dims = wh->dims(); // D x 4D
const int total_T = x_dims[0]; const int total_T = x_dims[0];
const int N = x_lod[0].size() - 1; // batch size const int N = x_lod[0].size() - 1; // batch size
const int M = x_dims[1]; // x frame size
const int D = wh_dims[0];
const int D2 = D * 2;
const int D3 = D * 3;
const int D4 = wh_dims[1];
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : NULL; const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* c0_data = c0 ? c0->data<T>() : NULL; const T* c0_data = c0 ? c0->data<T>() : nullptr;
const T* bias_data = bias->data<T>();
const T* wc_data = bias_data + D4; // w_ic, w_fc, w_oc
const T* wx_data = wx->data<T>(); const T* wx_data = wx->data<T>();
const T* wh_data = wh->data<T>(); const T* wh_data = wh->data<T>();
T* xx_data = xx->mutable_data<T>(ctx.GetPlace()); T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace()); T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace()); T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
// use local variable
framework::DDim check_dims({3, D});
Tensor checked_cell; // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
auto checked_cell_data =
checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data, math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
xx_data, bias->data<T>()); xx_data, bias->data<T>());
...@@ -290,199 +305,319 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -290,199 +305,319 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
int bid = is_reverse ? N - 1 - i : i; int bid = is_reverse ? N - 1 - i : i;
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
const T* prev_cell_data = NULL; const T* prev_c_data = nullptr;
const T* prev_hidden_data = NULL; const T* prev_h_data = nullptr;
int tstart = 0; int tstart = 0;
if (h0_data) { if (h0_data) {
prev_hidden_data = h0_data + bid * D; prev_h_data = h0_data + bid * D;
prev_cell_data = c0_data + bid * D; prev_c_data = c0_data + bid * D;
} else { } else {
// W_ch, W_ih, W_fh, W_oh // If step == 0 and there is no initialized hidden state, that is to say
act_gate(D3, xx_data + D, xx_data + D); // the H0 is zeros. Then W_h * H_t-1 can be skipped
// ~C_t
act_cand(D, xx_data, xx_data); act_cand(D, xx_data, xx_data);
// cell out= input*tilde if (use_peepholes) {
// I_t, F_t
act_gate(D2, xx_data + D, xx_data + D);
} else {
// I_t, F_t, O_t
act_gate(D3, xx_data + D, xx_data + D);
}
// C_t = I_t * ~C_t
blas.VMUL(D, xx_data, xx_data + D, cell_out_data); blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
if (use_peepholes) {
// + W_oc * C_t for peephole connection
blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
// O_t
act_gate(D, xx_data + D3, xx_data + D3);
}
// hidden out= act_state(cellout) * outgate // hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2); act_cell(D, cell_out_data, xx_data + D2);
// H_t = O_t * act_state(C_t)
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev // prev
prev_hidden_data = hidden_out_data; prev_h_data = hidden_out_data;
prev_cell_data = cell_out_data; prev_c_data = cell_out_data;
tstart = 1;
tstart = 1;
move_step(); move_step();
} }
for (int step = tstart; step < seq_len; ++step) { for (int step = tstart; step < seq_len; ++step) {
// + W_h * H_t-1
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1), blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data, prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
D4);
// W_ch, W_ih, W_fh, W_oh // ~C_t
act_gate(D3, xx_data + D, xx_data + D);
act_cand(D, xx_data, xx_data); act_cand(D, xx_data, xx_data);
// a = forget * prev_cell if (use_peepholes) {
blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2); // + W_ic|W_fc * C_t-1 for peephole connection
blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
blas.VADD(D2, xx_data + D, checked_cell_data, xx_data + D);
// I_t, F_t
act_gate(D2, xx_data + D, xx_data + D);
} else {
// I_t, F_t, O_t
act_gate(D3, xx_data + D, xx_data + D);
}
// b = input * tilde // F_t * C_t-1
blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
// I_t * ~C_t
blas.VMUL(D, xx_data, xx_data + D, xx_data + D); blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
// C_t = F_t * C_t-1 + I_t * ~C_t
// cell out= a+b
blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data); blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
if (use_peepholes) {
// + W_oc * C_t for peephole connection
blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
// O_t
act_gate(D, xx_data + D3, xx_data + D3);
}
// hidden out= act_state(cellout) * outgate // hidden out= act_state(cellout) * outgate
act_cell(D, cell_out_data, xx_data + D2); act_cell(D, cell_out_data, xx_data + D2);
// H_t = O_t * act_state(C_t)
blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
// prev // prev
prev_hidden_data = hidden_out_data; prev_h_data = hidden_out_data;
prev_cell_data = cell_out_data; prev_c_data = cell_out_data;
move_step(); move_step();
} } // for each step in batch
} } // for each batch
} }
void BatchCompute(const framework::ExecutionContext& ctx) const { void BatchCompute(const framework::ExecutionContext& ctx) const {
using DeviceContext = platform::CPUDeviceContext; using DeviceContext = platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X"); INIT_BASE_INPUT_OUTPUT
auto* wx = ctx.Input<Tensor>("WeightX"); if (x->lod()[0].size() == 2) { // batch size == 1
auto* wh = ctx.Input<Tensor>("WeightH"); SeqCompute(ctx);
auto* bias = ctx.Input<Tensor>("Bias"); return;
auto* hidden_t0 = ctx.Input<Tensor>("H0"); }
auto* cell_t0 = ctx.Input<Tensor>("C0"); INIT_BASE_SIZES
INIT_VEC_FUNC
auto* xx = ctx.Output<LoDTensor>("XX");
auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
auto* cell_out = ctx.Output<LoDTensor>("Cell");
bool is_reverse = ctx.Attr<bool>("is_reverse");
T* xx_data = xx->mutable_data<T>(ctx.GetPlace()); auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace()); auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
hidden_out->mutable_data<T>(ctx.GetPlace()); auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
cell_out->mutable_data<T>(ctx.GetPlace()); auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>(); const T* wx_data = wx->data<T>();
auto x_dims = x->dims(); const T* wh_data = wh->data<T>();
auto wx_dims = wx->dims(); const T* bias_data = bias->data<T>();
const T* wc_data = bias_data + D4; // w_ic, w_fc, w_oc
auto place = ctx.GetPlace();
T* xx_data = xx->mutable_data<T>(place);
T* batched_input_data = batched_input->mutable_data<T>(place);
T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
hidden_out->mutable_data<T>(place);
cell_out->mutable_data<T>(place);
// use local variable
framework::DDim check_dims({3, D});
Tensor checked_cell; // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
auto checked_cell_data =
checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch; math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (x_dims[1] > wx_dims[1]) { if (M > D4) {
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, x_data, wx_data,
x_data, wx_data, xx_data, xx_data, bias->data<T>());
bias->data<T>()); to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
} else { } else {
to_batch(dev_ctx, *x, xx, true, is_reverse); to_batch(dev_ctx, *x, xx, true, is_reverse);
batched_gate->set_lod(xx->lod()); batched_input->set_lod(xx->lod());
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1], math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, xx_data,
xx_data, wx_data, batched_gate_data, wx_data, batched_input_data,
bias->data<T>()); bias->data<T>());
} }
int frame_size = static_cast<int>(wx_dims[1] / 4); auto batched_lod = batched_input->lod();
framework::DDim out_dims({x_dims[0], frame_size}); const auto& seq_order = batched_lod[2];
math::LstmMetaValue<T> lstm_value; const int max_bs = seq_order.size();
// no peephole reordered_h0->Resize({max_bs, D});
lstm_value.check_ig = nullptr; reordered_c0->Resize({max_bs, D});
lstm_value.check_fg = nullptr;
lstm_value.check_og = nullptr; T* prev_batch_h_data = nullptr;
lstm_value.prev_state_value = nullptr; T* prev_batch_c_data = nullptr;
Tensor ordered_c0; T* cur_batch_in_data = batched_input_data;
T* cur_batch_h_out_data = batched_h_out_data;
framework::Vector<size_t> order(batched_gate->lod()[2]); T* cur_batch_c_out_data = batched_c_out_data;
if (cell_t0) { auto move_step = [&](int bs) {
// Since the batch computing for LSTM reorders the input sequence cur_batch_in_data += bs * D4;
// according to their length. The initialized cell state also needs cur_batch_c_out_data += bs * D;
// to reorder. cur_batch_h_out_data += bs * D;
ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0, };
true);
lstm_value.prev_state_value = ordered_c0.data<T>(); int tstart = 0;
if (h0) {
// reorder h0, c0
T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
const T* h0_data = h0->data<T>();
const T* c0_data = c0->data<T>();
prev_batch_h_data = reordered_h0_data;
prev_batch_c_data = reordered_c0_data;
size_t sz = sizeof(T) * D;
for (int i = 0; i < max_bs; ++i) {
std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
reordered_h0_data += D;
reordered_c0_data += D;
} }
} else {
// Compute with no H0/C0
T* cur_in_data = cur_batch_in_data;
T* cur_c_out_data = cur_batch_c_out_data;
T* cur_h_out_data = cur_batch_h_out_data;
// If step == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros. Then W_h * H_t-1 can be skiped
// Use the local variable as here. for (int i = 0; i < max_bs; ++i) { // iterate each data in 1st batch
LoDTensor batch_hidden, batch_cell; // ~C_t
auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct"); act_cand(D, cur_in_data, cur_in_data);
batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
batch_cell.mutable_data<T>(out_dims, ctx.GetPlace()); if (use_peepholes) {
batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace()); // I_t, F_t
act_gate(D2, cur_in_data + D, cur_in_data + D);
auto batch_starts = batched_gate->lod()[0]; } else {
size_t max_seq_len = batch_starts.size() - 1; // I_t, F_t, O_t
auto gate_act = math::detail::GetActivationType( act_gate(D3, cur_in_data + D, cur_in_data + D);
ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType(
ctx.Attr<std::string>("cell_activation"));
auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation"));
for (size_t n = 0; n < max_seq_len; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
Tensor gate_t = batched_gate->Slice(bstart, bend);
Tensor out_t = batch_hidden.Slice(bstart, bend);
Tensor cell_t = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
int cur_batch_size = bend - bstart;
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
// TODO(TJ): use gemm directly
blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
&gate_t, static_cast<T>(1.0));
} else if (hidden_t0) {
// TODO(TJ): move h0 outside for
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
// If n == 0 and there is initialized hidden state, calculate W_h * H0.
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor ordered_h0;
ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
&ordered_h0, true);
// TODO(TJ): use gemm directly
blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
static_cast<T>(1.0));
} }
lstm_value.gate_value = gate_t.data<T>(); // C_t = I_t * ~C_t
lstm_value.output_value = out_t.data<T>(); blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
lstm_value.state_value = cell_t.data<T>();
lstm_value.state_active_value = cell_pre_act_t.data<T>(); if (use_peepholes) {
math::LstmUnitFunctor<DeviceContext, T>::compute( // + W_oc * C_t for peephole connection
dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act, blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
cand_act); blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
lstm_value.prev_state_value = lstm_value.state_value; cur_in_data + D3);
// O_t
act_gate(D, cur_in_data + D3, cur_in_data + D3);
} }
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq; // hidden out= act_state(cellout) * outgate
batch_hidden.set_lod(batched_gate->lod()); act_cell(D, cur_c_out_data, cur_in_data + D2);
// restore the output hidden in LoDTensor from the batch hidden // H_t = O_t * act_state(C_t)
to_seq(dev_ctx, batch_hidden, hidden_out); blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
// move to next data in the same batch
cur_in_data += D4;
cur_c_out_data += D;
cur_h_out_data += D;
}
// move to data for next timestep
prev_batch_h_data = cur_batch_h_out_data;
prev_batch_c_data = cur_batch_c_out_data;
move_step(max_bs);
tstart = 1;
}
const auto& batch_starts = batched_lod[0];
const int max_seq_len = batch_starts.size() - 1;
for (int step = tstart; step < max_seq_len; ++step) {
const int cur_bs = batch_starts[step + 1] - batch_starts[step];
// + W_h * H_t-1
blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
prev_batch_h_data, D, wh_data, D4, static_cast<T>(1),
cur_batch_in_data, D4);
T* cur_in_data = cur_batch_in_data;
T* cur_c_out_data = cur_batch_c_out_data;
T* cur_h_out_data = cur_batch_h_out_data;
T* prev_c_data = prev_batch_c_data; // NULL if no C0 in step0
T* prev_h_data = prev_batch_h_data; // NULL if no H0 in step0
auto next_data_in_batch = [&]() {
cur_in_data += D4;
cur_c_out_data += D;
cur_h_out_data += D;
prev_c_data = prev_c_data ? prev_c_data + D : nullptr;
prev_h_data = prev_h_data ? prev_h_data + D : nullptr;
};
for (int i = 0; i < cur_bs; ++i) { // iterate each data in same batch
// ~C_t
act_cand(D, cur_in_data, cur_in_data);
if (use_peepholes) {
// + W_ic|W_fc * C_t-1 for peephole connection
blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
blas.VADD(D2, cur_in_data + D, checked_cell_data, cur_in_data + D);
// I_t, F_t
act_gate(D2, cur_in_data + D, cur_in_data + D);
} else {
// I_t, F_t, O_t
act_gate(D3, cur_in_data + D, cur_in_data + D);
}
batch_cell.set_lod(batched_gate->lod()); // F_t * C_t-1
// restore the output cell state in LoDTensor from the batch cell blas.VMUL(D, cur_in_data + D2, prev_c_data, cur_in_data + D2);
to_seq(dev_ctx, batch_cell, cell_out); // I_t * ~C_t
blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
// C_t = F_t * C_t-1 + I_t * ~C_t
blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
if (use_peepholes) {
// + W_oc * C_t for peephole connection
blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
cur_in_data + D3);
// O_t
act_gate(D, cur_in_data + D3, cur_in_data + D3);
} }
// hidden out= act_state(cellout) * outgate
act_cell(D, cur_c_out_data, cur_in_data + D2);
// H_t = O_t * act_state(C_t)
blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
// move to next data in same batch
next_data_in_batch();
}
// move to data for next timestep
prev_batch_h_data = cur_batch_h_out_data;
prev_batch_c_data = cur_batch_c_out_data;
move_step(cur_bs);
}
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batched_h_out->set_lod(batched_lod);
to_seq(dev_ctx, *batched_h_out, hidden_out);
batched_c_out->set_lod(batched_lod);
to_seq(dev_ctx, *batched_c_out, cell_out);
}
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
if (FLAGS_seq_mode) { if (ctx.Attr<bool>("use_seq")) {
SeqCompute(ctx); SeqCompute(ctx);
} else { } else {
BatchCompute(ctx); BatchCompute(ctx);
} }
} }
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC
}; };
} // namespace operators } // namespace operators
......
...@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
gate_data, frame_size * 3); gate_data, frame_size * 3);
// calculate activited gate // calculate activited gate
Eigen::array<int, 2> extents({{batch_size, frame_size}}); Eigen::array<int, 2> extents{{batch_size, frame_size}};
Eigen::array<int, 2> u_offsets({{0, 0}}); Eigen::array<int, 2> u_offsets{{0, 0}};
ActCompute(context.Attr<int>("gate_activation"), place, ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(u_offsets, extents), g.slice(u_offsets, extents)); g.slice(u_offsets, extents), g.slice(u_offsets, extents));
auto u = g.slice(u_offsets, extents); // update gate auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets({{0, frame_size}}); Eigen::array<int, 2> r_offsets{{0, frame_size}};
ActCompute(context.Attr<int>("gate_activation"), place, ActCompute(context.Attr<int>("gate_activation"), place,
g.slice(r_offsets, extents), g.slice(r_offsets, extents)); g.slice(r_offsets, extents), g.slice(r_offsets, extents));
auto r = g.slice(r_offsets, extents); // reset gate auto r = g.slice(r_offsets, extents); // reset gate
...@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
weight_data + frame_size * frame_size * 2, frame_size, 1, weight_data + frame_size * frame_size * 2, frame_size, 1,
gate_data + frame_size * 2, frame_size * 3); gate_data + frame_size * 2, frame_size * 3);
Eigen::array<int, 2> c_offsets({{0, frame_size * 2}}); Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
ActCompute(context.Attr<int>("activation"), place, ActCompute(context.Attr<int>("activation"), place,
g.slice(c_offsets, extents), g.slice(c_offsets, extents)); g.slice(c_offsets, extents), g.slice(c_offsets, extents));
auto c = g.slice(c_offsets, extents); // output candidate auto c = g.slice(c_offsets, extents); // output candidate
...@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
int batch_size = input->dims()[0]; int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1]; int frame_size = hidden_prev->dims()[1];
Eigen::array<int, 2> extents({{batch_size, frame_size}}); Eigen::array<int, 2> extents{{batch_size, frame_size}};
Eigen::array<int, 2> u_offsets({{0, 0}}); Eigen::array<int, 2> u_offsets{{0, 0}};
auto u = g.slice(u_offsets, extents); // update gate auto u = g.slice(u_offsets, extents); // update gate
Eigen::array<int, 2> r_offsets({{0, frame_size}}); Eigen::array<int, 2> r_offsets{{0, frame_size}};
auto r = g.slice(r_offsets, extents); // reset gate auto r = g.slice(r_offsets, extents); // reset gate
Eigen::array<int, 2> c_offsets({{0, frame_size * 2}}); Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
auto c = g.slice(c_offsets, extents); // output candidate auto c = g.slice(c_offsets, extents); // output candidate
// backward for unactivated update gate // backward for unactivated update gate
......
...@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> { ...@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
auto dist = framework::EigenVector<T>::Flatten(*dist_t); auto dist = framework::EigenVector<T>::Flatten(*dist_t);
out.device(dev) = out.device(dev) =
static_cast<T>(1 - epsilon) * in + static_cast<T>(1 - epsilon) * in +
epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel())); static_cast<T>(epsilon) *
dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
} else { } else {
out.device(dev) = static_cast<T>(1 - epsilon) * in + out.device(dev) = static_cast<T>(1 - epsilon) * in +
static_cast<T>(epsilon / label_dim); static_cast<T>(epsilon / label_dim);
......
...@@ -67,27 +67,27 @@ template <typename T, int BlockDim> ...@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
__global__ void LayerNormForward(const T *x, const T *scale, const T *bias, __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
T *y, T *mean, T *var, float epsilon, T *y, T *mean, T *var, float epsilon,
int feature_size) { int feature_size) {
using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>; using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
int beg_idx = blockIdx.x * feature_size + threadIdx.x; int beg_idx = blockIdx.x * feature_size + threadIdx.x;
int end_idx = (blockIdx.x + 1) * feature_size; int end_idx = (blockIdx.x + 1) * feature_size;
// Step 1: Reduce to calculate mean and var // Step 1: Reduce to calculate mean and var
T mean_val = static_cast<T>(0); double mean_val = 0;
T var_val = static_cast<T>(0); double var_val = 0;
for (int i = beg_idx; i < end_idx; i += BlockDim) { for (int i = beg_idx; i < end_idx; i += BlockDim) {
T tmp = x[i]; T tmp = x[i];
mean_val += tmp; mean_val += tmp;
var_val += (tmp * tmp); var_val += (tmp * tmp);
} }
auto pair = BlockReduce(temp_storage) auto pair = BlockReduce(temp_storage)
.Reduce(PairForLayerNorm<T>(mean_val, var_val), .Reduce(PairForLayerNorm<double>(mean_val, var_val),
PairForLayerNormAddFunctor<T>()); PairForLayerNormAddFunctor<double>());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
auto tmp = pair.first_ / feature_size; auto tmp = pair.first_ / feature_size;
mean[blockIdx.x] = tmp; mean[blockIdx.x] = static_cast<T>(tmp);
var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp; var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
} }
__syncthreads(); __syncthreads();
mean_val = mean[blockIdx.x]; mean_val = mean[blockIdx.x];
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
memset(output + i * row_width, 0, row_width * sizeof(T)); memset(output + i * row_width, 0, row_width * sizeof(T));
} else { } else {
PADDLE_ENFORCE_LT(ids[i], row_number); PADDLE_ENFORCE_LT(ids[i], row_number);
PADDLE_ENFORCE_GE(ids[i], 0); PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
memcpy(output + i * row_width, table + ids[i] * row_width, memcpy(output + i * row_width, table + ids[i] * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
......
if (NOT WIN32)
add_subdirectory(detail) add_subdirectory(detail)
endif(NOT WIN32)
function(math_library TARGET) function(math_library TARGET)
# math_library is a function to create math library. # math_library is a function to create math library.
...@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function) ...@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function)
math_library(cross_entropy) math_library(cross_entropy)
math_library(cos_sim_functor) math_library(cos_sim_functor)
math_library(depthwise_conv) math_library(depthwise_conv)
math_library(gru_compute DEPS activation_functions math_function)
math_library(im2col) math_library(im2col)
if (NOT WIN32) # windows do not support avx functions yet.
math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
endif (NOT WIN32)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
math_library(math_function DEPS blas) math_library(math_function DEPS blas)
math_library(maxouting) math_library(maxouting)
...@@ -51,7 +57,9 @@ math_library(sequence_padding) ...@@ -51,7 +57,9 @@ math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax DEPS math_function) math_library(softmax DEPS math_function)
if (NOT WIN32)
math_library(matrix_bit_code) math_library(matrix_bit_code)
endif (NOT WIN32)
math_library(unpooling) math_library(unpooling)
math_library(vol2col) math_library(vol2col)
......
...@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n, ...@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
vec_scal<float, platform::jit::avx2>(n, a, x, y); vec_scal<float, platform::jit::avx2>(n, a, x, y);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = a - x[i];
}
}
template <>
inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
const float* x, float* y) {
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(a);
__m256 tmp;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for (i = 0; i < end; i += block) {
MOVE_ONE_STEP;
}
#undef MOVE_ONE_STEP
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
y[i] = a - x[i];
}
#else
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
#endif
}
template <>
inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
const float* x, float* y) {
vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
}
template <>
inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
const float a,
const float* x,
float* y) {
// TODO(TJ): enable me
vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
}
// out = x*y + (1-x)*z
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
}
}
template <>
inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
const float* y, const float* z,
float* out) {
#ifdef __AVX__
constexpr int block = AVX_FLOAT_BLOCK;
if (n < block) {
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
return;
}
const int rest = n % block;
const int end = n - rest;
int i = 0;
__m256 bias = _mm256_set1_ps(1.f);
__m256 tmpx, tmpy, tmpz;
for (i = 0; i < end; i += block) {
tmpx = _mm256_loadu_ps(x + i);
tmpy = _mm256_loadu_ps(y + i);
tmpz = _mm256_loadu_ps(z + i);
tmpy = _mm256_mul_ps(tmpx, tmpy);
tmpx = _mm256_sub_ps(bias, tmpx);
tmpz = _mm256_mul_ps(tmpx, tmpz);
tmpz = _mm256_add_ps(tmpy, tmpz);
_mm256_storeu_ps(out + i, tmpz);
}
if (rest == 0) {
return;
}
// can not continue move step if src and dst are inplace
for (i = n - rest; i < n; ++i) {
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
}
#else
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
#endif
}
template <>
inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
const float* y,
const float* z, float* out) {
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <>
inline void vec_cross<float, platform::jit::avx512_common>(
const int n, const float* x, const float* y, const float* z, float* out) {
// TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out);
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) { inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
......
...@@ -17,6 +17,11 @@ limitations under the License. */ ...@@ -17,6 +17,11 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#endif // _WIN32
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -55,12 +60,38 @@ namespace math { ...@@ -55,12 +60,38 @@ namespace math {
* FindLastSet(x) = 1 + \floor*{\log_{2}x} * FindLastSet(x) = 1 + \floor*{\log_{2}x}
* \f] * \f]
*/ */
#if !defined(_WIN32)
inline constexpr size_t FindLastSet(size_t x) { inline constexpr size_t FindLastSet(size_t x) {
return std::is_same<size_t, unsigned int>::value return std::is_same<size_t, unsigned int>::value
? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0) ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
: (std::is_same<size_t, unsigned long>::value // NOLINT : (std::is_same<size_t, unsigned long>::value // NOLINT
? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0) ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
: (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0)); : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
#else
// windows don't have built-in clz, ctz function
template <typename T>
inline int ctz(const T& value) {
DWORD trailing_zero = 0;
if (_BitScanForward(&trailing_zero, value)) {
return static_cast<int>(trailing_zero);
} else {
return static_cast<int>(0);
}
}
template <typename T>
inline int clz(const T& value) {
DWORD leadning_zero = 0;
if (_BitScanReverse(&leadning_zero, value)) {
return static_cast<int>(sizeof(T) * 8 - leadning_zero);
} else {
return static_cast<int>(0);
}
}
inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
#endif // !_WIN32
} }
struct SimpleCode { struct SimpleCode {
......
...@@ -16,13 +16,12 @@ limitations under the License. */ ...@@ -16,13 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#define FLT_MAX __FLT_MAX__
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class MaxOutFunctor { class MaxOutFunctor {
public: public:
......
...@@ -18,15 +18,12 @@ limitations under the License. */ ...@@ -18,15 +18,12 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
#define FLT_MAX \
__FLT_MAX__ // TODO(zcd) :It might need to be placed in another file, but I'm
// still wondering where to put it.
/* /*
* \brief Extracting simple operations from pooling. * \brief Extracting simple operations from pooling.
* Both MaxPool and AvgPool need "initial", "compute" and "finalize" * Both MaxPool and AvgPool need "initial", "compute" and "finalize"
......
...@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor { ...@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
// Calculate the start position of each batch. // Calculate the start position of each batch.
// example: sequences = {s0, s1, s2} // example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// num_batch = 5, // max_seqlen = 5,
// batchIndex = {b0, b1, b2, b3, b4} // batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12} // batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
...@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor { ...@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
// where 1 is the second sequence, // where 1 is the second sequence,
// 0 is the first sequence, // 0 is the first sequence,
// 2 is the third sequence. // 2 is the third sequence.
// The num_batch represents batch size after rearranging the // The max_seqlen represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence. // input LodTensor. It is also the maximum length of input sequence.
paddle::framework::LoD batch_lods; paddle::framework::LoD batch_lods;
...@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor { ...@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
batch_lods.emplace_back(std::vector<size_t>{0}); batch_lods.emplace_back(std::vector<size_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor // batch_lods[0] is the start positions for batch LoDTensor
int num_batch = seq_info[0].length; int max_seqlen = seq_info[0].length;
batch_lods[0].resize(static_cast<size_t>(num_batch + 1)); batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
// batch_lods[1] is the raw index in the input LoDTensor // batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0])); batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
// batch_lods[2] is the sort order for the input LoDTensor. // batch_lods[2] is the sort order for the input LoDTensor.
...@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor { ...@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
size_t* batch_starts = batch_lods[0].data(); size_t* batch_starts = batch_lods[0].data();
size_t* seq2batch_idx = batch_lods[1].data(); size_t* seq2batch_idx = batch_lods[1].data();
batch_starts[0] = 0; batch_starts[0] = 0;
for (int n = 0; n < num_batch; n++) { for (int n = 0; n < max_seqlen; n++) {
auto batch_id = static_cast<int>(batch_starts[n]); auto batch_id = static_cast<int>(batch_starts[n]);
for (size_t i = 0; i < seq_info.size(); ++i) { for (size_t i = 0; i < seq_info.size(); ++i) {
int seq_len = seq_info[i].length; int seq_len = seq_info[i].length;
......
...@@ -38,10 +38,9 @@ class PReluKernel : public framework::OpKernel<T> { ...@@ -38,10 +38,9 @@ class PReluKernel : public framework::OpKernel<T> {
auto dim = x->dims(); auto dim = x->dims();
int index = 0; int index = 0;
int i = 0; int i = 0;
int temp = 0;
if (mode == "channel") { if (mode == "channel") {
int temp = numel / (dim[0] * dim[1]);
for (i = 0; i < numel; i++) { for (i = 0; i < numel; i++) {
temp = numel / (dim[0] * dim[1]);
index = (i / temp) % dim[1]; index = (i / temp) % dim[1];
o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
} }
......
...@@ -246,6 +246,88 @@ class ReshapeGradKernel { ...@@ -246,6 +246,88 @@ class ReshapeGradKernel {
} }
}; };
// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
// the XShape is used to carry the shape and lod of X which will be used in
// reshape_grad, in this way, the framework can reuse the memory of X
// immediately the reshape_op is finished.
// Considering compatibility issues, we could not fix reshape_op
class Reshape2Op : public ReshapeOp {
public:
Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: ReshapeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
ReshapeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of ReshapeOp should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Reshape2OpMaker : public ReshapeOpMaker {
public:
void Make() override {
ReshapeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp.")
.AsIntermediate();
}
};
class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("reshape2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Reshape2GradOp : public framework::OperatorWithKernel {
public:
Reshape2GradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = ctx->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("XShape", framework::GradVarName("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
->type()),
ctx.device_context());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
...@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ...@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel); ops::ReshapeGradKernel);
REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
ops::Reshape2GradMaker);
REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel, ops::ReshapeKernel, int, ops::ReshapeKernel,
...@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ...@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int, double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel); ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel);
#endif #endif
...@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(param_out) of RmspropOp should not be null."); "Output(param_out) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
"Output(Momentum_out) of RmspropOp should not be null."); "Output(MomentOut) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
"Output(MeanSquareOut) of RmspropOp should not be null."); "Output(MeanSquareOut) of RmspropOp should not be null.");
if (ctx->Attrs().Get<bool>("centered")) {
PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
"Output(MeanGradOut) of RmspropOp should not be null.");
}
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel { ...@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim);
ctx->SetOutputDim("MeanSquareOut", param_dim); ctx->SetOutputDim("MeanSquareOut", param_dim);
if (ctx->Attrs().Get<bool>("centered")) {
ctx->SetOutputDim("MeanGradOut", param_dim);
}
} }
}; };
...@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("MeanSquare", AddInput("MeanSquare",
"(Tensor, default Tensor<float>)" "(Tensor, default Tensor<float>)"
" The mean square value that gets updated."); " The mean square value that gets updated.");
AddInput("MeanGrad",
"(Tensor, default Tensor<float>)"
" The moving average of gradient")
.AsDispensable();
AddInput("LearningRate", AddInput("LearningRate",
"(Tensor, default Tensor<float>) " "(Tensor, default Tensor<float>) "
"The learning rate should be a tensor of size 1."); "The learning rate should be a tensor of size 1.");
...@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("ParamOut", "(Tensor) Output updated parameter value."); AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
AddOutput("MomentOut", "(Tensor) Output updated moment."); AddOutput("MomentOut", "(Tensor) Output updated moment.");
AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value."); AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
AddOutput("MeanGradOut",
"(Tensor) Output moving average of gradient updated value.");
AddAttr<float>("epsilon", AddAttr<float>("epsilon",
"(float, default 1e-10) Constant " "(float, default 1e-10) Constant "
...@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(0.9f); .SetDefault(0.9f);
AddAttr<float>("momentum", "(float, default 0.0) Constant value.") AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
.SetDefault(0.0f); .SetDefault(0.0f);
AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Rmsprop Optimizer. Rmsprop Optimizer.
...@@ -103,6 +118,14 @@ MomentOut = momentum * Moment + ...@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
ParamOut = Param - MomentOut ParamOut = Param - MomentOut
$$ $$
if centered is true:
mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
mom = momentum * mom{t-1} + learning_rate * g_t /
sqrt(mean_square - mean_grad**2 + epsilon)
param -= mom
The original slides that proposed Rmsprop: Slide 29 of The original slides that proposed Rmsprop: Slide 29 of
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
......
...@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
float epsilon = ctx.Attr<float>("epsilon"); float epsilon = ctx.Attr<float>("epsilon");
float rho = ctx.Attr<float>("decay"); float rho = ctx.Attr<float>("decay");
float momentum = ctx.Attr<float>("momentum"); float momentum = ctx.Attr<float>("momentum");
bool centered = ctx.Attr<bool>("centered");
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param")); auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare")); auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
...@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> { ...@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
auto ms_out = EigenVector<T>::Flatten(*mean_square_out); auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel()); Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
ms_out.device(place) = rho * ms + (1 - rho) * g * g; ms_out.device(place) = rho * ms + (1 - rho) * g * g;
if (centered) {
auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
mean_grad_out->mutable_data<T>(ctx.GetPlace());
auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
mg_out.device(place) = rho * mg + (1 - rho) * g;
mom_out.device(place) = momentum * mom +
lr.broadcast(grad_dsize) * g /
(ms_out - mg_out.square() + epsilon).sqrt();
} else {
mom_out.device(place) = mom_out.device(place) =
momentum * mom + momentum * mom +
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
}
p_out.device(place) = p - mom_out; p_out.device(place) = p - mom_out;
} }
}; };
......
...@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) { ...@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
template <typename T> template <typename T>
__global__ void GPUROIPoolForward( __global__ void GPUROIPoolForward(
const int nthreads, const T* input_data, const int64_t* input_rois, const int nthreads, const T* input_data, const T* input_rois,
const float spatial_scale, const int channels, const int height, const float spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width, const int width, const int pooled_height, const int pooled_width,
int* roi_batch_id_data, T* output_data, int64_t* argmax_data) { int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
...@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward( ...@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
int c = (i / pooled_width / pooled_height) % channels; int c = (i / pooled_width / pooled_height) % channels;
int n = i / pooled_width / pooled_height / channels; int n = i / pooled_width / pooled_height / channels;
const int64_t* offset_input_rois = input_rois + n * kROISize; const T* offset_input_rois = input_rois + n * kROISize;
int roi_batch_ind = roi_batch_id_data[n]; int roi_batch_ind = roi_batch_id_data[n];
int roi_start_w = round(offset_input_rois[0] * spatial_scale); int roi_start_w = round(offset_input_rois[0] * spatial_scale);
int roi_start_h = round(offset_input_rois[1] * spatial_scale); int roi_start_h = round(offset_input_rois[1] * spatial_scale);
...@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward( ...@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
template <typename T> template <typename T>
__global__ void GPUROIPoolBackward( __global__ void GPUROIPoolBackward(
const int nthreads, const int64_t* input_rois, const T* output_grad, const int nthreads, const T* input_rois, const T* output_grad,
const int64_t* argmax_data, const int num_rois, const float spatial_scale, const int64_t* argmax_data, const int num_rois, const float spatial_scale,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, int* roi_batch_id_data, const int pooled_height, const int pooled_width, int* roi_batch_id_data,
...@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
GPUROIPoolForward< GPUROIPoolForward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale, output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
channels, height, width, pooled_height, pooled_width, height, width, pooled_height, pooled_width,
roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()), roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
argmax->mutable_data<int64_t>(ctx.GetPlace())); argmax->mutable_data<int64_t>(ctx.GetPlace()));
} }
...@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
if (output_grad_size > 0) { if (output_grad_size > 0) {
GPUROIPoolBackward< GPUROIPoolBackward<
T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
output_grad_size, rois->data<int64_t>(), out_grad->data<T>(), output_grad_size, rois->data<T>(), out_grad->data<T>(),
argmax->data<int64_t>(), rois_num, spatial_scale, channels, height, argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
width, pooled_height, pooled_width, width, pooled_height, pooled_width,
roi_batch_id_list_gpu.data<int>(), roi_batch_id_list_gpu.data<int>(),
......
...@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
T* output_data = out->mutable_data<T>(ctx.GetPlace()); T* output_data = out->mutable_data<T>(ctx.GetPlace());
int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace()); int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
const int64_t* rois_data = rois->data<int64_t>(); const T* rois_data = rois->data<T>();
for (int n = 0; n < rois_num; ++n) { for (int n = 0; n < rois_num; ++n) {
int roi_batch_id = roi_batch_id_data[n]; int roi_batch_id = roi_batch_id_data[n];
int roi_start_w = round(rois_data[0] * spatial_scale); int roi_start_w = round(rois_data[0] * spatial_scale);
...@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> { ...@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
} }
} }
const int64_t* rois_data = rois->data<int64_t>(); const T* rois_data = rois->data<T>();
const T* out_grad_data = out_grad->data<T>(); const T* out_grad_data = out_grad->data<T>();
const int64_t* argmax_data = argmax->data<int64_t>(); const int64_t* argmax_data = argmax->data<int64_t>();
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace()); T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
......
...@@ -53,7 +53,7 @@ class SamplingIdKernel : public framework::OpKernel<T> { ...@@ -53,7 +53,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
static_cast<T>(context.Attr<float>("min")), static_cast<T>(context.Attr<float>("min")),
static_cast<T>(context.Attr<float>("max"))); static_cast<T>(context.Attr<float>("max")));
std::vector<T> ids(batch_size); std::vector<int64_t> ids(batch_size);
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
T r = dist(engine); T r = dist(engine);
int idx = width - 1; int idx = width - 1;
...@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
break; break;
} }
} }
ids[i] = ins_vector[idx]; ids[i] = int64_t(idx);
} }
std::vector<int64_t> out_dim; std::vector<int64_t> out_dim;
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include <sys/stat.h>
#include <fstream> #include <fstream>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
...@@ -23,40 +22,11 @@ limitations under the License. */ ...@@ -23,40 +22,11 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveCombineOp : public framework::OperatorBase { class SaveCombineOp : public framework::OperatorBase {
public: public:
SaveCombineOp(const std::string &type, SaveCombineOp(const std::string &type,
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include <sys/stat.h>
#include <fstream> #include <fstream>
#include <numeric> #include <numeric>
...@@ -25,6 +24,7 @@ limitations under the License. */ ...@@ -25,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,36 +33,6 @@ namespace operators { ...@@ -33,36 +33,6 @@ namespace operators {
// to directory specified. // to directory specified.
constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveOp : public framework::OperatorBase { class SaveOp : public framework::OperatorBase {
public: public:
SaveOp(const std::string &type, const framework::VariableNameMap &inputs, SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/sequence_enumerate_op.h"
namespace paddle {
namespace operators {
class SequenceEnumerateOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("X"),
"Input(X) of SequecceEnumerate operator should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(X) of SequenceEnumerate operator should not be null.");
const auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
x_dims.size(), 2UL,
"Input(X) of SequenceEnumerate operator's rank should be 2.");
PADDLE_ENFORCE_EQ(
x_dims[1], 1UL,
"Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
const auto win_size = ctx->Attrs().Get<int>("win_size");
ctx->SetOutputDim("Out", {x_dims[0], win_size});
ctx->ShareLoD("X", "Out");
}
};
class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(2-D LoDTensor with the 2nd dimension equal to 1) "
"Input LoDTensor of SequenceEnumerate operator.");
AddOutput("Out",
"(2-D LoDTensor with the 2nd dimension equal to win_size) "
"Output LoDTensor of SequenceEnumerate operator.");
AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
.AddCustomChecker([](const int& win_size) {
PADDLE_ENFORCE(win_size >= 2,
"The window size should be not less than 2.");
});
AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
.SetDefault(0);
AddComment(R"DOC(
Sequence Enumerate Operator.
Generate a new sequence for the input index sequence, which enumerates all the
sub-sequences with length `win_size` of the input.
The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples:
Case 1:
Input:
X.lod = [[0, 3, 5]]
X.data = [[1], [2], [3], [4], [5]]
X.dims = [5, 1]
Attrs:
win_size = 2
pad_value = 0
Output:
Out.lod = [[0, 3, 5]]
Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
Out.dims = [5, 2]
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp,
ops::SequenceEnumerateOpMaker);
REGISTER_OP_CPU_KERNEL(
sequence_enumerate,
ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include "paddle/fluid/operators/sequence_enumerate_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using LoDTensor = framework::LoDTensor;
template <typename T>
__global__ void CalcOutPut(const T* in_data, const size_t* in_lod,
const size_t lod_len, const int64_t win_size,
const int64_t pad_value, T* out_data) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < in_lod[lod_len - 1]) {
int end_idx = 0;
// Get LoD interval of index
for (int i = 1; i < lod_len; ++i) {
if (index < in_lod[i]) {
end_idx = in_lod[i];
break;
}
}
for (size_t i = 0; i < win_size; ++i) {
int word_pos = index + i;
out_data[index * win_size + i] =
word_pos < end_idx ? in_data[word_pos] : pad_value;
}
}
}
template <typename T>
class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
int win_size = context.Attr<int>("win_size");
int pad_value = context.Attr<int>("pad_value");
auto in_dims = in->dims();
auto in_lod = in->lod();
PADDLE_ENFORCE_EQ(
static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
"The actual input data's size mismatched with LoD information.");
/* Generate enumerate sequence set */
auto stream = context.cuda_device_context().stream();
auto lod0 = in_lod[0];
auto in_len = in->numel();
auto in_data = in->data<T>();
auto out_data = out->mutable_data<T>(context.GetPlace());
// Copy LoD to GPU
const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
// Calc output tensor
CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
sequence_enumerate,
paddle::operators::SequenceEnumerateOpCUDAKernel<int32_t>,
paddle::operators::SequenceEnumerateOpCUDAKernel<int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class SequenceEnumerateKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
int win_size = context.Attr<int>("win_size");
int pad_value = context.Attr<int>("pad_value");
auto in_dims = in->dims();
auto in_lod = in->lod();
PADDLE_ENFORCE_EQ(
static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
"The actual input data's size mismatched with LoD information.");
// Generate enumerate sequence set
auto lod0 = in_lod[0];
auto in_data = in->data<T>();
auto out_data = out->mutable_data<T>(context.GetPlace());
for (size_t i = 0; i < lod0.size() - 1; ++i) {
for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
for (int word_idx = 0; word_idx < win_size; ++word_idx) {
size_t word_pos = idx + word_idx;
out_data[win_size * idx + word_idx] =
word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
}
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase { ...@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
} }
}; };
// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
// the XShape is used to carry the shape and lod of X which will be used in
// squeeze_grad, in this way, the framework can reuse the memory of X
// immediately the squeeze2_op is finished.
// Considering compatibility issues, we could not fix squeeze2_op
class Squeeze2OpMaker : public SqueezeOpMaker {
public:
void Make() override {
SqueezeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in SqueezeGradOp.")
.AsIntermediate();
}
};
class Squeeze2OpInferShape : public SqueezeOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
SqueezeOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of Squeeze operator should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Squeeze2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
// Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("squeeze2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Squeeze2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Squeeze2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker, ...@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
ops::SqueezeOpInferShape, ops::SqueezeOpInferShape,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape); REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
ops::Squeeze2GradInferShape);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include <string>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
...@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel { ...@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
...@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel { ...@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null"); "Input(Out@GRAD) should not be null");
...@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel { ...@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
} }
}; };
// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
// transpose, the XShape is used to carry the shape and lod of X which
// will be used in transpose_grad, in this way, the framework can reuse
// the memory of X immediately the transpose2_op is finished.
// Considering compatibility issues, we could not fix transpose2_op
class Transpose2Op : public TransposeOp {
public:
Transpose2Op(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: TransposeOp(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
TransposeOp::InferShape(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) should not be null");
const auto &in_dims = ctx->GetInputDim("X");
std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
x_shape_dim[0] = 0;
for (int i = 0; i < in_dims.size(); ++i) {
x_shape_dim[i + 1] = in_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
ctx->ShareLoD("X", /*->*/ "XShape");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
};
class Transpose2OpMaker : public TransposeOpMaker {
public:
void Make() override {
TransposeOpMaker::Make();
AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
}
};
class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("transpose2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Transpose2OpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
if (ctx->HasOutput(framework::GradVarName("X"))) {
auto xshape_dim = ctx->GetInputDim("XShape");
auto x_shape_dim =
framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
ctx->ShareLoD("XShape", framework::GradVarName("X"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
->type()),
ctx.device_context());
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -120,8 +208,20 @@ namespace ops = paddle::operators; ...@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker, REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>); transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
ops::Transpose2GradMaker);
REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
REGISTER_OP_CPU_KERNEL(
transpose2,
ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
...@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
transpose_grad, transpose_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>); ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
transpose2,
ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
transpose2_grad,
ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
...@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase { ...@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
} }
}; };
// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
// unsqueeze, the XShape is used to carry the shape and lod of X which
// will be used in unsqueeze_grad, in this way, the framework can reuse
// the memory of X immediately the unsqueeze2_op is finished.
// Considering compatibility issues, we could not fix unsqueeze2_op
class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
public:
void operator()(framework::InferShapeContext *ctx) const override {
UnsqueezeOpInferShape::operator()(ctx);
PADDLE_ENFORCE(ctx->HasOutput("XShape"),
"Output(XShape) of Unsqueeze operator should not be null.");
const auto &x_dims = ctx->GetInputDim("X");
std::vector<int64_t> xshape_dims(x_dims.size() + 1);
xshape_dims[0] = 0;
for (int i = 0; i < x_dims.size(); ++i) {
xshape_dims[i + 1] = x_dims[i];
}
ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
ctx->ShareLoD("X", /*->*/ "XShape");
}
};
class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
public:
void Make() override {
UnsqueezeOpMaker::Make();
AddOutput("XShape",
"XShape is just used to store the shape and lod of X, which will "
"be used in UnsqueezeGradOp.")
.AsIntermediate();
}
};
class Unsqueeze2Op : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto &axes = Attr<std::vector<int>>("axes");
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims);
// Invoke Reshape op.
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
{{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
reshape_op->Run(scope, place);
}
};
class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("unsqueeze2_grad");
grad_op->SetInput("XShape", Output("XShape"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class Unsqueeze2GradInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("XShape"),
"Input(XShape) shouldn't be null.");
PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
auto xshape_dims = context->GetInputDim("XShape");
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
context->SetOutputDim(framework::GradVarName("X"), x_dims);
context->ShareLoD("XShape", framework::GradVarName("X"));
}
};
class Unsqueeze2GradOp : public framework::OperatorBase {
public:
using OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
auto dx_name = Output(framework::GradVarName("X"));
auto dout_name = Input(framework::GradVarName("Out"));
auto xshape_name = Input("XShape");
auto xshape_dims =
scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims);
auto reshape_op = framework::OpRegistry::CreateOp(
"reshape2", {{"X", {dout_name}}, {"Shape", {}}},
{{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
reshape_op->Run(scope, place);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker, ...@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>); paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp, REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
ops::UnsqueezeGradInferShape); ops::UnsqueezeGradInferShape);
REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
ops::Unsqueeze2GradInferShape);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cfloat>
// Disable the copy and assignment operator for a class. // Disable the copy and assignment operator for a class.
#ifndef DISABLE_COPY_AND_ASSIGN #ifndef DISABLE_COPY_AND_ASSIGN
...@@ -23,3 +24,7 @@ limitations under the License. */ ...@@ -23,3 +24,7 @@ limitations under the License. */
classname& operator=(const classname&) = delete; \ classname& operator=(const classname&) = delete; \
classname& operator=(classname&&) = delete classname& operator=(classname&&) = delete
#endif #endif
#if defined(__FLT_MAX__)
#define FLT_MAX __FLT_MAX__
#endif // __FLT_MAX__
...@@ -14,24 +14,141 @@ ...@@ -14,24 +14,141 @@
#pragma once #pragma once
#include <cstdio>
#include <stdexcept> #include <stdexcept>
#include <memory>
#include <string> #include <string>
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#if !defined(_WIN32) #if !defined(_WIN32)
#include <dlfcn.h> // for dladdr #define UNUSED __attribute__((unused))
#include <execinfo.h> // for backtrace #include <dlfcn.h> // dladdr
#include <execinfo.h> // backtrace
#include <sys/stat.h>
#include <algorithm> // std::accumulate
#else #else
#include <Shlwapi.h> #include <io.h> // _popen, _pclose
#include <Windows.h> #include <windows.h>
#if defined(_WIN32)
#include <numeric> // std::accumulate in msvc
#endif
// windows version of __attribute__((unused))
#define UNUSED __pragma(warning(suppress : 4100))
static void* dlsym(void* handle, const char* symbol_name) { #ifndef S_ISDIR // windows port for sys/stat.h
#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
#endif // S_ISDIR
static void *dlsym(void *handle, const char *symbol_name) {
FARPROC found_symbol; FARPROC found_symbol;
found_symbol = GetProcAddress((HMODULE)handle, symbol_name); found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
if (found_symbol == NULL) { if (found_symbol == NULL) {
throw std::runtime_error(std::string(symbol_name) + " not found."); throw std::runtime_error(std::string(symbol_name) + " not found.");
} }
return reinterpret_cast<void*>(found_symbol); return reinterpret_cast<void *>(found_symbol);
} }
#endif static void *dlopen(const char *filename, int flag) {
std::string file_name(filename);
file_name.replace(0, file_name.size() - 1, '/', '\\');
HMODULE hModule = LoadLibrary(file_name.c_str());
if (!hModule) {
throw std::runtime_error(file_name + " not found.");
}
return reinterpret_cast<void *>(hModule);
}
#endif // !_WIN32
static void ExecShellCommand(const std::string &cmd, std::string *message) {
char buffer[128];
#if !defined(_WIN32)
std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
#else
std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
#endif // _WIN32
if (!pipe) {
LOG(ERROR) << "error running command: " << cmd;
return;
}
while (!feof(pipe.get())) {
if (fgets(buffer, 128, pipe.get()) != nullptr) {
*message += buffer;
}
}
}
static bool PathExists(const std::string &path) {
#if !defined(_WIN32)
struct stat statbuf;
if (stat(path.c_str(), &statbuf) != -1) {
if (S_ISDIR(statbuf.st_mode)) {
return true;
}
}
#else
struct _stat statbuf;
if (_stat(path.c_str(), &statbuf) != -1) {
if (S_ISDIR(statbuf.st_mode)) {
return true;
}
}
#endif // !_WIN32
return false;
}
// TODO(yuyang18): If the functions below are needed by other files, move them
// to paddle::filesystem namespace.
#if !defined(_WIN32)
constexpr char kSEP = '/';
#else
constexpr char kSEP = '\\';
#endif // _WIN32
static bool FileExists(const std::string &filepath) {
#if !defined(_WIN32)
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
#else
struct _stat buffer;
return (_stat(filepath.c_str(), &buffer) == 0);
#endif // !_WIN32
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
std::string path_error(path);
path_error += " mkdir failed!";
#if !defined(_WIN32)
if (mkdir(path, 0755)) {
if (errno != EEXIST) {
throw std::runtime_error(path_error);
}
}
#else
CreateDirectory(path, NULL);
auto errorno = GetLastError();
if (errorno != ERROR_ALREADY_EXISTS) {
throw std::runtime_error(path_error);
}
#endif // !_WIN32
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
...@@ -115,6 +115,7 @@ function cmake_gen() { ...@@ -115,6 +115,7 @@ function cmake_gen() {
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE=${WITH_INFERENCE:-ON}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
======================================== ========================================
...@@ -144,6 +145,7 @@ EOF ...@@ -144,6 +145,7 @@ EOF
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
} }
......
...@@ -98,10 +98,9 @@ class Inferencer(object): ...@@ -98,10 +98,9 @@ class Inferencer(object):
raise ValueError( raise ValueError(
"inputs should be a map of {'input_name': input_var}") "inputs should be a map of {'input_name': input_var}")
with executor.scope_guard(self.scope): with self._prog_and_scope_guard():
results = self.exe.run(self.inference_program, results = self.exe.run(feed=inputs,
feed=inputs, fetch_list=[self.predict_var.name],
fetch_list=[self.predict_var],
return_numpy=return_numpy) return_numpy=return_numpy)
return results return results
......
...@@ -145,26 +145,23 @@ def rpn_target_assign(loc, ...@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
""" """
helper = LayerHelper('rpn_target_assign', **locals()) helper = LayerHelper('rpn_target_assign', **locals())
# 1. Compute the regression target bboxes # Compute overlaps between the prior boxes and the gt boxes overlaps
target_bbox = box_coder(
prior_box=anchor_box,
prior_box_var=anchor_var,
target_box=gt_box,
code_type='encode_center_size',
box_normalized=False)
# 2. Compute overlaps between the prior boxes and the gt boxes overlaps
iou = iou_similarity(x=gt_box, y=anchor_box) iou = iou_similarity(x=gt_box, y=anchor_box)
# 3. Assign target label to anchors # Assign target label to anchors
loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype) loc_index = helper.create_tmp_variable(dtype='int32')
score_index = helper.create_tmp_variable(dtype=anchor_box.dtype) score_index = helper.create_tmp_variable(dtype='int32')
target_label = helper.create_tmp_variable(dtype=anchor_box.dtype) target_label = helper.create_tmp_variable(dtype='int64')
target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
helper.append_op( helper.append_op(
type="rpn_target_assign", type="rpn_target_assign",
inputs={'DistMat': iou}, inputs={'Anchor': anchor_box,
'GtBox': gt_box,
'DistMat': iou},
outputs={ outputs={
'LocationIndex': loc_index, 'LocationIndex': loc_index,
'ScoreIndex': score_index, 'ScoreIndex': score_index,
'TargetLabel': target_label 'TargetLabel': target_label,
'TargetBBox': target_bbox,
}, },
attrs={ attrs={
'rpn_batch_size_per_im': rpn_batch_size_per_im, 'rpn_batch_size_per_im': rpn_batch_size_per_im,
...@@ -173,16 +170,16 @@ def rpn_target_assign(loc, ...@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
'fg_fraction': fg_fraction 'fg_fraction': fg_fraction
}) })
# 4. Reshape and gather the target entry loc_index.stop_gradient = True
scores = nn.reshape(x=scores, shape=(-1, 2)) score_index.stop_gradient = True
loc = nn.reshape(x=loc, shape=(-1, 4)) target_label.stop_gradient = True
target_label = nn.reshape(x=target_label, shape=(-1, 1)) target_bbox.stop_gradient = True
target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
scores = nn.reshape(x=scores, shape=(-1, 1))
loc = nn.reshape(x=loc, shape=(-1, 4))
predicted_scores = nn.gather(scores, score_index) predicted_scores = nn.gather(scores, score_index)
predicted_location = nn.gather(loc, loc_index) predicted_location = nn.gather(loc, loc_index)
target_label = nn.gather(target_label, score_index)
target_bbox = nn.gather(target_bbox, loc_index)
return predicted_scores, predicted_location, target_label, target_bbox return predicted_scores, predicted_location, target_label, target_bbox
......
...@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
return acc_out return acc_out
def auc(input, label, curve='ROC', num_thresholds=200, topk=1): def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
""" """
**Area Under the Curve (AUC) Layer** **Area Under the Curve (AUC) Layer**
...@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): ...@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
""" """
helper = LayerHelper("auc", **locals()) helper = LayerHelper("auc", **locals())
auc_out = helper.create_tmp_variable(dtype="float64") auc_out = helper.create_tmp_variable(dtype="float64")
batch_auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches. # make tp, tn, fp, fn persistable, so that can accumulate all batches.
tp = helper.create_global_variable( stat_pos = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) persistable=True, dtype='int64', shape=[num_thresholds + 1])
tn = helper.create_global_variable( stat_neg = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) persistable=True, dtype='int64', shape=[num_thresholds + 1])
fp = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds]) for var in [stat_pos, stat_neg]:
fn = helper.create_global_variable(
persistable=True, dtype='int64', shape=[num_thresholds])
for var in [tp, tn, fp, fn]:
helper.set_variable_initializer( helper.set_variable_initializer(
var, Constant( var, Constant(
value=0.0, force_cpu=True)) value=0.0, force_cpu=True))
...@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): ...@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
inputs={ inputs={
"Predict": [input], "Predict": [input],
"Label": [label], "Label": [label],
"TP": [tp], "StatPos": [stat_pos],
"TN": [tn], "StatNeg": [stat_neg]
"FP": [fp],
"FN": [fn]
}, },
attrs={"curve": curve, attrs={"curve": curve,
"num_thresholds": num_thresholds}, "num_thresholds": num_thresholds},
outputs={ outputs={
"AUC": [auc_out], "AUC": [auc_out],
"TPOut": [tp], "BatchAUC": [batch_auc_out],
"TNOut": [tn], "StatPosOut": [stat_pos],
"FPOut": [fp], "StatNegOut": [stat_neg]
"FNOut": [fn]
}) })
return auc_out, [tp, tn, fp, fn] return auc_out, batch_auc_out, [stat_pos, stat_neg]
...@@ -111,6 +111,7 @@ __all__ = [ ...@@ -111,6 +111,7 @@ __all__ = [
'stack', 'stack',
'pad2d', 'pad2d',
'unstack', 'unstack',
'sequence_enumerate',
] ]
...@@ -3545,11 +3546,6 @@ def topk(input, k, name=None): ...@@ -3545,11 +3546,6 @@ def topk(input, k, name=None):
top5_values, top5_indices = layers.topk(input, k=5) top5_values, top5_indices = layers.topk(input, k=5)
""" """
shape = input.shape
if k < 1 or k >= shape[-1]:
raise ValueError("k must be greater than 0 and less than %d." %
(shape[-1]))
helper = LayerHelper("top_k", **locals()) helper = LayerHelper("top_k", **locals())
values = helper.create_tmp_variable(dtype=input.dtype) values = helper.create_tmp_variable(dtype=input.dtype)
indices = helper.create_tmp_variable(dtype="int64") indices = helper.create_tmp_variable(dtype="int64")
...@@ -4029,10 +4025,12 @@ def transpose(x, perm, name=None): ...@@ -4029,10 +4025,12 @@ def transpose(x, perm, name=None):
helper = LayerHelper('transpose', **locals()) helper = LayerHelper('transpose', **locals())
out = helper.create_tmp_variable(x.dtype) out = helper.create_tmp_variable(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype)
helper.append_op( helper.append_op(
type='transpose', type='transpose2',
inputs={'X': [x]}, inputs={'X': [x]},
outputs={'Out': [out]}, outputs={'Out': [out],
'XShape': [x_shape]},
attrs={'axis': perm}) attrs={'axis': perm})
return out return out
...@@ -4524,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): ...@@ -4524,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
"Each dimension size given in shape must not be negtive " "Each dimension size given in shape must not be negtive "
"except one unknown dimension.") "except one unknown dimension.")
helper = LayerHelper("reshape", **locals()) helper = LayerHelper("reshape2", **locals())
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_tmp_variable(dtype=x.dtype)
x_shape = helper.create_tmp_variable(dtype=x.dtype)
helper.append_op( helper.append_op(
type="reshape", type="reshape2",
inputs=inputs, inputs=inputs,
attrs={"shape": shape}, attrs={"shape": shape},
outputs={"Out": out}) outputs={"Out": out,
"XShape": x_shape})
return helper.append_activation(out) return helper.append_activation(out)
...@@ -4574,11 +4574,13 @@ def squeeze(input, axes, name=None): ...@@ -4574,11 +4574,13 @@ def squeeze(input, axes, name=None):
""" """
helper = LayerHelper("squeeze", **locals()) helper = LayerHelper("squeeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_tmp_variable(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op( helper.append_op(
type="squeeze", type="squeeze2",
inputs={"X": input}, inputs={"X": input},
attrs={"axes": axes}, attrs={"axes": axes},
outputs={"Out": out}) outputs={"Out": out,
"XShape": x_shape})
return out return out
...@@ -4609,11 +4611,13 @@ def unsqueeze(input, axes, name=None): ...@@ -4609,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
""" """
helper = LayerHelper("unsqueeze", **locals()) helper = LayerHelper("unsqueeze", **locals())
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_tmp_variable(dtype=input.dtype)
x_shape = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op( helper.append_op(
type="unsqueeze", type="unsqueeze2",
inputs={"X": input}, inputs={"X": input},
attrs={"axes": axes}, attrs={"axes": axes},
outputs={"Out": out}) outputs={"Out": out,
"XShape": x_shape})
return out return out
...@@ -5815,14 +5819,61 @@ def flatten(x, axis=1, name=None): ...@@ -5815,14 +5819,61 @@ def flatten(x, axis=1, name=None):
raise ValueError("The axis should be a int, and in range [0, rank(x)]") raise ValueError("The axis should be a int, and in range [0, rank(x)]")
out = helper.create_tmp_variable(x.dtype) out = helper.create_tmp_variable(x.dtype)
x_shape = helper.create_tmp_variable(x.dtype)
helper.append_op( helper.append_op(
type='flatten', type='flatten2',
inputs={"X": x}, inputs={"X": x},
outputs={'Out': out}, outputs={'Out': out,
'XShape': x_shape},
attrs={"axis": axis}) attrs={"axis": axis})
return out return out
def sequence_enumerate(input, win_size, pad_value=0, name=None):
"""
Generate a new sequence for the input index sequence, which enumerates all the
sub-sequences with length `win_size` of the input.
The enumerated sequence has the same 1st dimension with variable `input`, and
the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
Examples:
Case 1:
Input:
X.lod = [[0, 3, 5]]
X.data = [[1], [2], [3], [4], [5]]
X.dims = [5, 1]
Attrs:
win_size = 2
pad_value = 0
Output:
Out.lod = [[0, 3, 5]]
Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
Out.dims = [5, 2]
Args:
input (Variable): The input variable which is a index sequence.
win_size (int): The window size for enumerating all sub-sequences.
pad_value (int): The padding value, default 0.
Returns:
Variable: The enumerate sequence variable which is a LoDTensor.
Examples:
.. code-block:: python
x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
"""
helper = LayerHelper('sequence_enumerate', **locals())
out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
helper.append_op(
type='sequence_enumerate',
inputs={'X': input},
outputs={'Out': out},
attrs={'win_size': win_size,
'pad_value': pad_value})
def sequence_mask(x, maxlen=None, dtype='int64', name=None): def sequence_mask(x, maxlen=None, dtype='int64', name=None):
""" """
**SequenceMask Layer** **SequenceMask Layer**
...@@ -5902,6 +5953,7 @@ def stack(x, axis=0): ...@@ -5902,6 +5953,7 @@ def stack(x, axis=0):
helper.append_op( helper.append_op(
type='stack', inputs={'X': x}, outputs={'Y': out}, type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis}) attrs={'axis': axis})
return out return out
......
...@@ -558,8 +558,6 @@ class Auc(MetricBase): ...@@ -558,8 +558,6 @@ class Auc(MetricBase):
name: metric name name: metric name
curve: Specifies the name of the curve to be computed, 'ROC' [default] or curve: Specifies the name of the curve to be computed, 'ROC' [default] or
'PR' for the Precision-Recall-curve. 'PR' for the Precision-Recall-curve.
num_thresholds: The number of thresholds to use when discretizing the roc
curve.
"NOTE: only implement the ROC curve type via Python now." "NOTE: only implement the ROC curve type via Python now."
...@@ -574,15 +572,14 @@ class Auc(MetricBase): ...@@ -574,15 +572,14 @@ class Auc(MetricBase):
numpy_auc = metric.eval() numpy_auc = metric.eval()
""" """
def __init__(self, name, curve='ROC', num_thresholds=200): def __init__(self, name, curve='ROC', num_thresholds=4095):
super(Auc, self).__init__(name=name) super(Auc, self).__init__(name=name)
self._curve = curve self._curve = curve
self._num_thresholds = num_thresholds self._num_thresholds = num_thresholds
self._epsilon = 1e-6
self.tp_list = np.zeros((num_thresholds, )) _num_pred_buckets = num_thresholds + 1
self.fn_list = np.zeros((num_thresholds, )) self._stat_pos = [0] * _num_pred_buckets
self.tn_list = np.zeros((num_thresholds, )) self._stat_neg = [0] * _num_pred_buckets
self.fp_list = np.zeros((num_thresholds, ))
def update(self, preds, labels): def update(self, preds, labels):
if not _is_numpy_(labels): if not _is_numpy_(labels):
...@@ -590,41 +587,32 @@ class Auc(MetricBase): ...@@ -590,41 +587,32 @@ class Auc(MetricBase):
if not _is_numpy_(preds): if not _is_numpy_(preds):
raise ValueError("The 'predictions' must be a numpy ndarray.") raise ValueError("The 'predictions' must be a numpy ndarray.")
kepsilon = 1e-7 # to account for floating point imprecisions
thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
for i in range(self._num_thresholds - 2)]
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
# calculate TP, FN, TN, FP count
for idx_thresh, thresh in enumerate(thresholds):
tp, fn, tn, fp = 0, 0, 0, 0
for i, lbl in enumerate(labels): for i, lbl in enumerate(labels):
value = preds[i, 1]
bin_idx = int(value * self._num_thresholds)
assert bin_idx <= self._num_thresholds
if lbl: if lbl:
if preds[i, 1] >= thresh: self._stat_pos[bin_idx] += 1.0
tp += 1
else:
fn += 1
else: else:
if preds[i, 1] >= thresh: self._stat_neg[bin_idx] += 1.0
fp += 1
else: @staticmethod
tn += 1 def trapezoid_area(x1, x2, y1, y2):
self.tp_list[idx_thresh] += tp return abs(x1 - x2) * (y1 + y2) / 2.0
self.fn_list[idx_thresh] += fn
self.tn_list[idx_thresh] += tn
self.fp_list[idx_thresh] += fp
def eval(self): def eval(self):
epsilon = self._epsilon tot_pos = 0.0
num_thresholds = self._num_thresholds tot_neg = 0.0
tpr = (self.tp_list.astype("float32") + epsilon) / ( auc = 0.0
self.tp_list + self.fn_list + epsilon)
fpr = self.fp_list.astype("float32") / ( idx = self._num_thresholds
self.fp_list + self.tn_list + epsilon) while idx >= 0:
rec = (self.tp_list.astype("float32") + epsilon) / ( tot_pos_prev = tot_pos
self.tp_list + self.fp_list + epsilon) tot_neg_prev = tot_neg
tot_pos += self._stat_pos[idx]
x = fpr[:num_thresholds - 1] - fpr[1:] tot_neg += self._stat_neg[idx]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0 auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
auc_value = np.sum(x * y) tot_pos_prev)
return auc_value idx -= 1
return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
...@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer): ...@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer):
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
\\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t)
if centered is True:
.. math::
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
\\epsilon}} \\nabla Q_{i}(w) \\epsilon}} \\nabla Q_{i}(w)
w & = w - v(w, t) w & = w - v(w, t)
...@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer): ...@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
avoid division by zero, set 1e-6 by default. avoid division by zero, set 1e-6 by default.
momentum(float): :math:`\\beta` in equation is the momentum term, momentum(float): :math:`\\beta` in equation is the momentum term,
set 0.0 by default. set 0.0 by default.
centered(bool): If True, gradients are normalized by the estimated variance of
the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer): ...@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer):
_momentum_acc_str = "momentum" _momentum_acc_str = "momentum"
_mean_square_acc_str = "mean_square" _mean_square_acc_str = "mean_square"
_mean_grad_acc_str = "mean_grad"
def __init__(self, def __init__(self,
learning_rate, learning_rate,
rho=0.95, rho=0.95,
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0, momentum=0.0,
centered=False,
**kwargs): **kwargs):
super(RMSPropOptimizer, self).__init__( super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate, **kwargs)
...@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer): ...@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
self._rho = rho self._rho = rho
self._epsilon = epsilon self._epsilon = epsilon
self._momentum = momentum self._momentum = momentum
self._centered = centered
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
...@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer): ...@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
for p in parameters: for p in parameters:
self._add_accumulator(self._momentum_acc_str, p) self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p) self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
...@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer): ...@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
mean_square_acc = self._get_accumulator(self._mean_square_acc_str, mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
param_and_grad[0]) param_and_grad[0])
mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
param_and_grad[0])
rmsprop_op = block.append_op( rmsprop_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
...@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer): ...@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Moment": momentum_acc, "Moment": momentum_acc,
"MeanSquare": mean_square_acc, "MeanSquare": mean_square_acc,
"MeanGrad": mean_grad_acc,
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
"MomentOut": momentum_acc, "MomentOut": momentum_acc,
"MeanSquareOut": mean_square_acc "MeanSquareOut": mean_square_acc,
"MeanGradOut": mean_grad_acc
}, },
attrs={ attrs={
"epsilon": self._epsilon, "epsilon": self._epsilon,
"decay": self._rho, "decay": self._rho,
"momentum": self._momentum "momentum": self._momentum,
"centered": self._centered
}) })
return rmsprop_op return rmsprop_op
......
...@@ -16,7 +16,9 @@ from __future__ import print_function ...@@ -16,7 +16,9 @@ from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy import numpy
import os
import cifar10_small_test_set import cifar10_small_test_set
...@@ -89,7 +91,7 @@ def optimizer_func(): ...@@ -89,7 +91,7 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001) return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname): def train(use_cuda, train_program, parallel, params_dirname):
BATCH_SIZE = 128 BATCH_SIZE = 128
EPOCH_NUM = 1 EPOCH_NUM = 1
...@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname): ...@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = fluid.Trainer(
train_func=train_program, optimizer_func=optimizer_func, place=place) train_func=train_program,
optimizer_func=optimizer_func,
place=place,
parallel=parallel)
trainer.train( trainer.train(
reader=train_reader, reader=train_reader,
...@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname): ...@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['pixel', 'label']) feed_order=['pixel', 'label'])
def infer(use_cuda, inference_program, params_dirname=None): def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place) infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
# The input's dimension of conv should be 4-D or 5-D. # The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range # Use normilized image pixels as input data, which should be in the range
...@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results) print("infer results: ", results)
def main(use_cuda): def main(use_cuda, parallel):
if use_cuda and not fluid.core.is_compiled_with_cuda(): if use_cuda and not fluid.core.is_compiled_with_cuda():
return return
save_path = "image_classification_resnet.inference.model" save_path = "image_classification_resnet.inference.model"
os.environ['CPU_NUM'] = str(4)
train( train(
use_cuda=use_cuda, use_cuda=use_cuda,
train_program=train_network, train_program=train_network,
params_dirname=save_path) params_dirname=save_path,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer( infer(
use_cuda=use_cuda, use_cuda=use_cuda,
inference_program=inference_network, inference_program=inference_network,
params_dirname=save_path) params_dirname=save_path,
parallel=parallel)
if __name__ == '__main__': if __name__ == '__main__':
for use_cuda in (False, True): for use_cuda in (False, True):
main(use_cuda=use_cuda) for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
...@@ -17,7 +17,9 @@ from __future__ import print_function ...@@ -17,7 +17,9 @@ from __future__ import print_function
import six import six
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy import numpy
import os
import cifar10_small_test_set import cifar10_small_test_set
...@@ -69,7 +71,7 @@ def optimizer_func(): ...@@ -69,7 +71,7 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001) return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname): def train(use_cuda, train_program, parallel, params_dirname):
BATCH_SIZE = 128 BATCH_SIZE = 128
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -94,7 +96,10 @@ def train(use_cuda, train_program, params_dirname): ...@@ -94,7 +96,10 @@ def train(use_cuda, train_program, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_func) train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=parallel)
if six.PY2: if six.PY2:
trainer.train( trainer.train(
...@@ -114,10 +119,13 @@ def train(use_cuda, train_program, params_dirname): ...@@ -114,10 +119,13 @@ def train(use_cuda, train_program, params_dirname):
assert ("kid scope" in cpt.get_exception_message(ex)) assert ("kid scope" in cpt.get_exception_message(ex))
def infer(use_cuda, inference_program, params_dirname=None): def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place) infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
# The input's dimension of conv should be 4-D or 5-D. # The input's dimension of conv should be 4-D or 5-D.
# Use normilized image pixels as input data, which should be in the range # Use normilized image pixels as input data, which should be in the range
...@@ -128,22 +136,31 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -128,22 +136,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results) print("infer results: ", results)
def main(use_cuda): def main(use_cuda, parallel):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
save_path = "image_classification_vgg.inference.model" save_path = "image_classification_vgg.inference.model"
os.environ['CPU_NUM'] = str(4)
train( train(
use_cuda=use_cuda, use_cuda=use_cuda,
train_program=train_network, train_program=train_network,
params_dirname=save_path) params_dirname=save_path,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer( infer(
use_cuda=use_cuda, use_cuda=use_cuda,
inference_program=inference_network, inference_program=inference_network,
params_dirname=save_path) params_dirname=save_path,
parallel=parallel)
if __name__ == '__main__': if __name__ == '__main__':
for use_cuda in (False, True): for use_cuda in (False, True):
main(use_cuda=use_cuda) for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
...@@ -64,14 +64,14 @@ def optimizer_func(): ...@@ -64,14 +64,14 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001) return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname): def train(use_cuda, train_program, parallel, params_dirname):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = fluid.Trainer(
train_func=train_program, train_func=train_program,
place=place, place=place,
optimizer_func=optimizer_func, optimizer_func=optimizer_func,
parallel=True) parallel=parallel)
def event_handler(event): def event_handler(event):
if isinstance(event, fluid.EndEpochEvent): if isinstance(event, fluid.EndEpochEvent):
...@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname): ...@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname):
feed_order=['img', 'label']) feed_order=['img', 'label'])
def infer(use_cuda, inference_program, params_dirname=None): def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place) infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
batch_size = 1 batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0, tensor_img = numpy.random.uniform(-1.0, 1.0,
...@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0]) print("infer results: ", results[0])
def main(use_cuda): def main(use_cuda, parallel):
params_dirname = "recognize_digits_conv.inference.model" params_dirname = "recognize_digits_conv.inference.model"
# call train() with is_local argument to run distributed train # call train() with is_local argument to run distributed train
os.environ['CPU_NUM'] = str(4)
train( train(
use_cuda=use_cuda, use_cuda=use_cuda,
train_program=train_program, train_program=train_program,
params_dirname=params_dirname) params_dirname=params_dirname,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer( infer(
use_cuda=use_cuda, use_cuda=use_cuda,
inference_program=inference_program, inference_program=inference_program,
params_dirname=params_dirname) params_dirname=params_dirname,
parallel=parallel)
if __name__ == '__main__': if __name__ == '__main__':
# for use_cuda in (False, True): for use_cuda in (False, True):
main(use_cuda=core.is_compiled_with_cuda()) for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import argparse import argparse
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle import paddle
import six import six
import sys import sys
...@@ -51,11 +52,14 @@ def optimizer_func(): ...@@ -51,11 +52,14 @@ def optimizer_func():
return fluid.optimizer.Adam(learning_rate=0.001) return fluid.optimizer.Adam(learning_rate=0.001)
def train(use_cuda, train_program, params_dirname): def train(use_cuda, train_program, params_dirname, parallel):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
trainer = fluid.Trainer( trainer = fluid.Trainer(
train_func=train_program, place=place, optimizer_func=optimizer_func) train_func=train_program,
place=place,
optimizer_func=optimizer_func,
parallel=parallel)
def event_handler(event): def event_handler(event):
if isinstance(event, fluid.EndEpochEvent): if isinstance(event, fluid.EndEpochEvent):
...@@ -98,11 +102,14 @@ def train(use_cuda, train_program, params_dirname): ...@@ -98,11 +102,14 @@ def train(use_cuda, train_program, params_dirname):
assert ("kid scope" in cpt.get_exception_message(ex)) assert ("kid scope" in cpt.get_exception_message(ex))
def infer(use_cuda, inference_program, params_dirname=None): def infer(use_cuda, inference_program, parallel, params_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer( inferencer = fluid.Inferencer(
infer_func=inference_program, param_path=params_dirname, place=place) infer_func=inference_program,
param_path=params_dirname,
place=place,
parallel=parallel)
batch_size = 1 batch_size = 1
tensor_img = numpy.random.uniform(-1.0, 1.0, tensor_img = numpy.random.uniform(-1.0, 1.0,
...@@ -113,20 +120,32 @@ def infer(use_cuda, inference_program, params_dirname=None): ...@@ -113,20 +120,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
print("infer results: ", results[0]) print("infer results: ", results[0])
def main(use_cuda): def main(use_cuda, parallel):
params_dirname = "recognize_digits_mlp.inference.model" params_dirname = "recognize_digits_mlp.inference.model"
# call train() with is_local argument to run distributed train # call train() with is_local argument to run distributed train
os.environ['CPU_NUM'] = str(4)
train( train(
use_cuda=use_cuda, use_cuda=use_cuda,
train_program=train_program, train_program=train_program,
params_dirname=params_dirname) params_dirname=params_dirname,
parallel=parallel)
# FIXME(zcd): in the inference stage, the number of
# input data is one, it is not appropriate to use parallel.
if parallel and use_cuda:
return
os.environ['CPU_NUM'] = str(1)
infer( infer(
use_cuda=use_cuda, use_cuda=use_cuda,
inference_program=inference_program, inference_program=inference_program,
params_dirname=params_dirname) params_dirname=params_dirname,
parallel=parallel)
if __name__ == '__main__': if __name__ == '__main__':
# for use_cuda in (False, True): for use_cuda in (False, True):
main(use_cuda=False) for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
main(use_cuda=use_cuda, parallel=parallel)
...@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase): ...@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
gt_box = layers.data( gt_box = layers.data(
name='gt_box', shape=[4], lod_level=1, dtype='float32') name='gt_box', shape=[4], lod_level=1, dtype='float32')
predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign( pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
loc=loc, loc=loc,
scores=scores, scores=scores,
anchor_box=anchor_box, anchor_box=anchor_box,
...@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase): ...@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
rpn_positive_overlap=0.7, rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3) rpn_negative_overlap=0.3)
self.assertIsNotNone(predicted_scores) self.assertIsNotNone(pred_scores)
self.assertIsNotNone(predicted_location) self.assertIsNotNone(pred_loc)
self.assertIsNotNone(target_label) self.assertIsNotNone(tgt_lbl)
self.assertIsNotNone(target_bbox) self.assertIsNotNone(tgt_bbox)
assert predicted_scores.shape[1] == 2 assert pred_scores.shape[1] == 1
assert predicted_location.shape[1] == 4 assert pred_loc.shape[1] == 4
assert predicted_location.shape[1] == target_bbox.shape[1] assert pred_loc.shape[1] == tgt_bbox.shape[1]
print(str(program))
class TestGenerateProposals(unittest.TestCase): class TestGenerateProposals(unittest.TestCase):
......
...@@ -36,6 +36,7 @@ import paddle.fluid as fluid ...@@ -36,6 +36,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid import core from paddle.fluid import core
from test_dist_base import TestDistRunnerBase, runtime_main from test_dist_base import TestDistRunnerBase, runtime_main
import paddle.compat as cpt
from paddle.compat import long_type from paddle.compat import long_type
import hashlib import hashlib
...@@ -315,7 +316,8 @@ def pad_batch_data(insts, ...@@ -315,7 +316,8 @@ def pad_batch_data(insts,
""" """
return_list = [] return_list = []
max_len = max(len(inst) for inst in insts) max_len = max(len(inst) for inst in insts)
num_token = reduce(lambda x, y: x + y, num_token = six.moves.reduce(
lambda x, y: x + y,
[len(inst) for inst in insts]) if return_num_token else 0 [len(inst) for inst in insts]) if return_num_token else 0
# Any token included in dict can be used to pad, since the paddings' loss # Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients. # will be masked out by weights and make no effect on parameter gradients.
...@@ -328,7 +330,7 @@ def pad_batch_data(insts, ...@@ -328,7 +330,7 @@ def pad_batch_data(insts,
return_list += [inst_weight.astype("float32").reshape([-1, 1])] return_list += [inst_weight.astype("float32").reshape([-1, 1])]
else: # position data else: # position data
inst_pos = np.array([ inst_pos = np.array([
range(1, len(inst) + 1) + [0] * (max_len - len(inst)) list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
for inst in insts for inst in insts
]) ])
return_list += [inst_pos.astype("int64").reshape([-1, 1])] return_list += [inst_pos.astype("int64").reshape([-1, 1])]
...@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx, ...@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
return_num_token=True) return_num_token=True)
data_input_dict = dict( data_input_dict = dict(
list(
zip(data_input_names, [ zip(data_input_names, [
src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
])) ])))
return data_input_dict, np.asarray([num_token], dtype="float32") return data_input_dict, np.asarray([num_token], dtype="float32")
...@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, ...@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
np.log(TrainTaskConfig.label_smooth_eps / ( np.log(TrainTaskConfig.label_smooth_eps / (
ModelHyperParams.trg_vocab_size - 1) + 1e-20)) ModelHyperParams.trg_vocab_size - 1) + 1e-20))
init = False init = False
for pass_id in xrange(TrainTaskConfig.pass_num): for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time() pass_start_time = time.time()
for batch_id, data in enumerate(train_data()): for batch_id, data in enumerate(train_data()):
if batch_id >= 5: if batch_id >= 5:
...@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, ...@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.eos_idx, ModelHyperParams.n_head,
ModelHyperParams.d_model) ModelHyperParams.d_model)
total_num_token += num_token total_num_token += num_token
feed_kv_pairs = data_input_dict.items() feed_kv_pairs = list(data_input_dict.items())
if TrainTaskConfig.local: if TrainTaskConfig.local:
feed_kv_pairs += { feed_kv_pairs += list({
lr_scheduler.learning_rate.name: lr_rate lr_scheduler.learning_rate.name: lr_rate
}.items() }.items())
feed_list.append(dict(feed_kv_pairs)) feed_list.append(dict(feed_kv_pairs))
if not init: if not init:
...@@ -873,6 +876,7 @@ class DataReader(object): ...@@ -873,6 +876,7 @@ class DataReader(object):
f = tarfile.open(fpaths[0], "r") f = tarfile.open(fpaths[0], "r")
for line in f.extractfile(tar_fname): for line in f.extractfile(tar_fname):
line = cpt.to_text(line)
fields = line.strip("\n").split(self._field_delimiter) fields = line.strip("\n").split(self._field_delimiter)
if (not self._only_src and len(fields) == 2) or ( if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1): self._only_src and len(fields) == 1):
...@@ -882,8 +886,9 @@ class DataReader(object): ...@@ -882,8 +886,9 @@ class DataReader(object):
if not os.path.isfile(fpath): if not os.path.isfile(fpath):
raise IOError("Invalid file: %s" % fpath) raise IOError("Invalid file: %s" % fpath)
with open(fpath, "r") as f: with open(fpath, "rb") as f:
for line in f: for line in f:
line = cpt.to_text(line)
fields = line.strip("\n").split(self._field_delimiter) fields = line.strip("\n").split(self._field_delimiter)
if (not self._only_src and len(fields) == 2) or ( if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1): self._only_src and len(fields) == 1):
...@@ -892,8 +897,9 @@ class DataReader(object): ...@@ -892,8 +897,9 @@ class DataReader(object):
@staticmethod @staticmethod
def load_dict(dict_path, reverse=False): def load_dict(dict_path, reverse=False):
word_dict = {} word_dict = {}
with open(dict_path, "r") as fdict: with open(dict_path, "rb") as fdict:
for idx, line in enumerate(fdict): for idx, line in enumerate(fdict):
line = cpt.to_text(line)
if reverse: if reverse:
word_dict[idx] = line.strip("\n") word_dict[idx] = line.strip("\n")
else: else:
...@@ -1034,7 +1040,7 @@ def multi_head_attention(queries, ...@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
# size of the input as the output dimension size. # size of the input as the output dimension size.
return layers.reshape( return layers.reshape(
x=trans_x, x=trans_x,
shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])) shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
""" """
......
...@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase): ...@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
outs, _ = self._calc_output(place) outs, _ = self._calc_output(place)
return outs return outs
def _calc_output(self, place, parallel=False): def _calc_output(self, place, parallel=False, no_check_set=None):
program = Program() program = Program()
block = program.global_block() block = program.global_block()
...@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase): ...@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
# if not, fill the fetch_list by the user configured outputs in test. # if not, fill the fetch_list by the user configured outputs in test.
if len(fetch_list) == 0: if len(fetch_list) == 0:
for var_name, var in six.iteritems(outputs): for var_name, var in six.iteritems(outputs):
if no_check_set is not None and var_name in no_check_set:
continue
if isinstance(var, list): if isinstance(var, list):
for v in var: for v in var:
fetch_list.append(v) fetch_list.append(v)
...@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase): ...@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
return_numpy=False) return_numpy=False)
return outs, fetch_list return outs, fetch_list
def check_output_with_place(self, place, atol): def check_output_with_place(self,
outs, fetch_list = self._calc_output(place) place,
atol,
no_check_set=None,
equal_nan=False):
outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
for out_name, out_dup in Operator.get_op_outputs(self.op_type): for out_name, out_dup in Operator.get_op_outputs(self.op_type):
if out_name not in self.outputs: if out_name not in self.outputs:
continue continue
if no_check_set is not None and out_name in no_check_set:
continue
def find_actual(target_name, fetch_list): def find_actual(target_name, fetch_list):
found = [ found = [
...@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase): ...@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase):
if isinstance(expect, tuple) else expect if isinstance(expect, tuple) else expect
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + sub_out_name + ") has diff at " + "Output (" + sub_out_name + ") has diff at " +
str(place)) str(place))
if isinstance(expect, tuple): if isinstance(expect, tuple):
...@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase): ...@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase):
expect_t = expect[0] if isinstance(expect, tuple) else expect expect_t = expect[0] if isinstance(expect, tuple) else expect
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t)) str(actual_t))
...@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase): ...@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
return places return places
def check_output(self, atol=1e-5): def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
places = self._get_places() places = self._get_places()
for place in places: for place in places:
self.check_output_with_place(place, atol) self.check_output_with_place(place, atol, no_check_set, equal_nan)
def check_output_customized(self, checker): def check_output_customized(self, checker):
places = self._get_places() places = self._get_places()
......
...@@ -26,18 +26,15 @@ class TestAucOp(OpTest): ...@@ -26,18 +26,15 @@ class TestAucOp(OpTest):
pred = np.random.random((128, 2)).astype("float32") pred = np.random.random((128, 2)).astype("float32")
labels = np.random.randint(0, 2, (128, 1)) labels = np.random.randint(0, 2, (128, 1))
num_thresholds = 200 num_thresholds = 200
tp = np.zeros((num_thresholds, )).astype("int64")
tn = np.zeros((num_thresholds, )).astype("int64") stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
fp = np.zeros((num_thresholds, )).astype("int64") stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
fn = np.zeros((num_thresholds, )).astype("int64")
self.inputs = { self.inputs = {
'Predict': pred, 'Predict': pred,
'Label': labels, 'Label': labels,
'TP': tp, "StatPos": stat_pos,
'TN': tn, "StatNeg": stat_neg
'FP': fp,
'FN': fn
} }
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
...@@ -47,11 +44,10 @@ class TestAucOp(OpTest): ...@@ -47,11 +44,10 @@ class TestAucOp(OpTest):
python_auc.update(pred, labels) python_auc.update(pred, labels)
self.outputs = { self.outputs = {
'AUC': python_auc.eval(), 'AUC': np.array(python_auc.eval()),
'TPOut': python_auc.tp_list, 'BatchAUC': np.array(python_auc.eval()),
'FNOut': python_auc.fn_list, 'StatPosOut': np.array(python_auc._stat_pos),
'TNOut': python_auc.tn_list, 'StatNegOut': np.array(python_auc._stat_neg)
'FPOut': python_auc.fp_list
} }
def test_check_output(self): def test_check_output(self):
......
...@@ -438,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): ...@@ -438,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
# 2 optimize for table adam # 2 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops], self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "adam", "scale", "scale"]) ["sum", "scale", "adam", "scale", "scale"])
trainer, _ = self.get_trainer() trainer, _ = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1) self.assertEqual(len(trainer.blocks), 1)
......
...@@ -21,28 +21,41 @@ from op_test import OpTest ...@@ -21,28 +21,41 @@ from op_test import OpTest
class TestFakeQuantizeOp(OpTest): class TestFakeQuantizeOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "fake_quantize" self.op_type = "fake_quantize_abs_max"
self.attrs = {'bit_length': 8}
self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
scale = np.max(np.abs(self.inputs['X'])).astype("float32")
self.outputs = {
'Out': np.round(self.inputs['X'] / scale * (
(1 << (self.attrs['bit_length'] - 1)) - 1)),
'OutScale': np.array(scale).astype("float32"),
}
def test_check_output(self):
self.check_output()
class TestFakeQuantizeOp(OpTest):
def setUp(self):
self.op_type = "fake_quantize_range_abs_max"
self.attrs = { self.attrs = {
'bit_length': 8, 'bit_length': int(5),
'quantize_type': 'abs_max', 'window_size': int(1),
'window_size': 10000 'is_test': False
} }
self.inputs = { self.inputs = {
'X': np.random.random((10, 10)).astype("float32"), 'X': np.random.random((8, 16, 7, 7)).astype("float32"),
'InScales': np.zeros(self.attrs['window_size']).astype("float32"), 'Iter': np.zeros(1).astype("int64"),
'InCurrentIter': np.zeros(1).astype("float32"), 'InScale': np.zeros(1).astype("float32")
'InMovingScale': np.zeros(1).astype("float32")
}
self.scale = {
'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32")
} }
scale = np.max(np.abs(self.inputs['X'])).astype("float32")
out_scales = np.zeros(self.attrs['window_size']).astype("float32")
out_scales[0] = scale
self.outputs = { self.outputs = {
'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * ( 'Out': np.round(self.inputs['X'] / scale * (
(1 << (self.attrs['bit_length'] - 1)) - 1)), (1 << (self.attrs['bit_length'] - 1)) - 1)),
'OutScales': np.zeros(self.attrs['window_size']).astype("float32"), 'OutScale': scale,
'OutMovingScale': 'OutScales': out_scales,
np.array([self.scale['abs_max']]).astype("float32"),
'OutCurrentIter': np.zeros(1).astype("float32")
} }
def test_check_output(self): def test_check_output(self):
......
...@@ -22,14 +22,17 @@ from op_test import OpTest ...@@ -22,14 +22,17 @@ from op_test import OpTest
class TestFlattenOp(OpTest): class TestFlattenOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "flatten" self.op_type = "flatten2"
self.init_test_case() self.init_test_case()
self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
self.init_attrs() self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.in_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=["XShape"])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
......
...@@ -37,7 +37,7 @@ def fusion_gru( ...@@ -37,7 +37,7 @@ def fusion_gru(
h0, h0,
wh, wh,
np.zeros( np.zeros(
(1, wh.shape[1]), dtype='float64'), (1, wh.shape[1]), dtype='float32'),
is_reverse, is_reverse,
act_state, act_state,
act_gate) act_gate)
...@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest): ...@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest):
T = sum(self.lod[0]) T = sum(self.lod[0])
N = len(self.lod[0]) N = len(self.lod[0])
x = np.random.rand(T, self.M).astype('float64') x = np.random.rand(T, self.M).astype('float32')
wx = np.random.rand(self.M, 3 * self.D).astype('float64') wx = np.random.rand(self.M, 3 * self.D).astype('float32')
wh = np.random.rand(self.D, 3 * self.D).astype('float64') wh = np.random.rand(self.D, 3 * self.D).astype('float32')
bias = np.random.rand( bias = np.random.rand(
1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( 1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
(1, 3 * self.D), dtype='float64') (1, 3 * self.D), dtype='float32')
h0 = np.random.rand( h0 = np.random.rand(
N, self.D).astype('float64') if self.with_h0 else np.zeros( N, self.D).astype('float32') if self.with_h0 else np.zeros(
(N, self.D), dtype='float64') (N, self.D), dtype='float32')
_, _, _, hidden = fusion_gru( _, _, _, hidden = fusion_gru(
x, self.lod, h0, wx, wh, bias, self.is_reverse, x, self.lod, h0, wx, wh, bias, self.is_reverse,
...@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest): ...@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest):
} }
def test_check_output(self): def test_check_output(self):
self.check_output(atol=1e-8) for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq
self.check_output()
class TestFusionGRUOpNoInitial(TestFusionGRUOp): class TestFusionGRUOpNoInitial(TestFusionGRUOp):
......
...@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest): ...@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest):
self.act_cell = 'tanh' self.act_cell = 'tanh'
self.act_cand = 'tanh' self.act_cand = 'tanh'
self.use_peepholes = False self.use_peepholes = False
self.use_seq = False
self.set_conf() self.set_conf()
T = sum(self.lod[0]) T = sum(self.lod[0])
...@@ -107,6 +108,7 @@ class TestFusionLSTMOp(OpTest): ...@@ -107,6 +108,7 @@ class TestFusionLSTMOp(OpTest):
} }
self.attrs = { self.attrs = {
'use_peepholes': self.use_peepholes, 'use_peepholes': self.use_peepholes,
'use_seq': self.use_seq,
'is_reverse': self.is_reverse, 'is_reverse': self.is_reverse,
'gate_activation': self.act_gate, 'gate_activation': self.act_gate,
'cell_activation': self.act_cell, 'cell_activation': self.act_cell,
...@@ -114,6 +116,8 @@ class TestFusionLSTMOp(OpTest): ...@@ -114,6 +116,8 @@ class TestFusionLSTMOp(OpTest):
} }
def test_check_output(self): def test_check_output(self):
for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq
self.check_output() self.check_output()
...@@ -157,5 +161,68 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp): ...@@ -157,5 +161,68 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp):
self.D = 16 self.D = 16
class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.has_initial_state = True
class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.is_reverse = True
class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp):
def set_conf(self):
self.use_peepholes = True
self.lod = [[3]]
self.D = 16
class TestFusionLSTMOpSeqInit(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.has_initial_state = True
class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.is_reverse = True
class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.has_initial_state = True
self.is_reverse = True
class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
self.has_initial_state = True
class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp):
def set_conf(self):
self.use_seq = True
self.use_peepholes = True
self.is_reverse = True
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights): ...@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0] dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1] dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
dw = (np.log(gt_w / ex_w)) / ex_w / weights[2] dw = (np.log(gt_w / ex_w)) / weights[2]
dh = (np.log(gt_h / ex_h)) / ex_h / weights[3] dh = (np.log(gt_h / ex_h)) / weights[3]
targets = np.vstack([dx, dy, dw, dh]).transpose() targets = np.vstack([dx, dy, dw, dh]).transpose()
return targets return targets
......
...@@ -549,6 +549,13 @@ class TestBook(unittest.TestCase): ...@@ -549,6 +549,13 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
def test_sequence_enumerate(self):
program = Program()
with program_guard(program):
x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import six
from op_test import OpTest from op_test import OpTest
...@@ -62,17 +63,20 @@ class PReluTest(OpTest): ...@@ -62,17 +63,20 @@ class PReluTest(OpTest):
# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
# class TestCase1(PReluTest): if six.PY2:
# def initTestCase(self):
# self.attrs = {'mode': "all"}
# class TestCase2(PReluTest): class TestCase1(PReluTest):
# def initTestCase(self): def initTestCase(self):
# self.attrs = {'mode': "channel"} self.attrs = {'mode': "all"}
class TestCase2(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "channel"}
class TestCase3(PReluTest):
def initTestCase(self):
self.attrs = {'mode': "element"}
# class TestCase3(PReluTest):
# def initTestCase(self):
# self.attrs = {'mode': "element"}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -22,106 +22,39 @@ from op_test import OpTest ...@@ -22,106 +22,39 @@ from op_test import OpTest
class TestReshapeOp(OpTest): class TestReshapeOp(OpTest):
def setUp(self): def setUp(self):
ori_shape = (2, 25) self.init_data()
new_shape = (5, 10) self.op_type = "reshape2"
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.op_type = "reshape" self.attrs = {"shape": self.new_shape}
self.inputs = {"X": np.random.random(ori_shape).astype("float32")} self.outputs = {
self.attrs = {"shape": new_shape} "Out": self.inputs["X"].reshape(self.infered_shape),
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} 'XShape': np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer1(OpTest):
def setUp(self):
ori_shape = (5, 10)
new_shape = (5, -1, 5)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInfer2(OpTest):
def setUp(self):
ori_shape = (2, 2, 6)
new_shape = (2, 0, 3, -1)
infered_shape = (2, 2, 3, -1)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpInplace(OpTest):
def setUp(self):
ori_shape = (2, 25)
new_shape = (5, 10)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestReshapeOpDimInferInplace1(OpTest):
def setUp(self):
ori_shape = (5, 10)
new_shape = (5, -1, 5)
self.op_type = "reshape" def init_data(self):
self.inputs = {"X": np.random.random(ori_shape).astype("float32")} self.ori_shape = (2, 25)
self.attrs = {"shape": new_shape} self.new_shape = (5, 10)
self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} self.infered_shape = (5, 10)
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
class TestReshapeOpDimInferInplace2(OpTest): class TestReshapeOpDimInfer1(TestReshapeOp):
def setUp(self): def init_data(self):
ori_shape = (2, 2, 6) self.ori_shape = (5, 10)
new_shape = (2, 0, 3, -1) self.new_shape = (5, -1, 5)
infered_shape = (2, 2, 3, -1) self.infered_shape = (5, -1, 5)
self.op_type = "reshape"
self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
def test_check_output(self):
self.check_output()
def test_check_grad(self): class TestReshapeOpDimInfer2(TestReshapeOp):
self.check_grad(["X"], "Out") def init_data(self):
self.ori_shape = (2, 2, 6)
self.new_shape = (2, 0, 3, -1)
self.infered_shape = (2, 2, 3, -1)
class TestReshapeOpWithInputShape(OpTest): class TestReshapeOpWithInputShape(OpTest):
...@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest): ...@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
new_shape = (0, -1, 5) new_shape = (0, -1, 5)
actual_shape = (2, 3, 5) actual_shape = (2, 3, 5)
self.op_type = "reshape" self.op_type = "reshape2"
self.inputs = { self.inputs = {
"X": np.random.random(ori_shape).astype("float32"), "X": np.random.random(ori_shape).astype("float32"),
"Shape": np.array( "Shape": np.array(
actual_shape, dtype="int32") actual_shape, dtype="int32")
} }
self.attrs = {"shape": new_shape} self.attrs = {"shape": new_shape}
self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(actual_shape),
'XShape': np.random.random(ori_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out", sum_outputs=["Out"])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,90 +15,164 @@ ...@@ -15,90 +15,164 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestRmspropOp1(OpTest):
''' Test RMSProp with explicit inputs class TestBase(unittest.TestCase):
''' def setup(self, centered, epsilon=1e-6):
np.random.seed(5) # fix seed
def setUp(self):
self.op_type = "rmsprop" self.param_name = "param"
self.param = np.random.random((123, 321)).astype("float32")
param = np.random.random((123, 321)).astype("float32")
mean_square = np.random.random((123, 321)).astype("float32") self.mean_square_name = "mean_square"
learning_rate = np.array([0.01]).astype("float32") self.mean_square = np.random.random((123, 321)).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
moment = np.zeros((123, 321)).astype("float32") self.mean_grad_name = "mean_grad"
self.mean_grad = np.random.random((123, 321)).astype("float32")
epsilon = 1e-6
decay = 0.9 self.lr_name = "lr"
momentum = 0.0 self.learning_rate = np.array([0.01]).astype("float32")
self.inputs = { self.grad_name = "grad"
'Param': param, self.grad = np.random.random((123, 321)).astype("float32")
'MeanSquare': mean_square,
'LearningRate': learning_rate, self.moment_name = "moment"
'Grad': grad, self.moment = np.zeros((123, 321)).astype("float32")
'Moment': moment,
} self.epsilon = epsilon
self.decay = 0.9
self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum} self.momentum = 0.0
self.centered = centered
ms_out = decay * mean_square + (1 - decay) * grad * grad
moment_out = momentum * moment + \ self.ms_out = self.decay * self.mean_square + (1 - self.decay
learning_rate * grad / np.sqrt(ms_out + epsilon) ) * self.grad * self.grad
param_out = param - moment_out if centered:
self.mg_out = self.decay * self.mean_grad + (1 - self.decay
self.outputs = { ) * self.grad
'ParamOut': param_out, self.moment_out = self.momentum * self.moment + \
'MomentOut': moment_out, self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
'MeanSquareOut': ms_out else:
} self.moment_out = self.momentum * self.moment + \
self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
def test_check_output(self):
self.check_output() self.param_out = self.param - self.moment_out
def check(self,
class TestRmspropOp2(OpTest): actual_t,
'''Test RMSProp with default values for attributes expect_t,
''' place,
out_name,
def setUp(self): atol=1e-5,
self.op_type = "rmsprop" equal_nan=False):
self.assertTrue(
param = np.random.random((123, 321)).astype("float32") np.allclose(
mean_square = np.random.random((123, 321)).astype("float32") actual_t, expect_t, atol=atol, equal_nan=equal_nan),
learning_rate = np.array([0.01]).astype("float32") "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
grad = np.random.random((123, 321)).astype("float32") + str(expect_t) + "\n" + "But Got" + str(actual_t))
moment = np.zeros((123, 321)).astype("float32")
epsilon = 1.0e-10 class TestRmspropOp(TestBase):
decay = 0.9 def check_with_place(self, place, centered, epsilon):
momentum = 0.0 self.setup(centered, epsilon)
scope = core.Scope()
self.inputs = {
'Param': param, # create and initialize Param Variable
'MeanSquare': mean_square, param = scope.var(self.param_name).get_tensor()
'LearningRate': learning_rate, param.set(self.param, place)
'Grad': grad,
'Moment': moment, mean_square = scope.var(self.mean_square_name).get_tensor()
} mean_square.set(self.mean_square, place)
ms_out = decay * mean_square + (1 - decay) * grad * grad lr = scope.var(self.lr_name).get_tensor()
moment_out = momentum * moment + \ lr.set(self.learning_rate, place)
learning_rate * grad / np.sqrt(ms_out + epsilon)
param_out = param - moment_out grad = scope.var(self.grad_name).get_tensor()
grad.set(self.grad, place)
self.outputs = {
'ParamOut': param_out, moment = scope.var(self.moment_name).get_tensor()
'MomentOut': moment_out, moment.set(self.moment, place)
'MeanSquareOut': ms_out
} # create and run sgd operator
def test_check_output(self): if self.centered:
self.check_output() mean_grad = scope.var(self.mean_grad_name).get_tensor()
mean_grad.set(self.mean_grad, place)
rmsprop_op = Operator(
"rmsprop",
Param=self.param_name,
Grad=self.grad_name,
MeanSquare=self.mean_square_name,
MeanGrad=self.mean_grad_name,
Moment=self.moment_name,
LearningRate=self.lr_name,
ParamOut=self.param_name,
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
MeanGradOut=self.mean_grad_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=True)
else:
rmsprop_op = Operator(
"rmsprop",
Param=self.param_name,
Grad=self.grad_name,
MeanSquare=self.mean_square_name,
Moment=self.moment_name,
LearningRate=self.lr_name,
ParamOut=self.param_name,
MeanSquareOut=self.mean_square_name,
MomentOut=self.moment_name,
epsilon=self.epsilon,
decay=self.decay,
momentum=self.momentum,
centered=False)
rmsprop_op.run(scope, place)
atol = 1e-5
equal_nan = False
if self.centered:
atol = 1e-3
equal_nan = True
self.check(
np.array(mean_square), self.ms_out, place, self.mean_square_name)
self.check(
np.array(moment),
self.moment_out,
place,
self.moment_name,
atol=atol,
equal_nan=equal_nan)
self.check(
np.array(param),
self.param_out,
place,
self.param_name,
atol=atol,
equal_nan=equal_nan)
if self.centered:
self.check(
np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
def test_rmsprop(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place, False, 1e-6)
self.check_with_place(place, False, 1e-10)
self.check_with_place(place, True, 1e-6)
self.check_with_place(place, True, 1e-10)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest): ...@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):
for i in range(self.rois_num): for i in range(self.rois_num):
roi = self.rois[i] roi = self.rois[i]
roi_batch_id = roi[0] roi_batch_id = int(roi[0])
roi_start_w = int(cpt.round(roi[1] * self.spatial_scale)) roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
roi_start_h = int(cpt.round(roi[2] * self.spatial_scale)) roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
roi_end_w = int(cpt.round(roi[3] * self.spatial_scale)) roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
...@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest): ...@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
roi = [bno, x1, y1, x2, y2] roi = [bno, x1, y1, x2, y2]
rois.append(roi) rois.append(roi)
self.rois_num = len(rois) self.rois_num = len(rois)
self.rois = np.array(rois).astype("int64") self.rois = np.array(rois).astype("float32")
def setUp(self): def setUp(self):
self.op_type = "roi_pool" self.op_type = "roi_pool"
......
...@@ -18,12 +18,17 @@ import unittest ...@@ -18,12 +18,17 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
from test_anchor_generator_op import anchor_generator_in_python
from test_generate_proposal_labels import _generate_groundtruth
from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
rpn_negative_overlap, fg_fraction): rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
iou = np.transpose(iou) iou = np.transpose(gt_anchor_iou)
anchor_to_gt_max = iou.max(axis=1) anchor_to_gt_max = iou.max(axis=1)
anchor_to_gt_argmax = iou.argmax(axis=1)
gt_to_anchor_argmax = iou.argmax(axis=0) gt_to_anchor_argmax = iou.argmax(axis=0)
gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
...@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, ...@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
tgt_lbl[bg_inds] = 0
if len(bg_inds) > num_bg: if len(bg_inds) > num_bg:
enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
tgt_lbl[enable_inds] = 0 tgt_lbl[enable_inds] = 0
bg_inds = np.where(tgt_lbl == 0)[0] bg_inds = np.where(tgt_lbl == 0)[0]
tgt_lbl[bg_inds] = 0
loc_index = fg_inds loc_index = fg_inds
score_index = np.hstack((fg_inds, bg_inds)) score_index = np.hstack((fg_inds, bg_inds))
tgt_lbl = np.expand_dims(tgt_lbl, axis=1) tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
return loc_index, score_index, tgt_lbl
gt_inds = anchor_to_gt_argmax[fg_inds]
return loc_index, score_index, tgt_lbl, gt_inds
def get_anchor(n, c, h, w):
input_feat = np.random.random((n, c, h, w)).astype('float32')
anchors, _ = anchor_generator_in_python(
input_feat=input_feat,
anchor_sizes=[32., 64.],
aspect_ratios=[0.5, 1.0],
variances=[1.0, 1.0, 1.0, 1.0],
stride=[16.0, 16.0],
offset=0.5)
return anchors
def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
loc_indexes = []
score_indexes = []
tmp_tgt_labels = []
tgt_bboxes = []
anchor_num = anchor.shape[0]
batch_size = len(lod) - 1
for i in range(batch_size):
b, e = lod[i], lod[i + 1]
iou_slice = iou[b:e, :]
bboxes_slice = gt_boxes[b:e, :]
loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
rpn_negative_overlap, fg_fraction)
fg_bboxes = bboxes_slice[gt_inds]
fg_anchors = anchor[loc_idx]
box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
if i == 0:
loc_indexes = loc_idx
score_indexes = score_idx
tmp_tgt_labels = tgt_lbl
tgt_bboxes = box_deltas
else:
loc_indexes = np.concatenate(
[loc_indexes, loc_idx + i * anchor_num])
score_indexes = np.concatenate(
[score_indexes, score_idx + i * anchor_num])
tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
tgt_labels = tmp_tgt_labels[score_indexes]
return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
class TestRpnTargetAssignOp(OpTest): class TestRpnTargetAssignOp(OpTest):
def setUp(self): def setUp(self):
iou = np.random.random((10, 8)).astype("float32") n, c, h, w = 2, 4, 14, 14
self.op_type = "rpn_target_assign" anchor = get_anchor(n, c, h, w)
self.inputs = {'DistMat': iou} gt_num = 10
self.attrs = { anchor = anchor.reshape(-1, 4)
'rpn_batch_size_per_im': 256, anchor_num = anchor.shape[0]
'rpn_positive_overlap': 0.95,
'rpn_negative_overlap': 0.3,
'fg_fraction': 0.25,
'fix_seed': True
}
loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
0.25)
self.outputs = {
'LocationIndex': loc_index,
'ScoreIndex': score_index,
'TargetLabel': tgt_lbl,
}
def test_check_output(self): im_shapes = [[64, 64], [64, 64]]
self.check_output() gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
bbox = np.vstack([v['boxes'] for v in gt_box])
iou = _bbox_overlaps(bbox, anchor)
anchor = anchor.astype('float32')
bbox = bbox.astype('float32')
iou = iou.astype('float32')
loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
class TestRpnTargetAssignOp2(OpTest):
def setUp(self):
iou = np.random.random((10, 20)).astype("float32")
self.op_type = "rpn_target_assign" self.op_type = "rpn_target_assign"
self.inputs = {'DistMat': iou} self.inputs = {
'Anchor': anchor,
'GtBox': (bbox, [[4, 4]]),
'DistMat': (iou, [[4, 4]]),
}
self.attrs = { self.attrs = {
'rpn_batch_size_per_im': 128, 'rpn_batch_size_per_im': 25600,
'rpn_positive_overlap': 0.5, 'rpn_positive_overlap': 0.95,
'rpn_negative_overlap': 0.5, 'rpn_negative_overlap': 0.03,
'fg_fraction': 0.5, 'fg_fraction': 0.25,
'fix_seed': True 'fix_seed': True
} }
loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
0.5)
self.outputs = { self.outputs = {
'LocationIndex': loc_index, 'LocationIndex': loc_index.astype('int32'),
'ScoreIndex': score_index, 'ScoreIndex': score_index.astype('int32'),
'TargetLabel': tgt_lbl, 'TargetBBox': tgt_bbox.astype('float32'),
'TargetLabel': tgt_lbl.astype('int64'),
} }
def test_check_output(self): def test_check_output(self):
......
...@@ -25,9 +25,9 @@ class TestSamplingIdOp(OpTest): ...@@ -25,9 +25,9 @@ class TestSamplingIdOp(OpTest):
self.op_type = "sampling_id" self.op_type = "sampling_id"
self.use_mkldnn = False self.use_mkldnn = False
self.init_kernel_type() self.init_kernel_type()
self.X = np.random.random((8, 4)).astype('float32') self.X = np.random.random((100, 10)).astype('float32')
self.inputs = {"X": self.X} self.inputs = {"X": self.X}
self.Y = np.random.random(8).astype('float32') self.Y = np.random.random(100).astype('int64')
self.outputs = {'Out': self.Y} self.outputs = {'Out': self.Y}
self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1} self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
...@@ -36,6 +36,16 @@ class TestSamplingIdOp(OpTest): ...@@ -36,6 +36,16 @@ class TestSamplingIdOp(OpTest):
y1 = self.out y1 = self.out
self.check_output_customized(self.verify_output) self.check_output_customized(self.verify_output)
y2 = self.out y2 = self.out
# check dtype
assert y1.dtype == np.int64
assert y2.dtype == np.int64
# check output is index ids of inputs
inputs_ids = np.arange(self.X.shape[1])
assert np.isin(y1, inputs_ids).all()
assert np.isin(y2, inputs_ids).all()
self.assertTrue(np.array_equal(y1, y2)) self.assertTrue(np.array_equal(y1, y2))
self.assertEqual(len(y1), len(self.Y)) self.assertEqual(len(y1), len(self.Y))
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
lod0 = [0]
for i in range(0, len(in_lod[0])):
lod0.append(lod0[i] + in_lod[0][i])
out_seq = []
for i in range(0, len(lod0) - 1):
for idx in range(lod0[i], lod0[i + 1]):
single_seq = []
for word_idx in range(win_size):
word_pos = idx + word_idx
dat = input_seq[word_pos] if word_pos < lod0[i+1] \
else pad_value
single_seq.append(dat)
out_seq.append(single_seq)
return out_seq
class TestSequenceEnumerateOp(OpTest):
def setUp(self):
self.op_type = "sequence_enumerate"
self.init_test_case()
self.inputs = {'X': (self.in_seq, self.lod)}
self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value}
self.outputs = {'Out': (self.out_seq, self.lod)}
def test_check_output(self):
self.check_output()
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 2
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
self.lod = [[9, 4, 11, 6]]
self.win_size = 2
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int64")
class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 5
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 30
self.pad_value = 0
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
def init_test_case(self):
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
self.lod = [[9, 4, 11, 6]]
self.win_size = 5
self.pad_value = 5
out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
self.pad_value)
self.out_seq = np.array(out_seq).astype("int32")
if __name__ == "__main__":
unittest.main()
...@@ -23,14 +23,17 @@ from op_test import OpTest ...@@ -23,14 +23,17 @@ from op_test import OpTest
# Correct: General. # Correct: General.
class TestSqueezeOp(OpTest): class TestSqueezeOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "squeeze" self.op_type = "squeeze2"
self.init_test_case() self.init_test_case()
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs() self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
......
...@@ -22,16 +22,19 @@ from op_test import OpTest ...@@ -22,16 +22,19 @@ from op_test import OpTest
class TestTransposeOp(OpTest): class TestTransposeOp(OpTest):
def setUp(self): def setUp(self):
self.initTestCase() self.initTestCase()
self.op_type = "transpose" self.op_type = "transpose2"
self.inputs = {'X': np.random.random(self.shape).astype("float32")} self.inputs = {'X': np.random.random(self.shape).astype("float32")}
self.attrs = {'axis': list(self.axis)} self.attrs = {'axis': list(self.axis)}
self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} self.outputs = {
'XShape': np.random.random(self.shape).astype("float32"),
'Out': self.inputs['X'].transpose(self.axis)
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out', sum_outputs=['Out'])
def initTestCase(self): def initTestCase(self):
self.shape = (3, 4) self.shape = (3, 4)
......
...@@ -24,13 +24,16 @@ from op_test import OpTest ...@@ -24,13 +24,16 @@ from op_test import OpTest
class TestUnsqueezeOp(OpTest): class TestUnsqueezeOp(OpTest):
def setUp(self): def setUp(self):
self.init_test_case() self.init_test_case()
self.op_type = "unsqueeze" self.op_type = "unsqueeze2"
self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
self.init_attrs() self.init_attrs()
self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float32")
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(no_check_set=["XShape"])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out") self.check_grad(["X"], "Out")
......
...@@ -153,7 +153,7 @@ def block_to_code(block, block_idx): ...@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):
indent += 1 indent += 1
# sort all vars # sort all vars
all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0]) all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
for var in all_vars: for var in all_vars:
print("{}{}".format(get_indent_space(indent), variable_to_code(var[1]))) print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))
......
...@@ -300,7 +300,7 @@ class DistributeTranspiler(object): ...@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
input_deps = grad_name_to_send_dummy_out.values() input_deps = grad_name_to_send_dummy_out.values()
program.global_block().append_op( program.global_block().append_op(
type="send_barrier", type="send_barrier",
inputs={"X": input_deps}, inputs={"X": list(input_deps)},
outputs={"Out": send_barrier_out}, outputs={"Out": send_barrier_out},
attrs={ attrs={
"endpoints": pserver_endpoints, "endpoints": pserver_endpoints,
...@@ -455,7 +455,7 @@ class DistributeTranspiler(object): ...@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
# NOTE: if enable memory optimization, origin vars maybe removed. # NOTE: if enable memory optimization, origin vars maybe removed.
if startup_program.global_block().vars.has_key(varname): if varname in startup_program.global_block().vars:
orig_param = startup_program.global_block().vars[varname] orig_param = startup_program.global_block().vars[varname]
else: else:
origin_param_var = self.origin_program.global_block().vars[ origin_param_var = self.origin_program.global_block().vars[
...@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object): ...@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object):
self.table_name] self.table_name]
zero_dim = int( zero_dim = int(
math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints))) math.ceil(origin_param_var.shape[0] / float(
len(self.pserver_endpoints))))
table_shape = list(origin_param_var.shape) table_shape = list(origin_param_var.shape)
table_shape[0] = zero_dim table_shape[0] = zero_dim
...@@ -1390,8 +1391,6 @@ class DistributeTranspiler(object): ...@@ -1390,8 +1391,6 @@ class DistributeTranspiler(object):
inputs={"X": vars2merge}, inputs={"X": vars2merge},
outputs={"Out": merged_var}, outputs={"Out": merged_var},
attrs={"use_mkldnn": False}) attrs={"use_mkldnn": False})
# TODO(panyx0718): What if it's SELECTED_ROWS.
if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
optimize_block.append_op( optimize_block.append_op(
type="scale", type="scale",
inputs={"X": merged_var}, inputs={"X": merged_var},
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册