diff --git a/CMakeLists.txt b/CMakeLists.txt index ff9ac4f0176e53e6e8dcf8fca9dd3272c66e5bdf..bc020792a66e2c1a7e152a3ef3d1c2ea699a097c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -207,6 +207,10 @@ endif() include(external/threadpool) +include(flags) # set paddle compile flags +include(cudnn) # set cudnn libraries, must before configure +include(configure) # add paddle env configuration + if(WITH_GPU) include(cuda) include(tensorrt) diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 6b22f8f520e3d9c6c89d41a7455a6f9ebbad6d80..53d010434a8ebbe0184d84f588783f25186d606a 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -151,6 +151,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, if data == None: break if iters == args.iterations: + reader_generator.close() break if iters == args.skip_batch_num: start_time = time.time() @@ -252,6 +253,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, if data == None: break if iters == args.iterations: + reader_generator.close() break if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 78be0749091fb0a617f9fb172cc92b33560a3552..dc6730662f0b888f1981ac9c086320acc52d0a50 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -52,9 +52,8 @@ ExternalProject_Add( extern_anakin ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLML_PROJECT} - # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code. - GIT_REPOSITORY "https://github.com/luotao1/Anakin" - GIT_TAG "211d1fc5d813d70c0c14072f9083cf25f40940ea" + GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin" + GIT_TAG "9424277cf9ae180a14aff09560d3cd60a49c76d2" PREFIX ${ANAKIN_SOURCE_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DUSE_GPU_PLACE=YES diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index 42881106c8778bfd1b23cb7333eb8a472fc276e8..497764e0ef2c670bab910f04ea95b53539478c51 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -46,8 +46,13 @@ ExternalProject_Add( ${BOOST_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} +<<<<<<< HEAD DOWNLOAD_COMMAND "wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz && tar zxf ${BOOST_TAR}.tar.gz" +======= + DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz + && tar zxf ${BOOST_TAR}.tar.gz +>>>>>>> origin/develop DOWNLOAD_NO_PROGRESS 1 PREFIX ${BOOST_SOURCES_DIR} CONFIGURE_COMMAND "" @@ -57,7 +62,7 @@ ExternalProject_Add( ) endif(NOT WIN32) -if (${CMAKE_VERSION} VERSION_LESS "3.3.0") +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") add_library(boost STATIC ${dummyfile}) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 6a521125cc87b346943d6bb846a899cf00319057..c3fbe4dbdb28f1008bb274ee18293db348bfc6ed 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -17,6 +17,9 @@ IF(USE_EIGEN_FOR_BLAS) ENDIF(USE_EIGEN_FOR_BLAS) INCLUDE(cblas) +# IF(WIN32 AND NOT ${CBLAS_FOUND}) + + IF(NOT ${CBLAS_FOUND}) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index b15f2c1485f362055d8b0976b234a1d9d65c32e4..7d542114fb81dac9bef8d6d6eafdb68dc7eee31e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -218,14 +218,18 @@ function(merge_static_libs TARGET_NAME) foreach(lib ${libs}) # Get the file names of the libraries to be merged + #if(NOT $ MATCHES "lib.*\\.lib") + # message("library" ${lib}) + # set(libfiles ${libfiles} lib$) + #else() set(libfiles ${libfiles} $) + #endif() endforeach() - - # msvc will put libarary in directory of "/Release/xxxlib" by default - # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" + + # windows cmd return error in clean env. + # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" - COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles} + COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles} ) endif(WIN32) endfunction(merge_static_libs) diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md index f989b964d6d1a329bbe31adc7ec10db017acaefa..2c1c30c1eddfde6d9a8e2637be86537c43cc1b00 100644 --- a/doc/fluid/dev/releasing_process_en.md +++ b/doc/fluid/dev/releasing_process_en.md @@ -50,6 +50,33 @@ pop-up box, choose the current release branch and click "Run Build" button. You * pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the old version. you must change the version number before upload a new one. +### Publish wheel Packages for MacOS + +You need to build the binary wheel package for MacOS before publishing, to +make sure that the package can be used by many versions of MacOS +(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.), +you must build the package ***exactly*** following below steps: + +Build steps: + +1. install python from python.org downloads, and make sure it's currently in use + in your system. +1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions. +1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build` +1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org +1. `make -j` +1. `pip install delocate` +1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl` + +Then the whl under `fixed_wheel` is ready to upload. + +Install steps: + +1. run `pip install paddlepaddle...whl` +1. find the `libpython.dylib` that are currently in use: + - for python.org package installs, do nothing. + - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path` + ## Publish Docker Images Our CI tool will push latest images to DockerHub, so we only need to push a version tag like: diff --git a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst index 3571f81326a9f9ae31a8327c3e288e601f248e4b..aa9377c112856693cda72779bd399f2415d716f0 100644 --- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst +++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst @@ -9,8 +9,6 @@ Paddle 预测 API - 头文件 ``paddle_inference_api.h`` 定义了所有的接口 - 库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a`` -- 库文件 ``libpaddle_inference_api.so`` 或 - ``libpaddle_inference_api.a`` 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。 @@ -97,8 +95,7 @@ engine CHECK(predictor->Run(slots, &outputs)); // 获取 outputs ... -编译时,联编 ``libpaddle_fluid.a/.so`` 和 -``libpaddle_inference_api.a/.so`` 便可。 +编译时,联编 ``libpaddle_fluid.a/.so`` 即可。 详细代码参考 ------------ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/index.md b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md similarity index 68% rename from doc/fluid/new_docs/beginners_guide/basics/image_classification/index.md rename to doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md index ce0d2bb1dc0cf73151ee9aceea7e4d7b24af1926..4f20843596aa676962a36241f59560ec2a41257b 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/index.md +++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md @@ -1,559 +1,576 @@ - -# 图像分类 - -本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 - -图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。 - -图像分类是根据图像的语义信息将不同类别图像区分开来,是计算机视觉中重要的基本问题,也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用,包括安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。 - - -一般来说,图像分类通过手工特征或特征学习方法对整个图像进行全部描述,然后使用分类器判别物体类别,因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入,即一句话可以用一个装了词的袋子表示其特征,袋子中的词为句子中的单词、短语或字。对于图像而言,词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。 - -而基于深度学习的图像分类方法,可以通过有监督或无监督的方式**学习**层次化的特征描述,从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩,CNN直接利用图像像素信息作为输入,最大程度上保留了输入图像的所有信息,通过卷积操作进行特征的提取和高层抽象,模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果,得到了广泛的应用。 - -本教程主要介绍图像分类的深度学习模型,以及如何使用PaddlePaddle训练CNN模型。 - -## 效果展示 - -图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。 - -![dogCatClassification](./image/dog_cat.png) -

-图1. 通用图像分类展示 -

- - -图2展示了细粒度图像分类-花卉识别的效果,要求模型可以正确识别花的类别。 - -![flowersClassification](./image/flowers.png) -

-图2. 细粒度图像分类展示 -

- - -一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。 - -![imageVariations](https://raw.githubusercontent.com/PaddlePaddle/book/develop/03.image_classification/image/variations.png) -

-图3. 扰动图片展示[22] -

- -## 模型概览 - -图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上,很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛,ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集,在本章中我们基于这些竞赛的一些论文介绍图像分类模型。 - -在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成,但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。 -1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等,一般也采用多种特征描述子,防止丢失过多的有用信息。 -2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。 -3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。 -4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。 - -这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。 - -Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。 - -![ilsvrc](./image/ilsvrc.png) -

-图4. ILSVRC图像分类Top-5错误率 -

- -### CNN - -传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。 - -![cnnStructure](./image/lenet.png) -

-图5. CNN网络示例[20] -

- -- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。 -- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。 -- 全连接层(fully-connected layer,或者fc layer): 输入层到隐藏层的神经元是全部连接的。 -- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层,例如Sigmoid、Tanh、ReLu等来增强网络的表达能力,在CNN里最常使用的为ReLu激活函数。 -- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作,提高网络的泛化能力,一定程度上防止过拟合。 - -另外,在训练过程中由于每层参数不断更新,会导致下一次输入分布发生变化,这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中,每个batch对网络中的每一层特征都做归一化,使得每层分布相对稳定。BN算法不仅起到一定的正则作用,而且弱化了一些超参数的设计。经过实验证明,BN算法加速了模型收敛过程,在后来较深的模型中被广泛使用。 - -接下来我们主要介绍VGG,GoogleNet和ResNet网络结构。 - -### VGG - -牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。 - -![vgg16](./image/vgg16.png) -

-图6. 基于ImageNet的VGG16模型 -

- -### GoogleNet - -GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军,在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块,因为GoogleNet模型由多组Inception模块组成,模型设计借鉴了NIN的一些思想。 - -NIN模型主要有两个特点:1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络,即在线性卷积后面增加若干层1x1的卷积,这样可以提取出高度非线性特征。2) 传统的CNN最后几层一般都是全连接层,参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图,然后采用全局均值池化(Avg-Pooling)替代全连接层,得到类别维度大小的向量,再进行分类。这种替代全连接层的方式有利于减少参数。 - -Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。 - -![inception](./image/inception.png) -

-图7. Inception模块 -

- -GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没有采用传统的多层全连接层,而是像NIN网络一样采用了均值池化层;但与NIN不同的是,池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外,由于网络中间层特征也很有判别性,GoogleNet在中间层添加了两个辅助分类器,在后向传播中增强梯度并且增强正则化,而整个网络的损失函数是这个三个分类器的损失加权求和。 - -GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。 - -![googleNet](./image/googlenet.jpeg) -

-图8. GoogleNet[12] -

- - -上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层;GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解,进一步提高网络非线性能力和加深网络;GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升,介于篇幅,这里不再详细介绍v2到v4的结构。 - - -### ResNet - -ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题,ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核,全卷积网络)的基础上,引入了残差模块。每个残差模块包含两条路径,其中一条路径是输入特征的直连通路,另一条路径对该特征做两到三次卷积操作得到该特征的残差,最后再将两条路径上的特征相加。 - -残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256->64),下面的1x1卷积用来升维(图示例即64->256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。 - -![ResNetBlock](./image/resnet_block.jpg) -

-图9. 残差模块 -

- -图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。 - -![ResNet](./image/resnet.png) -

-图10. 基于ImageNet的ResNet模型 -

- - -## 数据准备 - -通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。 - -由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10]()数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。 - -![CIFAR](https://raw.githubusercontent.com/PaddlePaddle/book/develop/03.image_classification/image/cifar.png) -

-图11. CIFAR10数据集[21] -

- -Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。 - -通过输入`python train.py`,就可以开始训练模型了,以下小节将详细介绍`train.py`的相关内容。 - -### 模型结构 - -#### Paddle 初始化 - -让我们从导入 Paddle Fluid API 和辅助模块开始。 - -```python -import paddle -import paddle.fluid as fluid -import numpy -import sys -``` - -本教程中我们提供了VGG和ResNet两个模型的配置。 - -#### VGG - -首先介绍VGG模型结构,由于CIFAR10图片大小和数量相比ImageNet数据小很多,因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。 -VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下: - -```python -def vgg_bn_drop(input): -def conv_block(ipt, num_filter, groups, dropouts): -return fluid.nets.img_conv_group( -input=ipt, -pool_size=2, -pool_stride=2, -conv_num_filter=[num_filter] * groups, -conv_filter_size=3, -conv_act='relu', -conv_with_batchnorm=True, -conv_batchnorm_drop_rate=dropouts, -pool_type='max') - -conv1 = conv_block(input, 64, 2, [0.3, 0]) -conv2 = conv_block(conv1, 128, 2, [0.4, 0]) -conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) -conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) -conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - -drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) -fc1 = fluid.layers.fc(input=drop, size=512, act=None) -bn = fluid.layers.batch_norm(input=fc1, act='relu') -drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) -fc2 = fluid.layers.fc(input=drop2, size=512, act=None) -predict = fluid.layers.fc(input=fc2, size=10, act='softmax') -return predict -``` - -1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。 - -2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 - -3. 最后接两层512维的全连接。 - -4. 通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。 - -### ResNet - -ResNet模型的第1、3、4步和VGG模型相同,这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。 - -先介绍`resnet_cifar10`中的一些基本函数,再介绍网络连接过程。 - -- `conv_bn_layer` : 带BN的卷积层。 -- `shortcut` : 残差模块的"直连"路径,"直连"实际分两种形式:残差模块输入和输出特征通道数不等时,采用1x1卷积的升维操作;残差模块输入和输出通道相等时,采用直连操作。 -- `basicblock` : 一个基础残差模块,即图9左边所示,由两组3x3卷积组成的路径和一条"直连"路径组成。 -- `bottleneck` : 一个瓶颈残差模块,即图9右边所示,由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。 -- `layer_warp` : 一组残差模块,由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同,以用来减少特征图在垂直和水平方向的大小。 - -```python -def conv_bn_layer(input, -ch_out, -filter_size, -stride, -padding, -act='relu', -bias_attr=False): -tmp = fluid.layers.conv2d( -input=input, -filter_size=filter_size, -num_filters=ch_out, -stride=stride, -padding=padding, -act=None, -bias_attr=bias_attr) -return fluid.layers.batch_norm(input=tmp, act=act) - - -def shortcut(input, ch_in, ch_out, stride): -if ch_in != ch_out: -return conv_bn_layer(input, ch_out, 1, stride, 0, None) -else: -return input - - -def basicblock(input, ch_in, ch_out, stride): -tmp = conv_bn_layer(input, ch_out, 3, stride, 1) -tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True) -short = shortcut(input, ch_in, ch_out, stride) -return fluid.layers.elementwise_add(x=tmp, y=short, act='relu') - - -def layer_warp(block_func, input, ch_in, ch_out, count, stride): -tmp = block_func(input, ch_in, ch_out, stride) -for i in range(1, count): -tmp = block_func(tmp, ch_out, ch_out, 1) -return tmp -``` - -`resnet_cifar10` 的连接结构主要有以下几个过程。 - -1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 -2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。 -3. 最后对网络做均值池化并返回该层。 - -注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 `$(depth - 2) % 6 == 0$` 。 - -```python -def resnet_cifar10(ipt, depth=32): -# depth should be one of 20, 32, 44, 56, 110, 1202 -assert (depth - 2) % 6 == 0 -n = (depth - 2) / 6 -nStages = {16, 64, 128} -conv1 = conv_bn_layer(ipt, ch_out=16, filter_size=3, stride=1, padding=1) -res1 = layer_warp(basicblock, conv1, 16, 16, n, 1) -res2 = layer_warp(basicblock, res1, 16, 32, n, 2) -res3 = layer_warp(basicblock, res2, 32, 64, n, 2) -pool = fluid.layers.pool2d( -input=res3, pool_size=8, pool_type='avg', pool_stride=1) -predict = fluid.layers.fc(input=pool, size=10, act='softmax') -return predict -``` - -## Infererence Program 配置 - -网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32)。 - -```python -def inference_program(): -# The image is 32 * 32 with RGB representation. -data_shape = [3, 32, 32] -images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - -predict = resnet_cifar10(images, 32) -# predict = vgg_bn_drop(images) # un-comment to use vgg net -return predict -``` - -## Train Program 配置 - -然后我们需要设置训练程序 `train_program`。它首先从推理程序中进行预测。 -在训练期间,它将从预测中计算 `avg_cost`。 -在有监督训练中需要输入图像对应的类别信息,同样通过`fluid.layers.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。 - -**注意:** 训练程序应该返回一个数组,第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。 - -```python -def train_program(): -predict = inference_program() - -label = fluid.layers.data(name='label', shape=[1], dtype='int64') -cost = fluid.layers.cross_entropy(input=predict, label=label) -avg_cost = fluid.layers.mean(cost) -accuracy = fluid.layers.accuracy(input=predict, label=label) -return [avg_cost, accuracy] -``` - -## Optimizer Function 配置 - -在下面的 `Adam optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 - -```python -def optimizer_program(): -return fluid.optimizer.Adam(learning_rate=0.001) -``` - -## 训练模型 - -### Trainer 配置 - -现在,我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer_func`。 - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -trainer = fluid.Trainer( -train_func=train_program, -optimizer_func=optimizer_program, -place=place) -``` - -### Data Feeders 配置 - -`cifar.train10()` 每次产生一条样本,在完成shuffle和batch之后,作为训练的输入。 - -```python -# Each batch will yield 128 images -BATCH_SIZE = 128 - -# Reader for training -train_reader = paddle.batch( -paddle.reader.shuffle(paddle.dataset.cifar.train10(), buf_size=50000), -batch_size=BATCH_SIZE) - -# Reader for testing. A separated data set for testing. -test_reader = paddle.batch( -paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) -``` - -### Event Handler - -可以使用`event_handler`回调函数来观察训练过程,或进行测试等, 该回调函数是`trainer.train`函数里设定。 - -`event_handler_plot`可以用来利用回调数据来打点画图: - -![png](./image/train_and_test.png) - -```python -params_dirname = "image_classification_resnet.inference.model" - -from paddle.v2.plot import Ploter - -train_title = "Train cost" -test_title = "Test cost" -cost_ploter = Ploter(train_title, test_title) - -step = 0 -def event_handler_plot(event): -global step -if isinstance(event, fluid.EndStepEvent): -if step % 1 == 0: -cost_ploter.append(train_title, step, event.metrics[0]) -cost_ploter.plot() -step += 1 -if isinstance(event, fluid.EndEpochEvent): -avg_cost, accuracy = trainer.test( -reader=test_reader, -feed_order=['pixel', 'label']) -cost_ploter.append(test_title, step, avg_cost) - -# save parameters -if params_dirname is not None: -trainer.save_params(params_dirname) -``` - -`event_handler` 用来在训练过程中输出文本日志 - -```python -params_dirname = "image_classification_resnet.inference.model" - -# event handler to track training and testing process -def event_handler(event): -if isinstance(event, fluid.EndStepEvent): -if event.step % 100 == 0: -print("\nPass %d, Batch %d, Cost %f, Acc %f" % -(event.step, event.epoch, event.metrics[0], -event.metrics[1])) -else: -sys.stdout.write('.') -sys.stdout.flush() - -if isinstance(event, fluid.EndEpochEvent): -# Test against with the test dataset to get accuracy. -avg_cost, accuracy = trainer.test( -reader=test_reader, feed_order=['pixel', 'label']) - -print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format(event.epoch, avg_cost, accuracy)) - -# save parameters -if params_dirname is not None: -trainer.save_params(params_dirname) -``` - -### 训练 - -通过`trainer.train`函数训练: - -**注意:** CPU,每个 Epoch 将花费大约15~20分钟。这部分可能需要一段时间。请随意修改代码,在GPU上运行测试,以提高培训速度。 - -```python -trainer.train( -reader=train_reader, -num_epochs=2, -event_handler=event_handler, -feed_order=['pixel', 'label']) -``` - -一轮训练log示例如下所示,经过1个pass, 训练集上平均 Accuracy 为0.59 ,测试集上平均 Accuracy 为0.6 。 - -```text -Pass 0, Batch 0, Cost 3.869598, Acc 0.164062 -................................................................................................... -Pass 100, Batch 0, Cost 1.481038, Acc 0.460938 -................................................................................................... -Pass 200, Batch 0, Cost 1.340323, Acc 0.523438 -................................................................................................... -Pass 300, Batch 0, Cost 1.223424, Acc 0.593750 -.......................................................................................... -Test with Pass 0, Loss 1.1, Acc 0.6 -``` - -图12是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。 - -![CIFARErrorRate](./image/plot.png) -

-图12. CIFAR10数据集上VGG模型的分类错误率 -

- -## 应用模型 - -可以使用训练好的模型对图片进行分类,下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断,可以打开注释,更改加载的模型。 - -### 生成预测输入数据 - -`dog.png` is an example image of a dog. Turn it into an numpy array to match the data feeder format. - -```python -# Prepare testing data. -from PIL import Image -import numpy as np -import os - -def load_image(file): -im = Image.open(file) -im = im.resize((32, 32), Image.ANTIALIAS) - -im = np.array(im).astype(np.float32) -# The storage order of the loaded image is W(width), -# H(height), C(channel). PaddlePaddle requires -# the CHW order, so transpose them. -im = im.transpose((2, 0, 1)) # CHW -im = im / 255.0 - -# Add one dimension to mimic the list format. -im = numpy.expand_dims(im, axis=0) -return im - -cur_dir = os.getcwd() -img = load_image(cur_dir + '/image/dog.png') -``` - -### Inferencer 配置和预测 - -`Inferencer` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。 -我们可以简单地插入前面定义的推理程序。 -现在我们准备做预测。 - -```python -inferencer = fluid.Inferencer( -infer_func=inference_program, param_path=params_dirname, place=place) - -# inference -results = inferencer.infer({'pixel': img}) -print("infer results: ", results) -``` - -## 总结 - -传统图像分类方法由多个阶段构成,框架较为复杂,而端到端的CNN模型结构可一步到位,而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型;然后基于CIFAR10数据集,介绍如何使用PaddlePaddle配置和训练CNN模型,尤其是VGG和ResNet模型;最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet,配置和训练流程是同样的,大家可以自行进行实验。 - - -## 参考文献 - -[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004. - -[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005. - -[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. - -[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003. - -[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997. - -[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR. - -[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4). - -[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR. - -[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS. - -[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012. - -[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。 - -[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015) - -[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014. - -[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015. - -[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016. - -[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016). - -[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016). - -[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015. - -[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015. - -[20] http://deeplearning.net/tutorial/lenet.html - -[21] https://www.cs.toronto.edu/~kriz/cifar.html - -[22] http://cs231n.github.io/classification/ - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 + +# 图像分类 + +本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。 + +## 背景介绍 + +图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。 + +图像分类是根据图像的语义信息将不同类别图像区分开来,是计算机视觉中重要的基本问题,也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用,包括安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。 + + +一般来说,图像分类通过手工特征或特征学习方法对整个图像进行全部描述,然后使用分类器判别物体类别,因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入,即一句话可以用一个装了词的袋子表示其特征,袋子中的词为句子中的单词、短语或字。对于图像而言,词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。 + +而基于深度学习的图像分类方法,可以通过有监督或无监督的方式**学习**层次化的特征描述,从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩,CNN直接利用图像像素信息作为输入,最大程度上保留了输入图像的所有信息,通过卷积操作进行特征的提取和高层抽象,模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果,得到了广泛的应用。 + +本教程主要介绍图像分类的深度学习模型,以及如何使用PaddlePaddle训练CNN模型。 + +## 效果展示 + +图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。 + +

+
+图1. 通用图像分类展示 +

+ + +图2展示了细粒度图像分类-花卉识别的效果,要求模型可以正确识别花的类别。 + + +

+
+图2. 细粒度图像分类展示 +

+ + +一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。 + +

+
+图3. 扰动图片展示[22] +

+ +## 模型概览 + +图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上,很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛,ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集,在本章中我们基于这些竞赛的一些论文介绍图像分类模型。 + +在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成,但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。 + + 1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等,一般也采用多种特征描述子,防止丢失过多的有用信息。 + + 2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。 + + 3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。 + + 4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。 + +这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。 + +Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。 + +

+
+图4. ILSVRC图像分类Top-5错误率 +

+ +### CNN + +传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。 + +

+
+图5. CNN网络示例[20] +

+ +- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。 +- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。 +- 全连接层(fully-connected layer,或者fc layer): 输入层到隐藏层的神经元是全部连接的。 +- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层,例如Sigmoid、Tanh、ReLu等来增强网络的表达能力,在CNN里最常使用的为ReLu激活函数。 +- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作,提高网络的泛化能力,一定程度上防止过拟合。 + +另外,在训练过程中由于每层参数不断更新,会导致下一次输入分布发生变化,这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中,每个batch对网络中的每一层特征都做归一化,使得每层分布相对稳定。BN算法不仅起到一定的正则作用,而且弱化了一些超参数的设计。经过实验证明,BN算法加速了模型收敛过程,在后来较深的模型中被广泛使用。 + +接下来我们主要介绍VGG,GoogleNet和ResNet网络结构。 + +### VGG + +牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。 + +

+
+图6. 基于ImageNet的VGG16模型 +

+ +### GoogleNet + +GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军,在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块,因为GoogleNet模型由多组Inception模块组成,模型设计借鉴了NIN的一些思想。 + +NIN模型主要有两个特点: + +1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络,即在线性卷积后面增加若干层1x1的卷积,这样可以提取出高度非线性特征。 + +2) 传统的CNN最后几层一般都是全连接层,参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图,然后采用全局均值池化(Avg-Pooling)替代全连接层,得到类别维度大小的向量,再进行分类。这种替代全连接层的方式有利于减少参数。 + +Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。 + +

+
+图7. Inception模块 +

+ +GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没有采用传统的多层全连接层,而是像NIN网络一样采用了均值池化层;但与NIN不同的是,池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外,由于网络中间层特征也很有判别性,GoogleNet在中间层添加了两个辅助分类器,在后向传播中增强梯度并且增强正则化,而整个网络的损失函数是这个三个分类器的损失加权求和。 + +GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。 + +

+
+图8. GoogleNet[12] +

+ + +上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层;GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解,进一步提高网络非线性能力和加深网络;GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升,介于篇幅,这里不再详细介绍v2到v4的结构。 + + +### ResNet + +ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题,ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核,全卷积网络)的基础上,引入了残差模块。每个残差模块包含两条路径,其中一条路径是输入特征的直连通路,另一条路径对该特征做两到三次卷积操作得到该特征的残差,最后再将两条路径上的特征相加。 + +残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256->64),下面的1x1卷积用来升维(图示例即64->256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。 + +

+
+图9. 残差模块 +

+ +图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。 + +

+
+图10. 基于ImageNet的ResNet模型 +

+ + +## 数据准备 + +通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。 + +由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10]()数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。 + +

+
+图11. CIFAR10数据集[21] +

+ +Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。 + +通过输入`python train.py`,就可以开始训练模型了,以下小节将详细介绍`train.py`的相关内容。 + +### 模型结构 + +#### Paddle 初始化 + +让我们从导入 Paddle Fluid API 和辅助模块开始。 + +```python +import paddle +import paddle.fluid as fluid +import numpy +import sys +from __future__ import print_function +``` + +本教程中我们提供了VGG和ResNet两个模型的配置。 + +#### VGG + +首先介绍VGG模型结构,由于CIFAR10图片大小和数量相比ImageNet数据小很多,因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。 +VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下: + +```python +def vgg_bn_drop(input): + def conv_block(ipt, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=ipt, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=512, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=512, act=None) + predict = fluid.layers.fc(input=fc2, size=10, act='softmax') + return predict +``` + + +1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。 + +2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 + +3. 最后接两层512维的全连接。 + +4. 通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。 + +### ResNet + +ResNet模型的第1、3、4步和VGG模型相同,这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。 + +先介绍`resnet_cifar10`中的一些基本函数,再介绍网络连接过程。 + + - `conv_bn_layer` : 带BN的卷积层。 + - `shortcut` : 残差模块的"直连"路径,"直连"实际分两种形式:残差模块输入和输出特征通道数不等时,采用1x1卷积的升维操作;残差模块输入和输出通道相等时,采用直连操作。 + - `basicblock` : 一个基础残差模块,即图9左边所示,由两组3x3卷积组成的路径和一条"直连"路径组成。 + - `bottleneck` : 一个瓶颈残差模块,即图9右边所示,由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。 + - `layer_warp` : 一组残差模块,由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同,以用来减少特征图在垂直和水平方向的大小。 + +```python +def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + +def shortcut(input, ch_in, ch_out, stride): + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + +def basicblock(input, ch_in, ch_out, stride): + tmp = conv_bn_layer(input, ch_out, 3, stride, 1) + tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True) + short = shortcut(input, ch_in, ch_out, stride) + return fluid.layers.elementwise_add(x=tmp, y=short, act='relu') + + +def layer_warp(block_func, input, ch_in, ch_out, count, stride): + tmp = block_func(input, ch_in, ch_out, stride) + for i in range(1, count): + tmp = block_func(tmp, ch_out, ch_out, 1) + return tmp +``` + +`resnet_cifar10` 的连接结构主要有以下几个过程。 + +1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。 + +2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。 + +3. 最后对网络做均值池化并返回该层。 + +注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。 + +```python +def resnet_cifar10(ipt, depth=32): + # depth should be one of 20, 32, 44, 56, 110, 1202 + assert (depth - 2) % 6 == 0 + n = (depth - 2) / 6 + nStages = {16, 64, 128} + conv1 = conv_bn_layer(ipt, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, 16, n, 1) + res2 = layer_warp(basicblock, res1, 16, 32, n, 2) + res3 = layer_warp(basicblock, res2, 32, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + predict = fluid.layers.fc(input=pool, size=10, act='softmax') + return predict +``` + +## Infererence Program 配置 + +网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32)。 + +```python +def inference_program(): + # The image is 32 * 32 with RGB representation. + data_shape = [3, 32, 32] + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + + predict = resnet_cifar10(images, 32) + # predict = vgg_bn_drop(images) # un-comment to use vgg net + return predict +``` + +## Train Program 配置 + +然后我们需要设置训练程序 `train_program`。它首先从推理程序中进行预测。 +在训练期间,它将从预测中计算 `avg_cost`。 +在有监督训练中需要输入图像对应的类别信息,同样通过`fluid.layers.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。 + +**注意:** 训练程序应该返回一个数组,第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。 + +```python +def train_program(): + predict = inference_program() + + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=predict, label=label) + return [avg_cost, accuracy] +``` + +## Optimizer Function 配置 + +在下面的 `Adam optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 + +```python +def optimizer_program(): + return fluid.optimizer.Adam(learning_rate=0.001) +``` + +## 训练模型 + +### Trainer 配置 + +现在,我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer_func`。 + +```python +use_cuda = False +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() +trainer = fluid.Trainer( + train_func=train_program, + optimizer_func=optimizer_program, + place=place) +``` + +### Data Feeders 配置 + +`cifar.train10()` 每次产生一条样本,在完成shuffle和batch之后,作为训练的输入。 + +```python +# Each batch will yield 128 images +BATCH_SIZE = 128 + +# Reader for training +train_reader = paddle.batch( + paddle.reader.shuffle(paddle.dataset.cifar.train10(), buf_size=50000), + batch_size=BATCH_SIZE) + +# Reader for testing. A separated data set for testing. +test_reader = paddle.batch( + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) +``` + +### Event Handler + +可以使用`event_handler`回调函数来观察训练过程,或进行测试等, 该回调函数是`trainer.train`函数里设定。 + +`event_handler_plot`可以用来利用回调数据来打点画图: + +

+
+图12. 训练结果 +

+ + +```python +params_dirname = "image_classification_resnet.inference.model" + +from paddle.v2.plot import Ploter + +train_title = "Train cost" +test_title = "Test cost" +cost_ploter = Ploter(train_title, test_title) + +step = 0 +def event_handler_plot(event): + global step + if isinstance(event, fluid.EndStepEvent): + if step % 1 == 0: + cost_ploter.append(train_title, step, event.metrics[0]) + cost_ploter.plot() + step += 1 + if isinstance(event, fluid.EndEpochEvent): + avg_cost, accuracy = trainer.test( + reader=test_reader, + feed_order=['pixel', 'label']) + cost_ploter.append(test_title, step, avg_cost) + + # save parameters + if params_dirname is not None: + trainer.save_params(params_dirname) +``` + +`event_handler` 用来在训练过程中输出文本日志 + +```python +params_dirname = "image_classification_resnet.inference.model" + +# event handler to track training and testing process +def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + if event.step % 100 == 0: + print("\nPass %d, Batch %d, Cost %f, Acc %f" % + (event.step, event.epoch, event.metrics[0], + event.metrics[1])) + else: + sys.stdout.write('.') + sys.stdout.flush() + + if isinstance(event, fluid.EndEpochEvent): + # Test against with the test dataset to get accuracy. + avg_cost, accuracy = trainer.test( + reader=test_reader, feed_order=['pixel', 'label']) + + print('\nTest with Pass {0}, Loss {1:2.2}, Acc {2:2.2}'.format(event.epoch, avg_cost, accuracy)) + + # save parameters + if params_dirname is not None: + trainer.save_params(params_dirname) +``` + +### 训练 + +通过`trainer.train`函数训练: + +**注意:** CPU,每个 Epoch 将花费大约15~20分钟。这部分可能需要一段时间。请随意修改代码,在GPU上运行测试,以提高训练速度。 + +```python +trainer.train( + reader=train_reader, + num_epochs=2, + event_handler=event_handler, + feed_order=['pixel', 'label']) +``` + +一轮训练log示例如下所示,经过1个pass, 训练集上平均 Accuracy 为0.59 ,测试集上平均 Accuracy 为0.6 。 + +```text +Pass 0, Batch 0, Cost 3.869598, Acc 0.164062 +................................................................................................... +Pass 100, Batch 0, Cost 1.481038, Acc 0.460938 +................................................................................................... +Pass 200, Batch 0, Cost 1.340323, Acc 0.523438 +................................................................................................... +Pass 300, Batch 0, Cost 1.223424, Acc 0.593750 +.......................................................................................... +Test with Pass 0, Loss 1.1, Acc 0.6 +``` + +图13是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。 + +

+
+图13. CIFAR10数据集上VGG模型的分类错误率 +

+ +## 应用模型 + +可以使用训练好的模型对图片进行分类,下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断,可以打开注释,更改加载的模型。 + +### 生成预测输入数据 + +`dog.png` is an example image of a dog. Turn it into an numpy array to match the data feeder format. + +```python +# Prepare testing data. +from PIL import Image +import numpy as np +import os + +def load_image(file): + im = Image.open(file) + im = im.resize((32, 32), Image.ANTIALIAS) + + im = np.array(im).astype(np.float32) + # The storage order of the loaded image is W(width), + # H(height), C(channel). PaddlePaddle requires + # the CHW order, so transpose them. + im = im.transpose((2, 0, 1)) # CHW + im = im / 255.0 + + # Add one dimension to mimic the list format. + im = numpy.expand_dims(im, axis=0) + return im + +cur_dir = os.getcwd() +img = load_image(cur_dir + '/image/dog.png') +``` + +### Inferencer 配置和预测 + +`Inferencer` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。 +我们可以简单地插入前面定义的推理程序。 +现在我们准备做预测。 + +```python +inferencer = fluid.Inferencer( + infer_func=inference_program, param_path=params_dirname, place=place) +label_list = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"] +# inference +results = inferencer.infer({'pixel': img}) +print("infer results: %s" % label_list[np.argmax(results[0])]) +``` + +## 总结 + +传统图像分类方法由多个阶段构成,框架较为复杂,而端到端的CNN模型结构可一步到位,而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型;然后基于CIFAR10数据集,介绍如何使用PaddlePaddle配置和训练CNN模型,尤其是VGG和ResNet模型;最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet,配置和训练流程是同样的,大家可以自行进行实验。 + + +## 参考文献 + +[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004. + +[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005. + +[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. + +[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003. + +[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997. + +[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR. + +[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4). + +[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR. + +[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS. + +[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012. + +[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。 + +[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015) + +[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014. + +[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015. + +[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016. + +[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016). + +[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016). + +[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015. + +[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015. + +[20] http://deeplearning.net/tutorial/lenet.html + +[21] https://www.cs.toronto.edu/~kriz/cifar.html + +[22] http://cs231n.github.io/classification/ + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png deleted file mode 100644 index ca8f858a902ea723d886d2b88c2c0a1005301c50..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png deleted file mode 100644 index 38b21f21604b1bb84fc3f6aa96bd5fce45d15a55..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png deleted file mode 100644 index 647c822e52cd55d50e5f207978f5e6ada86cf34c..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png deleted file mode 100644 index 04245cef60fe7126ae4c92ba8085273965078bee..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg deleted file mode 100644 index 249dbf96df61c3352ea5bd80470f6c4a1e03ff10..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png deleted file mode 100644 index 4660ac122e9d533023a21154d35eee29e3b08d27..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png deleted file mode 100644 index 9591a0c1e8c0165c40ca560be35a7b9a91cd5027..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png deleted file mode 100644 index 77f785e03bacd38c4c64a817874a58ff3298d2f3..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png deleted file mode 100644 index 57e45cc0c27dd99b9918de2ff1228bc6b65f7424..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png deleted file mode 100644 index 0aeb4f254639fdbf18e916dc219ca61602596d85..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg deleted file mode 100644 index c500eb01a90190ff66150871fe83ec275e2de8d7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png deleted file mode 100644 index c6336a9a69b95dc978719ce68896e3e752e67fed..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png deleted file mode 100644 index 6270eefcfd7071bc1643ee06567e5b81aaf4c177..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/index.rst b/doc/fluid/new_docs/beginners_guide/basics/index.rst index d16f8b947253a535567ddc8d7b227dd153d9b154..0fcb008e0a7773e81e5124da09fe07366130b924 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/index.rst +++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst @@ -10,9 +10,9 @@ .. toctree:: :maxdepth: 2 - image_classification/index.md - word2vec/index.md - recommender_system/index.md - understand_sentiment/index.md - label_semantic_roles/index.md - machine_translation/index.md + image_classification/README.cn.md + word2vec/README.cn.md + recommender_system/README.cn.md + understand_sentiment/README.cn.md + label_semantic_roles/README.cn.md + machine_translation/README.cn.md diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/index.md b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md similarity index 54% rename from doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/index.md rename to doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md index 828ca738317992270487647e66b08b6d2f80e209..0891f5b6b16a1b715b44db6c47ba079adfcad4c5 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/index.md +++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md @@ -1,568 +1,562 @@ -# 语义角色标注 - -本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 - -自然语言分析技术大致分为三个层面:词法分析、句法分析和语义分析。语义角色标注是实现浅层语义分析的一种方式。在一个句子中,谓词是对主语的陈述或说明,指出“做什么”、“是什么”或“怎么样,代表了一个事件的核心,跟谓词搭配的名词称为论元。语义角色是指论元在动词所指事件中担任的角色。主要有:施事者(Agent)、受事者(Patient)、客体(Theme)、经验者(Experiencer)、受益者(Beneficiary)、工具(Instrument)、处所(Location)、目标(Goal)和来源(Source)等。 - -请看下面的例子,“遇到” 是谓词(Predicate,通常简写为“Pred”),“小明”是施事者(Agent),“小红”是受事者(Patient),“昨天” 是事件发生的时间(Time),“公园”是事情发生的地点(Location)。 - -$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_{\mbox{Time}}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$ - -语义角色标注(Semantic Role Labeling,SRL)以句子的谓词为中心,不对句子所包含的语义信息进行深入分析,只分析句子中各成分与谓词之间的关系,即句子的谓词(Predicate)- 论元(Argument)结构,并用语义角色来描述这些结构关系,是许多自然语言理解任务(如信息抽取,篇章分析,深度问答等)的一个重要中间步骤。在研究中一般都假定谓词是给定的,所要做的就是找出给定谓词的各个论元和它们的语义角色。 - -传统的SRL系统大多建立在句法分析基础之上,通常包括5个流程: - -1. 构建一棵句法分析树,例如,图1是对上面例子进行依存句法分析得到的一棵句法树。 -2. 从句法树上识别出给定谓词的候选论元。 -3. 候选论元剪除;一个句子中的候选论元可能很多,候选论元剪除就是从大量的候选项中剪除那些最不可能成为论元的候选项。 -4. 论元识别:这个过程是从上一步剪除之后的候选中判断哪些是真正的论元,通常当做一个二分类问题来解决。 -5. 对第4步的结果,通过多分类得到论元的语义角色标签。可以看到,句法分析是基础,并且后续步骤常常会构造的一些人工特征,这些特征往往也来自句法分析。 - -![dependencyParsing](./image/dependency_parsing.png) -
-图1. 依存句法分析句法树示例 -
- -然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个角色为A的论元,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。 - -我们继续以上面的这句话为例,图1展示了BIO表示方法。 - -![bioExample](./image/bio_example.png) -
-图2. BIO标注方法示例 -
- -从上面的例子可以看到,根据序列标注结果可以直接得到论元的语义角色标注结果,是一个相对简单的过程。这种简单性体现在:(1)依赖浅层句法分析,降低了句法分析的要求和难度;(2)没有了候选论元剪除这一步骤;(3)论元的识别和论元标注是同时实现的。这种一体化处理论元识别和论元标注的方法,简化了流程,降低了错误累积的风险,往往能够取得更好的结果。 - -与基于语块的SRL方法类似,在本教程中我们也将SRL看作一个序列标注问题,不同的是,我们只依赖输入文本序列,不依赖任何额外的语法解析结果或是复杂的人造特征,利用深度神经网络构建一个端到端学习的SRL系统。我们以[CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/)任务中SRL任务的公开数据集为例,实践下面的任务:给定一句话和这句话里的一个谓词,通过序列标注的方式,从句子中找到谓词对应的论元,同时标注它们的语义角色。 - -## 模型概览 - -循环神经网络(Recurrent Neural Network)是一种对序列建模的重要模型,在自然语言处理任务中有着广泛地应用。不同于前馈神经网络(Feed-forward Neural Network),RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种,常用来学习长序列中蕴含的长程依赖关系,我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过,这一篇中我们依然利用LSTM来解决SRL问题。 - -### 栈式循环神经网络(Stacked Recurrent Neural Network) - -深层网络有助于形成层次化特征,网络上层在下层已经学习到的初级特征基础上,形成更复杂的高级特征。尽管LSTM沿时间轴展开后等价于一个非常“深”的前馈网络,但由于LSTM各个时间步参数共享,`$t-1$`时刻状态到`$t$`时刻的映射,始终只经过了一次非线性映射,也就是说单层LSTM对状态转移的建模是 “浅” 的。堆叠多个LSTM单元,令前一个LSTM`$t$`时刻的输出,成为下一个LSTM单元`$t$`时刻的输入,帮助我们构建起一个深层网络,我们把它称为第一个版本的栈式循环神经网络。深层网络提高了模型拟合复杂模式的能力,能够更好地建模跨不同时间步的模式\[[2](#参考文献)\]。 - -然而,训练一个深层LSTM网络并非易事。纵向堆叠多个LSTM单元可能遇到梯度在纵向深度上传播受阻的问题。通常,堆叠4层LSTM单元可以正常训练,当层数达到4~8层时,会出现性能衰减,这时必须考虑一些新的结构以保证梯度纵向顺畅传播,这是训练深层LSTM网络必须解决的问题。我们可以借鉴LSTM解决 “梯度消失梯度爆炸” 问题的智慧之一:在记忆单元(Memory Cell)这条信息传播的路线上没有非线性映射,当梯度反向传播时既不会衰减、也不会爆炸。因此,深层LSTM模型也可以在纵向上添加一条保证梯度顺畅传播的路径。 - -一个LSTM单元完成的运算可以被分为三部分:(1)输入到隐层的映射(input-to-hidden) :每个时间步输入信息`$x$`会首先经过一个矩阵映射,再作为遗忘门,输入门,记忆单元,输出门的输入,注意,这一次映射没有引入非线性激活;(2)隐层到隐层的映射(hidden-to-hidden):这一步是LSTM计算的主体,包括遗忘门,输入门,记忆单元更新,输出门的计算;(3)隐层到输出的映射(hidden-to-output):通常是简单的对隐层向量进行激活。我们在第一个版本的栈式网络的基础上,加入一条新的路径:除上一层LSTM输出之外,将前层LSTM的输入到隐层的映射作为的一个新的输入,同时加入一个线性映射去学习一个新的变换。 - -图3是最终得到的栈式循环神经网络结构示意图。 - -![lstmStructure](./image/stacked_lstm.png) -

-图3. 基于LSTM的栈式循环神经网络结构示意图 -

- -### 双向循环神经网络(Bidirectional Recurrent Neural Network) - -在LSTM中,`$t$`时刻的隐藏层向量编码了到`$t$`时刻为止所有输入的信息,但`$t$`时刻的LSTM可以看到历史,却无法看到未来。在绝大多数自然语言处理任务中,我们几乎总是能拿到整个句子。这种情况下,如果能够像获取历史信息一样,得到未来的信息,对序列学习任务会有很大的帮助。 - -为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,`$t$`时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。 - -![lstmStructure](./image/bidirectional_stacked_lstm.png) -

-图4. 基于LSTM的双向循环神经网络结构示意图 -

- -需要说明的是,这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同,我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中,介绍另一种双向循环神经网络。 - -### 条件随机场 (Conditional Random Field) - -使用神经网络模型解决问题的思路通常是:前层网络学习输入的特征表示,网络的最后一层在特征基础上完成最终的任务。在SRL任务中,深层LSTM网络学习输入的特征表示,条件随机场(Conditional Random Filed, CRF)在特征的基础上完成序列标注,处于整个网络的末端。 - -CRF是一种概率化结构模型,可以看作是一个概率无向图模型,结点表示随机变量,边表示随机变量之间的概率依赖关系。简单来讲,CRF学习条件概率`$P(X|Y)$`,其中 `$X = (x_1, x_2, ... , x_n)$` 是输入序列,`$Y = (y_1, y_2, ... , y_n)$` 是标记序列;解码过程是给定 `$X$`序列求解令`$P(Y|X)$`最大的`$Y$`序列,即`$Y^* = \mbox{arg max}_{Y} P(Y | X)$`。 - -序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。 - -![linear_chain_crf](./image/linear_chain_crf.png) -

-图5. 序列标注任务中使用的线性链条件随机场 -

- -根据线性链条件随机场上的因子分解定理\[[5](#参考文献)\],在给定观测序列`$X$`时,一个特定标记序列`$Y$`的概率可以定义为: - -$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$ - -其中`$Z(X)$`是归一化因子,`$t_j$` 是定义在边上的特征函数,依赖于当前和前一个位置,称为转移特征,表示对于输入序列`$X$`及其标注序列在 `$i$`及`$i - 1$`位置上标记的转移概率。`$s_k$`是定义在结点上的特征函数,称为状态特征,依赖于当前位置,表示对于观察序列`$X$`及其`$i$`位置的标记概率。`$\lambda_j$` 和 `$\mu_k$` 分别是转移特征函数和状态特征函数对应的权值。实际上,`$t$`和`$s$`可以用相同的数学形式表示,再对转移特征和状态特在各个位置`$i$`求和有:`$f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$`,把`$f$`统称为特征函数,于是`$P(Y|X)$`可表示为: - -$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ - -`$\omega$`是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合`$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$` ,通过正则化的极大似然估计,求解如下优化目标: - -$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ - -这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列`$X$`,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率`$\bar{P}(Y|X)$`最大的输出序列 `$\bar{Y}$`。 - -### 深度双向LSTM(DB-LSTM)SRL模型 - -在SRL任务中,输入是 “谓词” 和 “一句话”,目标是从这句话中找到谓词的论元,并标注论元的语义角色。如果一个句子含有`$n$`个谓词,这个句子会被处理`$n$`次。一个最为直接的模型是下面这样: - -1. 构造输入; -- 输入1是谓词,输入2是句子 -- 将输入1扩展成和输入2一样长的序列,用one-hot方式表示; -2. one-hot方式的谓词序列和句子序列通过词表,转换为实向量表示的词向量序列; -3. 将步骤2中的2个词向量序列作为双向LSTM的输入,学习输入序列的特征表示; -4. CRF以步骤3中模型学习到的特征为输入,以标记序列为监督信号,实现序列标注; - -大家可以尝试上面这种方法。这里,我们提出一些改进,引入两个简单但对提高系统性能非常有效的特征: - -- 谓词上下文:上面的方法中,只用到了谓词的词向量表达谓词相关的所有信息,这种方法始终是非常弱的,特别是如果谓词在句子中出现多次,有可能引起一定的歧义。从经验出发,谓词前后若干个词的一个小片段,能够提供更丰富的信息,帮助消解歧义。于是,我们把这样的经验也添加到模型中,为每个谓词同时抽取一个“谓词上下文” 片段,也就是从这个谓词前后各取`$n$`个词构成的一个窗口片段; -- 谓词上下文区域标记:为句子中的每一个词引入一个0-1二值变量,表示它们是否在“谓词上下文”片段中; - -修改后的模型如下(图6是一个深度为4的模型结构示意图): - -1. 构造输入 -- 输入1是句子序列,输入2是谓词序列,输入3是谓词上下文,从句子中抽取这个谓词前后各`$n$`个词,构成谓词上下文,用one-hot方式表示,输入4是谓词上下文区域标记,标记了句子中每一个词是否在谓词上下文中; -- 将输入2~3均扩展为和输入1一样长的序列; -2. 输入1~4均通过词表取词向量转换为实向量表示的词向量序列;其中输入1、3共享同一个词表,输入2和4各自独有词表; -3. 第2步的4个词向量序列作为双向LSTM模型的输入;LSTM模型学习输入序列的特征表示,得到新的特性表示序列; -4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注; - -![db_lstm_network](./image/db_lstm_network.png) -
-图6. SRL任务上的深层双向LSTM模型 -
- - -## 数据介绍 - -在此教程中,我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是,CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开,目前,能够获取到的只有测试集,包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中,我们以测试集中的WSJ数据为训练集来讲解模型。但是,由于测试集中样本的数量远远不够,如果希望训练一个可用的神经网络SRL系统,请考虑付费获取全量数据。 - -原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中,我们使用test.wsj文件夹中的数据进行训练和测试,并只会用到words文件夹(文本序列)和props文件夹(标注结果)下的数据。本教程使用的数据目录如下: - -```text -conll05st-release/ -└── test.wsj -├── props # 标注结果 -└── words # 输入文本序列 -``` - -标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同,但原理是相同的,关于标注结果标签含义的说明,请参考论文\[[9](#参考文献)\]。 - -原始数据需要进行数据预处理才能被PaddlePaddle处理,预处理包括下面几个步骤: - -1. 将文本序列和标记序列其合并到一条记录中; -2. 一个句子如果含有`$n$`个谓词,这个句子会被处理`$n$`次,变成`$n$`条独立的训练样本,每个样本一个不同的谓词; -3. 抽取谓词上下文和构造谓词上下文区域标记; -4. 构造以BIO法表示的标记; -5. 依据词典获取词对应的整数索引。 - - -```python -# import paddle.v2.dataset.conll05 as conll05 -# conll05.corpus_reader函数完成上面第1步和第2步. -# conll05.reader_creator函数完成上面第3步到第5步. -# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练. -``` - -预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 - -| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | -|---|---|---|---|---| -| A | set | n't been set . × | 0 | B-A1 | -| record | set | n't been set . × | 0 | I-A1 | -| date | set | n't been set . × | 0 | I-A1 | -| has | set | n't been set . × | 0 | O | -| n't | set | n't been set . × | 1 | B-AM-NEG | -| been | set | n't been set . × | 1 | O | -| set | set | n't been set . × | 1 | B-V | -| . | set | n't been set . × | 1 | O | - - -除数据之外,我们同时提供了以下资源: - -| 文件名称 | 说明 | -|---|---| -| word_dict | 输入句子的词典,共计44068个词 | -| label_dict | 标记的词典,共计106个标记 | -| predicate_dict | 谓词的词典,共计3162个词 | -| emb | 一个训练好的词表,32维 | - -我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中,词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token,词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中,我们将它们全部看作未登录词,用``表示。 - -获取词典,打印词典大小: - -```python -import math, os -import numpy as np -import paddle -import paddle.v2.dataset.conll05 as conll05 -import paddle.fluid as fluid -import time - -with_gpu = os.getenv('WITH_GPU', '0') != '0' - -word_dict, verb_dict, label_dict = conll05.get_dict() -word_dict_len = len(word_dict) -label_dict_len = len(label_dict) -pred_dict_len = len(verb_dict) - -print word_dict_len -print label_dict_len -print pred_dict_len -``` - -## 模型配置说明 - -- 定义输入数据维度及模型超参数。 - -```python -mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 -word_dim = 32 # 词向量维度 -mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 -hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 -depth = 8 # 栈式LSTM的深度 -mix_hidden_lr = 1e-3 - -IS_SPARSE = True -PASS_NUM = 10 -BATCH_SIZE = 10 - -embedding_name = 'emb' -``` - -这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。 - -- 如上文提到,我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数,在训练中不更新。 - -```python -# 这里加载PaddlePaddle上版保存的二进制模型 -def load_parameter(file_name, h, w): -with open(file_name, 'rb') as f: -f.read(16) # skip header. -return np.fromfile(f, dtype=np.float32).reshape(h, w) -``` - -- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 - -```python -def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, -**ignored): -# 8 features -predicate_embedding = fluid.layers.embedding( -input=predicate, -size=[pred_dict_len, word_dim], -dtype='float32', -is_sparse=IS_SPARSE, -param_attr='vemb') - -mark_embedding = fluid.layers.embedding( -input=mark, -size=[mark_dict_len, mark_dim], -dtype='float32', -is_sparse=IS_SPARSE) - -word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] -# Since word vector lookup table is pre-trained, we won't update it this time. -# trainable being False prevents updating the lookup table during training. -emb_layers = [ -fluid.layers.embedding( -size=[word_dict_len, word_dim], -input=x, -param_attr=fluid.ParamAttr( -name=embedding_name, trainable=False)) for x in word_input -] -emb_layers.append(predicate_embedding) -emb_layers.append(mark_embedding) - -# 8 LSTM units are trained through alternating left-to-right / right-to-left order -# denoted by the variable `reverse`. -hidden_0_layers = [ -fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') -for emb in emb_layers -] - -hidden_0 = fluid.layers.sums(input=hidden_0_layers) - -lstm_0 = fluid.layers.dynamic_lstm( -input=hidden_0, -size=hidden_dim, -candidate_activation='relu', -gate_activation='sigmoid', -cell_activation='sigmoid') - -# stack L-LSTM and R-LSTM with direct edges -input_tmp = [hidden_0, lstm_0] - -# In PaddlePaddle, state features and transition features of a CRF are implemented -# by a fully connected layer and a CRF layer seperately. The fully connected layer -# with linear activation learns the state features, here we use fluid.layers.sums -# (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle: -# fluid.layers.linear_chain_crf only -# learns the transition features, which is a cost layer and is the last layer of the network. -# fluid.layers.linear_chain_crf outputs the log probability of true tag sequence -# as the cost by given the input sequence and it requires the true tag sequence -# as target in the learning process. - -for i in range(1, depth): -mix_hidden = fluid.layers.sums(input=[ -fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), -fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') -]) - -lstm = fluid.layers.dynamic_lstm( -input=mix_hidden, -size=hidden_dim, -candidate_activation='relu', -gate_activation='sigmoid', -cell_activation='sigmoid', -is_reverse=((i % 2) == 1)) - -input_tmp = [mix_hidden, lstm] - -# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射, -# 经过一个全连接层映射到标记字典的维度,来学习 CRF 的状态特征 -feature_out = fluid.layers.sums(input=[ -fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), -fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') -]) - -return feature_out -``` - -## 训练模型 - -- 我们根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的SGD方法(momentum设置为0),同时设定了学习率、正则等。 - -- 数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 - -- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。 - -- 可以使用event_handler回调函数来观察训练过程,或进行测试等。这里我们打印了训练过程的cost,该回调函数是trainer.train函数里设定。 - -- 通过trainer.train函数训练 - -```python -def train(use_cuda, save_dirname=None, is_local=True): -# define network topology - -# 句子序列 -word = fluid.layers.data( -name='word_data', shape=[1], dtype='int64', lod_level=1) - -# 谓词 -predicate = fluid.layers.data( -name='verb_data', shape=[1], dtype='int64', lod_level=1) - -# 谓词上下文5个特征 -ctx_n2 = fluid.layers.data( -name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) -ctx_n1 = fluid.layers.data( -name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) -ctx_0 = fluid.layers.data( -name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) -ctx_p1 = fluid.layers.data( -name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) -ctx_p2 = fluid.layers.data( -name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - -# 谓词上下区域标志 -mark = fluid.layers.data( -name='mark_data', shape=[1], dtype='int64', lod_level=1) - -# define network topology -feature_out = db_lstm(**locals()) - -# 标注序列 -target = fluid.layers.data( -name='target', shape=[1], dtype='int64', lod_level=1) - -# 学习 CRF 的转移特征 -crf_cost = fluid.layers.linear_chain_crf( -input=feature_out, -label=target, -param_attr=fluid.ParamAttr( -name='crfw', learning_rate=mix_hidden_lr)) - -avg_cost = fluid.layers.mean(crf_cost) - -sgd_optimizer = fluid.optimizer.SGD( -learning_rate=fluid.layers.exponential_decay( -learning_rate=0.01, -decay_steps=100000, -decay_rate=0.5, -staircase=True)) - -sgd_optimizer.minimize(avg_cost) - -# The CRF decoding layer is used for evaluation and inference. -# It shares weights with CRF layer. The sharing of parameters among multiple layers -# is specified by using the same parameter name in these layers. If true tag sequence -# is provided in training process, `fluid.layers.crf_decoding` calculates labelling error -# for each input token and sums the error over the entire sequence. -# Otherwise, `fluid.layers.crf_decoding` generates the labelling tags. -crf_decode = fluid.layers.crf_decoding( -input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) - -train_data = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.conll05.test(), buf_size=8192), -batch_size=BATCH_SIZE) - -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - - -feeder = fluid.DataFeeder( -feed_list=[ -word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target -], -place=place) -exe = fluid.Executor(place) - -def train_loop(main_program): -exe.run(fluid.default_startup_program()) -embedding_param = fluid.global_scope().find_var( -embedding_name).get_tensor() -embedding_param.set( -load_parameter(conll05.get_embedding(), word_dict_len, word_dim), -place) - -start_time = time.time() -batch_id = 0 -for pass_id in xrange(PASS_NUM): -for data in train_data(): -cost = exe.run(main_program, -feed=feeder.feed(data), -fetch_list=[avg_cost]) -cost = cost[0] - -if batch_id % 10 == 0: -print("avg_cost:" + str(cost)) -if batch_id != 0: -print("second per batch: " + str((time.time( -) - start_time) / batch_id)) -# Set the threshold low to speed up the CI test -if float(cost) < 60.0: -if save_dirname is not None: -fluid.io.save_inference_model(save_dirname, [ -'word_data', 'verb_data', 'ctx_n2_data', -'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', -'ctx_p2_data', 'mark_data' -], [feature_out], exe) -return - -batch_id = batch_id + 1 - -train_loop(fluid.default_main_program()) -``` - - -## 应用模型 - -训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。 - -```python -def infer(use_cuda, save_dirname=None): -if save_dirname is None: -return - -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -exe = fluid.Executor(place) - -inference_scope = fluid.core.Scope() -with fluid.scope_guard(inference_scope): -# Use fluid.io.load_inference_model to obtain the inference program desc, -# the feed_target_names (the names of variables that will be fed -# data using feed operators), and the fetch_targets (variables that -# we want to obtain data from using fetch operators). -[inference_program, feed_target_names, -fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) - -# Setup inputs by creating LoDTensors to represent sequences of words. -# Here each word is the basic element of these LoDTensors and the shape of -# each word (base_shape) should be [1] since it is simply an index to -# look up for the corresponding word vector. -# Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], -# which has only one lod level. Then the created LoDTensors will have only -# one higher level structure (sequence of words, or sentence) than the basic -# element (word). Hence the LoDTensor will hold data for three sentences of -# length 3, 4 and 2, respectively. -# Note that lod info should be a list of lists. -lod = [[3, 4, 2]] -base_shape = [1] -# The range of random integers is [low, high] -word = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=word_dict_len - 1) -pred = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=pred_dict_len - 1) -ctx_n2 = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=word_dict_len - 1) -ctx_n1 = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=word_dict_len - 1) -ctx_0 = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=word_dict_len - 1) -ctx_p1 = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=word_dict_len - 1) -ctx_p2 = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=word_dict_len - 1) -mark = fluid.create_random_int_lodtensor( -lod, base_shape, place, low=0, high=mark_dict_len - 1) - -# Construct feed as a dictionary of {feed_target_name: feed_target_data} -# and results will contain a list of data corresponding to fetch_targets. -assert feed_target_names[0] == 'word_data' -assert feed_target_names[1] == 'verb_data' -assert feed_target_names[2] == 'ctx_n2_data' -assert feed_target_names[3] == 'ctx_n1_data' -assert feed_target_names[4] == 'ctx_0_data' -assert feed_target_names[5] == 'ctx_p1_data' -assert feed_target_names[6] == 'ctx_p2_data' -assert feed_target_names[7] == 'mark_data' - -results = exe.run(inference_program, -feed={ -feed_target_names[0]: word, -feed_target_names[1]: pred, -feed_target_names[2]: ctx_n2, -feed_target_names[3]: ctx_n1, -feed_target_names[4]: ctx_0, -feed_target_names[5]: ctx_p1, -feed_target_names[6]: ctx_p2, -feed_target_names[7]: mark -}, -fetch_list=fetch_targets, -return_numpy=False) -print(results[0].lod()) -np_data = np.array(results[0]) -print("Inference Shape: ", np_data.shape) -``` - -整个程序的入口如下: - -```python -def main(use_cuda, is_local=True): -if use_cuda and not fluid.core.is_compiled_with_cuda(): -return - -# Directory for saving the trained model -save_dirname = "label_semantic_roles.inference.model" - -train(use_cuda, save_dirname, is_local) -infer(use_cuda, save_dirname) - - -main(use_cuda=False) -``` - -## 总结 - -语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例,介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放,教程中只使用测试数据作为示例。在这个过程中,我们希望减少对其它自然语言处理工具的依赖,利用神经网络数据驱动、端到端学习的能力,得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。 - -## 参考文献 -1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483. -2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013. -3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014. -4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014. -5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289. -6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012. -7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330. -8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106. -9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164. -10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 +# 语义角色标注 + +本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/178.html)。 + +## 背景介绍 + +自然语言分析技术大致分为三个层面:词法分析、句法分析和语义分析。语义角色标注是实现浅层语义分析的一种方式。在一个句子中,谓词是对主语的陈述或说明,指出“做什么”、“是什么”或“怎么样,代表了一个事件的核心,跟谓词搭配的名词称为论元。语义角色是指论元在动词所指事件中担任的角色。主要有:施事者(Agent)、受事者(Patient)、客体(Theme)、经验者(Experiencer)、受益者(Beneficiary)、工具(Instrument)、处所(Location)、目标(Goal)和来源(Source)等。 + +请看下面的例子,“遇到” 是谓词(Predicate,通常简写为“Pred”),“小明”是施事者(Agent),“小红”是受事者(Patient),“昨天” 是事件发生的时间(Time),“公园”是事情发生的地点(Location)。 + +$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$ + +语义角色标注(Semantic Role Labeling,SRL)以句子的谓词为中心,不对句子所包含的语义信息进行深入分析,只分析句子中各成分与谓词之间的关系,即句子的谓词(Predicate)- 论元(Argument)结构,并用语义角色来描述这些结构关系,是许多自然语言理解任务(如信息抽取,篇章分析,深度问答等)的一个重要中间步骤。在研究中一般都假定谓词是给定的,所要做的就是找出给定谓词的各个论元和它们的语义角色。 + +传统的SRL系统大多建立在句法分析基础之上,通常包括5个流程: + +1. 构建一棵句法分析树,例如,图1是对上面例子进行依存句法分析得到的一棵句法树。 +2. 从句法树上识别出给定谓词的候选论元。 +3. 候选论元剪除;一个句子中的候选论元可能很多,候选论元剪除就是从大量的候选项中剪除那些最不可能成为论元的候选项。 +4. 论元识别:这个过程是从上一步剪除之后的候选中判断哪些是真正的论元,通常当做一个二分类问题来解决。 +5. 对第4步的结果,通过多分类得到论元的语义角色标签。可以看到,句法分析是基础,并且后续步骤常常会构造的一些人工特征,这些特征往往也来自句法分析。 + +
+
+图1. 依存句法分析句法树示例 +
+ +然而,完全句法分析需要确定句子所包含的全部句法信息,并确定句子各成分之间的关系,是一个非常困难的任务,目前技术下的句法分析准确率并不高,句法分析的细微错误都会导致SRL的错误。为了降低问题的复杂度,同时获得一定的句法结构信息,“浅层句法分析”的思想应运而生。浅层句法分析也称为部分句法分析(partial parsing)或语块划分(chunking)。和完全句法分析得到一颗完整的句法树不同,浅层句法分析只需要识别句子中某些结构相对简单的独立成分,例如:动词短语,这些被识别出来的结构称为语块。为了回避 “无法获得准确率较高的句法树” 所带来的困难,一些研究\[[1](#参考文献)\]也提出了基于语块(chunk)的SRL方法。基于语块的SRL方法将SRL作为一个序列标注问题来解决。序列标注任务一般都会采用BIO表示方式来定义序列标注的标签集,我们先来介绍这种表示方法。在BIO表示法中,B代表语块的开始,I代表语块的中间,O代表语块结束。通过B、I、O 三种标记将不同的语块赋予不同的标签,例如:对于一个由角色A拓展得到的语块组,将它所包含的第一个语块赋予标签B-A,将它所包含的其它语块赋予标签I-A,不属于任何论元的语块赋予标签O。 + +我们继续以上面的这句话为例,图1展示了BIO表示方法。 + +
+
+图2. BIO标注方法示例 +
+ +从上面的例子可以看到,根据序列标注结果可以直接得到论元的语义角色标注结果,是一个相对简单的过程。这种简单性体现在:(1)依赖浅层句法分析,降低了句法分析的要求和难度;(2)没有了候选论元剪除这一步骤;(3)论元的识别和论元标注是同时实现的。这种一体化处理论元识别和论元标注的方法,简化了流程,降低了错误累积的风险,往往能够取得更好的结果。 + +与基于语块的SRL方法类似,在本教程中我们也将SRL看作一个序列标注问题,不同的是,我们只依赖输入文本序列,不依赖任何额外的语法解析结果或是复杂的人造特征,利用深度神经网络构建一个端到端学习的SRL系统。我们以[CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/)任务中SRL任务的公开数据集为例,实践下面的任务:给定一句话和这句话里的一个谓词,通过序列标注的方式,从句子中找到谓词对应的论元,同时标注它们的语义角色。 + +## 模型概览 + +循环神经网络(Recurrent Neural Network)是一种对序列建模的重要模型,在自然语言处理任务中有着广泛地应用。不同于前馈神经网络(Feed-forward Neural Network),RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种,常用来学习长序列中蕴含的长程依赖关系,我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过,这一篇中我们依然利用LSTM来解决SRL问题。 + +### 栈式循环神经网络(Stacked Recurrent Neural Network) + +深层网络有助于形成层次化特征,网络上层在下层已经学习到的初级特征基础上,形成更复杂的高级特征。尽管LSTM沿时间轴展开后等价于一个非常“深”的前馈网络,但由于LSTM各个时间步参数共享,$t-1$时刻状态到$t$时刻的映射,始终只经过了一次非线性映射,也就是说单层LSTM对状态转移的建模是 “浅” 的。堆叠多个LSTM单元,令前一个LSTM$t$时刻的输出,成为下一个LSTM单元$t$时刻的输入,帮助我们构建起一个深层网络,我们把它称为第一个版本的栈式循环神经网络。深层网络提高了模型拟合复杂模式的能力,能够更好地建模跨不同时间步的模式\[[2](#参考文献)\]。 + +然而,训练一个深层LSTM网络并非易事。纵向堆叠多个LSTM单元可能遇到梯度在纵向深度上传播受阻的问题。通常,堆叠4层LSTM单元可以正常训练,当层数达到4~8层时,会出现性能衰减,这时必须考虑一些新的结构以保证梯度纵向顺畅传播,这是训练深层LSTM网络必须解决的问题。我们可以借鉴LSTM解决 “梯度消失梯度爆炸” 问题的智慧之一:在记忆单元(Memory Cell)这条信息传播的路线上没有非线性映射,当梯度反向传播时既不会衰减、也不会爆炸。因此,深层LSTM模型也可以在纵向上添加一条保证梯度顺畅传播的路径。 + +一个LSTM单元完成的运算可以被分为三部分:(1)输入到隐层的映射(input-to-hidden) :每个时间步输入信息$x$会首先经过一个矩阵映射,再作为遗忘门,输入门,记忆单元,输出门的输入,注意,这一次映射没有引入非线性激活;(2)隐层到隐层的映射(hidden-to-hidden):这一步是LSTM计算的主体,包括遗忘门,输入门,记忆单元更新,输出门的计算;(3)隐层到输出的映射(hidden-to-output):通常是简单的对隐层向量进行激活。我们在第一个版本的栈式网络的基础上,加入一条新的路径:除上一层LSTM输出之外,将前层LSTM的输入到隐层的映射作为的一个新的输入,同时加入一个线性映射去学习一个新的变换。 + +图3是最终得到的栈式循环神经网络结构示意图。 + +

+
+图3. 基于LSTM的栈式循环神经网络结构示意图 +

+ +### 双向循环神经网络(Bidirectional Recurrent Neural Network) + +在LSTM中,$t$时刻的隐藏层向量编码了到$t$时刻为止所有输入的信息,但$t$时刻的LSTM可以看到历史,却无法看到未来。在绝大多数自然语言处理任务中,我们几乎总是能拿到整个句子。这种情况下,如果能够像获取历史信息一样,得到未来的信息,对序列学习任务会有很大的帮助。 + +为了克服这一缺陷,我们可以设计一种双向循环网络单元,它的思想简单且直接:对上一节的栈式循环神经网络进行一个小小的修改,堆叠多个LSTM单元,让每一层LSTM单元分别以:正向、反向、正向 …… 的顺序学习上一层的输出序列。于是,从第2层开始,$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。 + +

+
+图4. 基于LSTM的双向循环神经网络结构示意图 +

+ +需要说明的是,这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同,我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中,介绍另一种双向循环神经网络。 + +### 条件随机场 (Conditional Random Field) + +使用神经网络模型解决问题的思路通常是:前层网络学习输入的特征表示,网络的最后一层在特征基础上完成最终的任务。在SRL任务中,深层LSTM网络学习输入的特征表示,条件随机场(Conditional Random Filed, CRF)在特征的基础上完成序列标注,处于整个网络的末端。 + +CRF是一种概率化结构模型,可以看作是一个概率无向图模型,结点表示随机变量,边表示随机变量之间的概率依赖关系。简单来讲,CRF学习条件概率$P(X|Y)$,其中 $X = (x_1, x_2, ... , x_n)$ 是输入序列,$Y = (y_1, y_2, ... , y_n)$ 是标记序列;解码过程是给定 $X$序列求解令$P(Y|X)$最大的$Y$序列,即$Y^* = \mbox{arg max}_{Y} P(Y | X)$。 + +序列标注任务只需要考虑输入和输出都是一个线性序列,并且由于我们只是将输入序列作为条件,不做任何条件独立假设,因此输入序列的元素之间并不存在图结构。综上,在序列标注任务中使用的是如图5所示的定义在链式图上的CRF,称之为线性链条件随机场(Linear Chain Conditional Random Field)。 + +

+
+图5. 序列标注任务中使用的线性链条件随机场 +

+ +根据线性链条件随机场上的因子分解定理\[[5](#参考文献)\],在给定观测序列$X$时,一个特定标记序列$Y$的概率可以定义为: + +$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$ + +其中$Z(X)$是归一化因子,$t_j$ 是定义在边上的特征函数,依赖于当前和前一个位置,称为转移特征,表示对于输入序列$X$及其标注序列在 $i$及$i - 1$位置上标记的转移概率。$s_k$是定义在结点上的特征函数,称为状态特征,依赖于当前位置,表示对于观察序列$X$及其$i$位置的标记概率。$\lambda_j$ 和 $\mu_k$ 分别是转移特征函数和状态特征函数对应的权值。实际上,$t$和$s$可以用相同的数学形式表示,再对转移特征和状态特在各个位置$i$求和有:$f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$,把$f$统称为特征函数,于是$P(Y|X)$可表示为: + +$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ + +$\omega$是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ,通过正则化的极大似然估计,求解如下优化目标: + +$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ + +这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列$X$,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。 + +### 深度双向LSTM(DB-LSTM)SRL模型 + +在SRL任务中,输入是 “谓词” 和 “一句话”,目标是从这句话中找到谓词的论元,并标注论元的语义角色。如果一个句子含有$n$个谓词,这个句子会被处理$n$次。一个最为直接的模型是下面这样: + +1. 构造输入; + - 输入1是谓词,输入2是句子 + - 将输入1扩展成和输入2一样长的序列,用one-hot方式表示; +2. one-hot方式的谓词序列和句子序列通过词表,转换为实向量表示的词向量序列; +3. 将步骤2中的2个词向量序列作为双向LSTM的输入,学习输入序列的特征表示; +4. CRF以步骤3中模型学习到的特征为输入,以标记序列为监督信号,实现序列标注; + +大家可以尝试上面这种方法。这里,我们提出一些改进,引入两个简单但对提高系统性能非常有效的特征: + +- 谓词上下文:上面的方法中,只用到了谓词的词向量表达谓词相关的所有信息,这种方法始终是非常弱的,特别是如果谓词在句子中出现多次,有可能引起一定的歧义。从经验出发,谓词前后若干个词的一个小片段,能够提供更丰富的信息,帮助消解歧义。于是,我们把这样的经验也添加到模型中,为每个谓词同时抽取一个“谓词上下文” 片段,也就是从这个谓词前后各取$n$个词构成的一个窗口片段; +- 谓词上下文区域标记:为句子中的每一个词引入一个0-1二值变量,表示它们是否在“谓词上下文”片段中; + +修改后的模型如下(图6是一个深度为4的模型结构示意图): + +1. 构造输入 + - 输入1是句子序列,输入2是谓词序列,输入3是谓词上下文,从句子中抽取这个谓词前后各$n$个词,构成谓词上下文,用one-hot方式表示,输入4是谓词上下文区域标记,标记了句子中每一个词是否在谓词上下文中; + - 将输入2~3均扩展为和输入1一样长的序列; +2. 输入1~4均通过词表取词向量转换为实向量表示的词向量序列;其中输入1、3共享同一个词表,输入2和4各自独有词表; +3. 第2步的4个词向量序列作为双向LSTM模型的输入;LSTM模型学习输入序列的特征表示,得到新的特性表示序列; +4. CRF以第3步中LSTM学习到的特征为输入,以标记序列为监督信号,完成序列标注; + +
+
+图6. SRL任务上的深层双向LSTM模型 +
+ + +## 数据介绍 + +在此教程中,我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是,CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开,目前,能够获取到的只有测试集,包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中,我们以测试集中的WSJ数据为训练集来讲解模型。但是,由于测试集中样本的数量远远不够,如果希望训练一个可用的神经网络SRL系统,请考虑付费获取全量数据。 + +原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中,我们使用test.wsj文件夹中的数据进行训练和测试,并只会用到words文件夹(文本序列)和props文件夹(标注结果)下的数据。本教程使用的数据目录如下: + +```text +conll05st-release/ +└── test.wsj + ├── props # 标注结果 + └── words # 输入文本序列 +``` + +标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同,但原理是相同的,关于标注结果标签含义的说明,请参考论文\[[9](#参考文献)\]。 + +原始数据需要进行数据预处理才能被PaddlePaddle处理,预处理包括下面几个步骤: + +1. 将文本序列和标记序列其合并到一条记录中; +2. 一个句子如果含有$n$个谓词,这个句子会被处理$n$次,变成$n$条独立的训练样本,每个样本一个不同的谓词; +3. 抽取谓词上下文和构造谓词上下文区域标记; +4. 构造以BIO法表示的标记; +5. 依据词典获取词对应的整数索引。 + +预处理完成之后一条训练样本包含9个特征,分别是:句子序列、谓词、谓词上下文(占 5 列)、谓词上下区域标志、标注序列。下表是一条训练样本的示例。 + +| 句子序列 | 谓词 | 谓词上下文(窗口 = 5) | 谓词上下文区域标记 | 标注序列 | +|---|---|---|---|---| +| A | set | n't been set . × | 0 | B-A1 | +| record | set | n't been set . × | 0 | I-A1 | +| date | set | n't been set . × | 0 | I-A1 | +| has | set | n't been set . × | 0 | O | +| n't | set | n't been set . × | 1 | B-AM-NEG | +| been | set | n't been set . × | 1 | O | +| set | set | n't been set . × | 1 | B-V | +| . | set | n't been set . × | 1 | O | + + +除数据之外,我们同时提供了以下资源: + +| 文件名称 | 说明 | +|---|---| +| word_dict | 输入句子的词典,共计44068个词 | +| label_dict | 标记的词典,共计106个标记 | +| predicate_dict | 谓词的词典,共计3162个词 | +| emb | 一个训练好的词表,32维 | + +我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中,词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token,词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中,我们将它们全部看作未登录词,用``表示。 + +获取词典,打印词典大小: + +```python +from __future__ import print_function + +import math, os +import numpy as np +import paddle +import paddle.v2.dataset.conll05 as conll05 +import paddle.fluid as fluid +import time + +with_gpu = os.getenv('WITH_GPU', '0') != '0' + +word_dict, verb_dict, label_dict = conll05.get_dict() +word_dict_len = len(word_dict) +label_dict_len = len(label_dict) +pred_dict_len = len(verb_dict) + +print('word_dict_len: ', word_dict_len) +print('label_dict_len: ', label_dict_len) +print('pred_dict_len: ', pred_dict_len) +``` + +## 模型配置说明 + +- 定义输入数据维度及模型超参数。 + +```python +mark_dict_len = 2 # 谓上下文区域标志的维度,是一个0-1 2值特征,因此维度为2 +word_dim = 32 # 词向量维度 +mark_dim = 5 # 谓词上下文区域通过词表被映射为一个实向量,这个是相邻的维度 +hidden_dim = 512 # LSTM隐层向量的维度 : 512 / 4 +depth = 8 # 栈式LSTM的深度 +mix_hidden_lr = 1e-3 + +IS_SPARSE = True +PASS_NUM = 10 +BATCH_SIZE = 10 + +embedding_name = 'emb' +``` + +这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维,关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。 + +- 如上文提到,我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数,在训练中不更新。 + +```python +# 这里加载PaddlePaddle上版保存的二进制模型 +def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header. + return np.fromfile(f, dtype=np.float32).reshape(h, w) +``` + +- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。 + +```python +def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, + **ignored): + # 8 features + predicate_embedding = fluid.layers.embedding( + input=predicate, + size=[pred_dict_len, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr='vemb') + + mark_embedding = fluid.layers.embedding( + input=mark, + size=[mark_dict_len, mark_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] + # Since word vector lookup table is pre-trained, we won't update it this time. + # trainable being False prevents updating the lookup table during training. + emb_layers = [ + fluid.layers.embedding( + size=[word_dict_len, word_dim], + input=x, + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) for x in word_input + ] + emb_layers.append(predicate_embedding) + emb_layers.append(mark_embedding) + + # 8 LSTM units are trained through alternating left-to-right / right-to-left order + # denoted by the variable `reverse`. + hidden_0_layers = [ + fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') + for emb in emb_layers + ] + + hidden_0 = fluid.layers.sums(input=hidden_0_layers) + + lstm_0 = fluid.layers.dynamic_lstm( + input=hidden_0, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid') + + # stack L-LSTM and R-LSTM with direct edges + input_tmp = [hidden_0, lstm_0] + + # In PaddlePaddle, state features and transition features of a CRF are implemented + # by a fully connected layer and a CRF layer seperately. The fully connected layer + # with linear activation learns the state features, here we use fluid.layers.sums + # (fluid.layers.fc can be uesed as well), and the CRF layer in PaddlePaddle: + # fluid.layers.linear_chain_crf only + # learns the transition features, which is a cost layer and is the last layer of the network. + # fluid.layers.linear_chain_crf outputs the log probability of true tag sequence + # as the cost by given the input sequence and it requires the true tag sequence + # as target in the learning process. + + for i in range(1, depth): + mix_hidden = fluid.layers.sums(input=[ + fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh') + ]) + + lstm = fluid.layers.dynamic_lstm( + input=mix_hidden, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid', + is_reverse=((i % 2) == 1)) + + input_tmp = [mix_hidden, lstm] + + # 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射, + # 经过一个全连接层映射到标记字典的维度,来学习 CRF 的状态特征 + feature_out = fluid.layers.sums(input=[ + fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), + fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') + ]) + + return feature_out +``` + +## 训练模型 + +- 我们根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的SGD方法(momentum设置为0),同时设定了学习率、正则等。 + +- 数据介绍部分提到CoNLL 2005训练集付费,这里我们使用测试集训练供大家学习。conll05.test()每次产生一条样本,包含9个特征,shuffle和组完batch后作为训练的输入。 + +- 通过feeding来指定每一个数据和data_layer的对应关系。 例如 下面feeding表示: conll05.test()产生数据的第0列对应word_data层的特征。 + +- 可以使用event_handler回调函数来观察训练过程,或进行测试等。这里我们打印了训练过程的cost,该回调函数是trainer.train函数里设定。 + +- 通过trainer.train函数训练 + +```python +def train(use_cuda, save_dirname=None, is_local=True): + # define network topology + + # 句子序列 + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + + # 谓词 + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + + # 谓词上下文5个特征 + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + + # 谓词上下区域标志 + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + + # define network topology + feature_out = db_lstm(**locals()) + + # 标注序列 + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) + + # 学习 CRF 的转移特征 + crf_cost = fluid.layers.linear_chain_crf( + input=feature_out, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=mix_hidden_lr)) + + avg_cost = fluid.layers.mean(crf_cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=100000, + decay_rate=0.5, + staircase=True)) + + sgd_optimizer.minimize(avg_cost) + + # The CRF decoding layer is used for evaluation and inference. + # It shares weights with CRF layer. The sharing of parameters among multiple layers + # is specified by using the same parameter name in these layers. If true tag sequence + # is provided in training process, `fluid.layers.crf_decoding` calculates labelling error + # for each input token and sums the error over the entire sequence. + # Otherwise, `fluid.layers.crf_decoding` generates the labelling tags. + crf_decode = fluid.layers.crf_decoding( + input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), + batch_size=BATCH_SIZE) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target + ], + place=place) + exe = fluid.Executor(place) + + def train_loop(main_program): + exe.run(fluid.default_startup_program()) + embedding_param = fluid.global_scope().find_var( + embedding_name).get_tensor() + embedding_param.set( + load_parameter(conll05.get_embedding(), word_dict_len, word_dim), + place) + + start_time = time.time() + batch_id = 0 + for pass_id in xrange(PASS_NUM): + for data in train_data(): + cost = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[avg_cost]) + cost = cost[0] + + if batch_id % 10 == 0: + print("avg_cost: " + str(cost)) + if batch_id != 0: + print("second per batch: " + str((time.time( + ) - start_time) / batch_id)) + # Set the threshold low to speed up the CI test + if float(cost) < 60.0: + if save_dirname is not None: + fluid.io.save_inference_model(save_dirname, [ + 'word_data', 'verb_data', 'ctx_n2_data', + 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', + 'ctx_p2_data', 'mark_data' + ], [feature_out], exe) + return + + batch_id = batch_id + 1 + + train_loop(fluid.default_main_program()) +``` + + +## 应用模型 + +训练完成之后,需要依据某个我们关心的性能指标选择最优的模型进行预测,可以简单的选择测试集上标记错误最少的那个模型。以下我们给出一个使用训练后的模型进行预测的示例。 + +```python +def infer(use_cuda, save_dirname=None): + if save_dirname is None: + return + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be fed + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + # Setup inputs by creating LoDTensors to represent sequences of words. + # Here each word is the basic element of these LoDTensors and the shape of + # each word (base_shape) should be [1] since it is simply an index to + # look up for the corresponding word vector. + # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]], + # which has only one lod level. Then the created LoDTensors will have only + # one higher level structure (sequence of words, or sentence) than the basic + # element (word). Hence the LoDTensor will hold data for three sentences of + # length 3, 4 and 2, respectively. + # Note that lod info should be a list of lists. + lod = [[3, 4, 2]] + base_shape = [1] + # The range of random integers is [low, high] + word = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + pred = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=pred_dict_len - 1) + ctx_n2 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_n1 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_0 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_p1 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + ctx_p2 = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=word_dict_len - 1) + mark = fluid.create_random_int_lodtensor( + lod, base_shape, place, low=0, high=mark_dict_len - 1) + + # Construct feed as a dictionary of {feed_target_name: feed_target_data} + # and results will contain a list of data corresponding to fetch_targets. + assert feed_target_names[0] == 'word_data' + assert feed_target_names[1] == 'verb_data' + assert feed_target_names[2] == 'ctx_n2_data' + assert feed_target_names[3] == 'ctx_n1_data' + assert feed_target_names[4] == 'ctx_0_data' + assert feed_target_names[5] == 'ctx_p1_data' + assert feed_target_names[6] == 'ctx_p2_data' + assert feed_target_names[7] == 'mark_data' + + results = exe.run(inference_program, + feed={ + feed_target_names[0]: word, + feed_target_names[1]: pred, + feed_target_names[2]: ctx_n2, + feed_target_names[3]: ctx_n1, + feed_target_names[4]: ctx_0, + feed_target_names[5]: ctx_p1, + feed_target_names[6]: ctx_p2, + feed_target_names[7]: mark + }, + fetch_list=fetch_targets, + return_numpy=False) + print(results[0].lod()) + np_data = np.array(results[0]) + print("Inference Shape: ", np_data.shape) +``` + +整个程序的入口如下: + +```python +def main(use_cuda, is_local=True): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + # Directory for saving the trained model + save_dirname = "label_semantic_roles.inference.model" + + train(use_cuda, save_dirname, is_local) + infer(use_cuda, save_dirname) + + +main(use_cuda=False) +``` + +## 总结 + +语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例,介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放,教程中只使用测试数据作为示例。在这个过程中,我们希望减少对其它自然语言处理工具的依赖,利用神经网络数据驱动、端到端学习的能力,得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。 + +## 参考文献 +1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483. +2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013. +3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014. +4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014. +5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289. +6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012. +7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330. +8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106. +9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164. +10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png deleted file mode 100644 index e63f5ebd6d00f2e4ecf97b9ab2027e74683013f2..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png deleted file mode 100755 index f0a195c24d9ee493f96bb93c28a99e70566be7a4..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png deleted file mode 100755 index e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png deleted file mode 100755 index 93b44dd4874402ef29ad7bd7d94147609b92e309..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png deleted file mode 100644 index 592f7ee23bdc88a9a35059612e5ab880bbc9d34b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png deleted file mode 100755 index c3646312e48db977402fb353dc0c9b4d02269bf4..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png deleted file mode 100755 index 9265b671735940ed6549e2980064d2ce08baae64..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png deleted file mode 100755 index 23f4f45b603e3d60702af2b2464d10fc8deed061..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png deleted file mode 100644 index 0778fda74b2ad22ce4b631791a7b028cdef780a5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png deleted file mode 100644 index 3d2914c726b5f4c46e66dfa85d4e88649fede6b3..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png deleted file mode 100755 index 0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/index.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md similarity index 69% rename from doc/fluid/new_docs/beginners_guide/basics/machine_translation/index.md rename to doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md index fc161aaae9c37b0e1a596204e7138025a98adb1d..fa2b930be0d26d816566599cece8afbedc1157e0 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/index.md +++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md @@ -1,448 +1,470 @@ -# 机器翻译 - -本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 - -机器翻译(machine translation, MT)是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言(source language),翻译成的结果语言称为目标语言(target language)。机器翻译即实现从源语言到目标语言转换的过程,是自然语言处理的重要研究领域之一。 - -早期机器翻译系统多为基于规则的翻译系统,需要由语言学家编写两种语言之间的转换规则,再将这些规则录入计算机。该方法对语言学家的要求非常高,而且我们几乎无法总结一门语言会用到的所有规则,更何况两种甚至更多的语言。因此,传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。 - -为解决以上问题,统计机器翻译(Statistical Machine Translation, SMT)技术应运而生。在统计机器翻译技术中,转化规则是由机器自动从大规模的语料中学习得到的,而非我们人主动提供规则。因此,它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题,但仍然存在许多挑战:1)人为设计许多特征(feature),但永远无法覆盖所有的语言现象;2)难以利用全局的特征;3)依赖于许多预处理环节,如词语对齐、分词或符号化(tokenization)、规则抽取、句法分析等,而每个环节的错误会逐步累积,对翻译的影响也越来越大。 - -近年来,深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类:1)仍以统计机器翻译系统为框架,只是利用神经网络来改进其中的关键模块,如语言模型、调序模型等(见图1的左半部分);2)不再以统计机器翻译系统为框架,而是直接用神经网络将源语言映射到目标语言,即端到端的神经网络机器翻译(End-to-End Neural Machine Translation, End-to-End NMT)(见图1的右半部分),简称为NMT模型。 -![nmt](./image/nmt.png) -

-图1. 基于神经网络的机器翻译系统 -

- -本教程主要介绍NMT模型,以及如何用PaddlePaddle来训练一个NMT模型。 - -## 效果展示 - -以中英翻译(中文翻译到英文)的模型为例,当模型训练完毕时,如果输入如下已分词的中文句子: -```text -这些 是 希望 的 曙光 和 解脱 的 迹象 . -``` -如果设定显示翻译结果的条数(即[柱搜索算法](#柱搜索算法)的宽度)为3,生成的英语句子如下: -```text -0 -5.36816 These are signs of hope and relief . -1 -6.23177 These are the light of hope and relief . -2 -7.7914 These are the light of hope and the relief of hope . -``` -- 左起第一列是生成句子的序号;左起第二列是该条句子的得分(从大到小),分值越高越好;左起第三列是生成的英语句子。 -- 另外有两个特殊标志:``表示句子的结尾,``表示未登录词(unknown word),即未在训练字典中出现的词。 - -## 模型概览 - -本节依次介绍双向循环神经网络(Bi-directional Recurrent Neural Network),NMT模型中典型的编码器-解码器(Encoder-Decoder)框架以及柱搜索(beam search)算法。 - -### 双向循环神经网络 - -我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络,这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列,得到其在每个时刻的特征表示,即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。 - -具体来说,该双向循环神经网络分别在时间维以顺序和逆序——即前向(forward)和后向(backward)——依次处理输入序列,并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点,都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN,其中有六个权重矩阵:输入到前向隐层和后向隐层的权重矩阵(`$W_1, W_3$`),隐层到隐层自己的权重矩阵(`$W_2,W_5$`),前向隐层和后向隐层到输出层的权重矩阵(`$W_4, W_6$`)。注意,该网络的前向隐层和后向隐层之间没有连接。 - -![bi_rnn](./image/bi_rnn.png) -

-图3. 按时间步展开的双向循环神经网络 -

- -### 编码器-解码器框架 - -编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。 -![encoder_decoder](./image/encoder_decoder.png) -

-图4. 编码器-解码器框架 -

- -#### 编码器 - -编码阶段分为三步: - -1. one-hot vector表示:将源语言句子`$x=\left \{ x_1,x_2,...,x_T \right \}$`的每个词`$x_i$`表示成一个列向量`$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$`。这个向量`$w_i$`的维度与词汇表大小`$\left | V \right |$` 相同,并且只有一个维度上有值1(该位置对应该词在词汇表中的位置),其余全是0。 - -2. 映射到低维语义空间的词向量:one-hot vector表示存在两个问题,1)生成的向量维度往往很大,容易造成维数灾难;2)难以刻画词与词之间的关系(如语义相似性,也就是无法很好地表达语义)。因此,需再one-hot vector映射到低维的语义空间,由一个固定维度的稠密向量(称为词向量)表示。记映射矩阵为`$C\epsilon R^{K\times \left | V \right |}$`,用`$s_i=Cw_i$`表示第`$i$`个词的词向量,`$K$`为向量维度。 - -3. 用RNN编码源语言词序列:这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`,其中`$h_0$`是一个全零的向量,`$\varnothing _\theta$`是一个非线性激活函数,最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码,或使用时间维上的池化(pooling)结果。 - -第3步也可以使用双向循环神经网络实现更复杂的句编码表示,具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词,并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的,后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词,得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`,通过拼接两个GRU的结果得到它的隐层状态,即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。 - -![encoder_attention](./image/encoder_attention.png) -

-图5. 使用双向GRU的编码器 -

- -#### 解码器 - -机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: - -1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下: - -$$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$ - -其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$`;`$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记``,表示解码开始;`$z_i$`是`$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 - -2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下: - -$$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ - -其中`$W_sz_{i+1}+b_z$`是对每个可能的输出单词进行打分,再用softmax归一化就可以得到第`$i+1$`个词的概率`$p_{i+1}$`。 - -3. 根据`$p_{i+1}$`和`$u_{i+1}$`计算代价。 -4. 重复步骤1~3,直到目标语言序列中的所有词处理完毕。 - -机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法)。 - -### 柱搜索算法 - -柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`你好`”,就算目标语言字典中只有3个词(``, ``, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。 - -柱搜索算法使用广度优先策略建立搜索树,在树的每一层,按照启发代价(heuristic cost)(本教程中,为生成词的log概率之和)对节点进行排序,然后仅留下预先确定的个数(文献中通常称为beam width、beam size、柱宽度等)的节点。只有这些节点会在下一层继续扩展,其他节点就被剪掉了,也就是说保留了质量较高的节点,剪枝了质量较差的节点。因此,搜索所占用的空间和时间大幅减少,但缺点是无法保证一定获得最优解。 - -使用柱搜索算法的解码阶段,目标是最大化生成序列的概率。思路是: - -1. 每一个时刻,根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。 -2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。 -3. 根据`$p_{i+1}$`采样出单词`$u_{i+1}$`。 -4. 重复步骤1~3,直到获得句子结束标记``或超过句子的最大生成长度为止。 - -注意:`$z_{i+1}$`和`$p_{i+1}$`的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的,因此并不能保证得到全局最优解。 - -## 数据介绍 - -本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集,[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。 - -### 数据预处理 - -我们的预处理流程包括两步: -- 将每个源语言到目标语言的平行语料库文件合并为一个文件: -- 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。 -- `XXX`中的第`$i$`行内容为`XXX.src`中的第`$i$`行和`XXX.trg`中的第`$i$`行连接,用'\t'分隔。 -- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词,包括:语料中词频最高的(DICTSIZE - 3)个单词,和3个特殊符号``(序列的开始)、``(序列的结束)和``(未登录词)。 - -### 示例数据 - -因为完整的数据集数据量较大,为了验证训练流程,PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。 - -该数据集有193319条训练数据,6003条测试数据,词典长度为30000。因为数据规模限制,使用该数据集训练出来的模型效果无法保证。 - -## 模型配置说明 - -下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 - -```python -import contextlib - -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.framework as framework -import paddle.fluid.layers as pd -from paddle.fluid.executor import Executor -from functools import partial -import os - -dict_size = 30000 -source_dict_dim = target_dict_dim = dict_size -hidden_dim = 32 -word_dim = 16 -batch_size = 2 -max_length = 8 -topk_size = 50 -beam_size = 2 - -decoder_size = hidden_dim -``` - -然后如下实现编码器框架: - -```python -def encoder(is_sparse): -src_word_id = pd.data( -name="src_word_id", shape=[1], dtype='int64', lod_level=1) -src_embedding = pd.embedding( -input=src_word_id, -size=[dict_size, word_dim], -dtype='float32', -is_sparse=is_sparse, -param_attr=fluid.ParamAttr(name='vemb')) - -fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') -lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) -encoder_out = pd.sequence_last_step(input=lstm_hidden0) -return encoder_out -``` - -再实现训练模式下的解码器: - -```python -def train_decoder(context, is_sparse): -trg_language_word = pd.data( -name="target_language_word", shape=[1], dtype='int64', lod_level=1) -trg_embedding = pd.embedding( -input=trg_language_word, -size=[dict_size, word_dim], -dtype='float32', -is_sparse=is_sparse, -param_attr=fluid.ParamAttr(name='vemb')) - -rnn = pd.DynamicRNN() -with rnn.block(): -current_word = rnn.step_input(trg_embedding) -pre_state = rnn.memory(init=context) -current_state = pd.fc(input=[current_word, pre_state], -size=decoder_size, -act='tanh') - -current_score = pd.fc(input=current_state, -size=target_dict_dim, -act='softmax') -rnn.update_memory(pre_state, current_state) -rnn.output(current_score) - -return rnn() -``` - -实现推测模式下的解码器: - -```python -def decode(context, is_sparse): -init_state = context -array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) -counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) - -# fill the first element with init_state -state_array = pd.create_array('float32') -pd.array_write(init_state, array=state_array, i=counter) - -# ids, scores as memory -ids_array = pd.create_array('int64') -scores_array = pd.create_array('float32') - -init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) -init_scores = pd.data( -name="init_scores", shape=[1], dtype="float32", lod_level=2) - -pd.array_write(init_ids, array=ids_array, i=counter) -pd.array_write(init_scores, array=scores_array, i=counter) - -cond = pd.less_than(x=counter, y=array_len) - -while_op = pd.While(cond=cond) -with while_op.block(): -pre_ids = pd.array_read(array=ids_array, i=counter) -pre_state = pd.array_read(array=state_array, i=counter) -pre_score = pd.array_read(array=scores_array, i=counter) - -# expand the lod of pre_state to be the same with pre_score -pre_state_expanded = pd.sequence_expand(pre_state, pre_score) - -pre_ids_emb = pd.embedding( -input=pre_ids, -size=[dict_size, word_dim], -dtype='float32', -is_sparse=is_sparse) - -# use rnn unit to update rnn -current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], -size=decoder_size, -act='tanh') -current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) -# use score to do beam search -current_score = pd.fc(input=current_state_with_lod, -size=target_dict_dim, -act='softmax') -topk_scores, topk_indices = pd.topk(current_score, k=topk_size) -selected_ids, selected_scores = pd.beam_search( -pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) - -pd.increment(x=counter, value=1, in_place=True) - -# update the memories -pd.array_write(current_state, array=state_array, i=counter) -pd.array_write(selected_ids, array=ids_array, i=counter) -pd.array_write(selected_scores, array=scores_array, i=counter) - -pd.less_than(x=counter, y=array_len, cond=cond) - -translation_ids, translation_scores = pd.beam_search_decode( -ids=ids_array, scores=scores_array) - -return translation_ids, translation_scores -``` - -进而,我们定义一个`train_program`来使用`inference_program`计算出的结果,在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。 - -```python -def train_program(is_sparse): -context = encoder(is_sparse) -rnn_out = train_decoder(context, is_sparse) -label = pd.data( -name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) -cost = pd.cross_entropy(input=rnn_out, label=label) -avg_cost = pd.mean(cost) -return avg_cost - - -def optimizer_func(): -return fluid.optimizer.Adagrad( -learning_rate=1e-4, -regularization=fluid.regularizer.L2DecayRegularizer( -regularization_coeff=0.1)) -``` - -## 训练模型 - -### 定义训练环境 -定义您的训练环境,可以指定训练是发生在CPU还是GPU上。 - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 定义数据提供器 -下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.wmt.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据,乱序化的大小为缓存大小`buf_size`。 - -```python -train_reader = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.wmt14.train(dict_size), buf_size=1000), -batch_size=batch_size) -``` - -### 构造训练器(trainer) -训练器需要一个训练程序和一个训练优化函数。 - -```python -is_sparse = False -trainer = fluid.Trainer( -train_func=partial(train_program, is_sparse), -place=place, -optimizer_func=optimizer_func) -``` - -### 提供数据 - -`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`wmt14.train`产生的第一列的数据对应的是`src_word_id`这个特征。 - -```python -feed_order = [ -'src_word_id', 'target_language_word', 'target_language_next_word' -] -``` - -### 事件处理器 -回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 - -```python -def event_handler(event): -if isinstance(event, fluid.EndStepEvent): -if event.step % 10 == 0: -print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step)) - -if event.step == 20: -trainer.stop() -``` - -### 开始训练 -最后,我们传入训练循环数(`num_epoch`)和一些别的参数,调用 `trainer.train` 来开始训练。 - -```python -EPOCH_NUM = 1 - -trainer.train( -reader=train_reader, -num_epochs=EPOCH_NUM, -event_handler=event_handler, -feed_order=feed_order) -``` - -## 应用模型 - -### 定义解码部分 - -使用上面定义的 `encoder` 和 `decoder` 函数来推测翻译后的对应id和分数. - -```python -context = encoder(is_sparse) -translation_ids, translation_scores = decode(context, is_sparse) -``` - -### 定义数据 - -我们先初始化id和分数来生成tensors来作为输入数据。在这个预测例子中,我们用`wmt14.test`数据中的第一个记录来做推测,最后我们用"源字典"和"目标字典"来列印对应的句子结果。 - -```python -init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') -init_scores_data = np.array( -[1. for _ in range(batch_size)], dtype='float32') -init_ids_data = init_ids_data.reshape((batch_size, 1)) -init_scores_data = init_scores_data.reshape((batch_size, 1)) -init_lod = [1] * batch_size -init_lod = [init_lod, init_lod] - -init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) -init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) - -test_data = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.wmt14.test(dict_size), buf_size=1000), -batch_size=batch_size) - -feed_order = ['src_word_id'] -feed_list = [ -framework.default_main_program().global_block().var(var_name) -for var_name in feed_order -] -feeder = fluid.DataFeeder(feed_list, place) - -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) -``` - -### 测试 -现在我们可以进行预测了。我们要在`feed_order`提供对应参数,放在`executor`上运行以取得id和分数结果 - -```python -exe = Executor(place) -exe.run(framework.default_startup_program()) - -for data in test_data(): -feed_data = map(lambda x: [x[0]], data) -feed_dict = feeder.feed(feed_data) -feed_dict['init_ids'] = init_ids -feed_dict['init_scores'] = init_scores - -results = exe.run( -framework.default_main_program(), -feed=feed_dict, -fetch_list=[translation_ids, translation_scores], -return_numpy=False) - -result_ids = np.array(results[0]) -result_scores = np.array(results[1]) - -print("Original sentence:") -print(" ".join([src_dict[w] for w in feed_data[0][0]])) -print("Translated sentence:") -print(" ".join([trg_dict[w] for w in result_ids])) -print("Corresponding score: ", result_scores) - -break -``` - -## 总结 - -端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中,我们介绍了NMT中典型的“编码器-解码器”框架。由于NMT是一个典型的Seq2Seq(Sequence to Sequence,序列到序列)学习问题,因此,Seq2Seq中的query改写(query rewriting)、摘要、单轮对话等问题都可以用本教程的模型来解决。 - -## 参考文献 - -1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009. -2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734. -3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014. -4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015. -5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 +# 机器翻译 + +本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 + +## 背景介绍 + +机器翻译(machine translation, MT)是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言(source language),翻译成的结果语言称为目标语言(target language)。机器翻译即实现从源语言到目标语言转换的过程,是自然语言处理的重要研究领域之一。 + +早期机器翻译系统多为基于规则的翻译系统,需要由语言学家编写两种语言之间的转换规则,再将这些规则录入计算机。该方法对语言学家的要求非常高,而且我们几乎无法总结一门语言会用到的所有规则,更何况两种甚至更多的语言。因此,传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。 + +为解决以上问题,统计机器翻译(Statistical Machine Translation, SMT)技术应运而生。在统计机器翻译技术中,转化规则是由机器自动从大规模的语料中学习得到的,而非我们人主动提供规则。因此,它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题,但仍然存在许多挑战:1)人为设计许多特征(feature),但永远无法覆盖所有的语言现象;2)难以利用全局的特征;3)依赖于许多预处理环节,如词语对齐、分词或符号化(tokenization)、规则抽取、句法分析等,而每个环节的错误会逐步累积,对翻译的影响也越来越大。 + +近年来,深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类:1)仍以统计机器翻译系统为框架,只是利用神经网络来改进其中的关键模块,如语言模型、调序模型等(见图1的左半部分);2)不再以统计机器翻译系统为框架,而是直接用神经网络将源语言映射到目标语言,即端到端的神经网络机器翻译(End-to-End Neural Machine Translation, End-to-End NMT)(见图1的右半部分),简称为NMT模型。 +
+
+图1. 基于神经网络的机器翻译系统 +
+ +本教程主要介绍NMT模型,以及如何用PaddlePaddle来训练一个NMT模型。 + +## 效果展示 + +以中英翻译(中文翻译到英文)的模型为例,当模型训练完毕时,如果输入如下已分词的中文句子: +```text +这些 是 希望 的 曙光 和 解脱 的 迹象 . +``` +如果设定显示翻译结果的条数(即[柱搜索算法](#柱搜索算法)的宽度)为3,生成的英语句子如下: +```text +0 -5.36816 These are signs of hope and relief . +1 -6.23177 These are the light of hope and relief . +2 -7.7914 These are the light of hope and the relief of hope . +``` + +- 左起第一列是生成句子的序号;左起第二列是该条句子的得分(从大到小),分值越高越好;左起第三列是生成的英语句子。 + +- 另外有两个特殊标志:``表示句子的结尾,``表示未登录词(unknown word),即未在训练字典中出现的词。 + +## 模型概览 + +本节依次介绍双向循环神经网络(Bi-directional Recurrent Neural Network),NMT模型中典型的编码器-解码器(Encoder-Decoder)框架以及柱搜索(beam search)算法。 + +### 双向循环神经网络 + +我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络,这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列,得到其在每个时刻的特征表示,即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。 + +具体来说,该双向循环神经网络分别在时间维以顺序和逆序——即前向(forward)和后向(backward)——依次处理输入序列,并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点,都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN,其中有六个权重矩阵:输入到前向隐层和后向隐层的权重矩阵(`$W_1, W_3$`),隐层到隐层自己的权重矩阵(`$W_2,W_5$`),前向隐层和后向隐层到输出层的权重矩阵(`$W_4, W_6$`)。注意,该网络的前向隐层和后向隐层之间没有连接。 + + +
+
+图2. 按时间步展开的双向循环神经网络 +
+ +### 编码器-解码器框架 + +编码器-解码器(Encoder-Decoder)\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量,解码阶段通过最大化预测序列概率,从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。 +![encoder_decoder](./image/encoder_decoder.png) +
+
+图3. 编码器-解码器框架 +
+ +#### 编码器 + +编码阶段分为三步: + +1. one-hot vector表示:将源语言句子`$x=\left \{ x_1,x_2,...,x_T \right \}$`的每个词`$x_i$`表示成一个列向量`$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$`。这个向量`$w_i$`的维度与词汇表大小`$\left | V \right |$` 相同,并且只有一个维度上有值1(该位置对应该词在词汇表中的位置),其余全是0。 + +2. 映射到低维语义空间的词向量:one-hot vector表示存在两个问题,1)生成的向量维度往往很大,容易造成维数灾难;2)难以刻画词与词之间的关系(如语义相似性,也就是无法很好地表达语义)。因此,需再one-hot vector映射到低维的语义空间,由一个固定维度的稠密向量(称为词向量)表示。记映射矩阵为`$C\epsilon R^{K\times \left | V \right |}$`,用`$s_i=Cw_i$`表示第`$i$`个词的词向量,`$K$`为向量维度。 + +3. 用RNN编码源语言词序列:这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`,其中`$h_0$`是一个全零的向量,`$\varnothing _\theta$`是一个非线性激活函数,最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码,或使用时间维上的池化(pooling)结果。 + +第3步也可以使用双向循环神经网络实现更复杂的句编码表示,具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词,并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的,后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词,得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`,通过拼接两个GRU的结果得到它的隐层状态,即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。 +
+
+图4. 使用双向GRU的编码器 +
+ +#### 解码器 + +机器翻译任务的训练过程中,解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是: +1. 每一个时刻,根据源语言句子的编码信息(又叫上下文向量,context vector)`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。计算公式如下: +$$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$ +其中`$\phi _{\theta '}$`是一个非线性激活函数;`$c=q\mathbf{h}$`是源语言句子的上下文向量,在不使用[注意力机制](#注意力机制)时,如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素,则可以定义`$c=h_T$`;`$u_i$`是目标语言序列的第`$i$`个单词,`$u_0$`是目标语言序列的开始标记``,表示解码开始;`$z_i$`是`$i$`时刻解码RNN的隐层状态,`$z_0$`是一个全零的向量。 + +2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下: +$$p\left ( u_{i+1}|u_{<i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$ +其中`$W_sz_{i+1}+b_z$`是对每个可能的输出单词进行打分,再用softmax归一化就可以得到第`$i+1$`个词的概率`$p_{i+1}$`。 + +3. 根据`$p_{i+1}$`和`$u_{i+1}$`计算代价。 + +4. 重复步骤1~3,直到目标语言序列中的所有词处理完毕。 + +机器翻译任务的生成过程,通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异,具体介绍请见[柱搜索算法](#柱搜索算法)。 + +### 柱搜索算法 + +柱搜索([beam search](http://en.wikipedia.org/wiki/Beam_search))是一种启发式图搜索算法,用于在图或树中搜索有限集合中的最优扩展节点,通常用在解空间非常大的系统(如机器翻译、语音识别)中,原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`你好`”,就算目标语言字典中只有3个词(``, ``, `hello`),也可能生成无限句话(`hello`循环出现的次数不定),为了找到其中较好的翻译结果,我们可采用柱搜索算法。 + +柱搜索算法使用广度优先策略建立搜索树,在树的每一层,按照启发代价(heuristic cost)(本教程中,为生成词的log概率之和)对节点进行排序,然后仅留下预先确定的个数(文献中通常称为beam width、beam size、柱宽度等)的节点。只有这些节点会在下一层继续扩展,其他节点就被剪掉了,也就是说保留了质量较高的节点,剪枝了质量较差的节点。因此,搜索所占用的空间和时间大幅减少,但缺点是无法保证一定获得最优解。 + +使用柱搜索算法的解码阶段,目标是最大化生成序列的概率。思路是: +1. 每一个时刻,根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`,计算出下一个隐层状态`$z_{i+1}$`。 + +2. 将`$z_{i+1}$`通过`softmax`归一化,得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。 + +3. 根据`$p_{i+1}$`采样出单词`$u_{i+1}$`。 + +4. 重复步骤1~3,直到获得句子结束标记``或超过句子的最大生成长度为止。 + +注意:`$z_{i+1}$`和`$p_{i+1}$`的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的,因此并不能保证得到全局最优解。 + +## 数据介绍 + +本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集,[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。 + +### 数据预处理 + +我们的预处理流程包括两步: + +- 将每个源语言到目标语言的平行语料库文件合并为一个文件: + +- 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。 + +- `XXX`中的第`$i$`行内容为`XXX.src`中的第`$i$`行和`XXX.trg`中的第`$i$`行连接,用'\t'分隔。 + +- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词,包括:语料中词频最高的(DICTSIZE - 3)个单词,和3个特殊符号``(序列的开始)、``(序列的结束)和``(未登录词)。 + +### 示例数据 + +因为完整的数据集数据量较大,为了验证训练流程,PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。 + +该数据集有193319条训练数据,6003条测试数据,词典长度为30000。因为数据规模限制,使用该数据集训练出来的模型效果无法保证。 + +## 模型配置说明 + +下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 + +```python +from __future__ import print_function +import contextlib + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.fluid.layers as pd +from paddle.fluid.executor import Executor +from functools import partial +import os + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +hidden_dim = 32 +word_dim = 16 +batch_size = 2 +max_length = 8 +topk_size = 50 +beam_size = 2 + +decoder_size = hidden_dim +``` + +然后如下实现编码器框架: + + ```python + def encoder(is_sparse): + src_word_id = pd.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = pd.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + ``` + +再实现训练模式下的解码器: + +```python + def train_decoder(context, is_sparse): + trg_language_word = pd.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = pd.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn = pd.DynamicRNN() + with rnn.block(): + current_word = rnn.step_input(trg_embedding) + pre_state = rnn.memory(init=context) + current_state = pd.fc(input=[current_word, pre_state], + size=decoder_size, + act='tanh') + + current_score = pd.fc(input=current_state, + size=target_dict_dim, + act='softmax') + rnn.update_memory(pre_state, current_state) + rnn.output(current_score) + + return rnn() +``` + +实现推测模式下的解码器: + +```python +def decode(context, is_sparse): + init_state = context + array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) + counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) + + # fill the first element with init_state + state_array = pd.create_array('float32') + pd.array_write(init_state, array=state_array, i=counter) + + # ids, scores as memory + ids_array = pd.create_array('int64') + scores_array = pd.create_array('float32') + + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + pd.array_write(init_ids, array=ids_array, i=counter) + pd.array_write(init_scores, array=scores_array, i=counter) + + cond = pd.less_than(x=counter, y=array_len) + + while_op = pd.While(cond=cond) + with while_op.block(): + pre_ids = pd.array_read(array=ids_array, i=counter) + pre_state = pd.array_read(array=state_array, i=counter) + pre_score = pd.array_read(array=scores_array, i=counter) + + # expand the lod of pre_state to be the same with pre_score + pre_state_expanded = pd.sequence_expand(pre_state, pre_score) + + pre_ids_emb = pd.embedding( + input=pre_ids, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse) + + # use rnn unit to update rnn + current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], + size=decoder_size, + act='tanh') + current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) + # use score to do beam search + current_score = pd.fc(input=current_state_with_lod, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = pd.topk(current_score, k=beam_size) + # calculate accumulated scores after topk to reduce computation cost + accu_scores = pd.elementwise_add( + x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) + selected_ids, selected_scores = pd.beam_search( + pre_ids, + pre_score, + topk_indices, + accu_scores, + beam_size, + end_id=10, + level=0) + + pd.increment(x=counter, value=1, in_place=True) + + # update the memories + pd.array_write(current_state, array=state_array, i=counter) + pd.array_write(selected_ids, array=ids_array, i=counter) + pd.array_write(selected_scores, array=scores_array, i=counter) + + # update the break condition: up to the max length or all candidates of + # source sentences have ended. + length_cond = pd.less_than(x=counter, y=array_len) + finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) + pd.logical_and(x=length_cond, y=finish_cond, out=cond) + + translation_ids, translation_scores = pd.beam_search_decode( + ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) + + return translation_ids, translation_scores +``` + +进而,我们定义一个`train_program`来使用`inference_program`计算出的结果,在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。 + +```python +def train_program(is_sparse): + context = encoder(is_sparse) + rnn_out = train_decoder(context, is_sparse) + label = pd.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = pd.cross_entropy(input=rnn_out, label=label) + avg_cost = pd.mean(cost) + return avg_cost + + +def optimizer_func(): + return fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.1)) +``` + +## 训练模型 + +### 定义训练环境 +定义您的训练环境,可以指定训练是发生在CPU还是GPU上。 + +```python +use_cuda = False +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() +``` + +### 定义数据提供器 +下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.wmt.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据,乱序化的大小为缓存大小`buf_size`。 + +```python +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) +``` + +### 构造训练器(trainer) +训练器需要一个训练程序和一个训练优化函数。 + +```python +is_sparse = False +trainer = fluid.Trainer( + train_func=partial(train_program, is_sparse), + place=place, + optimizer_func=optimizer_func) +``` + +### 提供数据 + +`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`wmt14.train`产生的第一列的数据对应的是`src_word_id`这个特征。 + +```python +feed_order = [ + 'src_word_id', 'target_language_word', 'target_language_next_word' + ] +``` + +### 事件处理器 +回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 + +```python +def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + if event.step % 10 == 0: + print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step)) + + if event.step == 20: + trainer.stop() +``` + +### 开始训练 +最后,我们传入训练循环数(`num_epoch`)和一些别的参数,调用 `trainer.train` 来开始训练。 + +```python +EPOCH_NUM = 1 + +trainer.train( + reader=train_reader, + num_epochs=EPOCH_NUM, + event_handler=event_handler, + feed_order=feed_order) +``` + +## 应用模型 + +### 定义解码部分 + +使用上面定义的 `encoder` 和 `decoder` 函数来推测翻译后的对应id和分数. + +```python +context = encoder(is_sparse) +translation_ids, translation_scores = decode(context, is_sparse) +``` + +### 定义数据 + +我们先初始化id和分数来生成tensors来作为输入数据。在这个预测例子中,我们用`wmt14.test`数据中的第一个记录来做推测,最后我们用"源字典"和"目标字典"来列印对应的句子结果。 + +```python +init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') +init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') +init_ids_data = init_ids_data.reshape((batch_size, 1)) +init_scores_data = init_scores_data.reshape((batch_size, 1)) +init_lod = [1] * batch_size +init_lod = [init_lod, init_lod] + +init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) +init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) + +test_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(dict_size), buf_size=1000), + batch_size=batch_size) + +feed_order = ['src_word_id'] +feed_list = [ + framework.default_main_program().global_block().var(var_name) + for var_name in feed_order +] +feeder = fluid.DataFeeder(feed_list, place) + +src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +``` + +### 测试 +现在我们可以进行预测了。我们要在`feed_order`提供对应参数,放在`executor`上运行以取得id和分数结果 + +```python +exe = Executor(place) +exe.run(framework.default_startup_program()) + +for data in test_data(): + feed_data = map(lambda x: [x[0]], data) + feed_dict = feeder.feed(feed_data) + feed_dict['init_ids'] = init_ids + feed_dict['init_scores'] = init_scores + + results = exe.run( + framework.default_main_program(), + feed=feed_dict, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + + result_ids = np.array(results[0]) + result_scores = np.array(results[1]) + + print("Original sentence:") + print(" ".join([src_dict[w] for w in feed_data[0][0][1:-1]])) + print("Translated score and sentence:") + for i in xrange(beam_size): + start_pos = result_ids_lod[1][i] + 1 + end_pos = result_ids_lod[1][i+1] + print("%d\t%.4f\t%s\n" % (i+1, result_scores[end_pos-1], + " ".join([trg_dict[w] for w in result_ids[start_pos:end_pos]]))) + + break +``` + +## 总结 + +端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中,我们介绍了NMT中典型的“编码器-解码器”框架。由于NMT是一个典型的Seq2Seq(Sequence to Sequence,序列到序列)学习问题,因此,Seq2Seq中的query改写(query rewriting)、摘要、单轮对话等问题都可以用本教程的模型来解决。 + +## 参考文献 + +1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009. +2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734. +3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014. +4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015. +5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318. + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png deleted file mode 100644 index 9d8efd50a49d0305586f550344472ab94c93bed3..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png deleted file mode 100755 index 4b35c88fc8ea2c503473c0c15711744e784d6af6..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png deleted file mode 100644 index 1b355e7786d25487a3f564af758c2c52c43b4690..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png deleted file mode 100755 index 3728f782ee09d9308d02b42305027b2735467ead..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png deleted file mode 100644 index 28d7a15a3bd65262bde22a3f41b5aa78b46b368a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png deleted file mode 100755 index ea8585565da1ecaf241654c278c6f9b15e283286..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png deleted file mode 100755 index 60aee0017de73f462e35708b1055aff8992c03e1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png deleted file mode 100755 index 6b73798fe632e0873b35c117b86f347c8cf3116a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png deleted file mode 100644 index 0cde685b84106650a4df18ce335a23e6338d3d11..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png deleted file mode 100755 index a6af429f23f0f7e82650139bbd8dcbef27a34abe..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png deleted file mode 100644 index bf56d73ebf297fadf522389c7b6836dd379aa097..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png deleted file mode 100755 index 557310e044b2b6687e5ea6895417ed946ac7bc11..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/index.md b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md similarity index 66% rename from doc/fluid/new_docs/beginners_guide/basics/recommender_system/index.md rename to doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md index 09a07f3dc30abc57ab3731af054dd83491acc9a6..4b79e62f74e587fcd939d9f9e911af80992ea6a3 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/index.md +++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md @@ -1,528 +1,537 @@ -# 个性化推荐 - -本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 - -在网络技术不断发展和电子商务规模不断扩大的背景下,商品数量和种类快速增长,用户需要花费大量时间才能找到自己想买的商品,这就是信息超载问题。为了解决这个难题,推荐系统(Recommender System)应运而生。 - -个性化推荐系统是信息过滤系统(Information Filtering System)的子集,它可以用在很多领域,如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为,发现用户的个性化需求与兴趣特点,将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同,推荐系统不需要用户准确地描述出自己的需求,而是根据分析历史行为建模,主动提供满足用户兴趣和需求的信息。 - -传统的推荐系统方法主要有: - -- 协同过滤推荐(Collaborative Filtering Recommendation):该方法收集分析用户历史行为、活动、偏好,计算一个用户与其他用户的相似度,利用目标用户的相似用户对商品评价的加权评价值,来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品;缺点是对于没有任何行为的新用户存在冷启动的问题,同时也存在用户与商品之间的交互数据不够多造成的稀疏问题,会导致模型难以找到相近用户。 -- 基于内容过滤推荐[[1](#参考文献)](Content-based Filtering Recommendation):该方法利用商品的内容描述,抽象出有意义的特征,通过计算用户的兴趣和商品描述之间的相似度,来给用户做推荐。优点是简单直接,不需要依据其他用户对商品的评价,而是通过商品属性进行商品相似度度量,从而推荐给用户所感兴趣商品的相似商品;缺点是对于没有任何行为的新用户同样存在冷启动的问题。 -- 组合推荐[[2](#参考文献)](Hybrid Recommendation):运用不同的输入和技术共同进行推荐,以弥补各自推荐技术的缺点。 - -其中协同过滤是应用最广泛的技术之一,它又可以分为多个子类:基于用户 (User-Based)的推荐[[3](#参考文献)] 、基于物品(Item-Based)的推荐[[4](#参考文献)]、基于社交网络关系(Social-Based)的推荐[[5](#参考文献)]、基于模型(Model-based)的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想,此后,基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。 - -深度学习具有优秀的自动提取特征的能力,能够学习多层次的抽象特征表示,并对异质或跨域的内容信息进行学习,可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型,以及如何使用PaddlePaddle实现模型。 - -## 效果展示 - -我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后,只需要输入对应的用户ID和电影ID,就可以得出一个匹配的分数(范围[0,5],分数越高视为兴趣越大),然后根据所有电影的推荐得分排序,推荐给用户可能感兴趣的电影。 - -``` -Input movie_id: 1962 -Input user_id: 1 -Prediction Score is 4.25 -``` - -## 模型概览 - -本章中,我们首先介绍YouTube的视频推荐系统[[7](#参考文献)],然后介绍我们实现的融合推荐模型。 - -### YouTube的深度神经网络推荐系统 - -YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示: - -![YouTube_Overview](./image/YouTube_Overview.png) -

-图1. YouTube 推荐系统结构 -

- -#### 候选生成网络(Candidate Generation Network) - -候选生成网络将推荐问题建模为一个类别数极大的多类分类问题:对于一个Youtube用户,使用其观看历史(视频ID)、搜索词记录(search tokens)、人口学信息(如地理位置、用户登录设备)、二值特征(如性别,是否登录)和连续特征(如用户年龄)等,对视频库中所有视频进行多分类,得到每一类别的分类结果(即每一个视频的推荐概率),最终输出概率较高的几百个视频。 - -首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的`$k$`个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。 - -![Deep_candidate_generation_model_architecture](./image/Deep_candidate_generation_model_architecture.png) -

-图2. 候选生成网络结构 -

- -对于一个用户`$U$`,预测此刻用户要观看的视频`$\omega$`为视频`$i$`的概率公式为: - -$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$ - -其中`$u$`为用户`$U$`的特征表示,`$V$`为视频库集合,`$v_i$`为视频库中第`$i$`个视频的特征表示。`$u$`和`$v_i$`为长度相等的向量,两者点积可以通过全连接层实现。 - -考虑到softmax分类的类别数非常多,为了保证一定的计算效率:1)训练阶段,使用负样本类别采样将实际计算的类别数缩小至数千;2)推荐(预测)阶段,忽略softmax的归一化计算(不影响结果),将类别打分问题简化为点积(dot product)空间中的最近邻(nearest neighbor)搜索问题,取与`$u$`最近的`$k$`个视频作为生成的候选。 - -#### 排序网络(Ranking Network) -排序网络的结构类似于候选生成网络,但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似,这里也构造了大量的用于视频排序的相关特征(如视频 ID、上次观看时间等)。这些特征的处理方式和候选生成网络类似,不同之处是排序网络的顶部是一个加权逻辑回归(weighted logistic regression),它对所有候选视频进行打分,从高到底排序后将分数较高的一些视频返回给用户。 - -### 融合推荐模型 -本节会使卷积神经网络(Convolutional Neural Networks)来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。 - -#### 文本卷积神经网络(CNN) - -卷积神经网络经常用来处理具有类似网格拓扑结构(grid-like topology)的数据。例如,图像可以视为二维网格的像素点,自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征,并对其进行组合抽象得到更高级的特征表示。实验表明,卷积神经网络能高效地对图像及文本问题进行建模处理。 - -卷积神经网络主要由卷积(convolution)和池化(pooling)操作构成,其应用及组合方式灵活多变,种类繁多。本小结我们以如图3所示的网络进行讲解: - -![text_cnn](./image/text_cnn.png) -

-图3. 卷积神经网络文本分类模型 -

- -假设待处理句子的长度为`$n$`,其中第`$i$`个词的词向量(word embedding)为`$x_i\in\mathbb{R}^k$`,`$k$`为维度大小。 - -首先,进行词向量的拼接操作:将每`$h$`个词拼接起来形成一个大小为`$h$`的词窗口,记为`$x_{i:i+h-1}$`,它表示词序列`$x_{i},x_{i+1},\ldots,x_{i+h-1}$`的拼接,其中,`$i$`表示词窗口中第一个词在整个句子中的位置,取值范围从`$1$`到`$n-h+1$`,`$x_{i:i+h-1}\in\mathbb{R}^{hk}$`。 - -其次,进行卷积操作:把卷积核(kernel)`$w\in\mathbb{R}^{hk}$`应用于包含`$h$`个词的窗口`$x_{i:i+h-1}$`,得到特征`$c_i=f(w\cdot x_{i:i+h-1}+b)$`,其中`$b\in\mathbb{R}$`为偏置项(bias),`$f$`为非线性激活函数,如`$sigmoid$`。将卷积核应用于句子中所有的词窗口`${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$`,产生一个特征图(feature map): - -$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$ - -接下来,对特征图采用时间维度上的最大池化(max pooling over time)操作得到此卷积核对应的整句话的特征`$\hat c$`,它是特征图中所有元素的最大值: - -$$\hat c=max(c)$$ - -#### 模型概览 - -在融合推荐模型的电影推荐系统中: - -1. 首先,使用用户特征和电影特征作为神经网络的输入,其中: - -- 用户特征融合了四个属性信息,分别是用户ID、性别、职业和年龄。 - -- 电影特征融合了三个属性信息,分别是电影ID、电影类型ID和电影名称。 - -2. 对用户特征,将用户ID映射为维度大小为256的向量表示,输入全连接层,并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。 - -3. 对电影特征,将电影ID以类似用户ID的方式进行处理,电影类型ID以向量的形式直接输入全连接层,电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。 - -4. 得到用户和电影的向量表示后,计算二者的余弦相似度作为推荐系统的打分。最后,用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。 - -![rec_regression_network](./image/rec_regression_network.png) -

-图4. 融合推荐模型 -

- -## 数据准备 - -### 数据介绍与下载 - -我们以 [MovieLens 百万数据集(ml-1m)](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价(评分范围 1~5 分,均为整数),由 GroupLens Research 实验室搜集整理。 - -Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens` - - -```python -import paddle -movie_info = paddle.dataset.movielens.movie_info() -print movie_info.values()[0] -``` - - -```python -# Run this block to show dataset's documentation -# help(paddle.dataset.movielens) -``` - -在原始数据中包含电影的特征数据,用户的特征数据,和用户对电影的评分。 - -例如,其中某一个电影特征为: - - -```python -movie_info = paddle.dataset.movielens.movie_info() -print movie_info.values()[0] -``` - - - - -这表示,电影的id是1,标题是《Toy Story》,该电影被分为到三个类别中。这三个类别是动画,儿童,喜剧。 - - -```python -user_info = paddle.dataset.movielens.user_info() -print user_info.values()[0] -``` - - - - -这表示,该用户ID是1,女性,年龄比18岁还年轻。职业ID是10。 - - -其中,年龄使用下列分布 -* 1: "Under 18" -* 18: "18-24" -* 25: "25-34" -* 35: "35-44" -* 45: "45-49" -* 50: "50-55" -* 56: "56+" - -职业是从下面几种选项里面选则得出: -* 0: "other" or not specified -* 1: "academic/educator" -* 2: "artist" -* 3: "clerical/admin" -* 4: "college/grad student" -* 5: "customer service" -* 6: "doctor/health care" -* 7: "executive/managerial" -* 8: "farmer" -* 9: "homemaker" -* 10: "K-12 student" -* 11: "lawyer" -* 12: "programmer" -* 13: "retired" -* 14: "sales/marketing" -* 15: "scientist" -* 16: "self-employed" -* 17: "technician/engineer" -* 18: "tradesman/craftsman" -* 19: "unemployed" -* 20: "writer" - -而对于每一条训练/测试数据,均为 <用户特征> + <电影特征> + 评分。 - -例如,我们获得第一条训练数据: - - -```python -train_set_creator = paddle.dataset.movielens.train() -train_sample = next(train_set_creator()) -uid = train_sample[0] -mov_id = train_sample[len(user_info[uid].value())] -print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1]) -``` - -User rates Movie with Score [5.0] - - -即用户1对电影1193的评价为5分。 - -## 模型配置说明 - -下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 - - -```python -import math -import sys -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.layers as layers -import paddle.fluid.nets as nets - -IS_SPARSE = True -USE_GPU = False -BATCH_SIZE = 256 -``` - -然后为我们的用户特征综合模型定义模型配置 - -```python -def get_usr_combined_features(): - -USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1 - -uid = layers.data(name='user_id', shape=[1], dtype='int64') - -usr_emb = layers.embedding( -input=uid, -dtype='float32', -size=[USR_DICT_SIZE, 32], -param_attr='user_table', -is_sparse=IS_SPARSE) - -usr_fc = layers.fc(input=usr_emb, size=32) - -USR_GENDER_DICT_SIZE = 2 - -usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64') - -usr_gender_emb = layers.embedding( -input=usr_gender_id, -size=[USR_GENDER_DICT_SIZE, 16], -param_attr='gender_table', -is_sparse=IS_SPARSE) - -usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) - -USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table) -usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64") - -usr_age_emb = layers.embedding( -input=usr_age_id, -size=[USR_AGE_DICT_SIZE, 16], -is_sparse=IS_SPARSE, -param_attr='age_table') - -usr_age_fc = layers.fc(input=usr_age_emb, size=16) - -USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1 -usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64") - -usr_job_emb = layers.embedding( -input=usr_job_id, -size=[USR_JOB_DICT_SIZE, 16], -param_attr='job_table', -is_sparse=IS_SPARSE) - -usr_job_fc = layers.fc(input=usr_job_emb, size=16) - -concat_embed = layers.concat( -input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1) - -usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") - -return usr_combined_features -``` - -如上述代码所示,对于每个用户,我们输入4维特征。其中包括user_id,gender_id,age_id,job_id。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便,我们借鉴NLP中的语言模型,将这几维离散的整数值,变换成embedding取出。分别形成usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb。 - -然后,我们对于所有的用户特征,均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。 - -进而,我们对每一个电影特征做类似的变换,网络配置为: - - -```python -def get_mov_combined_features(): - -MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 - -mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') - -mov_emb = layers.embedding( -input=mov_id, -dtype='float32', -size=[MOV_DICT_SIZE, 32], -param_attr='movie_table', -is_sparse=IS_SPARSE) - -mov_fc = layers.fc(input=mov_emb, size=32) - -CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) - -category_id = layers.data( -name='category_id', shape=[1], dtype='int64', lod_level=1) - -mov_categories_emb = layers.embedding( -input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) - -mov_categories_hidden = layers.sequence_pool( -input=mov_categories_emb, pool_type="sum") - -MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) - -mov_title_id = layers.data( -name='movie_title', shape=[1], dtype='int64', lod_level=1) - -mov_title_emb = layers.embedding( -input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) - -mov_title_conv = nets.sequence_conv_pool( -input=mov_title_emb, -num_filters=32, -filter_size=3, -act="tanh", -pool_type="sum") - -concat_embed = layers.concat( -input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) - -mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") - -return mov_combined_features -``` - -电影标题名称(title)是一个序列的整数,整数代表的是这个词在索引序列中的下标。这个序列会被送入 `sequence_conv_pool` 层,这个层会在时间维度上使用卷积和池化。因为如此,所以输出会是固定长度,尽管输入的序列长度各不相同。 - -最后,我们定义一个`inference_program`来使用余弦相似度计算用户特征与电影特征的相似性。 - -```python -def inference_program(): -usr_combined_features = get_usr_combined_features() -mov_combined_features = get_mov_combined_features() - -inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) -scale_infer = layers.scale(x=inference, scale=5.0) - -return scale_infer -``` - -进而,我们定义一个`train_program`来使用`inference_program`计算出的结果,在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。 - -```python -def train_program(): - -scale_infer = inference_program() - -label = layers.data(name='score', shape=[1], dtype='float32') -square_cost = layers.square_error_cost(input=scale_infer, label=label) -avg_cost = layers.mean(square_cost) - -return [avg_cost, scale_infer] - - -def optimizer_func(): -return fluid.optimizer.SGD(learning_rate=0.2) -``` - - -## 训练模型 - -### 定义训练环境 -定义您的训练环境,可以指定训练是发生在CPU还是GPU上。 - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 定义数据提供器 -下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.movielens.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据,乱序化的大小为缓存大小`buf_size`。 - -```python -train_reader = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.movielens.train(), buf_size=8192), -batch_size=BATCH_SIZE) - -test_reader = paddle.batch( -paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) -``` - -### 构造训练器(trainer) -训练器需要一个训练程序和一个训练优化函数。 - -```python -trainer = fluid.Trainer( -train_func=train_program, place=place, optimizer_func=optimizer_func) -``` - -### 提供数据 - -`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`movielens.train`产生的第一列的数据对应的是`user_id`这个特征。 - -```python -feed_order = [ -'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id', -'movie_title', 'score' -] -``` - -### 事件处理器 -回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 - -```python -# Specify the directory path to save the parameters -params_dirname = "recommender_system.inference.model" - -from paddle.v2.plot import Ploter -test_title = "Test cost" -plot_cost = Ploter(test_title) - - -def event_handler(event): -if isinstance(event, fluid.EndStepEvent): -avg_cost_set = trainer.test( -reader=test_reader, feed_order=feed_order) - -# get avg cost -avg_cost = np.array(avg_cost_set).mean() - -plot_cost.append(test_title, event.step, avg_cost_set[0]) -plot_cost.plot() - -print("avg_cost: %s" % avg_cost) -print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1, -float(avg_cost))) - -if event.step == 20: # Adjust this number for accuracy -trainer.save_params(params_dirname) -trainer.stop() -``` - -### 开始训练 -最后,我们传入训练循环数(`num_epoch`)和一些别的参数,调用 `trainer.train` 来开始训练。 - -```python -trainer.train( -num_epochs=1, -event_handler=event_handler, -reader=train_reader, -feed_order=feed_order) -``` - -## 应用模型 - -### 构建预测器 -传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。 - -```python -inferencer = fluid.Inferencer( -inference_program, param_path=params_dirname, place=place) -``` - -### 生成测试用输入数据 -使用 create_lod_tensor(data, lod, place) 的API来生成细节层次的张量。`data`是一个序列,每个元素是一个索引号的序列。`lod`是细节层次的信息,对应于`data`。比如,data = [[10, 2, 3], [2, 3]] 意味着它包含两个序列,长度分别是3和2。于是相应地 lod = [[3, 2]],它表明其包含一层细节信息,意味着 `data` 有两个序列,长度分别是3和2。 - -在这个预测例子中,我们试着预测用户ID为1的用户对于电影'Hunchback of Notre Dame'的评分 - -```python -infer_movie_id = 783 -infer_movie_name = paddle.dataset.movielens.movie_info()[infer_movie_id].title -user_id = fluid.create_lod_tensor([[1]], [[1]], place) -gender_id = fluid.create_lod_tensor([[1]], [[1]], place) -age_id = fluid.create_lod_tensor([[0]], [[1]], place) -job_id = fluid.create_lod_tensor([[10]], [[1]], place) -movie_id = fluid.create_lod_tensor([[783]], [[1]], place) # Hunchback of Notre Dame -category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place) # Animation, Children's, Musical -movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]], -place) # 'hunchback','of','notre','dame','the' -``` - -### 测试 -现在我们可以进行预测了。我们要提供的`feed_order`应该和训练过程一致。 - - -```python -results = inferencer.infer( -{ -'user_id': user_id, -'gender_id': gender_id, -'age_id': age_id, -'job_id': job_id, -'movie_id': movie_id, -'category_id': category_id, -'movie_title': movie_title -}, -return_numpy=False) -``` - -## 总结 - -本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统,并以电影推荐为例,使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面,而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术,也将会在推荐系统领域大放异彩。 - -## 参考文献 - -1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325. -2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2. -3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186. -4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001. -5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA -6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016). -7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198. - - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 +# 个性化推荐 + +本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/176.html)。 + +## 背景介绍 + +在网络技术不断发展和电子商务规模不断扩大的背景下,商品数量和种类快速增长,用户需要花费大量时间才能找到自己想买的商品,这就是信息超载问题。为了解决这个难题,推荐系统(Recommender System)应运而生。 + +个性化推荐系统是信息过滤系统(Information Filtering System)的子集,它可以用在很多领域,如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为,发现用户的个性化需求与兴趣特点,将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同,推荐系统不需要用户准确地描述出自己的需求,而是根据分析历史行为建模,主动提供满足用户兴趣和需求的信息。 + +传统的推荐系统方法主要有: + +- 协同过滤推荐(Collaborative Filtering Recommendation):该方法收集分析用户历史行为、活动、偏好,计算一个用户与其他用户的相似度,利用目标用户的相似用户对商品评价的加权评价值,来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品;缺点是对于没有任何行为的新用户存在冷启动的问题,同时也存在用户与商品之间的交互数据不够多造成的稀疏问题,会导致模型难以找到相近用户。 +- 基于内容过滤推荐[[1](#参考文献)](Content-based Filtering Recommendation):该方法利用商品的内容描述,抽象出有意义的特征,通过计算用户的兴趣和商品描述之间的相似度,来给用户做推荐。优点是简单直接,不需要依据其他用户对商品的评价,而是通过商品属性进行商品相似度度量,从而推荐给用户所感兴趣商品的相似商品;缺点是对于没有任何行为的新用户同样存在冷启动的问题。 +- 组合推荐[[2](#参考文献)](Hybrid Recommendation):运用不同的输入和技术共同进行推荐,以弥补各自推荐技术的缺点。 + +其中协同过滤是应用最广泛的技术之一,它又可以分为多个子类:基于用户 (User-Based)的推荐[[3](#参考文献)] 、基于物品(Item-Based)的推荐[[4](#参考文献)]、基于社交网络关系(Social-Based)的推荐[[5](#参考文献)]、基于模型(Model-based)的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想,此后,基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。 + +深度学习具有优秀的自动提取特征的能力,能够学习多层次的抽象特征表示,并对异质或跨域的内容信息进行学习,可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型,以及如何使用PaddlePaddle实现模型。 + +## 效果展示 + +我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后,只需要输入对应的用户ID和电影ID,就可以得出一个匹配的分数(范围[0,5],分数越高视为兴趣越大),然后根据所有电影的推荐得分排序,推荐给用户可能感兴趣的电影。 + +``` +Input movie_id: 1962 +Input user_id: 1 +Prediction Score is 4.25 +``` + +## 模型概览 + +本章中,我们首先介绍YouTube的视频推荐系统[[7](#参考文献)],然后介绍我们实现的融合推荐模型。 + +### YouTube的深度神经网络推荐系统 + +YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示: + +

+
+图1. YouTube 推荐系统结构 +

+ +#### 候选生成网络(Candidate Generation Network) + +候选生成网络将推荐问题建模为一个类别数极大的多类分类问题:对于一个Youtube用户,使用其观看历史(视频ID)、搜索词记录(search tokens)、人口学信息(如地理位置、用户登录设备)、二值特征(如性别,是否登录)和连续特征(如用户年龄)等,对视频库中所有视频进行多分类,得到每一类别的分类结果(即每一个视频的推荐概率),最终输出概率较高的几百个视频。 + +首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。 + +

+
+图2. 候选生成网络结构 +

+ +对于一个用户$U$,预测此刻用户要观看的视频$\omega$为视频$i$的概率公式为: + +$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$ + +其中$u$为用户$U$的特征表示,$V$为视频库集合,$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量,两者点积可以通过全连接层实现。 + +考虑到softmax分类的类别数非常多,为了保证一定的计算效率:1)训练阶段,使用负样本类别采样将实际计算的类别数缩小至数千;2)推荐(预测)阶段,忽略softmax的归一化计算(不影响结果),将类别打分问题简化为点积(dot product)空间中的最近邻(nearest neighbor)搜索问题,取与$u$最近的$k$个视频作为生成的候选。 + +#### 排序网络(Ranking Network) +排序网络的结构类似于候选生成网络,但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似,这里也构造了大量的用于视频排序的相关特征(如视频 ID、上次观看时间等)。这些特征的处理方式和候选生成网络类似,不同之处是排序网络的顶部是一个加权逻辑回归(weighted logistic regression),它对所有候选视频进行打分,从高到底排序后将分数较高的一些视频返回给用户。 + +### 融合推荐模型 +本节会使卷积神经网络(Convolutional Neural Networks)来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。 + +#### 文本卷积神经网络(CNN) + +卷积神经网络经常用来处理具有类似网格拓扑结构(grid-like topology)的数据。例如,图像可以视为二维网格的像素点,自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征,并对其进行组合抽象得到更高级的特征表示。实验表明,卷积神经网络能高效地对图像及文本问题进行建模处理。 + +卷积神经网络主要由卷积(convolution)和池化(pooling)操作构成,其应用及组合方式灵活多变,种类繁多。本小结我们以如图3所示的网络进行讲解: + +

+
+图3. 卷积神经网络文本分类模型 +

+ +假设待处理句子的长度为$n$,其中第$i$个词的词向量(word embedding)为$x_i\in\mathbb{R}^k$,$k$为维度大小。 + +首先,进行词向量的拼接操作:将每$h$个词拼接起来形成一个大小为$h$的词窗口,记为$x_{i:i+h-1}$,它表示词序列$x_{i},x_{i+1},\ldots,x_{i+h-1}$的拼接,其中,$i$表示词窗口中第一个词在整个句子中的位置,取值范围从$1$到$n-h+1$,$x_{i:i+h-1}\in\mathbb{R}^{hk}$。 + +其次,进行卷积操作:把卷积核(kernel)$w\in\mathbb{R}^{hk}$应用于包含$h$个词的窗口$x_{i:i+h-1}$,得到特征$c_i=f(w\cdot x_{i:i+h-1}+b)$,其中$b\in\mathbb{R}$为偏置项(bias),$f$为非线性激活函数,如$sigmoid$。将卷积核应用于句子中所有的词窗口${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$,产生一个特征图(feature map): + +$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$ + +接下来,对特征图采用时间维度上的最大池化(max pooling over time)操作得到此卷积核对应的整句话的特征$\hat c$,它是特征图中所有元素的最大值: + +$$\hat c=max(c)$$ + +#### 模型概览 + +在融合推荐模型的电影推荐系统中: + +1. 首先,使用用户特征和电影特征作为神经网络的输入,其中: + + - 用户特征融合了四个属性信息,分别是用户ID、性别、职业和年龄。 + + - 电影特征融合了三个属性信息,分别是电影ID、电影类型ID和电影名称。 + +2. 对用户特征,将用户ID映射为维度大小为256的向量表示,输入全连接层,并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。 + +3. 对电影特征,将电影ID以类似用户ID的方式进行处理,电影类型ID以向量的形式直接输入全连接层,电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。 + +4. 得到用户和电影的向量表示后,计算二者的余弦相似度作为推荐系统的打分。最后,用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。 + +

+ +
+图4. 融合推荐模型 +

+ +## 数据准备 + +### 数据介绍与下载 + +我们以 [MovieLens 百万数据集(ml-1m)](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价(评分范围 1~5 分,均为整数),由 GroupLens Research 实验室搜集整理。 + +Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens` + + +```python +import paddle +movie_info = paddle.dataset.movielens.movie_info() +print movie_info.values()[0] +``` + + +```python +# Run this block to show dataset's documentation +# help(paddle.dataset.movielens) +``` + +在原始数据中包含电影的特征数据,用户的特征数据,和用户对电影的评分。 + +例如,其中某一个电影特征为: + + +```python +movie_info = paddle.dataset.movielens.movie_info() +print movie_info.values()[0] +``` + + + + +这表示,电影的id是1,标题是《Toy Story》,该电影被分为到三个类别中。这三个类别是动画,儿童,喜剧。 + + +```python +user_info = paddle.dataset.movielens.user_info() +print user_info.values()[0] +``` + + + + +这表示,该用户ID是1,女性,年龄比18岁还年轻。职业ID是10。 + + +其中,年龄使用下列分布 + +* 1: "Under 18" +* 18: "18-24" +* 25: "25-34" +* 35: "35-44" +* 45: "45-49" +* 50: "50-55" +* 56: "56+" + +职业是从下面几种选项里面选则得出: + +* 0: "other" or not specified +* 1: "academic/educator" +* 2: "artist" +* 3: "clerical/admin" +* 4: "college/grad student" +* 5: "customer service" +* 6: "doctor/health care" +* 7: "executive/managerial" +* 8: "farmer" +* 9: "homemaker" +* 10: "K-12 student" +* 11: "lawyer" +* 12: "programmer" +* 13: "retired" +* 14: "sales/marketing" +* 15: "scientist" +* 16: "self-employed" +* 17: "technician/engineer" +* 18: "tradesman/craftsman" +* 19: "unemployed" +* 20: "writer" + +而对于每一条训练/测试数据,均为 <用户特征> + <电影特征> + 评分。 + +例如,我们获得第一条训练数据: + + +```python +train_set_creator = paddle.dataset.movielens.train() +train_sample = next(train_set_creator()) +uid = train_sample[0] +mov_id = train_sample[len(user_info[uid].value())] +print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1]) +``` + + User rates Movie with Score [5.0] + + +即用户1对电影1193的评价为5分。 + +## 模型配置说明 + +下面我们开始根据输入数据的形式配置模型。首先引入所需的库函数以及定义全局变量。 + + +```python +from __future__ import print_function +import math +import sys +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.nets as nets + +IS_SPARSE = True +USE_GPU = False +BATCH_SIZE = 256 +``` + +然后为我们的用户特征综合模型定义模型配置 + +```python +def get_usr_combined_features(): + + USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1 + + uid = layers.data(name='user_id', shape=[1], dtype='int64') + + usr_emb = layers.embedding( + input=uid, + dtype='float32', + size=[USR_DICT_SIZE, 32], + param_attr='user_table', + is_sparse=IS_SPARSE) + + usr_fc = layers.fc(input=usr_emb, size=32) + + USR_GENDER_DICT_SIZE = 2 + + usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64') + + usr_gender_emb = layers.embedding( + input=usr_gender_id, + size=[USR_GENDER_DICT_SIZE, 16], + param_attr='gender_table', + is_sparse=IS_SPARSE) + + usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) + + USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table) + usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64") + + usr_age_emb = layers.embedding( + input=usr_age_id, + size=[USR_AGE_DICT_SIZE, 16], + is_sparse=IS_SPARSE, + param_attr='age_table') + + usr_age_fc = layers.fc(input=usr_age_emb, size=16) + + USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1 + usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64") + + usr_job_emb = layers.embedding( + input=usr_job_id, + size=[USR_JOB_DICT_SIZE, 16], + param_attr='job_table', + is_sparse=IS_SPARSE) + + usr_job_fc = layers.fc(input=usr_job_emb, size=16) + + concat_embed = layers.concat( + input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1) + + usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") + + return usr_combined_features +``` + +如上述代码所示,对于每个用户,我们输入4维特征。其中包括user_id,gender_id,age_id,job_id。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便,我们借鉴NLP中的语言模型,将这几维离散的整数值,变换成embedding取出。分别形成usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb。 + +然后,我们对于所有的用户特征,均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。 + +进而,我们对每一个电影特征做类似的变换,网络配置为: + + +```python +def get_mov_combined_features(): + + MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 + + mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') + + mov_emb = layers.embedding( + input=mov_id, + dtype='float32', + size=[MOV_DICT_SIZE, 32], + param_attr='movie_table', + is_sparse=IS_SPARSE) + + mov_fc = layers.fc(input=mov_emb, size=32) + + CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) + + category_id = layers.data( + name='category_id', shape=[1], dtype='int64', lod_level=1) + + mov_categories_emb = layers.embedding( + input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) + + mov_categories_hidden = layers.sequence_pool( + input=mov_categories_emb, pool_type="sum") + + MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) + + mov_title_id = layers.data( + name='movie_title', shape=[1], dtype='int64', lod_level=1) + + mov_title_emb = layers.embedding( + input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) + + mov_title_conv = nets.sequence_conv_pool( + input=mov_title_emb, + num_filters=32, + filter_size=3, + act="tanh", + pool_type="sum") + + concat_embed = layers.concat( + input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) + + mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") + + return mov_combined_features +``` + +电影标题名称(title)是一个序列的整数,整数代表的是这个词在索引序列中的下标。这个序列会被送入 `sequence_conv_pool` 层,这个层会在时间维度上使用卷积和池化。因为如此,所以输出会是固定长度,尽管输入的序列长度各不相同。 + +最后,我们定义一个`inference_program`来使用余弦相似度计算用户特征与电影特征的相似性。 + +```python +def inference_program(): + usr_combined_features = get_usr_combined_features() + mov_combined_features = get_mov_combined_features() + + inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) + scale_infer = layers.scale(x=inference, scale=5.0) + + return scale_infer +``` + +进而,我们定义一个`train_program`来使用`inference_program`计算出的结果,在标记数据的帮助下来计算误差。我们还定义了一个`optimizer_func`来定义优化器。 + +```python +def train_program(): + + scale_infer = inference_program() + + label = layers.data(name='score', shape=[1], dtype='float32') + square_cost = layers.square_error_cost(input=scale_infer, label=label) + avg_cost = layers.mean(square_cost) + + return [avg_cost, scale_infer] + + +def optimizer_func(): + return fluid.optimizer.SGD(learning_rate=0.2) +``` + + +## 训练模型 + +### 定义训练环境 +定义您的训练环境,可以指定训练是发生在CPU还是GPU上。 + +```python +use_cuda = False +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() +``` + +### 定义数据提供器 +下一步是为训练和测试定义数据提供器。提供器读入一个大小为 `BATCH_SIZE`的数据。`paddle.dataset.movielens.train` 每次会在乱序化后提供一个大小为`BATCH_SIZE`的数据,乱序化的大小为缓存大小`buf_size`。 + +```python +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.movielens.train(), buf_size=8192), + batch_size=BATCH_SIZE) + +test_reader = paddle.batch( + paddle.dataset.movielens.test(), batch_size=BATCH_SIZE) +``` + +### 构造训练器(trainer) +训练器需要一个训练程序和一个训练优化函数。 + +```python +trainer = fluid.Trainer( + train_func=train_program, place=place, optimizer_func=optimizer_func) +``` + +### 提供数据 + +`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`movielens.train`产生的第一列的数据对应的是`user_id`这个特征。 + +```python +feed_order = [ + 'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id', + 'movie_title', 'score' +] +``` + +### 事件处理器 +回调函数`event_handler`在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 + +```python +# Specify the directory path to save the parameters +params_dirname = "recommender_system.inference.model" + +from paddle.v2.plot import Ploter +test_title = "Test cost" +plot_cost = Ploter(test_title) + + +def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + avg_cost_set = trainer.test( + reader=test_reader, feed_order=feed_order) + + # get avg cost + avg_cost = np.array(avg_cost_set).mean() + + plot_cost.append(test_title, event.step, avg_cost_set[0]) + plot_cost.plot() + + print("avg_cost: %s" % avg_cost) + print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1, + float(avg_cost))) + + if event.step == 20: # Adjust this number for accuracy + trainer.save_params(params_dirname) + trainer.stop() +``` + +### 开始训练 +最后,我们传入训练循环数(`num_epoch`)和一些别的参数,调用 `trainer.train` 来开始训练。 + +```python +trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=feed_order) +``` + +## 应用模型 + +### 构建预测器 +传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。 + +```python +inferencer = fluid.Inferencer( + inference_program, param_path=params_dirname, place=place) +``` + +### 生成测试用输入数据 +使用 create_lod_tensor(data, lod, place) 的API来生成细节层次的张量。`data`是一个序列,每个元素是一个索引号的序列。`lod`是细节层次的信息,对应于`data`。比如,data = [[10, 2, 3], [2, 3]] 意味着它包含两个序列,长度分别是3和2。于是相应地 lod = [[3, 2]],它表明其包含一层细节信息,意味着 `data` 有两个序列,长度分别是3和2。 + +在这个预测例子中,我们试着预测用户ID为1的用户对于电影'Hunchback of Notre Dame'的评分 + +```python +infer_movie_id = 783 +infer_movie_name = paddle.dataset.movielens.movie_info()[infer_movie_id].title +user_id = fluid.create_lod_tensor([[1]], [[1]], place) +gender_id = fluid.create_lod_tensor([[1]], [[1]], place) +age_id = fluid.create_lod_tensor([[0]], [[1]], place) +job_id = fluid.create_lod_tensor([[10]], [[1]], place) +movie_id = fluid.create_lod_tensor([[783]], [[1]], place) # Hunchback of Notre Dame +category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place) # Animation, Children's, Musical +movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]], + place) # 'hunchback','of','notre','dame','the' +``` + +### 测试 +现在我们可以进行预测了。我们要提供的`feed_order`应该和训练过程一致。 + + +```python +results = inferencer.infer( + { + 'user_id': user_id, + 'gender_id': gender_id, + 'age_id': age_id, + 'job_id': job_id, + 'movie_id': movie_id, + 'category_id': category_id, + 'movie_title': movie_title + }, + return_numpy=False) + +predict_rating = np.array(results[0]) +print("Predict Rating of user id 1 on movie \"" + infer_movie_name + "\" is " + str(predict_rating[0][0])) +print("Actual Rating of user id 1 on movie \"" + infer_movie_name + "\" is 4.") + +``` + +## 总结 + +本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统,并以电影推荐为例,使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面,而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术,也将会在推荐系统领域大放异彩。 + +## 参考文献 + +1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325. +2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2. +3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186. +4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001. +5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA +6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016). +7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198. + + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png deleted file mode 100644 index c213608e769f69fb2cfe8597f8e696ee53730e3d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png deleted file mode 100644 index 8aedb2204371e7691140ceffa5992f6080bbf097..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png deleted file mode 100644 index 4298567ac5600173343299999965b20612e7affe..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png deleted file mode 100644 index a98e7cc67606b31e4c945f7eb907563e46dcef56..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png deleted file mode 100644 index 7fd97b9cc3a0b9105b41591af4e8f8e4646bd681..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png deleted file mode 100644 index 90c9b09fb78db98391ee199934f2d16efd6d6652..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png deleted file mode 100755 index 6fc8e11967000ec48c1c0a6fa3c2eaecb80cbb84..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png deleted file mode 100644 index 61e63d9147cbc2901706ef80776d706e5368c3c5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png deleted file mode 100644 index fbcae2be81141be955076e877b94b0ea5d7e4d4a..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/index.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md similarity index 70% rename from doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/index.md rename to doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md index 624de7e4d439953c7255481fb0c9d62ce94f3900..9900dfb9a67dc6f8940bd7dd3abfa15ac8a3488f 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/index.md +++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md @@ -1,354 +1,356 @@ -# 情感分析 - -本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 - -在自然语言处理中,情感分析一般是指判断一段文本所表达的情绪状态。其中,一段文本可以是一个句子,一个段落或一个文档。情绪状态可以是两类,如(正面,负面),(高兴,悲伤);也可以是三类,如(积极,消极,中性)等等。情感分析的应用场景十分广泛,如把用户在购物网站(亚马逊、天猫、淘宝等)、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论;或为了分析用户对于某一产品的整体使用感受,抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子: - -| 电影评论 | 类别 | -| -------- | ----- | -| 在冯小刚这几年的电影里,算最好的一部的了| 正面 | -| 很不好看,好像一个地方台的电视剧 | 负面 | -| 圆方镜头全程炫技,色调背景美则美矣,但剧情拖沓,口音不伦不类,一直努力却始终无法入戏| 负面| -|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉,看得实在太舒服了。。|正面| - -

表格 1 电影评论情感分析

- -在自然语言处理中,情感分析属于典型的**文本分类**问题,即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前,主流的文本表示方法为词袋模型BOW(bag of words),话题模型等等;分类方法有SVM(support vector machine), LR(logistic regression)等等。 - -对于一段文本,BOW表示会忽略其词顺序、语法和句法,将这段文本仅仅看做是一个词集合,因此BOW方法并不能充分表示文本的语义信息。例如,句子“这部电影糟糕透了”和“一个乏味,空洞,没有内涵的作品”在情感分析中具有很高的语义相似度,但是它们的BOW表示的相似度为0。又如,句子“一个空洞,没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高,但实际上它们的意思很不一样。 - -本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷,它在考虑词顺序的基础上把文本映射到低维度的语义空间,并且以端对端(end to end)的方式进行文本表示及分类,其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。 - -## 模型概览 -本章所使用的文本表示模型为卷积神经网络(Convolutional Neural Networks)和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。 - -### 文本卷积神经网络简介(CNN) - -我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程,这里进行一个简单的回顾。 - -对卷积神经网络来说,首先使用卷积处理输入的词向量序列,产生一个特征图(feature map),对特征图采用时间维度上的最大池化(max pooling over time)操作得到此卷积核对应的整句话的特征,最后,将所有卷积核得到的特征拼接起来即为文本的定长向量表示,对于文本分类问题,将其连接至softmax即构建出完整的模型。在实际应用中,我们会使用多个卷积核来处理句子,窗口大小相同的卷积核堆叠起来形成一个矩阵,这样可以更高效的完成运算。另外,我们也可使用窗口大小不同的卷积核来处理句子,[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核,不同颜色表示不同大小的卷积核操作。 - -对于一般的短文本分类问题,上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示,可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。 - -### 循环神经网络(RNN) - -循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上,循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据(词序列),近年来,循环神经网络及其变体(如long short term memory\[[5](#参考文献)\]等)在自然语言处理的多个领域,如语言模型、句法解析、语义角色标注(或一般的序列标注)、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。 - -![rnn](./image/rnn.png) -

-图1. 循环神经网络按时间展开的示意图 -

- -循环神经网络按时间展开后如图1所示:在第`$t$`时刻,网络读入第`$t$`个输入`$x_t$`(向量表示)及前一时刻隐层的状态值`$h_{t-1}$`(向量表示,`$h_0$`一般初始化为`$0$`向量),计算得出本时刻隐层的状态值`$h_t$`,重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为`$f$`,则其公式可表示为: - -$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{t-1}+b_h)$$ - -其中`$W_{xh}$`是输入到隐层的矩阵参数,`$W_{hh}$`是隐层到隐层的矩阵参数,`$b_h$`为隐层的偏置向量(bias)参数,`$\sigma$`为`$sigmoid$`函数。 - -在处理自然语言时,一般会先将词(one-hot表示)映射为其词向量(word embedding)表示,然后再作为循环神经网络每一时刻的输入`$x_t$`。此外,可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如,可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层(deep or stacked)循环神经网络,或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。 - -### 长短期记忆网络(LSTM) - -对于较长的序列数据,循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题,Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。 - -相比于简单的循环神经网络,LSTM增加了记忆单元`$c$`、输入门`$i$`、遗忘门`$f$`及输出门`$o$`。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为`$F$`,则其公式为: - -$$ h_t=F(x_t,h_{t-1})$$ - -`$F$`由下列公式组合而成\[[7](#参考文献)\]: -$$ i_t = \sigma{(W_{xi}x_t+W_{hi}h_{t-1}+W_{ci}c_{t-1}+b_i)} $$ -$$ f_t = \sigma(W_{xf}x_t+W_{hf}h_{t-1}+W_{cf}c_{t-1}+b_f) $$ -$$ c_t = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{t-1}+b_c) $$ -$$ o_t = \sigma(W_{xo}x_t+W_{ho}h_{t-1}+W_{co}c_{t}+b_o) $$ -$$ h_t = o_t\odot tanh(c_t) $$ -其中,`$i_t, f_t, c_t, o_t$`分别表示输入门,遗忘门,记忆单元及输出门的向量值,带角标的`$W$`及`$b$`为模型参数,`$tanh$`为双曲正切函数,`$\odot$`表示逐元素(elementwise)的乘法操作。输入门控制着新输入进入记忆单元`$c$`的强度,遗忘门控制着记忆单元维持上一时刻值的强度,输出门控制着输出记忆单元的强度。三种门的计算方式类似,但有着完全不同的参数,它们各自以不同的方式控制着记忆单元`$c$`,如图2所示: - -![lstm](./image/lstm.png) -

-图2. 时刻`$t$`的LSTM [7] -

- -LSTM通过给简单的循环神经网络增加记忆及控制门的方式,增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\],其设计更为简洁一些。**这些改进虽然各有不同,但是它们的宏观描述却与简单的循环神经网络一样(如图2所示),即隐状态依据当前输入及前一时刻的隐状态来改变,不断地循环这一过程直至输入处理完毕:** - -$$ h_t=Recrurent(x_t,h_{t-1})$$ - -其中,`$Recrurent$`可以表示简单的循环神经网络、GRU或LSTM。 - -### 栈式双向LSTM(Stacked Bidirectional LSTM) - -对于正常顺序的循环神经网络,`$h_t$`包含了`$t$`时刻之前的输入信息,也就是上文信息。同样,为了得到下文信息,我们可以使用反方向(将输入逆序处理)的循环神经网络。结合构建深层循环神经网络的方法(深层神经网络往往能得到更抽象和高级的特征表示),我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\],来对时序数据进行建模。 - -如图3所示(以三层为例),奇数层LSTM正向,偶数层LSTM反向,高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入,对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示(这一表示充分融合了文本的上下文信息,并且对文本进行了深层次抽象),最后我们将文本表示连接至softmax构建分类模型。 - -![stacked_lstm](./image/stacked_lstm.jpg) -

-图3. 栈式双向LSTM用于文本分类 -

- - -## 数据集介绍 - -我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中,负面评论的得分小于等于4,正面评论的得分大于等于7,满分10分。 -```text -aclImdb -|- test -|-- neg -|-- pos -|- train -|-- neg -|-- pos -``` -Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取,并提供了读取字典、训练数据、测试数据等API。 - -## 配置模型 - -在该示例中,我们实现了两种文本分类算法,分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络,以及[栈式双向LSTM](#栈式双向LSTM(Stacked Bidirectional LSTM))。我们首先引入要用到的库和定义全局变量: - -```python -import paddle -import paddle.fluid as fluid -from functools import partial -import numpy as np - -CLASS_DIM = 2 -EMB_DIM = 128 -HID_DIM = 512 -BATCH_SIZE = 128 -USE_GPU = False -``` - - -### 文本卷积神经网络 -我们构建神经网络`convolution_net`,示例代码如下。 -需要注意的是:`fluid.nets.sequence_conv_pool` 包含卷积和池化层两个操作。 - -```python -def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): -emb = fluid.layers.embedding( -input=data, size=[input_dim, emb_dim], is_sparse=True) -conv_3 = fluid.nets.sequence_conv_pool( -input=emb, -num_filters=hid_dim, -filter_size=3, -act="tanh", -pool_type="sqrt") -conv_4 = fluid.nets.sequence_conv_pool( -input=emb, -num_filters=hid_dim, -filter_size=4, -act="tanh", -pool_type="sqrt") -prediction = fluid.layers.fc( -input=[conv_3, conv_4], size=class_dim, act="softmax") -return prediction -``` - -网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。 - -### 栈式双向LSTM - -栈式双向神经网络`stacked_lstm_net`的代码片段如下: - -```python -def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): - -emb = fluid.layers.embedding( -input=data, size=[input_dim, emb_dim], is_sparse=True) - -fc1 = fluid.layers.fc(input=emb, size=hid_dim) -lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim) - -inputs = [fc1, lstm1] - -for i in range(2, stacked_num + 1): -fc = fluid.layers.fc(input=inputs, size=hid_dim) -lstm, cell = fluid.layers.dynamic_lstm( -input=fc, size=hid_dim, is_reverse=(i % 2) == 0) -inputs = [fc, lstm] - -fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max') -lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max') - -prediction = fluid.layers.fc(input=[fc_last, lstm_last], -size=class_dim, -act='softmax') -return prediction -``` -以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。 - -重申一下,此处我们可以调用`convolution_net`或`stacked_lstm_net`的任何一个。我们以`convolution_net`为例。 - -接下来我们定义预测程序(`inference_program`)。预测程序使用`convolution_net`来对`fluid.layer.data`的输入进行预测。 - -```python -def inference_program(word_dict): -data = fluid.layers.data( -name="words", shape=[1], dtype="int64", lod_level=1) - -dict_dim = len(word_dict) -net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) -return net -``` - -我们这里定义了`training_program`。它使用了从`inference_program`返回的结果来计算误差。我们同时定义了优化函数`optimizer_func`。 - -因为是有监督的学习,训练集的标签也在`paddle.layer.data`中定义了。在训练过程中,交叉熵用来在`paddle.layer.classification_cost`中作为损失函数。 - -在测试过程中,分类器会计算各个输出的概率。第一个返回的数值规定为 损耗(cost)。 - -```python -def train_program(word_dict): -prediction = inference_program(word_dict) -label = fluid.layers.data(name="label", shape=[1], dtype="int64") -cost = fluid.layers.cross_entropy(input=prediction, label=label) -avg_cost = fluid.layers.mean(cost) -accuracy = fluid.layers.accuracy(input=prediction, label=label) -return [avg_cost, accuracy] - - -def optimizer_func(): -return fluid.optimizer.Adagrad(learning_rate=0.002) -``` - -## 训练模型 - -### 定义训练环境 - -定义您的训练是在CPU上还是在GPU上: - - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 定义数据提供器 - -下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。paddle.dataset.imdb.train 每次会在乱序化后提供一个大小为BATCH_SIZE的数据,乱序化的大小为缓存大小buf_size。 - -注意:读取IMDB的数据可能会花费几分钟的时间,请耐心等待。 - -```python -print("Loading IMDB word dict....") -word_dict = paddle.dataset.imdb.word_dict() - -print ("Reading training data....") -train_reader = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.imdb.train(word_dict), buf_size=25000), -batch_size=BATCH_SIZE) -``` - -### 构造训练器(trainer) -训练器需要一个训练程序和一个训练优化函数。 - -```python -trainer = fluid.Trainer( -train_func=partial(train_program, word_dict), -place=place, -optimizer_func=optimizer_func) -``` - -### 提供数据 - -`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`imdb.train`产生的第一列的数据对应的是`words`这个特征。 - -```python -feed_order = ['words', 'label'] -``` - -### 事件处理器 - -回调函数event_handler在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 - -```python -# Specify the directory path to save the parameters -params_dirname = "understand_sentiment_conv.inference.model" - -def event_handler(event): -if isinstance(event, fluid.EndStepEvent): -print("Step {0}, Epoch {1} Metrics {2}".format( -event.step, event.epoch, map(np.array, event.metrics))) - -if event.step == 10: -trainer.save_params(params_dirname) -trainer.stop() -``` - -### 开始训练 - -最后,我们传入训练循环数(num_epoch)和一些别的参数,调用 trainer.train 来开始训练。 - -```python -trainer.train( -num_epochs=1, -event_handler=event_handler, -reader=train_reader, -feed_order=feed_order) -``` - -## 应用模型 - -### 构建预测器 - -传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。 - -```python -inferencer = fluid.Inferencer( -inference_program, param_path=params_dirname, place=place) -``` - -### 生成测试用输入数据 - -为了进行预测,我们任意选取3个评论。请随意选取您看好的3个。我们把评论中的每个词对应到`word_dict`中的id。如果词典中没有这个词,则设为`unknown`。 -然后我们用`create_lod_tensor`来创建细节层次的张量。 - -```python -reviews_str = [ -'read the book forget the movie', 'this is a great movie', 'this is very bad' -] -reviews = [c.split() for c in reviews_str] - -UNK = word_dict[''] -lod = [] -for c in reviews: -lod.append([word_dict.get(words, UNK) for words in c]) - -base_shape = [[len(c) for c in lod]] - -tensor_words = fluid.create_lod_tensor(lod, base_shape, place) -``` - -## 应用模型 - -现在我们可以对每一条评论进行正面或者负面的预测啦。 - -```python -results = inferencer.infer({'words': tensor_words}) - -for i, r in enumerate(results[0]): -print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'") - -``` - - -## 总结 - -本章我们以情感分析为例,介绍了使用深度学习的方法进行端对端的短文本分类,并且使用PaddlePaddle完成了全部相关实验。同时,我们简要介绍了两种文本处理模型:卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。 - - -## 参考文献 -1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014. -2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014. -3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016. -4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449. -5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780. -6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166. -7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013. -8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014. -9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 +# 情感分析 + +本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/177.html)。 + +## 背景介绍 + +在自然语言处理中,情感分析一般是指判断一段文本所表达的情绪状态。其中,一段文本可以是一个句子,一个段落或一个文档。情绪状态可以是两类,如(正面,负面),(高兴,悲伤);也可以是三类,如(积极,消极,中性)等等。情感分析的应用场景十分广泛,如把用户在购物网站(亚马逊、天猫、淘宝等)、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论;或为了分析用户对于某一产品的整体使用感受,抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子: + +| 电影评论 | 类别 | +| -------- | ----- | +| 在冯小刚这几年的电影里,算最好的一部的了| 正面 | +| 很不好看,好像一个地方台的电视剧 | 负面 | +| 圆方镜头全程炫技,色调背景美则美矣,但剧情拖沓,口音不伦不类,一直努力却始终无法入戏| 负面| +|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉,看得实在太舒服了。。|正面| + +

表格 1 电影评论情感分析

+ +在自然语言处理中,情感分析属于典型的**文本分类**问题,即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前,主流的文本表示方法为词袋模型BOW(bag of words),话题模型等等;分类方法有SVM(support vector machine), LR(logistic regression)等等。 + +对于一段文本,BOW表示会忽略其词顺序、语法和句法,将这段文本仅仅看做是一个词集合,因此BOW方法并不能充分表示文本的语义信息。例如,句子“这部电影糟糕透了”和“一个乏味,空洞,没有内涵的作品”在情感分析中具有很高的语义相似度,但是它们的BOW表示的相似度为0。又如,句子“一个空洞,没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高,但实际上它们的意思很不一样。 + +本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷,它在考虑词顺序的基础上把文本映射到低维度的语义空间,并且以端对端(end to end)的方式进行文本表示及分类,其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。 + +## 模型概览 +本章所使用的文本表示模型为卷积神经网络(Convolutional Neural Networks)和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。 + +### 文本卷积神经网络简介(CNN) + +我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程,这里进行一个简单的回顾。 + +对卷积神经网络来说,首先使用卷积处理输入的词向量序列,产生一个特征图(feature map),对特征图采用时间维度上的最大池化(max pooling over time)操作得到此卷积核对应的整句话的特征,最后,将所有卷积核得到的特征拼接起来即为文本的定长向量表示,对于文本分类问题,将其连接至softmax即构建出完整的模型。在实际应用中,我们会使用多个卷积核来处理句子,窗口大小相同的卷积核堆叠起来形成一个矩阵,这样可以更高效的完成运算。另外,我们也可使用窗口大小不同的卷积核来处理句子,[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核,不同颜色表示不同大小的卷积核操作。 + +对于一般的短文本分类问题,上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示,可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。 + +### 循环神经网络(RNN) + +循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上,循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据(词序列),近年来,循环神经网络及其变体(如long short term memory\[[5](#参考文献)\]等)在自然语言处理的多个领域,如语言模型、句法解析、语义角色标注(或一般的序列标注)、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。 + +

+
+图1. 循环神经网络按时间展开的示意图 +

+ +循环神经网络按时间展开后如图1所示:在第$t$时刻,网络读入第$t$个输入$x_t$(向量表示)及前一时刻隐层的状态值$h_{t-1}$(向量表示,$h_0$一般初始化为$0$向量),计算得出本时刻隐层的状态值$h_t$,重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为$f$,则其公式可表示为: + +$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{t-1}+b_h)$$ + +其中$W_{xh}$是输入到隐层的矩阵参数,$W_{hh}$是隐层到隐层的矩阵参数,$b_h$为隐层的偏置向量(bias)参数,$\sigma$为$sigmoid$函数。 + +在处理自然语言时,一般会先将词(one-hot表示)映射为其词向量(word embedding)表示,然后再作为循环神经网络每一时刻的输入$x_t$。此外,可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如,可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层(deep or stacked)循环神经网络,或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。 + +### 长短期记忆网络(LSTM) + +对于较长的序列数据,循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题,Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。 + +相比于简单的循环神经网络,LSTM增加了记忆单元$c$、输入门$i$、遗忘门$f$及输出门$o$。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为$F$,则其公式为: + +$$ h_t=F(x_t,h_{t-1})$$ + +$F$由下列公式组合而成\[[7](#参考文献)\]: +$$ i_t = \sigma{(W_{xi}x_t+W_{hi}h_{t-1}+W_{ci}c_{t-1}+b_i)} $$ +$$ f_t = \sigma(W_{xf}x_t+W_{hf}h_{t-1}+W_{cf}c_{t-1}+b_f) $$ +$$ c_t = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{t-1}+b_c) $$ +$$ o_t = \sigma(W_{xo}x_t+W_{ho}h_{t-1}+W_{co}c_{t}+b_o) $$ +$$ h_t = o_t\odot tanh(c_t) $$ +其中,$i_t, f_t, c_t, o_t$分别表示输入门,遗忘门,记忆单元及输出门的向量值,带角标的$W$及$b$为模型参数,$tanh$为双曲正切函数,$\odot$表示逐元素(elementwise)的乘法操作。输入门控制着新输入进入记忆单元$c$的强度,遗忘门控制着记忆单元维持上一时刻值的强度,输出门控制着输出记忆单元的强度。三种门的计算方式类似,但有着完全不同的参数,它们各自以不同的方式控制着记忆单元$c$,如图2所示: + +

+
+图2. 时刻$t$的LSTM [7] +

+ +LSTM通过给简单的循环神经网络增加记忆及控制门的方式,增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\],其设计更为简洁一些。**这些改进虽然各有不同,但是它们的宏观描述却与简单的循环神经网络一样(如图2所示),即隐状态依据当前输入及前一时刻的隐状态来改变,不断地循环这一过程直至输入处理完毕:** + +$$ h_t=Recrurent(x_t,h_{t-1})$$ + +其中,$Recrurent$可以表示简单的循环神经网络、GRU或LSTM。 + +### 栈式双向LSTM(Stacked Bidirectional LSTM) + +对于正常顺序的循环神经网络,$h_t$包含了$t$时刻之前的输入信息,也就是上文信息。同样,为了得到下文信息,我们可以使用反方向(将输入逆序处理)的循环神经网络。结合构建深层循环神经网络的方法(深层神经网络往往能得到更抽象和高级的特征表示),我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\],来对时序数据进行建模。 + +如图3所示(以三层为例),奇数层LSTM正向,偶数层LSTM反向,高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入,对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示(这一表示充分融合了文本的上下文信息,并且对文本进行了深层次抽象),最后我们将文本表示连接至softmax构建分类模型。 + +

+
+图3. 栈式双向LSTM用于文本分类 +

+ + +## 数据集介绍 + +我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中,负面评论的得分小于等于4,正面评论的得分大于等于7,满分10分。 +```text +aclImdb +|- test + |-- neg + |-- pos +|- train + |-- neg + |-- pos +``` +Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取,并提供了读取字典、训练数据、测试数据等API。 + +## 配置模型 + +在该示例中,我们实现了两种文本分类算法,分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络,以及[栈式双向LSTM](#栈式双向LSTM(Stacked Bidirectional LSTM))。我们首先引入要用到的库和定义全局变量: + +```python +from __future__ import print_function +import paddle +import paddle.fluid as fluid +from functools import partial +import numpy as np + +CLASS_DIM = 2 +EMB_DIM = 128 +HID_DIM = 512 +STACKED_NUM = 3 +BATCH_SIZE = 128 +USE_GPU = False +``` + + +### 文本卷积神经网络 +我们构建神经网络`convolution_net`,示例代码如下。 +需要注意的是:`fluid.nets.sequence_conv_pool` 包含卷积和池化层两个操作。 + +```python +def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim): + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=3, + act="tanh", + pool_type="sqrt") + conv_4 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=4, + act="tanh", + pool_type="sqrt") + prediction = fluid.layers.fc( + input=[conv_3, conv_4], size=class_dim, act="softmax") + return prediction +``` + +网络的输入`input_dim`表示的是词典的大小,`class_dim`表示类别数。这里,我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。 + +### 栈式双向LSTM + +栈式双向神经网络`stacked_lstm_net`的代码片段如下: + +```python +def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num): + + emb = fluid.layers.embedding( + input=data, size=[input_dim, emb_dim], is_sparse=True) + + fc1 = fluid.layers.fc(input=emb, size=hid_dim) + lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim) + + inputs = [fc1, lstm1] + + for i in range(2, stacked_num + 1): + fc = fluid.layers.fc(input=inputs, size=hid_dim) + lstm, cell = fluid.layers.dynamic_lstm( + input=fc, size=hid_dim, is_reverse=(i % 2) == 0) + inputs = [fc, lstm] + + fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max') + lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max') + + prediction = fluid.layers.fc( + input=[fc_last, lstm_last], size=class_dim, act='softmax') + return prediction +``` +以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。`paddle.activation.Softmax`函数用来计算分类属于某个类别的概率。 + +重申一下,此处我们可以调用`convolution_net`或`stacked_lstm_net`的任何一个。我们以`convolution_net`为例。 + +接下来我们定义预测程序(`inference_program`)。预测程序使用`convolution_net`来对`fluid.layer.data`的输入进行预测。 + +```python +def inference_program(word_dict): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + dict_dim = len(word_dict) + net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM) + # net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM) + return net +``` + +我们这里定义了`training_program`。它使用了从`inference_program`返回的结果来计算误差。我们同时定义了优化函数`optimizer_func`。 + +因为是有监督的学习,训练集的标签也在`paddle.layer.data`中定义了。在训练过程中,交叉熵用来在`paddle.layer.classification_cost`中作为损失函数。 + +在测试过程中,分类器会计算各个输出的概率。第一个返回的数值规定为 损耗(cost)。 + +```python +def train_program(word_dict): + prediction = inference_program(word_dict) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(cost) + accuracy = fluid.layers.accuracy(input=prediction, label=label) + return [avg_cost, accuracy] + + +def optimizer_func(): + return fluid.optimizer.Adagrad(learning_rate=0.002) +``` + +## 训练模型 + +### 定义训练环境 + +定义您的训练是在CPU上还是在GPU上: + + +```python +use_cuda = False +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() +``` + +### 定义数据提供器 + +下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。paddle.dataset.imdb.train 每次会在乱序化后提供一个大小为BATCH_SIZE的数据,乱序化的大小为缓存大小buf_size。 + +注意:读取IMDB的数据可能会花费几分钟的时间,请耐心等待。 + +```python +print("Loading IMDB word dict....") +word_dict = paddle.dataset.imdb.word_dict() + +print ("Reading training data....") +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=BATCH_SIZE) +``` + +### 构造训练器(trainer) +训练器需要一个训练程序和一个训练优化函数。 + +```python +trainer = fluid.Trainer( + train_func=partial(train_program, word_dict), + place=place, + optimizer_func=optimizer_func) +``` + +### 提供数据 + +`feed_order`用来定义每条产生的数据和`paddle.layer.data`之间的映射关系。比如,`imdb.train`产生的第一列的数据对应的是`words`这个特征。 + +```python +feed_order = ['words', 'label'] +``` + +### 事件处理器 + +回调函数event_handler在一个之前定义好的事件发生后会被调用。例如,我们可以在每步训练结束后查看误差。 + +```python +# Specify the directory path to save the parameters +params_dirname = "understand_sentiment_conv.inference.model" + +def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + print("Step {0}, Epoch {1} Metrics {2}".format( + event.step, event.epoch, map(np.array, event.metrics))) + + if event.step == 10: + trainer.save_params(params_dirname) + trainer.stop() +``` + +### 开始训练 + +最后,我们传入训练循环数(num_epoch)和一些别的参数,调用 trainer.train 来开始训练。 + +```python +trainer.train( + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=feed_order) +``` + +## 应用模型 + +### 构建预测器 + +传入`inference_program`和`params_dirname`来初始化一个预测器, `params_dirname`用来存放训练过程中的各个参数。 + +```python +inferencer = fluid.Inferencer( + infer_func=partial(inference_program, word_dict), param_path=params_dirname, place=place) +``` + +### 生成测试用输入数据 + +为了进行预测,我们任意选取3个评论。请随意选取您看好的3个。我们把评论中的每个词对应到`word_dict`中的id。如果词典中没有这个词,则设为`unknown`。 +然后我们用`create_lod_tensor`来创建细节层次的张量。 + +```python +reviews_str = [ + 'read the book forget the movie', 'this is a great movie', 'this is very bad' +] +reviews = [c.split() for c in reviews_str] + +UNK = word_dict[''] +lod = [] +for c in reviews: + lod.append([word_dict.get(words, UNK) for words in c]) + +base_shape = [[len(c) for c in lod]] + +tensor_words = fluid.create_lod_tensor(lod, base_shape, place) +``` + +## 应用模型 + +现在我们可以对每一条评论进行正面或者负面的预测啦。 + +```python +results = inferencer.infer({'words': tensor_words}) + +for i, r in enumerate(results[0]): + print("Predict probability of ", r[0], " to be positive and ", r[1], " to be negative for review \'", reviews_str[i], "\'") + +``` + + +## 总结 + +本章我们以情感分析为例,介绍了使用深度学习的方法进行端对端的短文本分类,并且使用PaddlePaddle完成了全部相关实验。同时,我们简要介绍了两种文本处理模型:卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。 + + +## 参考文献 +1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014. +2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014. +3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016. +4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449. +5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780. +6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166. +7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013. +8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014. +9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png deleted file mode 100644 index 98fbea413a98a619004ca669c67f5f867fe974c9..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png deleted file mode 100755 index d73a00bf2c1fca2f9b8c26bccf5ea844fa1db50b..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png deleted file mode 100755 index 26c904102a6e6c4e30f0048b81373ae8c148b355..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg deleted file mode 100644 index 6b2adf70f2b5112a2e82505da5cff9f5fd0c6298..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png deleted file mode 100755 index 8b5dbd726178b5555c513294e7b10a81acc96ff5..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/index.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md similarity index 55% rename from doc/fluid/new_docs/beginners_guide/basics/word2vec/index.md rename to doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md index e73a6334ca1acd49379604f24d3d4e463192a902..2c68cdac4f10319359b74bc92569dfd3f65380b5 100644 --- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/index.md +++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md @@ -1,440 +1,444 @@ - -# 词向量 - -本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 - -本章我们介绍词的向量表征,也称为word embedding。词向量是自然语言处理中常见的一个操作,是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。 - -在这些互联网服务里,我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较,我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。 -在这种方式里,每个词被表示成一个实数向量(one-hot vector),其长度为字典大小,每个维度对应一个字典里的每个词,除了这个词对应维度上的值是1,其他元素都是0。 - -One-hot vector虽然自然,但是用处有限。比如,在互联网广告系统里,如果用户输入的query是“母亲节”,而有一个广告的关键词是“康乃馨”。虽然按照常理,我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨;但是这两个词对应的one-hot vectors之间的距离度量,无论是欧氏距离还是余弦相似度(cosine similarity),由于其向量正交,都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是:每个词本身的信息量都太小。所以,仅仅给定两个词,不足以让我们准确判别它们是否相关。要想精确计算相关性,我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。 - -在机器学习领域里,各种“知识”被各种模型表示,词向量模型(word embedding model)就是其中的一类。通过词向量模型可将一个 one-hot vector映射到一个维度更低的实数向量(embedding vector),如`$embedding(Mother's\ Day) = [0.3, 4.2, -1.5, ...], embedding(Carnation) = [0.2, 5.6, -2.3, ...]$`。在这个映射到的实数向量表示中,希望两个语义(或用法)上相似的词对应的词向量“更像”,这样如“母亲节”和“康乃馨”的对应词向量的余弦相似度就不再为零了。 - -词向量模型可以是概率模型、共生矩阵(co-occurrence matrix)模型或神经元网络模型。在用神经网络求词向量之前,传统做法是统计一个词语的共生矩阵`$X$`。`$X$`是一个`$|V| \times |V|$` 大小的矩阵,`$X_{ij}$`表示在所有语料中,词汇表`V`(vocabulary)中第i个词和第j个词同时出现的词数,`$|V|$`为词汇表的大小。对`$X$`做矩阵分解(如奇异值分解,Singular Value Decomposition \[[5](#参考文献)\]),得到的`$U$`即视为所有词的词向量: - -$$X = USV^T$$ - -但这样的传统做法有很多问题:
-1) 由于很多词没有出现,导致矩阵极其稀疏,因此需要对词频做额外处理来达到好的矩阵分解效果;
-2) 矩阵非常大,维度太高(通常达到`$10^6*10^6$`的数量级);
-3) 需要手动去掉停用词(如although, a,...),不然这些频繁出现的词也会影响矩阵分解的效果。 - - -基于神经网络的模型不需要计算存储一个在全语料上统计的大表,而是通过学习语义信息得到词向量,因此能很好地解决以上问题。在本章里,我们将展示基于神经网络训练词向量的细节,以及如何用PaddlePaddle训练一个词向量模型。 - - -## 效果展示 - -本章中,当词向量训练好后,我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影(如下图所示)。从图中可以看出,语义相关的词语(如a, the, these; big, huge)在投影上距离很近,语意无关的词(如say, business; decision, japan)在投影上的距离很远。 - -![2d_similarity](./image/2d_similarity.png) -

-图1. 词向量的二维投影 -

- -另一方面,我们知道两个向量的余弦值在`$[-1,1]$`的区间内:两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0,两个方向完全相反的向量余弦值为-1,即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度: - -``` -similarity: 0.899180685161 -please input two words: big huge - -please input two words: from company -similarity: -0.0997506977351 -``` - -以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[应用模型](#应用模型)中详细描述用法。 - - -## 模型概览 - -在这里我们介绍三个训练词向量的模型:N-gram模型,CBOW模型和Skip-gram模型,它们的中心思想都是通过上下文得到一个词出现的概率。对于N-gram模型,我们会先介绍语言模型的概念,并在之后的[训练模型](#训练模型)中,带大家用PaddlePaddle实现它。而后两个模型,是近年来最有名的神经元词向量模型,由 Tomas Mikolov 在Google 研发\[[3](#参考文献)\],虽然它们很浅很简单,但训练效果很好。 - -### 语言模型 - -在介绍词向量模型之前,我们先来引入一个概念:语言模型。 -语言模型旨在为语句的联合概率函数`$P(w_1, ..., w_T)$`建模, 其中`$w_i$`表示句子中的第i个词。语言模型的目标是,希望模型对有意义的句子赋予大概率,对没意义的句子赋予小概率。 -这样的模型可以应用于很多领域,如机器翻译、语音识别、信息检索、词性标注、手写识别等,它们都希望能得到一个连续序列的概率。 以信息检索为例,当你在搜索“how long is a football bame”时(bame是一个医学名词),搜索引擎会提示你是否希望搜索"how long is a football game", 这是因为根据语言模型计算出“how long is a football bame”的概率很低,而与bame近似的,可能引起错误的词中,game会使该句生成的概率最大。 - -对语言模型的目标概率`$P(w_1, ..., w_T)$`,如果假设文本中每个词都是相互独立的,则整句话的联合概率可以表示为其中所有词语条件概率的乘积,即: - -$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$ - -然而我们知道语句中的每个词出现的概率都与其前面的词紧密相关, 所以实际上通常用条件概率表示语言模型: - -$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$ - - - -### N-gram neural model - -在计算语言学中,n-gram是一种重要的文本表示方法,表示一个文本中连续的n个项。基于具体的应用场景,每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法,用n-gram训练语言模型时,一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。 - -Yoshua Bengio等科学家就于2003年在著名论文 Neural Probabilistic Language Models \[[1](#参考文献)\] 中介绍如何学习一个神经元网络表示的词向量模型。文中的神经概率语言模型(Neural Network Language Model,NNLM)通过一个线性映射和一个非线性隐层连接,同时学习了语言模型和词向量,即通过学习大量语料得到词语的向量表达,通过这些向量得到整个句子的概率。用这种方法学习语言模型可以克服维度灾难(curse of dimensionality),即训练和测试数据不同导致的模型不准。注意:由于“神经概率语言模型”说法较为泛泛,我们在这里不用其NNLM的本名,考虑到其具体做法,本文中称该模型为N-gram neural model。 - -我们在上文中已经讲到用条件概率建模语言模型,即一句话中第`$t$`个词的概率和该句话的前`$t-1$`个词相关。可实际上越远的词语其实对该词的影响越小,那么如果考虑一个n-gram, 每个词都只受其前面`n-1`个词的影响,则有: - -$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$ - -给定一些真实语料,这些语料中都是有意义的句子,N-gram模型的优化目标则是最大化目标函数: - -$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$ - -其中`$f(w_t, w_{t-1}, ..., w_{t-n+1})$`表示根据历史n-1个词得到当前词`$w_t$`的条件概率,`$R(\theta)$`表示参数正则项。 - -![nnlm](./image/nnlm.png) -

-图2. N-gram神经网络模型 -

- -图2展示了N-gram神经网络模型,从下往上看,该模型分为以下几个部分: -- 对于每个样本,模型输入`$w_{t-n+1},...w_{t-1}$`, 输出句子第t个词为字典中`|V|`个词的概率。 - -每个输入词`$w_{t-n+1},...w_{t-1}$`首先通过映射矩阵映射到词向量`$C(w_{t-n+1}),...C(w_{t-1})$`。 - -- 然后所有词语的词向量连接成一个大向量,并经过一个非线性映射得到历史词语的隐层表示: - -$$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$ - -其中,`$x$`为所有词语的词向量连接成的大向量,表示文本历史特征;`$\theta$`、`$U$`、`$b_1$`、`$b_2$`和`$W$`分别为词向量层到隐层连接的参数。`$g$`表示未经归一化的所有输出单词概率,`$g_i$`表示未经归一化的字典中第`$i$`个单词的输出概率。 - -- 根据softmax的定义,通过归一化`$g_i$`, 生成目标词`$w_t$`的概率为: - -$$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$ - -- 整个网络的损失值(cost)为多类分类交叉熵,用公式表示为 - -$$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$ - -其中`$y_k^i$`表示第`$i$`个样本第`$k$`类的真实标签(0或1),`$softmax(g_k^i)$`表示第i个样本第k类softmax输出的概率。 - - - -### Continuous Bag-of-Words model(CBOW) - -CBOW模型通过一个词的上下文(各N个词)预测当前词。当N=2时,模型如下图所示: - -![cbow](./image/cbow.png) -

-图3. CBOW模型 -

- -具体来说,不考虑上下文的词语输入顺序,CBOW是用上下文词语的词向量的均值来预测当前词。即: - -$$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$ - -其中`$x_t$`为第`$t$`个词的词向量,分类分数(score)向量 `$z=U*context$`,最终的分类`$y$`采用softmax,损失函数采用多类分类交叉熵。 - -### Skip-gram model - -CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去掉了噪声,因此在小数据集上很有效。而Skip-gram的方法中,用一个词预测其上下文,得到了当前词上下文的很多样本,因此可用于更大的数据集。 - -![skipgram](./image/skipgram.png) -

-图4. Skip-gram模型 -

- -如上图所示,Skip-gram模型的具体做法是,将一个词的词向量映射到`$2n$`个词的词向量(`$2n$`表示当前输入词的前后各`$n$`个词),然后分别通过softmax得到这`$2n$`个词的分类损失值之和。 - - -## 数据准备 - -### 数据介绍 - -本教程使用Penn Treebank (PTB)(经Tomas Mikolov预处理过的版本)数据集。PTB数据集较小,训练速度快,应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下: - -

- - - - - - - - - - - - - - - - -
训练数据验证数据测试数据
ptb.train.txtptb.valid.txtptb.test.txt
42068句3370句3761句
-

- - -### 数据预处理 - -本章训练的是5-gram模型,表示在PaddlePaddle训练时,每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`,自动做数据的下载与预处理,方便大家使用。 - -预处理会把数据集中的每一句话前后加上开始符号``以及结束符号``。然后依据窗口大小(本教程中为5),从头到尾每次向右滑动窗口并生成一条数据。 - -如"I have a dream that one day" 一句提供了5条数据: - -```text - I have a dream -I have a dream that -have a dream that one -a dream that one day -dream that one day -``` - -最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。 - -## 编程实现 - -本配置的模型结构如下图所示: - -![ngram](./image/ngram.png) -

-图5. 模型配置中的N-gram神经网络模型 -

- -首先,加载所需要的包: - -```python -import paddle -import paddle.fluid as fluid -import numpy -from functools import partial -import math -import os -import sys -``` - -然后,定义参数: -```python -EMBED_SIZE = 32 # word vector dimension -HIDDEN_SIZE = 256 # hidden layer dimension -N = 5 # train 5-gram -BATCH_SIZE = 32 # batch size - -# can use CPU or GPU -use_cuda = os.getenv('WITH_GPU', '0') != '0' - -word_dict = paddle.dataset.imikolov.build_dict() -dict_size = len(word_dict) -``` - -不同于之前的PaddlePaddle v2版本,在新的Fluid版本里,我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`,我们就可以直接用它来构造 N-gram 神经网络。 - -- 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏,我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。 - -```python -def inference_program(is_sparse): -first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') -second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') -third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') -fourth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64') - -embed_first = fluid.layers.embedding( -input=first_word, -size=[dict_size, EMBED_SIZE], -dtype='float32', -is_sparse=is_sparse, -param_attr='shared_w') -embed_second = fluid.layers.embedding( -input=second_word, -size=[dict_size, EMBED_SIZE], -dtype='float32', -is_sparse=is_sparse, -param_attr='shared_w') -embed_third = fluid.layers.embedding( -input=third_word, -size=[dict_size, EMBED_SIZE], -dtype='float32', -is_sparse=is_sparse, -param_attr='shared_w') -embed_fourth = fluid.layers.embedding( -input=fourth_word, -size=[dict_size, EMBED_SIZE], -dtype='float32', -is_sparse=is_sparse, -param_attr='shared_w') - -concat_embed = fluid.layers.concat( -input=[embed_first, embed_second, embed_third, embed_fourth], axis=1) -hidden1 = fluid.layers.fc(input=concat_embed, -size=HIDDEN_SIZE, -act='sigmoid') -predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax') -return predict_word -``` - -- 基于以上的神经网络结构,我们可以如下定义我们的`训练`方法 - -```python -def train_program(is_sparse): -# The declaration of 'next_word' must be after the invoking of inference_program, -# or the data input order of train program would be [next_word, firstw, secondw, -# thirdw, fourthw], which is not correct. -predict_word = inference_program(is_sparse) -next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') -cost = fluid.layers.cross_entropy(input=predict_word, label=next_word) -avg_cost = fluid.layers.mean(cost) -return avg_cost -``` - -- 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集:`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`。两者都会返回一个读取器。在PaddlePaddle中,读取器是一个Python的函数,每次调用,会读取下一条数据。它是一个Python的generator。 - -`paddle.batch` 会读入一个读取器,然后输出一个批次化了的读取器。`event_handler`亦可以一并传入`trainer.train`来时不时的输出每个步骤,批次的训练情况。 - -```python -def optimizer_func(): -# Note here we need to choose more sophisticated optimizers -# such as AdaGrad with a decay rate. The normal SGD converges -# very slowly. -# optimizer=fluid.optimizer.SGD(learning_rate=0.001), -return fluid.optimizer.AdagradOptimizer( -learning_rate=3e-3, -regularization=fluid.regularizer.L2DecayRegularizer(8e-4)) - - -def train(use_cuda, train_program, params_dirname): -train_reader = paddle.batch( -paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) -test_reader = paddle.batch( -paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE) - -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - -def event_handler(event): -if isinstance(event, fluid.EndStepEvent): -# We output cost every 10 steps. -if event.step % 10 == 0: -outs = trainer.test( -reader=test_reader, -feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']) -avg_cost = outs[0] - -print "Step %d: Average Cost %f" % (event.step, avg_cost) - -# If average cost is lower than 5.8, we consider the model good enough to stop. -# Note 5.8 is a relatively high value. In order to get a better model, one should -# aim for avg_cost lower than 3.5. But the training could take longer time. -if avg_cost < 5.8: -trainer.save_params(params_dirname) -trainer.stop() - -if math.isnan(avg_cost): -sys.exit("got NaN loss, training failed.") - -trainer = fluid.Trainer( -train_func=train_program, -optimizer_func=optimizer_func, -place=place) - -trainer.train( -reader=train_reader, -num_epochs=1, -event_handler=event_handler, -feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']) -``` - -- `trainer.train`将会开始训练。从`event_handler`返回的监控情况如下: - -```python -Step 0: Average Cost 7.337213 -Step 10: Average Cost 6.136128 -Step 20: Average Cost 5.766995 -... -``` - -## 模型应用 -在模型训练后,我们可以用它做一些预测。 - -### 预测下一个词 -我们可以用我们训练过的模型,在得知之前的 N-gram 后,预测下一个词。 - -```python -def infer(use_cuda, inference_program, params_dirname=None): -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -inferencer = fluid.Inferencer( -infer_func=inference_program, param_path=params_dirname, place=place) - -# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word -# is simply an index to look up for the corresponding word vector and hence -# the shape of word (base_shape) should be [1]. The length-based level of -# detail (lod) info of each LoDtensor should be [[1]] meaning there is only -# one lod_level and there is only one sequence of one word on this level. -# Note that lod info should be a list of lists. - -data1 = [[211]] # 'among' -data2 = [[6]] # 'a' -data3 = [[96]] # 'group' -data4 = [[4]] # 'of' -lod = [[1]] - -first_word = fluid.create_lod_tensor(data1, lod, place) -second_word = fluid.create_lod_tensor(data2, lod, place) -third_word = fluid.create_lod_tensor(data3, lod, place) -fourth_word = fluid.create_lod_tensor(data4, lod, place) - -result = inferencer.infer( -{ -'firstw': first_word, -'secondw': second_word, -'thirdw': third_word, -'fourthw': fourth_word -}, -return_numpy=False) - -print(numpy.array(result[0])) -most_possible_word_index = numpy.argmax(result[0]) -print(most_possible_word_index) -print([ -key for key, value in word_dict.iteritems() -if value == most_possible_word_index -][0]) -``` - -在经历3分钟的短暂训练后,我们得到如下的预测。我们的模型预测 `among a group of` 的下一个词是`a`。这比较符合文法规律。如果我们训练时间更长,比如几个小时,那么我们会得到的下一个预测是 `workers`。 - - -```python -[[0.00106646 0.0007907 0.00072041 ... 0.00049024 0.00041355 0.00084464]] -6 -a -``` - -整个程序的入口很简单: - -```python -def main(use_cuda, is_sparse): -if use_cuda and not fluid.core.is_compiled_with_cuda(): -return - -params_dirname = "word2vec.inference.model" - -train( -use_cuda=use_cuda, -train_program=partial(train_program, is_sparse), -params_dirname=params_dirname) - -infer( -use_cuda=use_cuda, -inference_program=partial(inference_program, is_sparse), -params_dirname=params_dirname) - - -main(use_cuda=use_cuda, is_sparse=True) -``` - - -## 总结 -本章中,我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中,我们可以根据向量间的余弦夹角,来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中,训练好的词向量可以用来初始化模型,以得到更好的效果。在文档分类中,有了词向量之后,可以用聚类的方法将文档中同义词进行分组,也可以用 N-gram 来预测下一个词。希望大家在本章后能够自行运用词向量进行相关领域的研究。 - - -## 参考文献 -1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155. -2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201. -3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013. -4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605. -5. https://en.wikipedia.org/wiki/Singular_value_decomposition - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 + +# 词向量 + +本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/175.html)。 + +## 背景介绍 + +本章我们介绍词的向量表征,也称为word embedding。词向量是自然语言处理中常见的一个操作,是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。 + +在这些互联网服务里,我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较,我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。 +在这种方式里,每个词被表示成一个实数向量(one-hot vector),其长度为字典大小,每个维度对应一个字典里的每个词,除了这个词对应维度上的值是1,其他元素都是0。 + +One-hot vector虽然自然,但是用处有限。比如,在互联网广告系统里,如果用户输入的query是“母亲节”,而有一个广告的关键词是“康乃馨”。虽然按照常理,我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨;但是这两个词对应的one-hot vectors之间的距离度量,无论是欧氏距离还是余弦相似度(cosine similarity),由于其向量正交,都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是:每个词本身的信息量都太小。所以,仅仅给定两个词,不足以让我们准确判别它们是否相关。要想精确计算相关性,我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。 + +在机器学习领域里,各种“知识”被各种模型表示,词向量模型(word embedding model)就是其中的一类。通过词向量模型可将一个 one-hot vector映射到一个维度更低的实数向量(embedding vector),如$embedding(母亲节) = [0.3, 4.2, -1.5, ...], embedding(康乃馨) = [0.2, 5.6, -2.3, ...]$。在这个映射到的实数向量表示中,希望两个语义(或用法)上相似的词对应的词向量“更像”,这样如“母亲节”和“康乃馨”的对应词向量的余弦相似度就不再为零了。 + +词向量模型可以是概率模型、共生矩阵(co-occurrence matrix)模型或神经元网络模型。在用神经网络求词向量之前,传统做法是统计一个词语的共生矩阵$X$。$X$是一个$|V| \times |V|$ 大小的矩阵,$X_{ij}$表示在所有语料中,词汇表`V`(vocabulary)中第i个词和第j个词同时出现的词数,$|V|$为词汇表的大小。对$X$做矩阵分解(如奇异值分解,Singular Value Decomposition \[[5](#参考文献)\]),得到的$U$即视为所有词的词向量: + +$$X = USV^T$$ + +但这样的传统做法有很多问题: + +1) 由于很多词没有出现,导致矩阵极其稀疏,因此需要对词频做额外处理来达到好的矩阵分解效果; + +2) 矩阵非常大,维度太高(通常达到$10^6 \times 10^6$的数量级); + +3) 需要手动去掉停用词(如although, a,...),不然这些频繁出现的词也会影响矩阵分解的效果。 + +基于神经网络的模型不需要计算存储一个在全语料上统计的大表,而是通过学习语义信息得到词向量,因此能很好地解决以上问题。在本章里,我们将展示基于神经网络训练词向量的细节,以及如何用PaddlePaddle训练一个词向量模型。 + + +## 效果展示 + +本章中,当词向量训练好后,我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影(如下图所示)。从图中可以看出,语义相关的词语(如a, the, these; big, huge)在投影上距离很近,语意无关的词(如say, business; decision, japan)在投影上的距离很远。 + +

+
+ 图1. 词向量的二维投影 +

+ +另一方面,我们知道两个向量的余弦值在$[-1,1]$的区间内:两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0,两个方向完全相反的向量余弦值为-1,即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度: + +``` + +please input two words: big huge +similarity: 0.899180685161 + +please input two words: from company +similarity: -0.0997506977351 + +``` + +以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到,我们将在[应用模型](#应用模型)中详细描述用法。 + + +## 模型概览 + +在这里我们介绍三个训练词向量的模型:N-gram模型,CBOW模型和Skip-gram模型,它们的中心思想都是通过上下文得到一个词出现的概率。对于N-gram模型,我们会先介绍语言模型的概念,并在之后的[训练模型](#训练模型)中,带大家用PaddlePaddle实现它。而后两个模型,是近年来最有名的神经元词向量模型,由 Tomas Mikolov 在Google 研发\[[3](#参考文献)\],虽然它们很浅很简单,但训练效果很好。 + +### 语言模型 + +在介绍词向量模型之前,我们先来引入一个概念:语言模型。 +语言模型旨在为语句的联合概率函数$P(w_1, ..., w_T)$建模, 其中$w_i$表示句子中的第i个词。语言模型的目标是,希望模型对有意义的句子赋予大概率,对没意义的句子赋予小概率。 +这样的模型可以应用于很多领域,如机器翻译、语音识别、信息检索、词性标注、手写识别等,它们都希望能得到一个连续序列的概率。 以信息检索为例,当你在搜索“how long is a football bame”时(bame是一个医学名词),搜索引擎会提示你是否希望搜索"how long is a football game", 这是因为根据语言模型计算出“how long is a football bame”的概率很低,而与bame近似的,可能引起错误的词中,game会使该句生成的概率最大。 + +对语言模型的目标概率$P(w_1, ..., w_T)$,如果假设文本中每个词都是相互独立的,则整句话的联合概率可以表示为其中所有词语条件概率的乘积,即: + +$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$ + +然而我们知道语句中的每个词出现的概率都与其前面的词紧密相关, 所以实际上通常用条件概率表示语言模型: + +$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$ + + + +### N-gram neural model + +在计算语言学中,n-gram是一种重要的文本表示方法,表示一个文本中连续的n个项。基于具体的应用场景,每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法,用n-gram训练语言模型时,一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。 + +Yoshua Bengio等科学家就于2003年在著名论文 Neural Probabilistic Language Models \[[1](#参考文献)\] 中介绍如何学习一个神经元网络表示的词向量模型。文中的神经概率语言模型(Neural Network Language Model,NNLM)通过一个线性映射和一个非线性隐层连接,同时学习了语言模型和词向量,即通过学习大量语料得到词语的向量表达,通过这些向量得到整个句子的概率。用这种方法学习语言模型可以克服维度灾难(curse of dimensionality),即训练和测试数据不同导致的模型不准。注意:由于“神经概率语言模型”说法较为泛泛,我们在这里不用其NNLM的本名,考虑到其具体做法,本文中称该模型为N-gram neural model。 + +我们在上文中已经讲到用条件概率建模语言模型,即一句话中第$t$个词的概率和该句话的前$t-1$个词相关。可实际上越远的词语其实对该词的影响越小,那么如果考虑一个n-gram, 每个词都只受其前面`n-1`个词的影响,则有: + +$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$ + +给定一些真实语料,这些语料中都是有意义的句子,N-gram模型的优化目标则是最大化目标函数: + +$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$ + +其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率,$R(\theta)$表示参数正则项。 + +

+
+ 图2. N-gram神经网络模型 +

+ +图2展示了N-gram神经网络模型,从下往上看,该模型分为以下几个部分: + - 对于每个样本,模型输入$w_{t-n+1},...w_{t-1}$, 输出句子第t个词为字典中`|V|`个词的概率。 + + 每个输入词$w_{t-n+1},...w_{t-1}$首先通过映射矩阵映射到词向量$C(w_{t-n+1}),...C(w_{t-1})$。 + + - 然后所有词语的词向量连接成一个大向量,并经过一个非线性映射得到历史词语的隐层表示: + + $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$ + + 其中,$x$为所有词语的词向量连接成的大向量,表示文本历史特征;$\theta$、$U$、$b_1$、$b_2$和$W$分别为词向量层到隐层连接的参数。$g$表示未经归一化的所有输出单词概率,$g_i$表示未经归一化的字典中第$i$个单词的输出概率。 + + - 根据softmax的定义,通过归一化$g_i$, 生成目标词$w_t$的概率为: + + $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$ + + - 整个网络的损失值(cost)为多类分类交叉熵,用公式表示为 + + $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$ + + 其中$y_k^i$表示第$i$个样本第$k$类的真实标签(0或1),$softmax(g_k^i)$表示第i个样本第k类softmax输出的概率。 + + + +### Continuous Bag-of-Words model(CBOW) + +CBOW模型通过一个词的上下文(各N个词)预测当前词。当N=2时,模型如下图所示: + +

+
+ 图3. CBOW模型 +

+ +具体来说,不考虑上下文的词语输入顺序,CBOW是用上下文词语的词向量的均值来预测当前词。即: + +$$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$ + +其中$x_t$为第$t$个词的词向量,分类分数(score)向量 $z=U*context$,最终的分类$y$采用softmax,损失函数采用多类分类交叉熵。 + +### Skip-gram model + +CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去掉了噪声,因此在小数据集上很有效。而Skip-gram的方法中,用一个词预测其上下文,得到了当前词上下文的很多样本,因此可用于更大的数据集。 + +

+
+ 图4. Skip-gram模型 +

+ +如上图所示,Skip-gram模型的具体做法是,将一个词的词向量映射到$2n$个词的词向量($2n$表示当前输入词的前后各$n$个词),然后分别通过softmax得到这$2n$个词的分类损失值之和。 + + +## 数据准备 + +### 数据介绍 + +本教程使用Penn Treebank (PTB)(经Tomas Mikolov预处理过的版本)数据集。PTB数据集较小,训练速度快,应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下: + +

+ + + + + + + + + + + + + + + + +
训练数据验证数据测试数据
ptb.train.txtptb.valid.txtptb.test.txt
42068句3370句3761句
+

+ + +### 数据预处理 + +本章训练的是5-gram模型,表示在PaddlePaddle训练时,每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`,自动做数据的下载与预处理,方便大家使用。 + +预处理会把数据集中的每一句话前后加上开始符号``以及结束符号``。然后依据窗口大小(本教程中为5),从头到尾每次向右滑动窗口并生成一条数据。 + +如"I have a dream that one day" 一句提供了5条数据: + +```text + I have a dream +I have a dream that +have a dream that one +a dream that one day +dream that one day +``` + +最后,每个输入会按其单词次在字典里的位置,转化成整数的索引序列,作为PaddlePaddle的输入。 + +## 编程实现 + +本配置的模型结构如下图所示: + +

+
+ 图5. 模型配置中的N-gram神经网络模型 +

+ +首先,加载所需要的包: + +```python +import paddle +import paddle.fluid as fluid +import numpy +from functools import partial +import math +import os +import sys +from __future__ import print_function +``` + +然后,定义参数: +```python +EMBED_SIZE = 32 # word vector dimension +HIDDEN_SIZE = 256 # hidden layer dimension +N = 5 # train 5-gram +BATCH_SIZE = 32 # batch size + +# can use CPU or GPU +use_cuda = os.getenv('WITH_GPU', '0') != '0' + +word_dict = paddle.dataset.imikolov.build_dict() +dict_size = len(word_dict) +``` + +不同于之前的PaddlePaddle v2版本,在新的Fluid版本里,我们不必再手动计算词向量。PaddlePaddle提供了一个内置的方法`fluid.layers.embedding`,我们就可以直接用它来构造 N-gram 神经网络。 + +- 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏,我们传入参数 `is_sparse == True`, 可以加速稀疏矩阵的更新。 + +```python +def inference_program(is_sparse): + first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') + second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') + third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') + fourth_word = fluid.layers.data(name='fourthw', shape=[1], dtype='int64') + + embed_first = fluid.layers.embedding( + input=first_word, + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=is_sparse, + param_attr='shared_w') + embed_second = fluid.layers.embedding( + input=second_word, + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=is_sparse, + param_attr='shared_w') + embed_third = fluid.layers.embedding( + input=third_word, + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=is_sparse, + param_attr='shared_w') + embed_fourth = fluid.layers.embedding( + input=fourth_word, + size=[dict_size, EMBED_SIZE], + dtype='float32', + is_sparse=is_sparse, + param_attr='shared_w') + + concat_embed = fluid.layers.concat( + input=[embed_first, embed_second, embed_third, embed_fourth], axis=1) + hidden1 = fluid.layers.fc(input=concat_embed, + size=HIDDEN_SIZE, + act='sigmoid') + predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax') + return predict_word +``` + +- 基于以上的神经网络结构,我们可以如下定义我们的`训练`方法 + +```python +def train_program(is_sparse): + # The declaration of 'next_word' must be after the invoking of inference_program, + # or the data input order of train program would be [next_word, firstw, secondw, + # thirdw, fourthw], which is not correct. + predict_word = inference_program(is_sparse) + next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') + cost = fluid.layers.cross_entropy(input=predict_word, label=next_word) + avg_cost = fluid.layers.mean(cost) + return avg_cost +``` + +- 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集:`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`。两者都会返回一个读取器。在PaddlePaddle中,读取器是一个Python的函数,每次调用,会读取下一条数据。它是一个Python的generator。 + +`paddle.batch` 会读入一个读取器,然后输出一个批次化了的读取器。`event_handler`亦可以一并传入`trainer.train`来时不时的输出每个步骤,批次的训练情况。 + +```python +def optimizer_func(): + # Note here we need to choose more sophisticated optimizers + # such as AdaGrad with a decay rate. The normal SGD converges + # very slowly. + # optimizer=fluid.optimizer.SGD(learning_rate=0.001), + return fluid.optimizer.AdagradOptimizer( + learning_rate=3e-3, + regularization=fluid.regularizer.L2DecayRegularizer(8e-4)) + + +def train(use_cuda, train_program, params_dirname): + train_reader = paddle.batch( + paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) + test_reader = paddle.batch( + paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + # We output cost every 10 steps. + if event.step % 10 == 0: + outs = trainer.test( + reader=test_reader, + feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']) + avg_cost = outs[0] + + print("Step %d: Average Cost %f" % (event.step, avg_cost)) + + # If average cost is lower than 5.8, we consider the model good enough to stop. + # Note 5.8 is a relatively high value. In order to get a better model, one should + # aim for avg_cost lower than 3.5. But the training could take longer time. + if avg_cost < 5.8: + trainer.save_params(params_dirname) + trainer.stop() + + if math.isnan(avg_cost): + sys.exit("got NaN loss, training failed.") + + trainer = fluid.Trainer( + train_func=train_program, + optimizer_func=optimizer_func, + place=place) + + trainer.train( + reader=train_reader, + num_epochs=1, + event_handler=event_handler, + feed_order=['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']) +``` + +- `trainer.train`将会开始训练。从`event_handler`返回的监控情况如下: + +```text +Step 0: Average Cost 7.337213 +Step 10: Average Cost 6.136128 +Step 20: Average Cost 5.766995 +... +``` + +## 模型应用 +在模型训练后,我们可以用它做一些预测。 + +### 预测下一个词 +我们可以用我们训练过的模型,在得知之前的 N-gram 后,预测下一个词。 + +```python +def infer(use_cuda, inference_program, params_dirname=None): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + inferencer = fluid.Inferencer( + infer_func=inference_program, param_path=params_dirname, place=place) + + # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word + # is simply an index to look up for the corresponding word vector and hence + # the shape of word (base_shape) should be [1]. The length-based level of + # detail (lod) info of each LoDtensor should be [[1]] meaning there is only + # one lod_level and there is only one sequence of one word on this level. + # Note that lod info should be a list of lists. + + data1 = [[211]] # 'among' + data2 = [[6]] # 'a' + data3 = [[96]] # 'group' + data4 = [[4]] # 'of' + lod = [[1]] + + first_word = fluid.create_lod_tensor(data1, lod, place) + second_word = fluid.create_lod_tensor(data2, lod, place) + third_word = fluid.create_lod_tensor(data3, lod, place) + fourth_word = fluid.create_lod_tensor(data4, lod, place) + + result = inferencer.infer( + { + 'firstw': first_word, + 'secondw': second_word, + 'thirdw': third_word, + 'fourthw': fourth_word + }, + return_numpy=False) + + print(numpy.array(result[0])) + most_possible_word_index = numpy.argmax(result[0]) + print(most_possible_word_index) + print([ + key for key, value in word_dict.iteritems() + if value == most_possible_word_index + ][0]) +``` + +在经历3分钟的短暂训练后,我们得到如下的预测。我们的模型预测 `among a group of` 的下一个词是`a`。这比较符合文法规律。如果我们训练时间更长,比如几个小时,那么我们会得到的下一个预测是 `workers`。 + +```text +[[0.00106646 0.0007907 0.00072041 ... 0.00049024 0.00041355 0.00084464]] +6 +a +``` + +整个程序的入口很简单: + +```python +def main(use_cuda, is_sparse): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + + params_dirname = "word2vec.inference.model" + + train( + use_cuda=use_cuda, + train_program=partial(train_program, is_sparse), + params_dirname=params_dirname) + + infer( + use_cuda=use_cuda, + inference_program=partial(inference_program, is_sparse), + params_dirname=params_dirname) + + +main(use_cuda=use_cuda, is_sparse=True) +``` + + +## 总结 +本章中,我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中,我们可以根据向量间的余弦夹角,来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中,训练好的词向量可以用来初始化模型,以得到更好的效果。在文档分类中,有了词向量之后,可以用聚类的方法将文档中同义词进行分组,也可以用 N-gram 来预测下一个词。希望大家在本章后能够自行运用词向量进行相关领域的研究。 + + +## 参考文献 +1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155. +2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201. +3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013. +4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605. +5. https://en.wikipedia.org/wiki/Singular_value_decomposition + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png deleted file mode 100644 index 384f59919a2c8dedb198e97d51434616648932e1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png deleted file mode 100644 index 76b7d4bc0f99372465bd9aa34721513d39ad0776..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png deleted file mode 100755 index d985c393e618e9b79df05e4ff0ae57ccc93744d0..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png deleted file mode 100755 index 2e16ab2f443732b8ef5404a8e7cd2457bc5eee23..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png deleted file mode 100644 index 2449dce6a86b43b1b997ff418ed0dba56848463f..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png deleted file mode 100644 index 1e0b40a8f7aefdf46d42761305511f281c08e595..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png deleted file mode 100755 index 158bd64b8f8729dea67834a8d591d21bce8b8564..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png deleted file mode 100644 index ce4a8bf4769183cbaff91793753d2350a3ce936c..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png deleted file mode 100644 index a3ab385845d3dc8b5c670bae91225bc8dd47a8bb..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png deleted file mode 100755 index 3c36c6d1f66eb98ea78c0673965d02a4ee3aa288..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst index 8a66a95f45ea18dbfdc2450694517d5df8c47efd..18788d2eae048ac5120b0b7afd63cd784a235798 100644 --- a/doc/fluid/new_docs/beginners_guide/install/install_doc.rst +++ b/doc/fluid/new_docs/beginners_guide/install/install_doc.rst @@ -57,7 +57,28 @@ paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版 您可以在 `Release History `_ 中找到paddlepaddle-gpu的各个发行版本。 -如果需要获取并安装最新的PaddlePaddle开发分支,可以从我们的 `CI系统 `_ 中下载最新的whl安装包和c-api开发包并安装。如需登录,请点击“Log in as guest”。 +如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl +安装包和c-api开发包并安装,您可以从下面的表格中找到需要的版本: + +如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: 各个版本最新的whl包 + :header: "版本说明", "cp27-cp27mu", "cp27-cp27m" + :widths: 1, 3, 3 + + "stable_cuda9.0_cudnn7", "`paddlepaddle_gpu-0.14.0-cp27-cp27mu-manylinux1_x86_64.whl `__", "`paddlepaddle_gpu-0.14.0-cp27-cp27m-manylinux1_x86_64.whl `__" + "stable_cuda8.0_cudnn7", "`paddlepaddle_gpu-0.14.0.post87-cp27-cp27mu-manylinux1_x86_64.whl `__", "`paddlepaddle_gpu-0.14.0.post87-cp27-cp27m-manylinux1_x86_64.whl `__" + "stable_cuda8.0_cudnn5", "`paddlepaddle_gpu-0.14.0.post85-cp27-cp27mu-manylinux1_x86_64.whl `__", "`paddlepaddle_gpu-0.14.0.post85-cp27-cp27m-manylinux1_x86_64.whl `__" + "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" + "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" + "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _FAQ: diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md index ba43ada5100ed1db7192de9c795b4b8a6596d705..9574dbea2f9a39bb196b61bb4fd12ba7c378f75a 100644 --- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md @@ -1,329 +1,288 @@ -```eval_rst -.. _quick_start_fit_a_line: -``` -# 线性回归 -让我们从经典的线性回归(Linear Regression \[[1](#参考文献)\])模型开始这份教程。在这一章里,你将使用真实的数据集建立起一个房价预测模型,并且了解到机器学习中的若干重要概念。 - -本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 -给定一个大小为`$n$`的数据集 `${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$`,其中`$x_{i1}, \ldots, x_{id}$`是第`$i$`个样本`$d$`个属性上的取值,`$y_i$`是该样本待预测的目标。线性回归模型假设目标`$y_i$`可以被属性间的线性组合描述,即 - -$$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldots,n$$ - -例如,在我们将要建模的房价预测问题里,`$x_{ij}$`是描述房子`$i$`的各种属性(比如房间的个数、周围学校和医院的个数、交通状况等),而 `$y_i$`是房屋的价格。 - -初看起来,这个假设实在过于简单了,变量间的真实关系很难是线性的。但由于线性回归模型有形式简单和易于建模分析的优点,它在实际问题中得到了大量的应用。很多经典的统计学习、机器学习书籍\[[2,3,4](#参考文献)\]也选择对线性模型独立成章重点讲解。 - -## 效果展示 -我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。 - -![BostonHousePricePredictions](./image/predictions.png) -

图1. 预测值 V.S. 真实值

- -## 模型概览 - -### 模型定义 - -在波士顿房价数据集中,和房屋相关的值共有14个:前13个用来描述房屋相关的各种信息,即模型中的 `$x_i$`;最后一个值为我们要预测的该类房屋价格的中位数,即模型中的 `$y_i$`。因此,我们的模型就可以表示成: - -$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$ - -`$\hat{Y}$` 表示模型的预测结果,用来和真实值`$Y$`区分。模型要学习的参数即:`$\omega_1, \ldots, \omega_{13}, b$`。 - -建立模型后,我们需要给模型一个优化目标,使得学到的参数能够让预测值`$\hat{Y}$`尽可能地接近真实值`$Y$`。这里我们引入损失函数([Loss Function](https://en.wikipedia.org/wiki/Loss_function),或Cost Function)这个概念。 输入任意一个数据样本的目标值`$y_{i}$`和模型给出的预测值`$\hat{y_{i}}$`,损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。 - -对于线性回归模型来讲,最常见的损失函数就是均方误差(Mean Squared Error, [MSE](https://en.wikipedia.org/wiki/Mean_squared_error))了,它的形式是: - -$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$ - -即对于一个大小为`$n$`的测试集,`$MSE$`是`$n$`个数据预测结果误差平方的均值。 - -### 训练过程 - -定义好模型结构之后,我们要通过以下几个步骤进行模型训练 -1. 初始化参数,其中包括权重`$\omega_i$`和偏置`$b$`,对其进行初始化(如0均值,1方差)。 -2. 网络正向传播计算网络输出和损失函数。 -3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。 -4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。 - -## 数据集 - -### 数据集介绍 -这份数据集共506行,每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下: - -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
属性名解释类型
CRIM该镇的人均犯罪率连续值
ZN占地面积超过25,000平方呎的住宅用地比例连续值
INDUS非零售商业用地比例连续值
CHAS是否邻近 Charles River离散值,1=邻近;0=不邻近
NOX一氧化氮浓度连续值
RM每栋房屋的平均客房数连续值
AGE1940年之前建成的自用单位比例连续值
DIS到波士顿5个就业中心的加权距离连续值
RAD到径向公路的可达性指数连续值
TAX全值财产税率连续值
PTRATIO学生与教师的比例连续值
B1000(BK - 0.63)^2,其中BK为黑人占比连续值
LSTAT低收入人群占比连续值
MEDV同类房屋价格的中位数连续值
-

- -### 数据预处理 -#### 连续值与离散值 -观察一下数据,我们的第一个发现是:所有的13维属性中,有12维的连续值和1维的离散值(CHAS)。离散值虽然也常使用类似0、1、2这样的数字表示,但是其含义与连续值是不同的,因为这里的差值没有实际意义。例如,我们用0、1、2来分别表示红色、绿色和蓝色的话,我们并不能因此说“蓝色和红色”比“绿色和红色”的距离更远。所以通常对一个有`$d$`个可能取值的离散属性,我们会将它们转为`$d$`个取值为0或1的二值属性或者将每个可能取值映射为一个多维向量。不过就这里而言,因为CHAS本身就是一个二值属性,就省去了这个麻烦。 - -#### 属性的归一化 -另外一个稍加观察即可发现的事实是,各维属性的取值范围差别很大(如图2所示)。例如,属性B的取值范围是[0.32, 396.90],而属性NOX的取值范围是[0.3850, 0.8170]。这里就要用到一个常见的操作-归一化(normalization)了。归一化的目标是把各位属性的取值范围放缩到差不多的区间,例如[-0.5,0.5]。这里我们使用一种很常见的操作方法:减掉均值,然后除以原取值范围。 - -做归一化(或 [Feature scaling](https://en.wikipedia.org/wiki/Feature_scaling))至少有以下3个理由: -- 过大或过小的数值范围会导致计算时的浮点上溢或下溢。 -- 不同的数值范围会导致不同属性对模型的重要性不同(至少在训练的初始阶段如此),而这个隐含的假设常常是不合理的。这会对优化的过程造成困难,使训练时间大大的加长。 -- 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。 - -![featureScale](./image/ranges.png) -

图2. 各维属性的取值范围

- -#### 整理训练集与测试集 -我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。我们这个例子中设置的分割比例为`$8:2$` - - -在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。 - -## 训练 - -`fit_a_line/trainer.py`演示了训练的整体过程。 - -### 配置数据提供器(Datafeeder) -首先我们引入必要的库: -```python -import paddle -import paddle.fluid as fluid -import numpy -``` - -我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) - -其中,在uci_housing模块中封装了: - -1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。 -2. [数据预处理](#数据预处理)的过程。 - -接下来我们定义了用于训练和测试的数据提供器。提供器每次读入一个大小为`BATCH_SIZE`的数据批次。如果用户希望加一些随机性,她可以同时定义一个批次大小和一个缓存大小。这样的话,每次数据提供器会从缓存中随机读取批次大小那么多的数据。 - -```python -BATCH_SIZE = 20 - -train_reader = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.uci_housing.train(), buf_size=500), -batch_size=BATCH_SIZE) - -test_reader = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.uci_housing.test(), buf_size=500), -batch_size=BATCH_SIZE) -``` - -### 配置训练程序 -训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲,它就是一个从输入到输出的简单的全连接层。更加复杂的结果,比如卷积神经网络,递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值,因为它会被后面反向传播算法所用到。 - -```python -def train_program(): -y = fluid.layers.data(name='y', shape=[1], dtype='float32') - -# feature vector of length 13 -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y_predict = fluid.layers.fc(input=x, size=1, act=None) - -loss = fluid.layers.square_error_cost(input=y_predict, label=y) -avg_loss = fluid.layers.mean(loss) - -return avg_loss -``` - -### 定义运算场所 -我们可以定义运算是发生在CPU还是GPU - -```python -use_cuda = False -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() -``` - -### 创建训练器 -训练器会读入一个训练程序和一些必要的其他参数: - -```python -trainer = fluid.Trainer( -train_func=train_program, -place=place, -optimizer_func=fluid.optimizer.SGD(learning_rate=0.001)) -``` - -### 开始提供数据 -PaddlePaddle提供了读取数据者发生器机制来读取训练数据。读取数据者会一次提供多列数据,因此我们需要一个Python的list来定义读取顺序。 - -```python -feed_order=['x', 'y'] -``` - -除此之外,可以定义一个事件相应器来处理类似`打印训练进程`的事件: - -```python -# Specify the directory path to save the parameters -params_dirname = "fit_a_line.inference.model" - -# Plot data -from paddle.v2.plot import Ploter -train_title = "Train cost" -test_title = "Test cost" -plot_cost = Ploter(train_title, test_title) - -step = 0 - -# event_handler to print training and testing info -def event_handler_plot(event): -global step -if isinstance(event, fluid.EndStepEvent): -if event.step % 10 == 0: # every 10 batches, record a test cost -test_metrics = trainer.test( -reader=test_reader, feed_order=feed_order) - -plot_cost.append(test_title, step, test_metrics[0]) -plot_cost.plot() - -if test_metrics[0] < 10.0: -# If the accuracy is good enough, we can stop the training. -print('loss is less than 10.0, stop') -trainer.stop() - -# We can save the trained parameters for the inferences later -if params_dirname is not None: -trainer.save_params(params_dirname) - -step += 1 -``` - -### 开始训练 -我们现在可以通过调用`trainer.train()`来开始训练 - -```python -%matplotlib inline - -# The training could take up to a few minutes. -trainer.train( -reader=train_reader, -num_epochs=100, -event_handler=event_handler_plot, -feed_order=feed_order) -``` - -![trainTestCost](./image/train_and_test.png) - -## 预测 -提供一个`inference_program`和一个`params_dirname`来初始化预测器。`params_dirname`用来存储我们的参数。 - -### 设定预测程序 -类似于`trainer.train`,预测器需要一个预测程序来做预测。我们可以稍加修改我们的训练程序来把预测值包含进来。 - - -```python -def inference_program(): -x = fluid.layers.data(name='x', shape=[13], dtype='float32') -y_predict = fluid.layers.fc(input=x, size=1, act=None) -return y_predict -``` - -### 预测 -预测器会从`params_dirname`中读取已经训练好的模型,来对从未遇见过的数据进行预测。 - -```python -inferencer = fluid.Inferencer( -infer_func=inference_program, param_path=params_dirname, place=place) - -batch_size = 10 -tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32") - -results = inferencer.infer({'x': tensor_x}) -print("infer results: ", results[0]) -``` - -## 总结 -在这章里,我们借助波士顿房价这一数据集,介绍了线性回归模型的基本概念,以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来,因此弄清楚线性模型的原理和局限非常重要。 - - -## 参考文献 -1. https://en.wikipedia.org/wiki/Linear_regression -2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001. -3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012. -4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 +# 线性回归 +让我们从经典的线性回归(Linear Regression \[[1](#参考文献)\])模型开始这份教程。在这一章里,你将使用真实的数据集建立起一个房价预测模型,并且了解到机器学习中的若干重要概念。 + +本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/137.html)。 + +## 背景介绍 +给定一个大小为$n$的数据集 ${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$,其中$x_{i1}, \ldots, x_{id}$是第$i$个样本$d$个属性上的取值,$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述,即 + +$$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldots,n$$ + +例如,在我们将要建模的房价预测问题里,$x_{ij}$是描述房子$i$的各种属性(比如房间的个数、周围学校和医院的个数、交通状况等),而 $y_i$是房屋的价格。 + +初看起来,这个假设实在过于简单了,变量间的真实关系很难是线性的。但由于线性回归模型有形式简单和易于建模分析的优点,它在实际问题中得到了大量的应用。很多经典的统计学习、机器学习书籍\[[2,3,4](#参考文献)\]也选择对线性模型独立成章重点讲解。 + +## 效果展示 +我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。 +

+
+ 图1. 预测值 V.S. 真实值 +

+ +## 模型概览 + +### 模型定义 + +在波士顿房价数据集中,和房屋相关的值共有14个:前13个用来描述房屋相关的各种信息,即模型中的 $x_i$;最后一个值为我们要预测的该类房屋价格的中位数,即模型中的 $y_i$。因此,我们的模型就可以表示成: + +$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$ + +$\hat{Y}$ 表示模型的预测结果,用来和真实值$Y$区分。模型要学习的参数即:$\omega_1, \ldots, \omega_{13}, b$。 + +建立模型后,我们需要给模型一个优化目标,使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数([Loss Function](https://en.wikipedia.org/wiki/Loss_function),或Cost Function)这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$,损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。 + +对于线性回归模型来讲,最常见的损失函数就是均方误差(Mean Squared Error, [MSE](https://en.wikipedia.org/wiki/Mean_squared_error))了,它的形式是: + +$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$ + +即对于一个大小为$n$的测试集,$MSE$是$n$个数据预测结果误差平方的均值。 + +### 训练过程 + +定义好模型结构之后,我们要通过以下几个步骤进行模型训练 + 1. 初始化参数,其中包括权重$\omega_i$和偏置$b$,对其进行初始化(如0均值,1方差)。 + 2. 网络正向传播计算网络输出和损失函数。 + 3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。 + 4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。 + +## 数据集 + +### 数据集介绍 +这份数据集共506行,每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下: + +| 属性名 | 解释 | 类型 | +| ------| ------ | ------ | +| CRIM | 该镇的人均犯罪率 | 连续值 | +| ZN | 占地面积超过25,000平方呎的住宅用地比例 | 连续值 | +| INDUS | 非零售商业用地比例 | 连续值 | +| CHAS | 是否邻近 Charles River | 离散值,1=邻近;0=不邻近 | +| NOX | 一氧化氮浓度 | 连续值 | +| RM | 每栋房屋的平均客房数 | 连续值 | +| AGE | 1940年之前建成的自用单位比例 | 连续值 | +| DIS | 到波士顿5个就业中心的加权距离 | 连续值 | +| RAD | 到径向公路的可达性指数 | 连续值 | +| TAX | 全值财产税率 | 连续值 | +| PTRATIO | 学生与教师的比例 | 连续值 | +| B | 1000(BK - 0.63)^2,其中BK为黑人占比 | 连续值 | +| LSTAT | 低收入人群占比 | 连续值 | +| MEDV | 同类房屋价格的中位数 | 连续值 | + +### 数据预处理 +#### 连续值与离散值 +观察一下数据,我们的第一个发现是:所有的13维属性中,有12维的连续值和1维的离散值(CHAS)。离散值虽然也常使用类似0、1、2这样的数字表示,但是其含义与连续值是不同的,因为这里的差值没有实际意义。例如,我们用0、1、2来分别表示红色、绿色和蓝色的话,我们并不能因此说“蓝色和红色”比“绿色和红色”的距离更远。所以通常对一个有$d$个可能取值的离散属性,我们会将它们转为$d$个取值为0或1的二值属性或者将每个可能取值映射为一个多维向量。不过就这里而言,因为CHAS本身就是一个二值属性,就省去了这个麻烦。 + +#### 属性的归一化 +另外一个稍加观察即可发现的事实是,各维属性的取值范围差别很大(如图2所示)。例如,属性B的取值范围是[0.32, 396.90],而属性NOX的取值范围是[0.3850, 0.8170]。这里就要用到一个常见的操作-归一化(normalization)了。归一化的目标是把各位属性的取值范围放缩到差不多的区间,例如[-0.5,0.5]。这里我们使用一种很常见的操作方法:减掉均值,然后除以原取值范围。 + +做归一化(或 [Feature scaling](https://en.wikipedia.org/wiki/Feature_scaling))至少有以下3个理由: +- 过大或过小的数值范围会导致计算时的浮点上溢或下溢。 +- 不同的数值范围会导致不同属性对模型的重要性不同(至少在训练的初始阶段如此),而这个隐含的假设常常是不合理的。这会对优化的过程造成困难,使训练时间大大的加长。 +- 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。 + +

+
+ 图2. 各维属性的取值范围 +

+ +#### 整理训练集与测试集 +我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$ + + +在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。 + +## 训练 + +`fit_a_line/trainer.py`演示了训练的整体过程。 + +### 配置数据提供器(Datafeeder) +首先我们引入必要的库: +```python +import paddle +import paddle.fluid as fluid +import numpy +from __future__ import print_function +``` + +我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) + +其中,在uci_housing模块中封装了: + +1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。 +2. [数据预处理](#数据预处理)的过程。 + +接下来我们定义了用于训练和测试的数据提供器。提供器每次读入一个大小为`BATCH_SIZE`的数据批次。如果用户希望加一些随机性,她可以同时定义一个批次大小和一个缓存大小。这样的话,每次数据提供器会从缓存中随机读取批次大小那么多的数据。 + +```python +BATCH_SIZE = 20 + +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=BATCH_SIZE) + +test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.test(), buf_size=500), + batch_size=BATCH_SIZE) +``` + +### 配置训练程序 +训练程序的目的是定义一个训练模型的网络结构。对于线性回归来讲,它就是一个从输入到输出的简单的全连接层。更加复杂的结果,比如卷积神经网络,递归神经网络等会在随后的章节中介绍。训练程序必须返回`平均损失`作为第一个返回值,因为它会被后面反向传播算法所用到。 + +```python +def train_program(): + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + + # feature vector of length 13 + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + + loss = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_loss = fluid.layers.mean(loss) + + return avg_loss +``` + +### Optimizer Function 配置 + +在下面的 `SGD optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 + +```python +def optimizer_program(): + return fluid.optimizer.SGD(learning_rate=0.001) +``` + +### 定义运算场所 +我们可以定义运算是发生在CPU还是GPU + +```python +use_cuda = False +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() +``` + +### 创建训练器 +训练器会读入一个训练程序和一些必要的其他参数: + +```python +trainer = fluid.Trainer( + train_func=train_program, + place=place, + optimizer_func=optimizer_program) +``` + +### 开始提供数据 +PaddlePaddle提供了读取数据者发生器机制来读取训练数据。读取数据者会一次提供多列数据,因此我们需要一个Python的list来定义读取顺序。 + +```python +feed_order=['x', 'y'] +``` + +除此之外,可以定义一个事件相应器来处理类似`打印训练进程`的事件: + +```python +# Specify the directory to save the parameters +params_dirname = "fit_a_line.inference.model" + +# Plot data +from paddle.v2.plot import Ploter +train_title = "Train cost" +test_title = "Test cost" +plot_cost = Ploter(train_title, test_title) + +step = 0 + +# event_handler prints training and testing info +def event_handler_plot(event): + global step + if isinstance(event, fluid.EndStepEvent): + if step % 10 == 0: # record a train cost every 10 batches + plot_cost.append(train_title, step, event.metrics[0]) + + if step % 100 == 0: # record a test cost every 100 batches + test_metrics = trainer.test( + reader=test_reader, feed_order=feed_order) + plot_cost.append(test_title, step, test_metrics[0]) + plot_cost.plot() + + if test_metrics[0] < 10.0: + # If the accuracy is good enough, we can stop the training. + print('loss is less than 10.0, stop') + trainer.stop() + step += 1 + + if isinstance(event, fluid.EndEpochEvent): + if event.epoch % 10 == 0: + # We can save the trained parameters for the inferences later + if params_dirname is not None: + trainer.save_params(params_dirname) +``` + +### 开始训练 +我们现在可以通过调用`trainer.train()`来开始训练 + +```python +%matplotlib inline + +# The training could take up to a few minutes. +trainer.train( + reader=train_reader, + num_epochs=100, + event_handler=event_handler_plot, + feed_order=feed_order) +``` +
+
+图3 训练结果 +
+ + +## 预测 +提供一个`inference_program`和一个`params_dirname`来初始化预测器。`params_dirname`用来存储我们的参数。 + +### 设定预测程序 +类似于`trainer.train`,预测器需要一个预测程序来做预测。我们可以稍加修改我们的训练程序来把预测值包含进来。 + + +```python +def inference_program(): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + return y_predict +``` + +### 预测 +预测器会从`params_dirname`中读取已经训练好的模型,来对从未遇见过的数据进行预测。 + +```python +inferencer = fluid.Inferencer( + infer_func=inference_program, param_path=params_dirname, place=place) + +batch_size = 10 +test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size) +test_data = test_reader().next() +test_x = numpy.array([data[0] for data in test_data]).astype("float32") +test_y = numpy.array([data[1] for data in test_data]).astype("float32") + +results = inferencer.infer({'x': test_x}) + +print("infer results: (House Price)") +for idx, val in enumerate(results[0]): + print("%d: %.2f" % (idx, val)) + +print("\nground truth:") +for idx, val in enumerate(test_y): + print("%d: %.2f" % (idx, val)) +``` + +## 总结 +在这章里,我们借助波士顿房价这一数据集,介绍了线性回归模型的基本概念,以及如何使用PaddlePaddle实现训练和测试的过程。很多的模型和技巧都是从简单的线性回归模型演化而来,因此弄清楚线性模型的原理和局限非常重要。 + + +## 参考文献 +1. https://en.wikipedia.org/wiki/Linear_regression +2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001. +3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012. +4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128. + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png deleted file mode 100644 index 27e4acb1313794f52ad9ad9e874cdadd197ff41f..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png deleted file mode 100644 index 5d86b12715f46afbafb7d50e2938e184219b5b95..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test.png deleted file mode 100644 index bcd304a6a0baf30ecfbc43e08fc0aca179d05958..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md index c04a949a3f6550048f2a3447070829aeb640b995..e6f89b23a95d1a07565f3e0a285e9c3f921930df 100644 --- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md +++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md @@ -1,453 +1,447 @@ -# 识别数字 - -本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)。 - -## 背景介绍 -当我们学习编程的时候,编写的第一个程序一般是实现打印"Hello World"。而机器学习(或深度学习)的入门教程,一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题,比较简单,同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集,包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵,标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。 - -![MNIST](./image/mnist_example_image.png) -

图1. MNIST图片示例

- -MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3(SD-3)和Special Database 1(SD-1)构建而来。由于SD-3是由美国人口调查局的员工进行标注,SD-1是由美国高中生进行标注,因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集(60000条数据)和测试集(10000条数据),其中训练集来自250位不同的标注员,此外还保证了训练集和测试集的标注员是不完全相同的。 - -Yann LeCun早先在手写字符识别上做了很多研究,并在研究过程中提出了卷积神经网络(Convolutional Neural Network),大幅度地提高了手写字符的识别能力,也因此成为了深度学习领域的奠基人之一。如今的深度学习领域,卷积神经网络占据了至关重要的地位,从最早Yann LeCun提出的简单LeNet,到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等(请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程),人们在图像分类领域,利用卷积神经网络得到了一系列惊人的结果。 - -有很多算法在MNIST上进行实验。1998年,LeCun分别用单层线性分类器、多层感知器(Multilayer Perceptron, MLP)和多层卷积神经网络LeNet进行实验,使得测试集上的误差不断下降(从12%下降到0.7%)\[[1](#参考文献)\]。此后,科学家们又基于K近邻(K-Nearest Neighbors)算法\[[2](#参考文献)\]、支持向量机(SVM)\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验,并采用多种预处理方法(如去除歪曲、去噪、模糊等)来提高识别的准确率。 - -本教程中,我们从简单的模型Softmax回归开始,带大家入门手写字符识别,并逐步进行模型优化。 - - -## 模型概览 - -基于MNIST数据训练一个分类器,在介绍本教程使用的三个基本图像分类网络前,我们先给出一些定义: -- `$X$`是输入:MNIST图片是`$28\times28$` 的二维图像,为了进行计算,我们将其转化为`$784$`维向量,即`$X=\left ( x_0, x_1, \dots, x_{783} \right )$`。 -- `$Y$`是输出:分类器的输出是10类数字(0-9),即`$Y=\left ( y_0, y_1, \dots, y_9 \right )$`,每一维`$y_i$`代表图片分类为第`$i$`类数字的概率。 -- `$L$`是图片的真实标签:`$L=\left ( l_0, l_1, \dots, l_9 \right )$`也是10维,但只有一维为1,其他都为0。 - -### Softmax回归(Softmax Regression) - -最简单的Softmax回归模型是先将输入层经过一个全连接层得到的特征,然后直接通过softmax 函数进行多分类\[[9](#参考文献)\]。 - -输入层的数据`$X$`传到输出层,在激活操作之前,会乘以相应的权重 `$W$` ,并加上偏置变量 `$b$` ,具体如下: - -$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$ - -其中 `$ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $` - -对于有 `$N$` 个类别的多分类问题,指定 `$N$` 个输出节点,`$N$` 维结果向量经过softmax将归一化为 `$N$` 个[0,1]范围内的实数值,分别表示该样本属于这 `$N$` 个类别的概率。此处的 `$y_i$` 即对应该图片为数字 `$i$` 的预测概率。 - -在分类问题中,我们一般采用交叉熵代价损失函数(cross entropy),公式如下: - -$$ \text{crossentropy}(label, y) = -\sum_i label_ilog(y_i) $$ - -图2为softmax回归的网络图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 - -![softmaxRegression](./image/softmax_regression.png) -

图2. softmax回归网络结构图

- -### 多层感知器(Multilayer Perceptron, MLP) - -Softmax回归模型采用了最简单的两层神经网络,即只有输入层和输出层,因此其拟合能力有限。为了达到更好的识别效果,我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。 - -1. 经过第一个隐藏层,可以得到 `$ H_1 = \phi(W_1X + b_1) $`,其中`$\phi$`代表激活函数,常见的有sigmoid、tanh或ReLU等函数。 -2. 经过第二个隐藏层,可以得到 `$ H_2 = \phi(W_2H_1 + b_2) $`。 -3. 最后,再经过输出层,得到的`$Y=\text{softmax}(W_3H_2 + b_3)$`,即为最后的分类结果向量。 - - -图3为多层感知器的网络结构图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 - -![multilayerPerceptron](./image/mlp.png) -

图3. 多层感知器网络结构图

- -### 卷积神经网络(Convolutional Neural Network, CNN) - -在多层感知器模型中,将图像展开成一维向量输入到网络中,忽略了图像的位置和结构信息,而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构:输入的二维图像,先经过两次卷积层到池化层,再经过全连接层,最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。 - -![cnnStructure](./image/cnn.png) -

图4. LeNet-5卷积神经网络结构

- -#### 卷积层 - -卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积,即离散二维滤波器(也称作卷积核)与二维图像做卷积操作,简单的讲是二维滤波器滑动到二维图像上所有位置,并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域,不同卷积核可以提取不同的特征,例如边沿、线性、角等特征。在深层卷积神经网络中,通过卷积操作可以提取出图像低级到复杂的特征。 - -![cnn](https://raw.githubusercontent.com/PaddlePaddle/book/develop/02.recognize_digits/image/conv_layer.png) -

图5. 卷积层图片

- -图5给出一个卷积计算过程的示例图,输入图像大小为`$H=5,W=5,D=3$`,即`$5 \times 5$`大小的3通道(RGB,也称作深度)彩色图像。这个示例图中包含两(用`$K$`表示)组卷积核,即图中滤波器`$W_0$`和`$W_1$`。在卷积计算中,通常对不同的输入通道采用不同的卷积核,如图示例中每组卷积核包含(`$D=3$`)个`$3 \times 3$`(用`$F \times F$`表示)大小的卷积核。另外,这个示例中卷积核在图像的水平方向(`$W$`方向)和垂直方向(`$H$`方向)的滑动步长为2(用`$S$`表示);对输入图像周围各填充1(用`$P$`表示)个0,即图中输入层原始数据为蓝色部分,灰色部分是进行了大小为1的扩展,用0来进行扩展。经过卷积操作得到输出为`$3 \times 3 \times 2$`(用`$H_{o} \times W_{o} \times K$`表示)大小的特征图,即`$3 \times 3$`大小的2通道特征图,其中`$H_o$`计算公式为:`$H_o = (H - F + 2 \times P)/S + 1$`,`$W_o$`同理。 而输出特征图中的每个像素,是每组滤波器与输入图像每个特征图的内积再求和,再加上偏置`$b_o$`,偏置通常对于每个输出特征图是共享的。输出特征图`$o[:,:,0]$`中的最后一个`$-2$`计算如图5右下角公式所示。 - -在卷积操作中卷积核是可学习的参数,经过上面示例介绍,每层卷积的参数大小为`$D \times F \times F \times K$`。在多层感知器模型中,神经元通常是全部连接,参数较多。而卷积层的参数较少,这也是由卷积层的主要特性即局部连接和共享权重所决定。 - -- 局部连接:每个神经元仅与输入神经元的一块区域连接,这块局部区域称作感受野(receptive field)。在图像卷积操作中,即神经元在空间维度(spatial dimension,即上图示例H和W所在的平面)是局部连接,但在深度上是全部连接。对于二维图像本身而言,也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想,也是受启发于生物学里面的视觉系统结构,视觉皮层的神经元就是局部接受信息的。 - -- 权重共享:计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算`$o[:,:,0]$`的每个每个神经元的滤波器均相同,都为`$W_0$`,这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的,例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的,比如输入的图片是人脸,眼睛和头发位于不同的位置,希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的,在卷积层,通常采用多组卷积核提取不同特征,即对应不同深度切片的特征,不同深度切片的神经元权重是不共享。另外,偏重对同一深度切片的所有神经元都是共享的。 - -通过介绍卷积计算过程及其特性,可以看出卷积是线性操作,并具有平移不变性(shift-invariant),平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小,这样也有利于训练较大卷积神经网络。 - -#### 池化层 - -![pooling](./image/max_pooling.png) -

图6. 池化层图片

- -池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。 - -更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。 - -### 常见激活函数介绍 -- sigmoid激活函数: `$ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $` - -- tanh激活函数: `$ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $` - -实际上,tanh函数只是规模变化的sigmoid函数,将sigmoid函数值放大2倍之后再向下平移1个单位:tanh(x) = 2sigmoid(2x) - 1 。 - -- ReLU激活函数: `$ f(x) = max(0, x) $` - -更详细的介绍请参考[维基百科激活函数](https://en.wikipedia.org/wiki/Activation_function)。 - -## 数据介绍 - -PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mnist/)数据的模块`paddle.dataset.mnist`。加载后的数据位于`/home/username/.cache/paddle/dataset/mnist`下: - -

- - - - - - - - - - - - - - - - - - - - - - - - - - -
文件名称说明
train-images-idx3-ubyte训练数据图片,60,000条数据
train-labels-idx1-ubyte训练数据标签,60,000条数据
t10k-images-idx3-ubyte测试数据图片,10,000条数据
t10k-labels-idx1-ubyte测试数据标签,10,000条数据
-

- -## Fluid API 概述 - -演示将使用最新的 `Fluid API`。Fluid API是最新的 PaddlePaddle API。它在不牺牲性能的情况下简化了模型配置。 -我们建议使用 Fluid API,因为它更容易学起来。 - -下面是快速的 Fluid API 概述。 -1. `inference_program`:指定如何从数据输入中获得预测的函数。 -这是指定网络流的地方。 - -1. `train_program`:指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。 -这是指定损失计算的地方。 - -1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。 - -1. `Trainer`:PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。 -通过 `event_handler` 回调函数,用户可以监控培训的进展。 - -1. `Inferencer`:Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。 -然后,它可以推断数据和返回预测。 - -在这个演示中,我们将深入了解它们。 - -## 配置说明 -加载 PaddlePaddle 的 Fluid API 包。 - -```python -import paddle -import paddle.fluid as fluid -``` - -### Program Functions 配置 - -我们需要设置“推理程序”函数。我们想用这个程序来演示三个不同的分类器,每个分类器都定义为 Python 函数。 -我们需要将图像数据馈送到分类器。Paddle 为读取数据提供了一个特殊的层 `layer.data` 层。 -让我们创建一个数据层来读取图像并将其连接到分类网络。 - -- Softmax回归:只通过一层简单的以softmax为激活函数的全连接层,就可以得到分类的结果。 - -```python -def softmax_regression(): -img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') -predict = fluid.layers.fc( -input=img, size=10, act='softmax') -return predict -``` - -- 多层感知器:下面代码实现了一个含有两个隐藏层(即全连接层)的多层感知器。其中两个隐藏层的激活函数均采用ReLU,输出层的激活函数用Softmax。 - -```python -def multilayer_perceptron(): -img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') -# 第一个全连接层,激活函数为ReLU -hidden = fluid.layers.fc(input=img, size=200, act='relu') -# 第二个全连接层,激活函数为ReLU -hidden = fluid.layers.fc(input=hidden, size=200, act='relu') -# 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10 -prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') -return prediction -``` - -- 卷积神经网络LeNet-5: 输入的二维图像,首先经过两次卷积层到池化层,再经过全连接层,最后使用以softmax为激活函数的全连接层作为输出层。 - -```python -def convolutional_neural_network(): -img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') -# 第一个卷积-池化层 -conv_pool_1 = fluid.nets.simple_img_conv_pool( -input=img, -filter_size=5, -num_filters=20, -pool_size=2, -pool_stride=2, -act="relu") -conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) -# 第二个卷积-池化层 -conv_pool_2 = fluid.nets.simple_img_conv_pool( -input=conv_pool_1, -filter_size=5, -num_filters=50, -pool_size=2, -pool_stride=2, -act="relu") -# 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10 -prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') -return prediction -``` - -#### Train Program 配置 -然后我们需要设置训练程序 `train_program`。它首先从分类器中进行预测。 -在训练期间,它将从预测中计算 `avg_cost`。 - -**注意:** 训练程序应该返回一个数组,第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。 - -请随意修改代码,测试 Softmax 回归 `softmax_regression`, `MLP` 和 卷积神经网络 `convolutional neural network` 分类器之间的不同结果。 - -```python -def train_program(): -label = fluid.layers.data(name='label', shape=[1], dtype='int64') - -# predict = softmax_regression() # uncomment for Softmax回归 -# predict = multilayer_perceptron() # uncomment for 多层感知器 -predict = convolutional_neural_network() # uncomment for LeNet5卷积神经网络 -cost = fluid.layers.cross_entropy(input=predict, label=label) -avg_cost = fluid.layers.mean(cost) -acc = fluid.layers.accuracy(input=predict, label=label) -return [avg_cost, acc] - - -# 该模型运行在单个CPU上 -``` - -#### Optimizer Function 配置 - -在下面的 `Adam optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 - -```python -def optimizer_program(): -return fluid.optimizer.Adam(learning_rate=0.001) -``` - -### 数据集 Feeders 配置 - -下一步,我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python yield generator。 - -下面`shuffle`是一个reader decorator,它接受一个reader A,返回另一个reader B —— reader B 每次读入`buffer_size`条训练数据到一个buffer里,然后随机打乱其顺序,并且逐条输出。 - -`batch`是一个特殊的decorator,它的输入是一个reader,输出是一个batched reader —— 在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minibatch。 - -```python -train_reader = paddle.batch( -paddle.reader.shuffle( -paddle.dataset.mnist.train(), buf_size=500), -batch_size=64) - -test_reader = paddle.batch( -paddle.dataset.mnist.test(), batch_size=64) -``` - -### Trainer 配置 - -现在,我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer`。 - -```python -# 该模型运行在单个CPU上 -use_cuda = False # set to True if training with GPU -place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - -trainer = fluid.Trainer( -train_func=train_program, place=place, optimizer_func=optimizer_program) -``` - -#### Event Handler 配置 - -Fluid API 在训练期间为回调函数提供了一个钩子。用户能够通过机制监控培训进度。 -我们将在这里演示两个 `event_handler` 程序。请随意修改 Jupyter 笔记本 ,看看有什么不同。 - -`event_handler` 用来在训练过程中输出训练结果 - -```python -# Save the parameter into a directory. The Inferencer can load the parameters from it to do infer -params_dirname = "recognize_digits_network.inference.model" -lists = [] -def event_handler(event): -if isinstance(event, fluid.EndStepEvent): -if event.step % 100 == 0: -# event.metrics maps with train program return arguments. -# event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example. -print "Pass %d, Batch %d, Cost %f" % ( -event.step, event.epoch, event.metrics[0]) - -if isinstance(event, fluid.EndEpochEvent): -avg_cost, acc = trainer.test( -reader=test_reader, feed_order=['img', 'label']) - -print("Test with Epoch %d, avg_cost: %s, acc: %s" % (event.epoch, avg_cost, acc)) - -# save parameters -trainer.save_params(params_dirname) -lists.append((event.epoch, avg_cost, acc)) -``` - -`event_handler_plot` 可以用来在训练过程中画图如下: - -![png](./image/train_and_test.png) - -```python -from paddle.v2.plot import Ploter - -train_title = "Train cost" -test_title = "Test cost" -cost_ploter = Ploter(train_title, test_title) -step = 0 -lists = [] - -# event_handler to plot a figure -def event_handler_plot(event): -global step -if isinstance(event, fluid.EndStepEvent): -if step % 100 == 0: -# event.metrics maps with train program return arguments. -# event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example. -cost_ploter.append(train_title, step, event.metrics[0]) -cost_ploter.plot() -step += 1 -if isinstance(event, fluid.EndEpochEvent): -# save parameters -trainer.save_params(params_dirname) - -avg_cost, acc = trainer.test( -reader=test_reader, feed_order=['img', 'label']) -cost_ploter.append(test_title, step, avg_cost) -lists.append((event.epoch, avg_cost, acc)) -``` - -#### 开始训练 - -既然我们设置了 `event_handler` 和 `data reader`,我们就可以开始训练模型了。 - -`feed_order` 用于将数据目录映射到 `train_program` - -```python -trainer.train( -num_epochs=5, -event_handler=event_handler, -reader=train_reader, -feed_order=['img', 'label']) -``` - -训练过程是完全自动的,event_handler里打印的日志类似如下所示: - -``` -Pass 0, Batch 0, Cost 0.125650 -Pass 100, Batch 0, Cost 0.161387 -Pass 200, Batch 0, Cost 0.040036 -Pass 300, Batch 0, Cost 0.023391 -Pass 400, Batch 0, Cost 0.005856 -Pass 500, Batch 0, Cost 0.003315 -Pass 600, Batch 0, Cost 0.009977 -Pass 700, Batch 0, Cost 0.020959 -Pass 800, Batch 0, Cost 0.105560 -Pass 900, Batch 0, Cost 0.239809 -Test with Epoch 0, avg_cost: 0.053097883707459624, acc: 0.9822850318471338 -``` - -训练之后,检查模型的预测准确度。用 MNIST 训练的时候,一般 softmax回归模型的分类准确率为约为 92.34%,多层感知器为97.66%,卷积神经网络可以达到 99.20%。 - - -## 应用模型 - -可以使用训练好的模型对手写体数字图片进行分类,下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断。 - -### Inference 配置 - -`Inference` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。 -我们可以简单地插入在此之前定义的分类器。 - -```python -inferencer = fluid.Inferencer( -# infer_func=softmax_regression, # uncomment for softmax regression -# infer_func=multilayer_perceptron, # uncomment for MLP -infer_func=convolutional_neural_network, # uncomment for LeNet5 -param_path=params_dirname, -place=place) -``` - -### 生成预测输入数据 - -`infer_3.png` 是数字 3 的一个示例图像。把它变成一个 numpy 数组以匹配数据馈送格式。 - -```python -# Prepare the test image -import os -import numpy as np -from PIL import Image -def load_image(file): -im = Image.open(file).convert('L') -im = im.resize((28, 28), Image.ANTIALIAS) -im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32) -im = im / 255.0 * 2.0 - 1.0 -return im - -cur_dir = cur_dir = os.getcwd() -img = load_image(cur_dir + '/image/infer_3.png') -``` - -### 预测 - -现在我们准备做预测。 - -```python -results = inferencer.infer({'img': img}) -lab = np.argsort(results) # probs and lab are the results of one batch data -print "Label of image/infer_3.png is: %d" % lab[0][0][-1] -``` - -## 总结 - -本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型,后续章节中复杂的神经网络都是从它们衍生出来的,因此这几个模型对之后的学习大有裨益。同时,我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候,MNIST数据集上的识别准确率有了大幅度的提升,原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候,希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外,本教程还介绍了PaddlePaddle模型搭建的基本流程,从dataprovider的编写、网络层的构建,到最后的训练和预测。对这个流程熟悉以后,大家就可以用自己的数据,定义自己的网络模型,并完成自己的训练和预测任务了。 - -## 参考文献 - -1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324. -2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014). -3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190. -4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003. -5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007. -6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220. -7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010. -8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009. -9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386. -10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58. - -
-知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 +# 识别数字 + +本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/167.html)。 + +## 背景介绍 +当我们学习编程的时候,编写的第一个程序一般是实现打印"Hello World"。而机器学习(或深度学习)的入门教程,一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题,比较简单,同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集,包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵,标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。 + +

+
+图1. MNIST图片示例 +

+ +MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3(SD-3)和Special Database 1(SD-1)构建而来。由于SD-3是由美国人口调查局的员工进行标注,SD-1是由美国高中生进行标注,因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集(60000条数据)和测试集(10000条数据),其中训练集来自250位不同的标注员,此外还保证了训练集和测试集的标注员是不完全相同的。 + +Yann LeCun早先在手写字符识别上做了很多研究,并在研究过程中提出了卷积神经网络(Convolutional Neural Network),大幅度地提高了手写字符的识别能力,也因此成为了深度学习领域的奠基人之一。如今的深度学习领域,卷积神经网络占据了至关重要的地位,从最早Yann LeCun提出的简单LeNet,到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等(请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程),人们在图像分类领域,利用卷积神经网络得到了一系列惊人的结果。 + +有很多算法在MNIST上进行实验。1998年,LeCun分别用单层线性分类器、多层感知器(Multilayer Perceptron, MLP)和多层卷积神经网络LeNet进行实验,使得测试集上的误差不断下降(从12%下降到0.7%)\[[1](#参考文献)\]。此后,科学家们又基于K近邻(K-Nearest Neighbors)算法\[[2](#参考文献)\]、支持向量机(SVM)\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验,并采用多种预处理方法(如去除歪曲、去噪、模糊等)来提高识别的准确率。 + +本教程中,我们从简单的模型Softmax回归开始,带大家入门手写字符识别,并逐步进行模型优化。 + + +## 模型概览 + +基于MNIST数据训练一个分类器,在介绍本教程使用的三个基本图像分类网络前,我们先给出一些定义: +- $X$是输入:MNIST图片是$28\times28$ 的二维图像,为了进行计算,我们将其转化为$784$维向量,即$X=\left ( x_0, x_1, \dots, x_{783} \right )$。 +- $Y$是输出:分类器的输出是10类数字(0-9),即$Y=\left ( y_0, y_1, \dots, y_9 \right )$,每一维$y_i$代表图片分类为第$i$类数字的概率。 +- $L$是图片的真实标签:$L=\left ( l_0, l_1, \dots, l_9 \right )$也是10维,但只有一维为1,其他都为0。 + +### Softmax回归(Softmax Regression) + +最简单的Softmax回归模型是先将输入层经过一个全连接层得到的特征,然后直接通过softmax 函数进行多分类\[[9](#参考文献)\]。 + +输入层的数据$X$传到输出层,在激活操作之前,会乘以相应的权重 $W$ ,并加上偏置变量 $b$ ,具体如下: + +$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$ + +其中 $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $ + +对于有 $N$ 个类别的多分类问题,指定 $N$ 个输出节点,$N$ 维结果向量经过softmax将归一化为 $N$ 个[0,1]范围内的实数值,分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。 + +在分类问题中,我们一般采用交叉熵代价损失函数(cross entropy loss),公式如下: + +$$ L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$ + +图2为softmax回归的网络图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 + +

+
+图2. softmax回归网络结构图
+

+ +### 多层感知器(Multilayer Perceptron, MLP) + +Softmax回归模型采用了最简单的两层神经网络,即只有输入层和输出层,因此其拟合能力有限。为了达到更好的识别效果,我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。 + +1. 经过第一个隐藏层,可以得到 $ H_1 = \phi(W_1X + b_1) $,其中$\phi$代表激活函数,常见的有sigmoid、tanh或ReLU等函数。 +2. 经过第二个隐藏层,可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。 +3. 最后,再经过输出层,得到的$Y=\text{softmax}(W_3H_2 + b_3)$,即为最后的分类结果向量。 + + +图3为多层感知器的网络结构图,图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。 + +

+
+图3. 多层感知器网络结构图
+

+ +### 卷积神经网络(Convolutional Neural Network, CNN) + +在多层感知器模型中,将图像展开成一维向量输入到网络中,忽略了图像的位置和结构信息,而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构:输入的二维图像,先经过两次卷积层到池化层,再经过全连接层,最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。 + +

+
+图4. LeNet-5卷积神经网络结构
+

+ +#### 卷积层 + +卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积,即离散二维滤波器(也称作卷积核)与二维图像做卷积操作,简单的讲是二维滤波器滑动到二维图像上所有位置,并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域,不同卷积核可以提取不同的特征,例如边沿、线性、角等特征。在深层卷积神经网络中,通过卷积操作可以提取出图像低级到复杂的特征。 + +

+
+图5. 卷积层图片
+

+ +图5给出一个卷积计算过程的示例图,输入图像大小为$H=5,W=5,D=3$,即$5 \times 5$大小的3通道(RGB,也称作深度)彩色图像。这个示例图中包含两(用$K$表示)组卷积核,即图中滤波器$W_0$和$W_1$。在卷积计算中,通常对不同的输入通道采用不同的卷积核,如图示例中每组卷积核包含($D=3)$个$3 \times 3$(用$F \times F$表示)大小的卷积核。另外,这个示例中卷积核在图像的水平方向($W$方向)和垂直方向($H$方向)的滑动步长为2(用$S$表示);对输入图像周围各填充1(用$P$表示)个0,即图中输入层原始数据为蓝色部分,灰色部分是进行了大小为1的扩展,用0来进行扩展。经过卷积操作得到输出为$3 \times 3 \times 2$(用$H_{o} \times W_{o} \times K$表示)大小的特征图,即$3 \times 3$大小的2通道特征图,其中$H_o$计算公式为:$H_o = (H - F + 2 \times P)/S + 1$,$W_o$同理。 而输出特征图中的每个像素,是每组滤波器与输入图像每个特征图的内积再求和,再加上偏置$b_o$,偏置通常对于每个输出特征图是共享的。输出特征图$o[:,:,0]$中的最后一个$-2$计算如图5右下角公式所示。 + +在卷积操作中卷积核是可学习的参数,经过上面示例介绍,每层卷积的参数大小为$D \times F \times F \times K$。在多层感知器模型中,神经元通常是全部连接,参数较多。而卷积层的参数较少,这也是由卷积层的主要特性即局部连接和共享权重所决定。 + +- 局部连接:每个神经元仅与输入神经元的一块区域连接,这块局部区域称作感受野(receptive field)。在图像卷积操作中,即神经元在空间维度(spatial dimension,即上图示例H和W所在的平面)是局部连接,但在深度上是全部连接。对于二维图像本身而言,也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想,也是受启发于生物学里面的视觉系统结构,视觉皮层的神经元就是局部接受信息的。 + +- 权重共享:计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算$o[:,:,0]$的每个每个神经元的滤波器均相同,都为$W_0$,这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的,例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的,比如输入的图片是人脸,眼睛和头发位于不同的位置,希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的,在卷积层,通常采用多组卷积核提取不同特征,即对应不同深度切片的特征,不同深度切片的神经元权重是不共享。另外,偏重对同一深度切片的所有神经元都是共享的。 + +通过介绍卷积计算过程及其特性,可以看出卷积是线性操作,并具有平移不变性(shift-invariant),平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小,这样也有利于训练较大卷积神经网络。 + +#### 池化层 + +

+
+图6. 池化层图片
+

+ +池化是非线性下采样的一种形式,主要作用是通过减少网络的参数来减小计算量,并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域,对于每个矩形框的数取最大值作为输出层,如图6所示。 + +更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。 + +### 常见激活函数介绍 +- sigmoid激活函数: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $ + +- tanh激活函数: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $ + + 实际上,tanh函数只是规模变化的sigmoid函数,将sigmoid函数值放大2倍之后再向下平移1个单位:tanh(x) = 2sigmoid(2x) - 1 。 + +- ReLU激活函数: $ f(x) = max(0, x) $ + +更详细的介绍请参考[维基百科激活函数](https://en.wikipedia.org/wiki/Activation_function)。 + +## 数据介绍 + +PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mnist/)数据的模块`paddle.dataset.mnist`。加载后的数据位于`/home/username/.cache/paddle/dataset/mnist`下: + + +| 文件名称 | 说明 | +|----------------------|-------------------------| +|train-images-idx3-ubyte| 训练数据图片,60,000条数据 | +|train-labels-idx1-ubyte| 训练数据标签,60,000条数据 | +|t10k-images-idx3-ubyte | 测试数据图片,10,000条数据 | +|t10k-labels-idx1-ubyte | 测试数据标签,10,000条数据 | + +## Fluid API 概述 + +演示将使用最新的 `Fluid API`。Fluid API是最新的 PaddlePaddle API。它在不牺牲性能的情况下简化了模型配置。 +我们建议使用 Fluid API,因为它更容易学起来。 + +下面是快速的 Fluid API 概述。 +1. `inference_program`:指定如何从数据输入中获得预测的函数。 +这是指定网络流的地方。 + +1. `train_program`:指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。 +这是指定损失计算的地方。 + +1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。 + +1. `Trainer`:PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。 +通过 `event_handler` 回调函数,用户可以监控培训的进展。 + +1. `Inferencer`:Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。 +然后,它可以推断数据和返回预测。 + +在这个演示中,我们将深入了解它们。 + +## 配置说明 +加载 PaddlePaddle 的 Fluid API 包。 + +```python +import paddle +import paddle.fluid as fluid +from __future__ import print_function +``` + +### Program Functions 配置 + +我们需要设置“推理程序”函数。我们想用这个程序来演示三个不同的分类器,每个分类器都定义为 Python 函数。 +我们需要将图像数据馈送到分类器。Paddle 为读取数据提供了一个特殊的层 `layer.data` 层。 +让我们创建一个数据层来读取图像并将其连接到分类网络。 + +- Softmax回归:只通过一层简单的以softmax为激活函数的全连接层,就可以得到分类的结果。 + +```python +def softmax_regression(): + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + predict = fluid.layers.fc( + input=img, size=10, act='softmax') + return predict +``` + +- 多层感知器:下面代码实现了一个含有两个隐藏层(即全连接层)的多层感知器。其中两个隐藏层的激活函数均采用ReLU,输出层的激活函数用Softmax。 + +```python +def multilayer_perceptron(): + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + # 第一个全连接层,激活函数为ReLU + hidden = fluid.layers.fc(input=img, size=200, act='relu') + # 第二个全连接层,激活函数为ReLU + hidden = fluid.layers.fc(input=hidden, size=200, act='relu') + # 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10 + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + return prediction +``` + +- 卷积神经网络LeNet-5: 输入的二维图像,首先经过两次卷积层到池化层,再经过全连接层,最后使用以softmax为激活函数的全连接层作为输出层。 + +```python +def convolutional_neural_network(): + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + # 第一个卷积-池化层 + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + # 第二个卷积-池化层 + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + # 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10 + prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') + return prediction +``` + +#### Train Program 配置 +然后我们需要设置训练程序 `train_program`。它首先从分类器中进行预测。 +在训练期间,它将从预测中计算 `avg_cost`。 + +**注意:** 训练程序应该返回一个数组,第一个返回参数必须是 `avg_cost`。训练器使用它来计算梯度。 + +请随意修改代码,测试 Softmax 回归 `softmax_regression`, `MLP` 和 卷积神经网络 `convolutional neural network` 分类器之间的不同结果。 + +```python +def train_program(): + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # predict = softmax_regression() # uncomment for Softmax回归 + # predict = multilayer_perceptron() # uncomment for 多层感知器 + predict = convolutional_neural_network() # uncomment for LeNet5卷积神经网络 + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + acc = fluid.layers.accuracy(input=predict, label=label) + return [avg_cost, acc] + + +``` + +#### Optimizer Function 配置 + +在下面的 `Adam optimizer`,`learning_rate` 是训练的速度,与网络的训练收敛速度有关系。 + +```python +def optimizer_program(): + return fluid.optimizer.Adam(learning_rate=0.001) +``` + +### 数据集 Feeders 配置 + +下一步,我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python yield generator。 + +下面`shuffle`是一个reader decorator,它接受一个reader A,返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里,然后随机打乱其顺序,并且逐条输出。 + +`batch`是一个特殊的decorator,它的输入是一个reader,输出是一个batched reader。在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minibatch。 + +```python +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=64) + +test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=64) +``` + +### Trainer 配置 + +现在,我们需要配置 `Trainer`。`Trainer` 需要接受训练程序 `train_program`, `place` 和优化器 `optimizer`。 + +```python +# 该模型运行在单个CPU上 +use_cuda = False # set to True if training with GPU +place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + +trainer = fluid.Trainer( + train_func=train_program, place=place, optimizer_func=optimizer_program) +``` + +#### Event Handler 配置 + +Fluid API 在训练期间为回调函数提供了一个钩子。用户能够通过机制监控培训进度。 +我们将在这里演示两个 `event_handler` 程序。请随意修改 Jupyter 笔记本 ,看看有什么不同。 + +`event_handler` 用来在训练过程中输出训练结果 + +```python +# Save the parameter into a directory. The Inferencer can load the parameters from it to do infer +params_dirname = "recognize_digits_network.inference.model" +lists = [] +def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + if event.step % 100 == 0: + # event.metrics maps with train program return arguments. + # event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example. + print("Pass %d, Batch %d, Cost %f" % ( + event.step, event.epoch, event.metrics[0])) + + if isinstance(event, fluid.EndEpochEvent): + avg_cost, acc = trainer.test( + reader=test_reader, feed_order=['img', 'label']) + + print("Test with Epoch %d, avg_cost: %s, acc: %s" % (event.epoch, avg_cost, acc)) + + # save parameters + trainer.save_params(params_dirname) + lists.append((event.epoch, avg_cost, acc)) +``` + +`event_handler_plot` 可以用来在训练过程中画图如下: + +
+
+图7 训练结果 +
+ + +```python +from paddle.v2.plot import Ploter + +train_title = "Train cost" +test_title = "Test cost" +cost_ploter = Ploter(train_title, test_title) +step = 0 +lists = [] + +# event_handler to plot a figure +def event_handler_plot(event): + global step + if isinstance(event, fluid.EndStepEvent): + if step % 100 == 0: + # event.metrics maps with train program return arguments. + # event.metrics[0] will yeild avg_cost and event.metrics[1] will yeild acc in this example. + cost_ploter.append(train_title, step, event.metrics[0]) + cost_ploter.plot() + step += 1 + if isinstance(event, fluid.EndEpochEvent): + # save parameters + trainer.save_params(params_dirname) + + avg_cost, acc = trainer.test( + reader=test_reader, feed_order=['img', 'label']) + cost_ploter.append(test_title, step, avg_cost) + lists.append((event.epoch, avg_cost, acc)) +``` + +#### 开始训练 + +既然我们设置了 `event_handler` 和 `data reader`,我们就可以开始训练模型了。 + +`feed_order` 用于将数据目录映射到 `train_program` + +```python +trainer.train( + num_epochs=5, + event_handler=event_handler, + reader=train_reader, + feed_order=['img', 'label']) +``` + +训练过程是完全自动的,event_handler里打印的日志类似如下所示: + +``` +Pass 0, Batch 0, Cost 0.125650 +Pass 100, Batch 0, Cost 0.161387 +Pass 200, Batch 0, Cost 0.040036 +Pass 300, Batch 0, Cost 0.023391 +Pass 400, Batch 0, Cost 0.005856 +Pass 500, Batch 0, Cost 0.003315 +Pass 600, Batch 0, Cost 0.009977 +Pass 700, Batch 0, Cost 0.020959 +Pass 800, Batch 0, Cost 0.105560 +Pass 900, Batch 0, Cost 0.239809 +Test with Epoch 0, avg_cost: 0.053097883707459624, acc: 0.9822850318471338 +``` + +训练之后,检查模型的预测准确度。用 MNIST 训练的时候,一般 softmax回归模型的分类准确率为约为 92.34%,多层感知器为97.66%,卷积神经网络可以达到 99.20%。 + + +## 应用模型 + +可以使用训练好的模型对手写体数字图片进行分类,下面程序展示了如何使用 `fluid.Inferencer` 接口进行推断。 + +### Inference 配置 + +`Inference` 需要一个 `infer_func` 和 `param_path` 来设置网络和经过训练的参数。 +我们可以简单地插入在此之前定义的分类器。 + +```python +inferencer = fluid.Inferencer( + # infer_func=softmax_regression, # uncomment for softmax regression + # infer_func=multilayer_perceptron, # uncomment for MLP + infer_func=convolutional_neural_network, # uncomment for LeNet5 + param_path=params_dirname, + place=place) +``` + +### 生成预测输入数据 + +`infer_3.png` 是数字 3 的一个示例图像。把它变成一个 numpy 数组以匹配数据馈送格式。 + +```python +# Prepare the test image +import os +import numpy as np +from PIL import Image +def load_image(file): + im = Image.open(file).convert('L') + im = im.resize((28, 28), Image.ANTIALIAS) + im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32) + im = im / 255.0 * 2.0 - 1.0 + return im + +cur_dir = cur_dir = os.getcwd() +img = load_image(cur_dir + '/image/infer_3.png') +``` + +### 预测 + +现在我们准备做预测。 + +```python +results = inferencer.infer({'img': img}) +lab = np.argsort(results) # probs and lab are the results of one batch data +print ("Inference result of image/infer_3.png is: %d" % lab[0][0][-1]) +``` + +## 总结 + +本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型,后续章节中复杂的神经网络都是从它们衍生出来的,因此这几个模型对之后的学习大有裨益。同时,我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候,MNIST数据集上的识别准确率有了大幅度的提升,原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候,希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外,本教程还介绍了PaddlePaddle模型搭建的基本流程,从dataprovider的编写、网络层的构建,到最后的训练和预测。对这个流程熟悉以后,大家就可以用自己的数据,定义自己的网络模型,并完成自己的训练和预测任务了。 + +## 参考文献 + +1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324. +2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014). +3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190. +4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003. +5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007. +6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220. +7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010. +8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009. +9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386. +10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58. + +
+知识共享许可协议
本教程PaddlePaddle 创作,采用 知识共享 署名-相同方式共享 4.0 国际 许可协议进行许可。 diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png deleted file mode 100644 index 3f5cdaacdc6acce41c5c6c99649be46685cf9903..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png deleted file mode 100644 index 65bd17eacd41bbdbdb042bd1ba366eb53663b410..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png deleted file mode 100644 index 030cd60d3b4af9aecd4941204da4ad15f6e1189f..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png deleted file mode 100644 index 90b02fa2a735cfcc9efb2de90906325dedcb358c..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png deleted file mode 100644 index 9f4d26cd8da32201d0a5e9c72d466301dd2b42a1..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png deleted file mode 100644 index f5a478fdc24f29c17555a2f1451f3f5a079faed9..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png deleted file mode 100644 index 4edd7cabf8a2282f6392ac1421c7ca4afb288589..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png deleted file mode 100644 index 40b98298288b9c406fce1cbca9c913753020a94d..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png deleted file mode 100644 index 47204941af7f22e68386a70a06ec4f122b83e262..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png and /dev/null differ diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test.png deleted file mode 100644 index 5cb87b450d0398bcfaec0e647c362052069797e7..0000000000000000000000000000000000000000 Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test.png and /dev/null differ diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8c5cc44528a754f7612a23b1de09c247ca3f0c8e..b6ae930b7155d15d24b287cc3eed50f2aeaa5599 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -36,6 +36,7 @@ paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=Non paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)) @@ -55,9 +56,10 @@ paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) +paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) @@ -113,6 +115,7 @@ paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)) +paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)) paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)) @@ -143,9 +146,12 @@ paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_ paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) +paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.lrn ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)) paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) +paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)) paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)) paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)) paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)) @@ -162,7 +168,10 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) +paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)) paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) +paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)) +paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) @@ -251,6 +260,7 @@ paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwarg paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) +paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -293,8 +303,10 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) -paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) +paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) +paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None)) +paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -330,9 +342,10 @@ paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True)) +paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) @@ -377,7 +390,7 @@ paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, a paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]] paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]] -paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None +paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int] diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 675018be087d5d09cea060dfbac121048ea4dd3a..b344661f184806219e3741b0d13fc02fbb8b42d1 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,3 +1,4 @@ + # windows treat symbolic file as a real file, which is different with unix # We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -140,11 +141,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) endif() if (NOT WIN32) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index cd00b7de7338982308acfa1f1e8c38e010c6a43b..c9e3a8ac1d1e5228725bff49ecc6d91e640dfe57 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -46,7 +46,7 @@ struct CastDataLayout { const std::vector axis_; template - void operator()() { + void apply() { auto place = ctx_->GetPlace(); if (platform::is_cpu_place(place)) { diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 1a9ce746ea840bc088d222cc4e9bc05159d64734..28f3da88fa18021f6b71e458fdb467be86d4dbf0 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() { RegType(size_t, proto::VarType::SIZE_T); RegType(int16_t, proto::VarType::INT16); RegType(uint8_t, proto::VarType::UINT8); + RegType(int8_t, proto::VarType::INT8); #undef RegType return retv; diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 84c2e7f22721a81eca4a2b86b59ddec39f1c02b7..5024399d8b96709cd27ca6bd94058d0ee49e68b2 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -31,28 +31,31 @@ template inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { switch (type) { case proto::VarType::FP16: - visitor.template operator()(); + visitor.template apply(); break; case proto::VarType::FP32: - visitor.template operator()(); + visitor.template apply(); break; case proto::VarType::FP64: - visitor.template operator()(); + visitor.template apply(); break; case proto::VarType::INT32: - visitor.template operator()(); + visitor.template apply(); break; case proto::VarType::INT64: - visitor.template operator()(); + visitor.template apply(); break; case proto::VarType::BOOL: - visitor.template operator()(); + visitor.template apply(); break; case proto::VarType::UINT8: - visitor.template operator()(); + visitor.template apply(); break; case proto::VarType::INT16: - visitor.template operator()(); + visitor.template apply(); + break; + case proto::VarType::INT8: + visitor.template apply(); break; default: PADDLE_THROW("Not supported %d", type); diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 8213c82ec1f04244555d4a0a9079ac1d96113726..d79f8cacb5f4727defc77380371e57bcea65f068 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -37,7 +37,7 @@ struct CastDataType { const platform::DeviceContext* ctx_; template - void apply()() { + void apply() { auto* in_begin = in_.data(); auto* in_end = in_begin + in_.numel(); auto* out_begin = out_->mutable_data(in_.place()); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index bc61b0eacbf6c8a1fd4487ad5a442fed1b536345..0bfff745493d069e948e6d277ec2bbfb0673a70b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -625,19 +625,11 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name) const { for (size_t i = 0; i < places_.size(); ++i) { -// Insert ScaleCost OpHandle -#ifdef PADDLE_WITH_CUDA - auto *communication_dev_ctx = - nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i]) - : platform::DeviceContextPool::Instance().Get(places_[i]); -#else - auto *communication_dev_ctx = - platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); -#endif + // Insert ScaleCost OpHandle + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - local_scopes_.size(), local_scopes_[i], places_[i], - communication_dev_ctx); + local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -744,7 +736,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, .emplace(varname, op_dev_id); } } else { - PADDLE_ENFORCE( + PADDLE_THROW( "the distribute training related op should be in [split_byref, " "concat]."); } @@ -754,17 +746,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type()); CreateComputationalOp(result, node, op_dev_id); - if (node->Op()->Type() == "concat") { - ConnectOp(result, result->Get(kGraphOps).back().get(), - "fetch_barrier"); +} + +void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { + auto *op_handle = result->Get(kGraphOps).back().get(); + for (ir::Node *input : node->inputs) { + VarHandle *var = nullptr; + for (int place_offset = 0; place_offset < num_places; ++place_offset) { + auto &var_holders = result->Get(kGraphVars)[place_offset]; + auto &var_holder = var_holders[input->Name()]; + if (!var_holder.empty()) { + var = var_holder.rbegin()->get(); + op_handle->AddInput(var); + } + } } } // Create RPC related op handles that connects its in ops and out ops. void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { - // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode - // put them into transpiler. int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -799,8 +800,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - // FIXME(typhoonzero): assume each recv op output one param - // Use the same place as send. if (recv_param_grad.size() == 2U) { op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] @@ -814,34 +813,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, .emplace(varname, op_dev_id); } } else { - // send_barrier and fetch_barrier op can be scheduled on device 0 + // send_barrier, fetch_barrier will run on place 0; op_dev_id = 0; } PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", node->Op()->Type()); - result->Get(kGraphOps).emplace_back(new RPCOpHandle( result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], node->Op()->Type(), places_[op_dev_id])); - // TODO(panyx0718): This might not be needed anymore. - if (node->Op()->Type() == "send_barrier") { - ConnectOp(result, result->Get(kGraphOps).back().get(), "send"); - } else if (node->Op()->Type() == "recv") { - ConnectOp(result, result->Get(kGraphOps).back().get(), - "send_barrier"); - } else if (node->Op()->Type() == "fetch_barrier") { - ConnectOp(result, result->Get(kGraphOps).back().get(), "recv"); - } else if (node->Op()->Type() == "send") { - // do nothing + if (node->Op()->Type() == "send") { + CreateOpHandleIOs(result, node, op_dev_id); } else { - PADDLE_THROW( - "rpc op should be in [" - "send, send_barrier. recv, fetch_barrier]"); - } + // send_barrier, recv, fetch_barrier's inputs are deps var, get them from + // all places + auto p = places_[op_dev_id]; + auto *op_handle = result->Get(kGraphOps).back().get(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); - CreateOpHandleIOs(result, node, op_dev_id); + SetOpInputsAllPlaces(result, node, places_.size()); + for (ir::Node *output : node->outputs) { + int outvar_dev_id = op_dev_id; + if (node->Op()->Type() == "fetch_barrier") { + outvar_dev_id = GetVarDeviceID(*result, output->Name()); + PADDLE_ENFORCE_NE(outvar_dev_id, -1); + } + p = places_[outvar_dev_id]; + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); + } + } } bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index e28264eb32756f77ef5baed3dff77ba9f0943160..bd6153c0c736f6e32378eebcbf6c4d7e402c9b42 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -31,7 +31,7 @@ struct ReduceLoDTensor { : src_tensors_(src), dst_tensor_(*dst) {} template - void operator()() const { + void apply() const { PADDLE_ENFORCE(!src_tensors_.empty()); auto &t0 = *src_tensors_[0]; PADDLE_ENFORCE_NE(t0.numel(), 0); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 214ca3dc492c31d4c683790a6ae051be467401c9..f95808c199b9de693ec653c29374c9130be7fd59 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -60,6 +60,7 @@ class Executor { void Run(const ProgramDesc& prog, Scope* scope, int block_id, bool create_local_scope = true, bool create_vars = true); + // This API is very slow. void Run(const ProgramDesc& program, Scope* scope, std::map* feed_targets, std::map* fetch_targets, @@ -79,6 +80,7 @@ class Executor { bool create_local_scope = true, bool create_vars = true, bool keep_kids = false); + // This API is very slow. void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, std::map* feed_targets, std::map* fetch_targets, diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 2cf14bd371831ab682166f4256d6966b5ab278c8..c6588435819a982166cf2d2368a82b4402fdc2bc 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -107,6 +107,7 @@ message VarType { // Tensor is used in C++. SIZE_T = 19; UINT8 = 20; + INT8 = 21; // Other types that may need additional descriptions LOD_TENSOR = 7; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index da0955a9a000e0d0bff3fe9d0bc3bd25171be3d2..bfc649017f19d67660bd11d590134cf56772bb27 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -3,14 +3,18 @@ cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) +cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) -cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits) -cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter) +cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) +cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector) +cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) - +cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector) +cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) -cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter) -cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto) +cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) +cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4e205170bbf053840e2dcf68e4e782c4c212d3e --- /dev/null +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -0,0 +1,273 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/api/helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +struct Param { + std::string X = "concat_0.tmp_0"; + std::string C0 = "cell_init"; + std::string H0 = "hidden_init"; + std::string AttentionWeight = "attention_fc.w_0"; + std::string AttentionBias = "attention_fc.b_0"; + std::string AttentionScalar = "attention_output.w_0"; + std::string AttentionScalarBias = "attention_output.b_0"; + std::string LSTMWeight = "attention_w.new"; + std::string LSTMBias = "attention_b.new"; + std::string Hidden = "array_to_lod_tensor_0.tmp_0"; + std::string Cell = "at.cell.new"; + std::string AttentionedX = "at.x.new"; + std::string AttentionFCOut = "at.fc.new"; + std::string LSTMX = "at.lstmx.new"; + std::string LSTMOUT = "at.lstmout.new"; +}; + +void PrepareParameters(Graph* graph, const Param& param); + +void FindWhileOp(Graph* graph) { + GraphPatternDetector gpd; + std::unordered_set fused_external_ops( + {35, 36, 37, 38, 43, 44, 49, 45, 46, 47, 41, 42, 53, 54, 48, + 57, 55, 56, 52, 74, 80, 77, 78, 79, 50, 77, 39, 40, 51}); + + gpd.mutable_pattern()->NewNode( + [&](Node* n) { return fused_external_ops.count(n->id()); }, "while"); + + if (!graph->Has(kGraphvizMarkedNodeAttr)) { + graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t); + } + auto& marked_nodes = + graph->Get(kGraphvizMarkedNodeAttr); + + auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + auto* while_pat_node = gpd.pattern().RetrieveNode("while"); + auto* while_node = subgraph.at(while_pat_node); + marked_nodes.insert(while_node); + }; + gpd(graph, handle); + + Param param; + // Add AttentionLSTM node + OpDesc op_desc; + op_desc.SetType("attention_lstm"); + +#define OP_SET_IN(x) op_desc.SetInput(#x, {param.x}); +#define OP_SET_OUT(x) op_desc.SetOutput(#x, {param.x}); + OP_SET_IN(X); + OP_SET_IN(C0); + OP_SET_IN(H0); + OP_SET_IN(AttentionWeight); + OP_SET_IN(AttentionBias); + OP_SET_IN(AttentionScalar); + OP_SET_IN(AttentionScalarBias); + OP_SET_IN(LSTMWeight); + OP_SET_IN(LSTMBias); + + OP_SET_OUT(Hidden); + OP_SET_OUT(Cell); + OP_SET_OUT(AttentionedX); + OP_SET_OUT(AttentionFCOut); + OP_SET_OUT(LSTMX); + OP_SET_OUT(LSTMOUT); +#undef OP_SET_IN +#undef OP_SET_OUT + + auto* X = graph->RetriveNode(34); + auto* LSTMOUT = graph->RetriveNode(81); + auto* cell_init = graph->RetriveNode(6); + auto* hidden_init = graph->RetriveNode(8); + +#define LINK_TO(node0, node1) \ + node0->outputs.push_back(node1); \ + node1->inputs.push_back(node0); + + auto* lstm_op = graph->CreateOpNode(&op_desc); + PrepareParameters(graph, param); + + LINK_TO(X, lstm_op); + LINK_TO(cell_init, lstm_op); + LINK_TO(hidden_init, lstm_op); + LINK_TO(lstm_op, LSTMOUT); + + GraphSafeRemoveNodes(graph, marked_nodes); +} + +#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x); +#define CHECK_P2(x0, x1) \ + CHECK_P1(x0); \ + CHECK_P1(x1); +#define CHECK_P3(x0, x1, x2) \ + CHECK_P2(x0, x1); \ + CHECK_P1(x2); +#define CHECK_P4(x0, x1, x2, x3) \ + CHECK_P3(x0, x1, x2); \ + CHECK_P1(x3); +#define CHECK_P5(x0, x1, x2, x3, x4) \ + CHECK_P4(x0, x1, x2, x3); \ + CHECK_P1(x4); + +void PrepareLSTMWeight(const LoDTensor& W_forget_w0, + const LoDTensor& W_forget_w1, + const LoDTensor& W_input_w0, const LoDTensor& W_input_w1, + const LoDTensor& W_output_w0, + const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0, + const LoDTensor& W_cell_w1, LoDTensor* out); + +void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, + const LoDTensor& B_output, const LoDTensor& B_cell, + LoDTensor* out); + +void PrepareParameters(Graph* graph, const Param& param) { + // Check parameters + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); + + // Create new parameters. + scope->Var(param.LSTMWeight)->GetMutable(); + scope->Var(param.LSTMBias)->GetMutable(); + scope->Var(param.Hidden)->GetMutable(); + scope->Var(param.Cell)->GetMutable(); + scope->Var(param.AttentionedX)->GetMutable(); + scope->Var(param.AttentionFCOut)->GetMutable(); + scope->Var(param.LSTMX)->GetMutable(); + scope->Var(param.LSTMOUT)->GetMutable(); + +#define GATE_W(name__) \ + auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \ + auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \ + auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \ + CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \ + VLOG(4) << #name__ "_w0" \ + << " shape: " << W_##name__##_w0->Get().dims(); \ + VLOG(4) << #name__ "_w1" \ + << " shape: " << W_##name__##_w1->Get().dims(); \ + VLOG(4) << #name__ "_b0" \ + << " shape: " << W_##name__##_b0->Get().dims(); \ + auto& W_##name__##_w0_t = W_##name__##_w0->Get(); \ + auto& W_##name__##_w1_t = W_##name__##_w1->Get(); \ + auto& W_##name__##_b0_t = W_##name__##_b0->Get(); + + GATE_W(forget); + GATE_W(input); + GATE_W(output); + GATE_W(c); +#undef GATE_W + + auto* attention_fc_w = scope->FindVar("attention_fc.w_0"); + auto* attention_fc_b = scope->FindVar("attention_fc.b_0"); + auto* attention_output_w = scope->FindVar("attention_output.w_0"); + auto* attention_output_b = scope->FindVar("attention_output.b_0"); + CHECK_P4(attention_fc_w, attention_fc_b, attention_output_w, + attention_output_b); + + auto* lstm_weight = scope->Var(param.LSTMWeight); + auto* lstm_weight_t = lstm_weight->GetMutable(); + auto* lstm_bias = scope->Var(param.LSTMBias); + auto* lstm_bias_t = lstm_bias->GetMutable(); + + // reshape attention_bias + auto* attention_bias_t = + scope->FindVar(param.AttentionBias)->GetMutable(); + PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1); + attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]})); + + auto* attention_scalar_bias_t = + scope->FindVar(param.AttentionScalarBias)->GetMutable(); + attention_scalar_bias_t->Resize( + make_ddim({1, attention_scalar_bias_t->dims()[0]})); + + PrepareLSTMWeight(W_forget_w0_t, W_forget_w1_t, W_input_w0_t, W_input_w1_t, + W_output_w0_t, W_output_w1_t, W_c_w0_t, W_c_w1_t, + lstm_weight_t); + PrepareLSTMBias(W_forget_b0_t, W_input_b0_t, W_output_b0_t, W_c_b0_t, + lstm_bias_t); +} + +// Prepare parameters +void PrepareLSTMWeight(const LoDTensor& W_forget_w0, + const LoDTensor& W_forget_w1, + const LoDTensor& W_input_w0, const LoDTensor& W_input_w1, + const LoDTensor& W_output_w0, + const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0, + const LoDTensor& W_cell_w1, LoDTensor* out) { + int D = W_forget_w0.dims()[0]; + int M = W_forget_w1.dims()[0]; + out->Resize(make_ddim({D + M, 4 * D})); + VLOG(3) << "LSTMWeight resized to " << out->dims(); + + float* out_data = out->mutable_data(platform::CPUPlace()); + std::array tensors( + {{W_forget_w0.data(), W_input_w0.data(), + W_output_w0.data(), W_cell_w0.data()}}); + std::array tensors1( + {{W_forget_w1.data(), W_input_w1.data(), + W_output_w1.data(), W_cell_w1.data()}}); + + for (int row = 0; row < D; row++) { + for (int col = 0; col < 4; col++) { + float* dst = out_data + 4 * D * row + D * col; + const float* src = tensors[col] + D * row; + memcpy(dst, src, D * sizeof(float)); + } + } + + for (int row = 0; row < M; row++) { + for (int col = 0; col < 4; col++) { + float* dst = out_data + 4 * D * (D + row) + D * col; + const float* src = tensors1[col] + D * row; + memcpy(dst, src, D * sizeof(float)); + } + } +} + +void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, + const LoDTensor& B_output, const LoDTensor& B_cell, + LoDTensor* out) { + std::array tensors( + {{B_forget.data(), B_input.data(), B_output.data(), + B_cell.data()}}); + + PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); + int D = B_forget.dims()[0]; + out->Resize(make_ddim({1, 4 * D})); + auto* out_data = out->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < tensors.size(); i++) { + memcpy(out_data + D * i, tensors[i], D * sizeof(float)); + } +} + +// Parameters + +std::unique_ptr AttentionLSTMFusePass::ApplyImpl( + std::unique_ptr graph) const { + PDPattern external_pattern, subblock_pattern; + + FindWhileOp(graph.get()); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(attention_lstm_fuse_pass, + paddle::framework::ir::AttentionLSTMFusePass); diff --git a/paddle/fluid/inference/analysis/dot.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h similarity index 62% rename from paddle/fluid/inference/analysis/dot.cc rename to paddle/fluid/framework/ir/attention_lstm_fuse_pass.h index d5471ffcb594a6915e9e65c0fee5adc5f5bdf40c..a756dfc1b98e1de55c809c73e2c4df1e628950ae 100644 --- a/paddle/fluid/inference/analysis/dot.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,12 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/dot.h" +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { -namespace inference { -namespace analysis { -size_t Dot::counter = 0; -} // namespace analysis -} // namespace inference +namespace framework { +namespace ir { + +class AttentionLSTMFusePass : public FusePassBase { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index f4327742eac843f27385c165216ce48ceb97ea71..513742bab69d465aac1bfb7bcef2fe89108c14a0 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -31,81 +31,36 @@ bool VarOutLinksToOp(Node* node, const std::string& op_type) { } void BuildFCPattern(PDPattern* pattern) { - // make sure the selected MUL op has one input argument is a parameter. - auto* mul_parameter_var = pattern->NewNode( - [](Node* node) { - return node->IsVar() && node->outputs.size() == 1UL && - node->outputs.front()->Op()->Type() == "mul" && node->Var() && - node->Var()->Persistable(); // check is a parameter - }, - "mul_weight" /*name*/); - - auto* mul_tmp_input_var = pattern->NewNode( - [](Node* node) { - bool result = - node->IsVar() && node->outputs.size() >= 1UL && node->Var() && - !node->Var()->Persistable(); // this input is not an parameter. - if (!result) return false; - // check whether one output is MUL op. - for (auto* op : node->outputs) { - if (op->IsOp() && op->Op()->Type() == "mul") return true; - } - return false; - }, - "mul_tmp_var" /*name*/); - - // select a MUL op - auto* mul_op = pattern->NewNode( - [](Node* node) { - return node->IsOp() && // start from an Op - node->Op()->Type() == "mul"; // type is mul - // the output should be consumed only by one element_add, that check - // leaves in a Var PDNode. - }, - "mul" /*name*/); - - // make sure the MUL op's output has only one consumer and links to an - // ELEMENTWISE_ADD op. - auto* mul_out_var = pattern->NewNode( - [](Node* node) { - return node->IsVar() && // starts from a Var - node->outputs.size() == 1UL && // only has one consumer - node->outputs.front()->IsOp() && // check basic logic - node->Var() && // not a ControlDepVar - node->outputs.front()->Op()->Type() == - "elementwise_add"; // a very strong validation - }, - "mul_out"); - // this check is not essential, just to make the corresponding variable Node - // retrival easier. - auto* elementwise_add_tmp_var = pattern->NewNode( - [](Node* node) { - return node->IsVar() && node->outputs.size() >= 1UL && node->Var() && - VarOutLinksToOp(node, "elementwise_add"); - }, - "elementwise_add_tmpvar"); - - // select an ELEMENTWISE_ADD op - auto* elementwise_add_op = pattern->NewNode( - [](Node* node) { - return node->IsOp() && node->Op()->Type() == "elementwise_add"; - }, - "elementwise_add" /*name*/); - - // get the ELEMENTWISE_ADD op's output - auto* elementwise_add_out_var = pattern->NewNode( - [](Node* node) { - return node->IsVar() && node->inputs.size() == 1UL && node->Var() && - node->inputs.front()->Op()->Type() == "elementwise_add"; - }, - "elementwise_add_out"); - - pattern->AddEdge(mul_parameter_var, mul_op); - pattern->AddEdge(mul_tmp_input_var, mul_op); - pattern->AddEdge(mul_op, mul_out_var); - pattern->AddEdge(mul_out_var, elementwise_add_op); - pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op); - pattern->AddEdge(elementwise_add_op, elementwise_add_out_var); + // Create Operators + auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul"); + auto* elementwise_add_op = + pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add"); + // Create variables + // w + auto* mul_weight_var = pattern->NewNode("mul_weight") + ->AsInput() + ->assert_is_op_nth_input("mul", "Y", 0); + // x + auto* mul_tmp_var = pattern->NewNode("mul_tmp_var") + ->AsInput() + ->assert_is_op_nth_input("mul", "X", 0); + // intermediate variable, will be removed in the IR after fuse. + auto* mul_out_var = pattern->NewNode("mul_out") + ->AsIntermediate() + ->assert_is_only_output_of_op("mul") + ->assert_is_op_input("elementwise_add"); + // bias + auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar") + ->assert_is_op_input("elementwise_add") + ->AsInput(); + // output + auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out") + ->AsOutput() + ->assert_is_op_output("elementwise_add"); + + mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var}); + elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var}) + .LinksTo({elementwise_add_out_var}); } // Replace the node `from` in the links to `to` @@ -122,19 +77,21 @@ bool LinksReplace(std::vector* links, Node* from, Node* to) { std::unique_ptr FCFusePass::ApplyImpl( std::unique_ptr graph) const { PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("fc", graph.get()); std::unordered_set nodes2delete; - GraphPatternDetecter gpd; + GraphPatternDetector gpd; BuildFCPattern(gpd.mutable_pattern()); -#define GET_NODE(id) \ - PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetriveNode(#id)), \ - "pattern has no Node called %s", #id); \ - auto* id = subgraph.at(gpd.pattern().RetriveNode(#id)); \ +#define GET_NODE(id) \ + PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id)); \ PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); - auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph, + int found_fc_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { VLOG(4) << "handle FC fuse"; // Currently, there is no FC op available, so I will just simulate the @@ -178,10 +135,13 @@ std::unique_ptr FCFusePass::ApplyImpl( graph->RemoveNode(mul); graph->RemoveNode(elementwise_add); graph->RemoveNode(mul_out); // tmp variable + + found_fc_count++; }; gpd(graph.get(), handler); + AddStatis(found_fc_count); return graph; } diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h index eb43dd4486cda578804fb9f6438c67e9e4a03091..6c69539d1e48268afc2435f8f73b3818d13107cd 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detecter.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" namespace paddle { @@ -23,7 +24,7 @@ namespace ir { /* * Fuse the MUL and ELEMENTWISE_ADD to a FCOp. */ -class FCFusePass : public Pass { +class FCFusePass : public FusePassBase { public: virtual ~FCFusePass() {} diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 87ba417b1a43475f48380009f8e5cd84699b8e40..06286a109d01af638e74e06ccc83e2a5500663ea 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -25,8 +25,13 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::vector& outputs) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); - op->SetInput("Xs", inputs); - op->SetOutput("Ys", outputs); + if (type == "mul") { + op->SetInput("X", {inputs[0]}); + op->SetInput("Y", {inputs[1]}); + } else if (type == "elementwise_add") { + op->SetInput("X", inputs); + } + op->SetOutput("Out", outputs); } // a->OP0->b diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..5852705b6b8d1c650faeae3dc810aac65353b459 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr FCLstmFusePass::ApplyImpl( + std::unique_ptr graph) const { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + std::unordered_set fused_ops({// first lstm + 13, 15, 16, + // second lstm + 23, 25, 26}); + + pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); }, + "any_node"); + + std::unordered_set marked_nodes; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + + auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node")); + marked_nodes.insert(id); + }; + gpd(graph.get(), handler); + + // Create New OpDesc + auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h, + int bias, int hidden, int cell, int xx) { +#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x); + GET_NODE(input); + GET_NODE(weight_x); + GET_NODE(weight_h); + GET_NODE(bias); + GET_NODE(hidden); + GET_NODE(cell); + GET_NODE(xx); + GET_NODE(lstm); + + OpDesc op_desc; + op_desc.SetType("fusion_lstm"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); + SET_IN(X, input); + SET_IN(WeightX, weight_x); + SET_IN(WeightH, weight_h); + SET_IN(Bias, bias); +#undef GET_NODE +#undef SET_IN + + VLOG(4) << "hidden_n: " << hidden_n->Name(); + VLOG(4) << "cell: " << cell_n->Name(); + VLOG(4) << "xx: " << xx_n->Name(); + + op_desc.SetInput("H0", {}); + op_desc.SetInput("C0", {}); + op_desc.SetOutput("Hidden", {hidden_n->Name()}); + op_desc.SetOutput("Cell", {cell_n->Name()}); + op_desc.SetOutput("XX", {xx_n->Name()}); + op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"}); + op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"}); + op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); + op_desc.SetAttr("use_peepholes", false); + auto* op = graph->CreateOpNode(&op_desc); + +#define LINK_TO(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + LINK_TO(input_n, op); + LINK_TO(weight_x_n, op); + LINK_TO(weight_h_n, op); + LINK_TO(bias_n, op); + LINK_TO(op, hidden_n); +#undef LINK_TO + return op; + + }; + + lstm_creator(16, 12, 14, 18, 17, 22, 21, 19); + lstm_creator(26, 12, 24, 28, 27, 32, 31, 29); + + // remove all the nodes + + for (auto* node : marked_nodes) { + graph->RemoveNode(const_cast(node)); + } + + for (auto* node : graph->Nodes()) { + for (auto it = node->inputs.begin(); it != node->inputs.end();) { + if (marked_nodes.count(*it)) { + it = const_cast(node)->inputs.erase(it); + } else + it++; + } + for (auto it = node->outputs.begin(); it != node->outputs.end();) { + if (marked_nodes.count(*it)) { + it = const_cast(node)->outputs.erase(it); + } else + it++; + } + } + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..74b08ae558b12c9328db58687cd01edbc37291a8 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class FCLstmFusePass : public Pass { + public: + virtual ~FCLstmFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h new file mode 100644 index 0000000000000000000000000000000000000000..877bbeb502252cac77095981641d7ce283ca1eb7 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -0,0 +1,60 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace ir { + +static const char kParamScopeAttr[] = "__param_scope__"; +static const char kFuseStatisAttr[] = "__fuse_statis__"; + +class FusePassBase : public Pass { + public: + void Init(const std::string& repr, Graph* graph) const { + repr_ = repr; + graph_ = graph; + } + + Scope* param_scope() const { + PADDLE_ENFORCE(graph_->Has(kParamScopeAttr)); + return graph_->Get(kParamScopeAttr); + } + + void AddStatis(int count_of_fused) const { + PADDLE_ENFORCE(graph_); + PADDLE_ENFORCE(!repr_.empty()); + if (!graph_->Has(kFuseStatisAttr)) { + graph_->Set(kFuseStatisAttr, new std::unordered_map); + } + auto& info = + graph_->Get>(kFuseStatisAttr); + info[repr_] = count_of_fused; + } + + virtual ~FusePassBase() {} + + protected: + mutable Graph* graph_; + mutable std::string repr_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 2a6bf4ac230df81b38751000bf4b663f24984db3..398f7095968e62f92d610f560d7574b27706d13e 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -87,6 +87,9 @@ bool IsDistTrainOp(ir::Node *node, const std::vector &send_vars, } Graph::Graph(const ProgramDesc &program) : program_(program) { + // Make the nodes id start from 0. + Node::ResetId(); + VLOG(3) << "block in program:" << program_.Size(); std::unordered_map all_vars; for (auto *var : program.Block(0).AllVars()) { @@ -132,63 +135,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { } } - std::vector send_ops; - ir::Node *send_bar = nullptr; - std::vector recv_ops; - ir::Node *fetch_bar = nullptr; - for (ir::Node *node : Nodes()) { - if (node->Name() == "send") { - send_ops.push_back(node); - } else if (node->Name() == "send_barrier") { - PADDLE_ENFORCE(!send_bar, "only has one send barrier"); - send_bar = node; - } else if (node->Name() == "recv") { - recv_ops.push_back(node); - } else if (node->Name() == "fetch_barrier") { - PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier"); - fetch_bar = node; - } - } - if (send_bar) { - for (ir::Node *send : send_ops) { - ir::Node *dep_var = CreateControlDepVar(); - send->outputs.push_back(dep_var); - dep_var->inputs.push_back(send); - send_bar->inputs.push_back(dep_var); - dep_var->outputs.push_back(send_bar); - } - for (ir::Node *recv : recv_ops) { - ir::Node *dep_var = CreateControlDepVar(); - recv->inputs.push_back(dep_var); - dep_var->outputs.push_back(recv); - send_bar->outputs.push_back(dep_var); - dep_var->inputs.push_back(send_bar); - } - } - if (fetch_bar) { - for (ir::Node *recv : recv_ops) { - ir::Node *dep_var = CreateControlDepVar(); - recv->outputs.push_back(dep_var); - dep_var->inputs.push_back(recv); - fetch_bar->inputs.push_back(dep_var); - dep_var->outputs.push_back(fetch_bar); - } - } - - std::vector send_vars = FindDistTrainSendVars(send_ops); - std::vector recv_vars = FindDistTrainRecvVars(recv_ops); - for (ir::Node *node : Nodes()) { - if (IsDistTrainOp(node, send_vars, recv_vars)) { - if (fetch_bar && node->Name() == "concat") { - ir::Node *dep_var = CreateControlDepVar(); - fetch_bar->outputs.push_back(dep_var); - dep_var->inputs.push_back(fetch_bar); - node->inputs.push_back(dep_var); - dep_var->outputs.push_back(node); - } - } - } - /** * We should handle write after read(WAR) and write after write(WAW) here. * Because some of the operators of the program can be executed parallelly. diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 0d27be5fc007746d6ca41ff0dbcea5c5f45599ef..55e495a0ed75c3a09703438dcfe01ca8f9d36118 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -142,6 +142,16 @@ class Graph { nodes_.erase(node); } + // NOTE low performance, but simple and secure. + Node *RetriveNode(int id) { + for (auto &node : nodes_) { + if (node.second->id() == id) { + return node.second.get(); + } + } + return nullptr; + } + private: // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { @@ -157,6 +167,7 @@ class Graph { std::map> attr_dels_; std::map> nodes_; std::unordered_set node_set_; + int node_count_{0}; }; bool IsControlDepVar(const ir::Node &var); diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index dc81a2cac585b50b81f79f8f204ce1145d93eab0..62f94a1c0e5a300438bbe5fea34b9a07df5d9ebf 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -103,10 +103,10 @@ std::map> BuildOperationAdjList( for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); - adj_list[n].insert(adj_n); VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) << " -> " << n->Name() << reinterpret_cast(n) << " via " << var->Name() << reinterpret_cast(var); + adj_list[n].insert(adj_n); } } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.cc b/paddle/fluid/framework/ir/graph_pattern_detecter.cc deleted file mode 100644 index e197861251fe5c9f98eaaba2a10b4af371dcbcba..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/graph_pattern_detecter.h" -#include "paddle/fluid/framework/ir/graph_traits.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { -namespace ir { - -size_t PDPattern::id_ = 0UL; - -PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { - if (!name.empty()) { - PADDLE_ENFORCE_EQ(node_map_.count(name), 0, - "PDNode's name should be unique, get duplicate [%s]", - name); - } - - nodes_.emplace_back(new PDNode(std::move(teller), name)); - auto* cur = nodes_.back().get(); - node_map_[name] = cur; - return cur; -} - -PDNode* PDPattern::RetriveNode(const std::string& id) const { - auto it = node_map_.find(id); - if (it == node_map_.end()) { - return nullptr; - } - - return it->second; -} - -void PDPattern::AddEdge(PDNode* a, PDNode* b) { - PADDLE_ENFORCE(a); - PADDLE_ENFORCE(b); - PADDLE_ENFORCE(a != b, "can't connect to the same nodes."); - edges_.emplace_back(a, b); -} - -void GraphPatternDetecter::operator()(Graph* graph, - GraphPatternDetecter::handle_t handler) { - if (!MarkPDNodesInGraph(*graph)) return; - auto subgraphs = DetectPatterns(); - UniquePatterns(&subgraphs); - RemoveOverlappedMatch(&subgraphs); - - for (auto& g : subgraphs) { - handler(g, graph); - } -} - -bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) { - VLOG(4) << "mark pdnodes in graph"; - if (graph.Nodes().empty()) return false; - - for (auto& node : GraphTraits::DFS(graph)) { - for (const auto& pdnode : pattern_.nodes()) { - if (pdnode->Tell(&node)) { - VLOG(4) << "pdnode " << pdnode->name() << " marked"; - pdnodes2nodes_[pdnode.get()].insert(&node); - } - } - } - VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; - return !pdnodes2nodes_.empty(); -} - -struct HitGroup { - std::unordered_map roles; - - bool Match(Node* node, PDNode* pat) { - if (nodes_.count(node)) { - if (!roles.count(pat)) return false; - return roles[pat] == node; - } - return !roles.count(pat) || roles.at(pat) == node; - } - - void Register(Node* node, PDNode* pat) { - roles[pat] = node; - nodes_.insert(node); - } - - private: - std::unordered_set nodes_; -}; - -// Tell whether Node a links to b. -bool IsNodesLink(Node* a, Node* b) { - for (auto* node : a->outputs) { - if (b == node) { - return true; - } - } - return false; -} - -std::vector -GraphPatternDetecter::DetectPatterns() { - // Init empty subgraphs. - std::vector result; - std::vector init_groups; - PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed"); - auto* first_pnode = pattern_.edges().front().first; - if (!pdnodes2nodes_.count(first_pnode)) return result; - for (auto* node : pdnodes2nodes_[first_pnode]) { - HitGroup group; - group.roles[first_pnode] = node; - init_groups.emplace_back(group); - } - - int step = 0; - std::array, 2> bi_records; - bi_records[0] = std::move(init_groups); - - // Extend a PDNode to subgraphs by deducing the connection relations defined - // in edges of PDNodes. - for (const auto& edge : pattern_.edges()) { - VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); - // Each role has two PDNodes, which indicates two roles. - // Detect two Nodes that can match these two roles and they are connected. - auto& pre_groups = bi_records[step % 2]; - auto& cur_groups = bi_records[1 - (step++ % 2)]; - cur_groups.clear(); - // source -> target - for (Node* source : pdnodes2nodes_[edge.first]) { - for (Node* target : pdnodes2nodes_[edge.second]) { - // TODO(Superjomn) add some prune strategies. - for (const auto& group : pre_groups) { - HitGroup new_group = group; - if (IsNodesLink(source, target) && - new_group.Match(source, edge.first)) { - new_group.Register(source, edge.first); - if (new_group.Match(target, edge.second)) { - new_group.Register(target, edge.second); - cur_groups.push_back(new_group); - // TODO(Superjomn) need to unique - } - } - } - } - } - VLOG(3) << "step " << step << " get records: " << cur_groups.size(); - } - - for (auto& group : bi_records[step % 2]) { - GraphPatternDetecter::subgraph_t subgraph; - for (auto& role : group.roles) { - subgraph.emplace(role.first, role.second); - } - result.emplace_back(subgraph); - } - return result; -} - -void GraphPatternDetecter::UniquePatterns( - std::vector* subgraphs) { - if (subgraphs->empty()) return; - std::vector result; - - std::unordered_set set; - for (auto& g : *subgraphs) { - size_t key = 0; - for (auto& item : g) { - key ^= std::hash{}(item.first); - key ^= std::hash{}(item.second); - } - if (!set.count(key)) { - result.emplace_back(g); - set.insert(key); - } - } - *subgraphs = result; -} - -void GraphPatternDetecter::RemoveOverlappedMatch( - std::vector* subgraphs) { - std::vector result; - std::unordered_set node_set; - - for (const auto& subgraph : *subgraphs) { - bool valid = true; - for (auto& item : subgraph) { - if (node_set.count(item.second)) { - valid = false; - break; - } - } - if (valid) { - for (auto& item : subgraph) { - node_set.insert(item.second); - } - result.push_back(subgraph); - } - } - *subgraphs = result; -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.h b/paddle/fluid/framework/ir/graph_pattern_detecter.h deleted file mode 100644 index 68c39902b5a79bf25ca7f08529a958274ac64e33..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/graph_pattern_detecter.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_TESTING -#include -#endif - -#include -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/node.h" - -namespace paddle { -namespace framework { -namespace ir { - -// Some basic torminolygies: -// - PDPattern: a pattern defined as a data flow graph. -// - PDNode: the node in the pattern, each PDNode represents an `ir::Node` -// that meets some conditions defined in `PDNode.teller`. -// - A pattern is defined with PDNodes with edges. - -// Pattern detector node. This node helps to build a pattern. -struct PDNode { - // tell whether an ir::Node* is a candidation for a PDNode. - using teller_t = std::function; - - PDNode(teller_t&& teller, const std::string& name = "") - : teller_(teller), name_(name) { - PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set."); - } - - PDNode(PDNode&& other) = default; - - std::vector inlinks; - std::vector outlinks; - - bool Tell(Node* node) const { - PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode"); - return teller_(node); - } - - const std::string& name() const { return name_; } - - PDNode(const PDNode&) = delete; - PDNode& operator=(const PDNode&) = delete; - - private: - teller_t teller_; - std::string name_; -}; - -/* - * A pattern in a graph, which defined with PDNode and edges. Most graph - * patterns can be divided into PDNodes and link relations between them. - * - * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD - * operators from the computation graph, the MUL's output should have only one - * consumer which is the ELEMENTWISE_ADD. - * This pattern can be defined as with the following pseudo codes - * - * // Create two operator PDNodes. - * MUL = PDPattern.NewNode() - * ELE = PDPattern.NewNode() - * // Create the variable PDNodes. - * MUL_out = PDPattern.NewNode() - * // Add teller to define some rules that help to filter the target Nodes. - * MUL.teller = lambda(node): node->IsOp() && node->Op()->Type == "mul"; - * ELE.teller = lambda(node): \ - * node->IsOp() && node->Op()->Type == "elementwise_add"; - * MUL_out.teller = lambda(node): node->IsVar() && (MUL in node->inputs) - * && (ELE in node->outputs) - * - * One can add more specific tellers for PDNodes or edges, both the Operator - * and Variable Nodes can be ruled in PDNode.teller. - * - * PDPattern can record the general patterns, such as the pattern represents - * - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place. - * - Ops whose inputs and outputs share the same variables - */ -class PDPattern { - public: - using edge_t = std::pair; - - void AddEdge(PDNode* a, PDNode* b); - - PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID()); - PDNode* RetriveNode(const std::string& id) const; - - const std::vector>& nodes() const { return nodes_; } - const std::vector& edges() const { return edges_; } - - private: -#ifdef PADDLE_WITH_TESTING - FRIEND_TEST(PDPattern, AddEdge); - FRIEND_TEST(PDPattern, NewNode); -#endif - - static std::string NewID() { return "pdnode-" + std::to_string(id_++); } - - std::vector> nodes_; - std::vector edges_; - std::unordered_map node_map_; - static size_t id_; -}; - -/* - * GraphPatternDetecter helps to detect the specific patterns in the graph. - * Input a pattern, output a list of the matched subgraphs/nodes. - * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). - * - * The algorithm has three phases: - * 1. Mark the nodes that match the defined PDNodes in a PDPattern, - * 2. Extend a PDNode to subgraphs by deducing the connection relation defined - * in PAPattern(the edges), - * 3. Get the filtered subgraphs and treat them with a pre-defined handler. - * - * Usage: - * // Create a detector - * GraphPatternDetecter detector; - * // Define the detector's pattern, by adding PDNode and define the edges. - * auto* node0 = detector.mutable_pattern().AddNode(...) - * auto* node1 = detector.mutable_pattern().AddNode(...) - * node0->teller = some lambda. - * node1->teller = some lambda. - * detector.mutable_pattern().AddEdge(node0, node1); - * // Create an handler, to define the behavior of treating the filtered - * // subgraphs that comply with the patterns. - * GraphPatternDetecter::handle_t handler = some labmda - * // Execute the detector. - * detector(&graph, handler); - */ -class GraphPatternDetecter { - public: - using subgraph_t = std::unordered_map; - - // Operate on the detected pattern. - using handle_t = - std::function; - - void operator()(Graph* graph, handle_t handler); - - const PDPattern& pattern() const { return pattern_; } - PDPattern* mutable_pattern() { return &pattern_; } - - private: - // Mark the nodes that fits the pattern. - bool MarkPDNodesInGraph(const ir::Graph& graph); - - // Detect all the pattern and output the hit records. - std::vector DetectPatterns(); - - // Remove duplicate patterns. - void UniquePatterns(std::vector* subgraphs); - - // Remove overlapped match subgraphs, when overlapped, keep the previous one. - void RemoveOverlappedMatch(std::vector* subgraphs); - -#ifdef PADDLE_WITH_TESTING - FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph); - FRIEND_TEST(GraphPatternDetecter, DetectPatterns); -#endif - - private: - using hit_rcd_t = - std::pair; - PDPattern pattern_; - std::vector marked_records_; - std::unordered_map> pdnodes2nodes_; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc new file mode 100644 index 0000000000000000000000000000000000000000..945ab110b148c320b6626cadaa47d483df68419e --- /dev/null +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -0,0 +1,444 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +size_t PDPattern::id_ = 0UL; + +PDNode* PDPattern::NewNode(const std::string& name) { + if (!name.empty()) { + PADDLE_ENFORCE_EQ(node_map_.count(name), 0, + "PDNode's name should be unique, get duplicate [%s]", + name); + } + + nodes_.emplace_back(new PDNode(this, name)); + auto* cur = nodes_.back().get(); + node_map_[name] = cur; + return cur; +} + +PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { + if (!name.empty()) { + PADDLE_ENFORCE_EQ(node_map_.count(name), 0, + "PDNode's name should be unique, get duplicate [%s]", + name); + } + + nodes_.emplace_back(new PDNode(std::move(teller), this, name)); + auto* cur = nodes_.back().get(); + node_map_[name] = cur; + return cur; +} + +PDNode* PDPattern::RetrieveNode(const std::string& id) const { + auto it = node_map_.find(id); + if (it == node_map_.end()) { + return nullptr; + } + + return it->second; +} + +void PDPattern::AddEdge(PDNode* a, PDNode* b) { + PADDLE_ENFORCE(a); + PADDLE_ENFORCE(b); + PADDLE_ENFORCE(a != b, "can't connect to the same nodes."); + edges_.emplace_back(a, b); +} + +void GraphPatternDetector::operator()(Graph* graph, + GraphPatternDetector::handle_t handler) { + if (!MarkPDNodesInGraph(*graph)) return; + auto subgraphs = DetectPatterns(); + UniquePatterns(&subgraphs); + RemoveOverlappedMatch(&subgraphs); + ValidateByNodeRole(&subgraphs); + + if (subgraphs.empty()) return; + LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; + int id = 0; + for (auto& g : subgraphs) { + LOG(INFO) << "optimizing #" << id++ << " subgraph"; + handler(g, graph); + } +} + +bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { + VLOG(4) << "mark pdnodes in graph"; + if (graph.Nodes().empty()) return false; + + for (auto& node : GraphTraits::DFS(graph)) { + for (const auto& pdnode : pattern_.nodes()) { + if (pdnode->Tell(&node)) { + VLOG(4) << "pdnode " << pdnode->name() << " marked"; + pdnodes2nodes_[pdnode.get()].insert(&node); + } + } + } + // Check to early stop if some PDNode can't find matched Node. + for (auto& pdnode : pattern_.nodes()) { + if (!pdnodes2nodes_.count(pdnode.get())) { + VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; + + return false; + } + } + VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; + return !pdnodes2nodes_.empty(); +} + +// The intermediate Nodes can only link to the nodes inside the pattern, or this +// subgraph will be droped. +void GraphPatternDetector::ValidateByNodeRole( + std::vector* subgraphs) { + std::vector result; + + subgraphs->erase( + std::remove_if( + subgraphs->begin(), subgraphs->end(), + [](const GraphPatternDetector::subgraph_t& subgraph) -> bool { + // Collect the inputs and outputs. + std::unordered_set ios; + for (auto& item : subgraph) { + if (!item.first->IsIntermediate()) { + ios.insert(item.second); + } + } + for (auto& item : subgraph) { + if (item.first->IsIntermediate()) { + for (auto* x : item.second->inputs) { + if (!ios.count(x)) { + return true; + } + } + for (auto* x : item.second->outputs) { + if (!ios.count(x)) { + return true; + } + } + } + } + return false; + }), + subgraphs->end()); +} + +struct HitGroup { + std::unordered_map roles; + + bool Match(Node* node, PDNode* pat) { + if (nodes_.count(node)) { + if (!roles.count(pat)) return false; + return roles[pat] == node; + } + return !roles.count(pat) || roles.at(pat) == node; + } + + void Register(Node* node, PDNode* pat) { + roles[pat] = node; + nodes_.insert(node); + } + + private: + std::unordered_set nodes_; +}; + +// Tell whether Node a links to b. +bool IsNodesLink(Node* a, Node* b) { + for (auto* node : a->outputs) { + if (b == node) { + return true; + } + } + return false; +} + +std::vector +GraphPatternDetector::DetectPatterns() { + // Init empty subgraphs. + std::vector result; + std::vector init_groups; + std::array, 2> bi_records; + // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed"); + auto* first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() + : pattern_.edges().front().first; + if (!pdnodes2nodes_.count(first_pnode)) return result; + for (auto* node : pdnodes2nodes_[first_pnode]) { + HitGroup group; + group.roles[first_pnode] = node; + init_groups.emplace_back(group); + } + + int step = 0; + bi_records[0] = std::move(init_groups); + + // Extend a PDNode to subgraphs by deducing the connection relations defined + // in edges of PDNodes. + for (const auto& edge : pattern_.edges()) { + VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); + // TODO(Superjomn) Fix bug here, the groups might be duplicate here. + // Each role has two PDNodes, which indicates two roles. + // Detect two Nodes that can match these two roles and they are connected. + auto& pre_groups = bi_records[step % 2]; + auto& cur_groups = bi_records[1 - (step++ % 2)]; + cur_groups.clear(); + if (pre_groups.empty()) break; + // source -> target + for (Node* source : pdnodes2nodes_[edge.first]) { + for (Node* target : pdnodes2nodes_[edge.second]) { + VLOG(8) << "check " << source->id() << " -- " << target->id(); + // TODO(Superjomn) add some prune strategies. + for (const auto& group : pre_groups) { + HitGroup new_group = group; + if (IsNodesLink(source, target) && + new_group.Match(source, edge.first)) { + new_group.Register(source, edge.first); + if (new_group.Match(target, edge.second)) { + new_group.Register(target, edge.second); + cur_groups.push_back(new_group); + // TODO(Superjomn) need to unique + } + } + } + } + } + VLOG(3) << "step " << step << " get records: " << cur_groups.size(); + for (auto& group : cur_groups) { + for (auto& item : group.roles) { + VLOG(4) << "node " << item.second->id() << " as " << item.first->name(); + } + VLOG(4) << "========================================================="; + } + } + + for (auto& group : bi_records[step % 2]) { + GraphPatternDetector::subgraph_t subgraph; + for (auto& role : group.roles) { + subgraph.emplace(role.first, role.second); + } + result.emplace_back(subgraph); + } + return result; +} + +void GraphPatternDetector::UniquePatterns( + std::vector* subgraphs) { + if (subgraphs->empty()) return; + std::vector result; + + std::unordered_set set; + for (auto& g : *subgraphs) { + size_t key = 0; + for (auto& item : g) { + key ^= std::hash{}(item.first); + key ^= std::hash{}(item.second); + } + if (!set.count(key)) { + result.emplace_back(g); + set.insert(key); + } + } + *subgraphs = result; +} + +void GraphPatternDetector::RemoveOverlappedMatch( + std::vector* subgraphs) { + std::vector result; + std::unordered_set node_set; + + for (const auto& subgraph : *subgraphs) { + bool valid = true; + for (auto& item : subgraph) { + if (node_set.count(item.second)) { + valid = false; + break; + } + } + if (valid) { + for (auto& item : subgraph) { + node_set.insert(item.second); + } + result.push_back(subgraph); + } + } + *subgraphs = result; +} + +std::string PDPattern::DotString() const { + using inference::analysis::Dot; + Dot dot; + int id = 0; + // Create Nodes + std::unordered_map node2dot; + for (const auto& node : nodes()) { + std::string node_id = "Node" + std::to_string(id++); + dot.AddNode(node_id, {}, node->name()); + node2dot[node.get()] = node_id; + } + // Create Edges + for (const auto& edge : edges()) { + if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) { + LOG(ERROR) << "no node " << edge.first << " " << edge.second; + continue; + } + auto& src = node2dot.at(edge.first); + auto& trg = node2dot.at(edge.second); + dot.AddEdge(src, trg, {}); + } + return dot.Build(); +} + +PDNode& PDNode::LinksTo(const std::vector& others) { + // extend outlinks. + for (PDNode* x : others) { + pattern_->AddEdge(this, x); + } + return *this; +} + +PDNode& PDNode::LinksFrom(const std::vector& others) { + // extend outlinks. + for (PDNode* x : others) { + pattern_->AddEdge(x, this); + } + return *this; +} + +PDNode* PDNode::assert_is_op() { + asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); }); + return this; +} +PDNode* PDNode::assert_is_op(const std::string& op_type) { + asserts_.emplace_back([this, op_type](Node* x) { + return x && x->IsOp() && x->Op()->Type() == op_type; + }); + return this; +} +PDNode* PDNode::assert_is_var() { + asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); }); + return this; +} +PDNode* PDNode::assert_var_not_persistable() { + assert_is_var(); + asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); }); + return this; +} +PDNode* PDNode::assert_is_persistable_var() { + assert_is_var(); + asserts_.emplace_back([=](Node* x) { return x->Var()->Persistable(); }); + return this; +} +PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type, + const std::string& argument, int nth) { + assert_is_var(); + assert_is_op_input(op_type); + asserts_.emplace_back([=](Node* x) { + for (auto* op : x->outputs) { + if (IsNthInput(x, op, argument, nth)) return true; + } + return false; + }); + return this; +} +PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type, + const std::string& argument, int nth) { + assert_is_var(); + asserts_.emplace_back([=](Node* x) { + for (auto* op : x->inputs) { + if (IsNthOutput(x, op, argument, nth)) return true; + } + return false; + }); + return this; +} +PDNode* PDNode::assert_is_only_input_of_op(const std::string& op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node* x) { + for (auto* op : x->outputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && + op->inputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} +PDNode* PDNode::assert_is_only_output_of_op(const std::string& op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node* x) { + for (auto* op : x->inputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && + op->outputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} +PDNode* PDNode::assert_is_op_output(const std::string& op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node* x) { + for (auto* op : x->inputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { + return true; + } + } + return false; + }); + return this; +} +PDNode* PDNode::assert_is_op_input(const std::string& op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node* x) { + for (auto* op : x->outputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { + return true; + } + } + return false; + }); + return this; +} +PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) { + assert_is_op(op_type); + asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; }); + return this; +} +PDNode* PDNode::assert_op_has_n_outputs(const std::string& op_type, size_t n) { + assert_is_op(op_type); + asserts_.emplace_back([=](Node* x) { return x->outputs.size() == n; }); + return this; +} +PDNode* PDNode::assert_more(PDNode::teller_t&& teller) { + asserts_.emplace_back(std::move(teller)); + return this; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h new file mode 100644 index 0000000000000000000000000000000000000000..f8488c84962d1caa6e7817b3c0349d6da3a59182 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -0,0 +1,321 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_TESTING +#include +#endif + +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/inference/analysis/dot.h" + +namespace paddle { +namespace framework { +namespace ir { +class PDPattern; + +// Some basic terminologies: +// - PDPattern: a pattern defined as a data flow graph. +// - PDNode: the node in the pattern, each PDNode represents an `ir::Node` +// that meets some conditions defined in `PDNode.teller`. +// - A pattern is defined with PDNodes with edges. + +// Pattern detector node. This node helps to build a pattern. +struct PDNode { + // tell whether an ir::Node* is a candidation for a PDNode. + using teller_t = std::function; + enum class Type { kOp, kVar }; + enum class Role { + kUnknown, // No role, + kInput, // an input and will be retained, + kOutput, // an output and will be retained, + kIntermediate // will be removed after handler. + }; + + // this link to others + PDNode& LinksTo(const std::vector& others); + PDNode& LinksFrom(const std::vector& others); + + bool Tell(Node* node) const { + if (teller_) return teller_(node); + + for (auto& asrt : asserts_) { + if (!asrt(node)) return false; + } + return true; + } + + bool IsOp() const { return type_ == Type::kOp; } + bool IsVar() const { return type_ == Type::kVar; } + + const std::string& name() const { return name_; } + + PDNode& operator=(const PDNode&) = delete; + PDNode(const PDNode&) = delete; + + // Mark this node is an Input of a subgraph and will be retained. + PDNode* AsInput() { + role_ = Role::kInput; + return this; + } + // Mark this node is an Output of a subgraph and will be retained. + PDNode* AsOutput() { + role_ = Role::kOutput; + return this; + } + // Mark this node will be removed, so all the links should be inside a matched + // sub-graph. + PDNode* AsIntermediate() { + role_ = Role::kIntermediate; + return this; + } + + bool IsIntermediate() const { return role_ == Role::kIntermediate; } + bool IsInput() const { return role_ == Role::kInput; } + bool IsOutput() const { return role_ == Role::kOutput; } + + // Assertions, helper functions to simplify the pattern definition. + PDNode* assert_is_op(); + PDNode* assert_is_op(const std::string& op_type); + PDNode* assert_is_var(); + PDNode* assert_var_not_persistable(); + PDNode* assert_is_persistable_var(); + PDNode* assert_is_op_output(const std::string& op_type); + PDNode* assert_is_op_input(const std::string& op_type); + PDNode* assert_is_op_nth_input(const std::string& op_type, + const std::string& argument, int nth); + PDNode* assert_is_op_nth_output(const std::string& op_type, + const std::string& argument, int nth); + PDNode* assert_is_only_input_of_op(const std::string& op_type); + PDNode* assert_is_only_output_of_op(const std::string& op_type); + PDNode* assert_op_has_n_inputs(const std::string& op_type, size_t n); + PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n); + PDNode* assert_more(teller_t&& teller); + + private: + PDNode(PDPattern* pattern, const std::string& name = "", + Type type = Type::kVar) + : pattern_(pattern), name_(name), type_(type) {} + PDNode(teller_t&& teller, PDPattern* pattern, const std::string& name = "", + Type type = Type::kVar) + : teller_(std::move(teller)), + pattern_(pattern), + name_(name), + type_(type) { + PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set."); + } + + PDNode(PDNode&& other) = default; + + friend class PDPattern; + + // Will removed latter. + teller_t teller_; + std::vector asserts_; + PDPattern* pattern_; + std::string name_; + Type type_; + Role role_{Role::kUnknown}; +}; + +/* + * A pattern in a graph, which defined with PDNode and edges. Most graph + * patterns can be divided into PDNodes and link relations between them. + * + * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD + * operators from the computation graph, the MUL's output should have only one + * consumer which is the ELEMENTWISE_ADD. + * This pattern can be defined as with the following pseudo codes + * + * // Create two operator PDNodes. + * MUL = PDPattern.NewNode().assert_is_op("mul"); + * ELE = PDPattern.NewNode().assert_is_op("elementwise_add"); + * // Create the variable PDNodes. + * MUL_out = PDPattern.NewNode().assert_is_op_output("mul") \ + * .assert_is_op_input("elementwise_add") \ + * .AsIntermediate(); + * // Add relations. + * MUL->LinksTo({MUL_out}); + * MUL_out->LinksTo({ELE}); + * + * One can add more specific asserts for PDNodes or edges, both the Operator + * and Variable Nodes can be ruled in PDNode.assert_more(...). + * + * PDPattern can record the general patterns, such as the pattern represents + * - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place. + * - Ops whose inputs and outputs share the same variables + */ +class PDPattern { + public: + using edge_t = std::pair; + + void AddEdge(PDNode* a, PDNode* b); + + PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID()); + PDNode* NewNode(const std::string& name = NewID()); + PDNode* RetrieveNode(const std::string& id) const; + + const std::vector>& nodes() const { return nodes_; } + const std::vector& edges() const { return edges_; } + + std::string DotString() const; + + private: +#ifdef PADDLE_WITH_TESTING + FRIEND_TEST(PDPattern, AddEdge); + FRIEND_TEST(PDPattern, NewNode); +#endif + + static std::string NewID() { return "pdnode-" + std::to_string(id_++); } + + std::vector> nodes_; + std::vector edges_; + std::unordered_map node_map_; + static size_t id_; +}; + +/* + * GraphPatternDetector helps to detect the specific patterns in the graph. + * Input a pattern, output a list of the matched subgraphs/nodes. + * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). + * + * The algorithm has three phases: + * 1. Mark the nodes that match the defined PDNodes in a PDPattern, + * 2. Extend a PDNode to subgraphs by deducing the connection relation defined + * in PAPattern(the edges), + * 3. Get the filtered subgraphs and treat them with a pre-defined handler. + * + * Usage: + * // Create a detector + * GraphPatternDetector detector; + * // Define the detector's pattern, by adding PDNode and define the edges. + * auto* node0 = detector.mutable_pattern().AddNode(...) + * auto* node1 = detector.mutable_pattern().AddNode(...) + * node0->teller = some lambda. + * node1->teller = some lambda. + * detector.mutable_pattern().AddEdge(node0, node1); + * // Create an handler, to define the behavior of treating the filtered + * // subgraphs that comply with the patterns. + * GraphPatternDetector::handle_t handler = some labmda + * // Execute the detector. + * detector(&graph, handler); + */ +class GraphPatternDetector { + public: + using subgraph_t = std::unordered_map; + + // Operate on the detected pattern. + using handle_t = + std::function; + + void operator()(Graph* graph, handle_t handler); + + const PDPattern& pattern() const { return pattern_; } + PDPattern* mutable_pattern() { return &pattern_; } + + private: + // Mark the nodes that fits the pattern. + bool MarkPDNodesInGraph(const ir::Graph& graph); + + // Detect all the pattern and output the hit records. + std::vector DetectPatterns(); + + // Remove duplicate patterns. + void UniquePatterns(std::vector* subgraphs); + + // Remove overlapped match subgraphs, when overlapped, keep the previous one. + void RemoveOverlappedMatch(std::vector* subgraphs); + + // Validate whether the intermediate nodes are linked by external nodes. + void ValidateByNodeRole(std::vector* subgraphs); + +#ifdef PADDLE_WITH_TESTING + FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph); + FRIEND_TEST(GraphPatternDetecter, DetectPatterns); +#endif + + private: + using hit_rcd_t = + std::pair; + PDPattern pattern_; + std::unordered_map> pdnodes2nodes_; +}; + +// some helper methods. + +// Op's input. +static bool VarLinksToOp(Node* node, const std::string& op_type) { + for (auto* out : node->outputs) { + if (out->IsOp() && out->Op()->Type() == op_type) { + return true; + } + } + return false; +} + +// Op's output. +static bool VarLinksFromOp(Node* node, const std::string& op_type) { + for (auto* out : node->inputs) { + if (out->IsOp() && out->Op()->Type() == op_type) { + return true; + } + } + return false; +} + +// Check whether a var node is a op node's nth input. +static bool IsNthInput(Node* var, Node* op, const std::string& argument, + size_t nth) { + PADDLE_ENFORCE(var->IsVar()); + PADDLE_ENFORCE(op->IsOp()); + if (op->inputs.size() <= nth) return false; + return var->Name() == op->Op()->Input(argument)[nth]; +} + +static bool IsNthOutput(Node* var, Node* op, const std::string& argument, + size_t nth) { + PADDLE_ENFORCE(var->IsVar()); + PADDLE_ENFORCE(op->IsOp()); + if (op->inputs.size() <= nth) return false; + return var->Name() == op->Op()->Output(argument)[nth]; +} + +static void GraphSafeRemoveNodes(Graph* graph, + const std::unordered_set& nodes) { + for (auto* node : nodes) { + graph->RemoveNode(const_cast(node)); + } + + for (auto* node : graph->Nodes()) { + for (auto it = node->inputs.begin(); it != node->inputs.end();) { + if (nodes.count(*it)) { + it = const_cast(node)->inputs.erase(it); + } else + it++; + } + for (auto it = node->outputs.begin(); it != node->outputs.end();) { + if (nodes.count(*it)) { + it = const_cast(node)->outputs.erase(it); + } else + it++; + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc similarity index 79% rename from paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc rename to paddle/fluid/framework/ir/graph_pattern_detector_tester.cc index 06f9df5546910f492c9dd1da3e694623898d3d1d..7e5c86b033a7c69a306491cf4bf8d099018c5f19 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/graph_pattern_detecter.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include @@ -82,7 +82,7 @@ TEST(PDPattern, AddEdge) { } TEST(GraphPatternDetecter, MarkPDNodesInGraph) { - GraphPatternDetecter x; + GraphPatternDetector x; // mark o2, o3, v2 // The pattern is a graph: @@ -131,7 +131,7 @@ TEST(GraphPatternDetecter, MultiSubgraph) { Graph graph(program); BuildGraph(&graph); - GraphPatternDetecter x; + GraphPatternDetector x; // The pattern is a graph: // op -> var @@ -149,8 +149,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) { x.mutable_pattern()->AddEdge(any_var, any_op1); int count = 0; - GraphPatternDetecter::handle_t handle = [&]( - const GraphPatternDetecter::subgraph_t& s, Graph* g) { + GraphPatternDetector::handle_t handle = [&]( + const GraphPatternDetector::subgraph_t& s, Graph* g) { LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> " << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name(); count++; @@ -167,6 +167,39 @@ TEST(GraphPatternDetecter, MultiSubgraph) { ASSERT_LE(count, 2); } +TEST(GraphPatternDetector, IntermediateCheck) { + ProgramDesc program; + Graph graph(program); + BuildGraph(&graph); + + // o2->v2->o3 + // o2->v2->o4 + // check o2+o3 fuse, should fail because v2 also link to o4. + GraphPatternDetector detector; + auto* op2 = detector.mutable_pattern()->NewNode( + [](Node* x) { return x && x->IsOp() && x->Name() == "op2"; }, "op2"); + auto* op3 = detector.mutable_pattern()->NewNode( + [](Node* x) { return x && x->IsOp() && x->Name() == "op3"; }, "op3"); + auto* v2 = + detector.mutable_pattern() + ->NewNode( + [](Node* x) { return x && x->IsVar() && x->Name() == "var2"; }, + "var2") + ->AsIntermediate(); + v2->LinksFrom({op2}).LinksTo({op3}); + + int count = 0; + detector(&graph, [&](const GraphPatternDetector::subgraph_t& g, + Graph* graph) { ++count; }); + EXPECT_EQ(count, 0); + + count = 0; + v2->AsInput(); + detector(&graph, [&](const GraphPatternDetector::subgraph_t& g, + Graph* graph) { ++count; }); + ASSERT_EQ(count, 1); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..414d8f79b15de091c62af5fe099ffae144156e4e --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr GraphToProgramPass::ApplyImpl( + std::unique_ptr graph) const { + ProgramDesc& program = Get("program"); + + std::unique_ptr program_pb( + new proto::ProgramDesc(*program.Proto())); + + auto block = program_pb->mutable_blocks(kRootBlockIndex); + block->clear_vars(); + std::unordered_set visited_vars; + for (ir::Node* n : graph->Nodes()) { + if (n->NodeType() == ir::Node::Type::kVariable) { + if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) { + visited_vars.insert(n->Var()->Name()); + block->add_vars()->MergeFrom(*n->Var()->Proto()); + } + } + } + + block->clear_ops(); + std::vector nodes = TopologySortOperations(*graph); + for (ir::Node* n : nodes) { + if (!n->Op()) { + continue; + } + block->add_ops()->MergeFrom(*n->Op()->Proto()); + } + + program.CopyFrom(*program_pb); + return graph; +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(graph_to_program_pass, paddle::framework::ir::GraphToProgramPass); diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..124ec5a8e771fb768b31fa2e9f5143db96154490 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class GraphToProgramPass : public Pass { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d51d9751a28d2b1549096b1984d67b55f913da6 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" + +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +void BuildNoCircleGraph(Graph* g) { + OpDesc op1; + op1.SetType("op1"); + OpDesc op2; + op2.SetType("op2"); + OpDesc op3; + op3.SetType("op3"); + OpDesc op4; + op4.SetType("op4"); + OpDesc op5; + op5.SetType("op5"); + VarDesc var1("var1"); + VarDesc var2("var2"); + VarDesc var3("var3"); + VarDesc var4("var4"); + + ir::Node* o1 = g->CreateOpNode(&op1); + ir::Node* o2 = g->CreateOpNode(&op2); + ir::Node* o3 = g->CreateOpNode(&op3); + ir::Node* o4 = g->CreateOpNode(&op4); + ir::Node* o5 = g->CreateOpNode(&op5); + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + v2->inputs.push_back(o2); + // o4->v3->o5 + o4->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o4); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +TEST(GraphToProgramPass, Basic) { + ProgramDesc prog; + std::unique_ptr g(new Graph(prog)); + BuildNoCircleGraph(g.get()); + + auto pass = paddle::framework::ir::PassRegistry::Instance().Get( + "graph_to_program_pass"); + + ProgramDesc compiled_prog; + pass->SetNotOwned("program", &compiled_prog); + pass->Apply(std::move(g)); + std::vector ops = compiled_prog.Block(0).AllOps(); + EXPECT_EQ(ops[0]->Type(), "op1"); + EXPECT_EQ(ops[1]->Type(), "op2"); + if (ops[2]->Type() == "op3") { + EXPECT_EQ(ops[3]->Type(), "op4"); + } else if (ops[2]->Type() == "op4") { + EXPECT_EQ(ops[3]->Type(), "op3"); + } + EXPECT_EQ(ops[4]->Type(), "op5"); + + std::unordered_set vars; + for (VarDesc* v : compiled_prog.Block(0).AllVars()) { + vars.insert(v->Name()); + } + EXPECT_TRUE(vars.find("var1") != vars.end()); + EXPECT_TRUE(vars.find("var2") != vars.end()); + EXPECT_TRUE(vars.find("var3") != vars.end()); +} +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index e7ff0c1dac134334e3baad88886862ebff0fe367..4c7ffe69e933de3d52c8f762a1eeb73de17e0561 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -16,11 +16,27 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/inference/analysis/dot.h" +#include "paddle/fluid/string/printf.h" namespace paddle { namespace framework { namespace ir { -static const char kGraphVizPath[] = "graph_viz_path"; +using inference::analysis::Dot; +namespace { +const char kGraphVizPath[] = "graph_viz_path"; + +std::string FormatName(const Node* node) { + if (!node->IsOp() || !node->Op() || + !node->Op()->HasAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName())) { + return node->Name(); + } + const std::string full_scope = boost::get( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName())); + return string::Sprintf("%s%s", full_scope.c_str(), node->Name().c_str()); +} +} // namespace std::unique_ptr GraphVizPass::ApplyImpl( std::unique_ptr graph) const { @@ -30,41 +46,65 @@ std::unique_ptr GraphVizPass::ApplyImpl( PADDLE_ENFORCE(fout->good()); std::ostream& sout = *fout; - size_t var_id = 0; - std::unordered_map vars; - - sout << "digraph G {\n"; - - for (const ir::Node* n : graph->Nodes()) { - if (n->NodeType() != ir::Node::Type::kVariable) continue; - size_t cur_var_id = var_id++; - vars[n] = cur_var_id; - - sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]" - << std::endl; - } - - size_t op_id = 0; - for (const ir::Node* n : graph->Nodes()) { - if (n->NodeType() != ir::Node::Type::kOperation) continue; - std::string op_name = "op_" + std::to_string(op_id++); - sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]" - << std::endl; - for (auto in : n->inputs) { - std::string var_name = "var_" + std::to_string(vars[in]); - sout << var_name << " -> " << op_name << std::endl; + std::unordered_map node2dot; + + Dot dot; + + std::vector op_attrs({Dot::Attr("style", "filled"), + Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "red")}); + std::vector var_attrs({Dot::Attr("style", "filled,rounded"), + // Dot::Attr("shape", "diamond"), + Dot::Attr("fillcolor", "yellow")}); + + std::vector marked_op_attrs({Dot::Attr("style", "filled"), + Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "lightgray")}); + std::vector marked_var_attrs( + {Dot::Attr("style", "filled,rounded"), + // Dot::Attr("shape", "diamond"), + Dot::Attr("fillcolor", "lightgray")}); + + auto marked_nodes = ConsumeMarkedNodes(graph.get()); + // Create nodes + for (const Node* n : graph->Nodes()) { + std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")"; + if (n->IsOp()) { + decltype(op_attrs) attr = + marked_nodes.count(n) ? marked_op_attrs : op_attrs; + dot.AddNode(node_id, attr, node_id); + } else if (n->IsVar()) { + decltype(op_attrs) attr = + marked_nodes.count(n) ? marked_var_attrs : var_attrs; + dot.AddNode(node_id, attr, node_id); } - - for (auto out : n->outputs) { - std::string var_name = "var_" + std::to_string(vars[out]); - sout << op_name << " -> " << var_name << std::endl; + node2dot[n] = node_id; + } + // Create edges + for (const Node* n : graph->Nodes()) { + const auto& src_id = node2dot.at(n); + for (auto* out : n->outputs) { + const auto& trg_id = node2dot.at(out); + dot.AddEdge(src_id, trg_id, {}); } } - sout << "}\n"; + sout << dot.Build(); + return graph; } +GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( + Graph* graph) const { + marked_nodes_t res; + if (graph->Has(kGraphvizMarkedNodeAttr)) { + auto& attr = graph->Get(kGraphvizMarkedNodeAttr); + res = attr; + attr.clear(); + } + return res; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h index 1fd8c8a26e9581ccf605d4271a49ec2e90d8b997..8d885cb9e4ee6e01de386b0f22423988dbe60ca6 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.h +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -27,10 +27,19 @@ namespace paddle { namespace framework { namespace ir { +const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__"; + class GraphVizPass : public Pass { + public: + using marked_nodes_t = std::unordered_set; + protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; + + // Tell whether there are any marked nodes in the graph. Consume the + // corresponding attribute. + marked_nodes_t ConsumeMarkedNodes(Graph* graph) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 84748089d30cb8931c4aa458217522bb8d7cb975..03ed6da1046495ade890f1e8e0c34654883254b7 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -17,7 +17,9 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -char Node::kControlDepVarName[] = "__control_var"; + +constexpr char Node::kControlDepVarName[]; +int Node::count_ = 0; } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index fc3cefea46413811df0df5d72f16d4f2c8205d62..e053478f89823edd2ebf2319dc1f7b5f363d149e 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -30,34 +30,42 @@ class Node { static char kControlDepVarName[]; explicit Node(const std::string& name, Type type) - : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} + : name_(name), + var_desc_(nullptr), + op_desc_(nullptr), + type_(type), + id_(count_++) {} explicit Node(VarDesc* var_desc) : name_(var_desc->Name()), var_desc_(new VarDesc(*var_desc)), op_desc_(nullptr), - type_(Type::kVariable) {} + type_(Type::kVariable), + id_(count_++) {} explicit Node(OpDesc* op_desc) : name_(op_desc->Type()), var_desc_(nullptr), op_desc_(new OpDesc(*op_desc, op_desc->Block())), - type_(Type::kOperation) {} + type_(Type::kOperation), + id_(count_++) {} Type NodeType() const { return type_; } std::string Name() const { return name_; } VarDesc* Var() { - PADDLE_ENFORCE(type_ == Type::kVariable); + PADDLE_ENFORCE(IsVar()); return var_desc_.get(); } - OpDesc* Op() { + OpDesc* Op() const { PADDLE_ENFORCE(IsOp()); return op_desc_.get(); } + int id() const { return id_; } + bool IsOp() const { return type_ == Type::kOperation; } bool IsVar() const { return type_ == Type::kVariable; } @@ -69,8 +77,12 @@ class Node { std::unique_ptr var_desc_; std::unique_ptr op_desc_; Type type_; + int id_; private: + friend class Graph; + static int count_; + static void ResetId() { count_ = 0; } DISABLE_COPY_AND_ASSIGN(Node); }; diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..a776a898a5ee13b4dde12460dce71433268fb9d4 --- /dev/null +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -0,0 +1,256 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +struct FuseExpr {}; + +// sequence expand, concat fuse pattern, return concat's output +PDNode* BuildSeqExpandConcatPattern(PDPattern* pattern) { + // The following operators will be fused: + // concat + // sequence_expand + // sequence_expand + + // The following variables will be treat as inputs: + // concat mid input, 0th input for fused op + // sequence_expand input, 1th input for fused op + // sequence_expand input, 2th input for fused op + + // The following variables will be treat as outputs: + // concat output + + // So the following variables will be removed: + // sequence-expand output + // sequence-expand output + + // Three operators + auto* sequence_expand0 = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "sequence_expand"; + }, + "sequence_expand0"); + + auto* sequence_expand1 = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "sequence_expand"; + }, + "sequence_expand1"); + + auto* concat = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "concat" && // basic check + x->Op()->Input("X").size() == 3; // Special case + }, + "concat"); + + auto* sequence_expand0_in = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && VarLinksToOp(x, "sequence_expand"); + }, + "sequence_expand0_in"); + auto* sequence_expand1_in = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && VarLinksToOp(x, "sequence_expand"); + }, + "sequence_expand1_in"); + + // The variables + auto* sequence_expand0_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && + VarLinksFromOp(x, "sequence_expand") && // basic check + VarLinksToOp(x, "concat") && // is concat's input + IsNthInput(x, x->outputs[0], "X", 1); // X[0] + }, + "sequence_expand0_out"); + + auto* sequence_expand1_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && + VarLinksFromOp(x, "sequence_expand") && // basic check + VarLinksToOp(x, "concat") && // is concat's input + IsNthInput(x, x->outputs[0], "X", 2); // x[2] + }, + "sequence_expand1_out"); + + auto* concat_in0 = pattern->NewNode( + [](Node* x) { return x && x->IsVar() && VarLinksToOp(x, "concat"); }, + "concat_in0"); + + auto* concat_out = pattern->NewNode( + [](Node* x) { return x && x->IsVar() && VarLinksFromOp(x, "concat"); }, + "concat_out"); + + // Links + sequence_expand0->LinksFrom({sequence_expand0_in}) + .LinksTo({sequence_expand0_out}); + sequence_expand1->LinksFrom({sequence_expand1_in}) + .LinksTo({sequence_expand1_out}); + concat->LinksFrom({sequence_expand0_out, sequence_expand1_out, concat_in0}) + .LinksTo({concat_out}); + return concat_out; +} + +PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { + PDNode* fc_w = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksToOp(x, "mul") && // link + x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_w"); + + PDNode* mul_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksFromOp(x, "mul") && // link + VarLinksToOp(x, "elementwise_add") && // + !x->Var()->Proto()->persistable(); // is a parameter + }, + "mul_out"); + + PDNode* fc_mul = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "mul"; // basic + }, + "fc_mul"); + + PDNode* fc_bias = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksToOp(x, "elementwise_add") && // link + x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_bias"); + + PDNode* elementwise_add = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "elementwise_add"; + }, + "elementwise_add"); + + PDNode* add_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksFromOp(x, "elementwise_add") && // link + !x->Var()->Proto()->persistable(); // is a parameter + }, + "add_out"); + + std::set acts({"sigmoid", "tanh", "relu", "identity"}); + PDNode* act = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && acts.count(x->Op()->Type()); + + }, + "act"); + + PDNode* fc_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + !x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_out"); + + fc_mul->LinksFrom({fc_w, fc_x}).LinksTo({mul_out}); + elementwise_add->LinksFrom({mul_out, fc_bias}).LinksTo({add_out}); + act->LinksFrom({add_out}).LinksTo({fc_out}); + return fc_out; +} + +std::unique_ptr SeqConcatFcFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init("seq_concat_fc_fuse", graph.get()); + GraphPatternDetector detector; + auto* pattern = detector.mutable_pattern(); + auto* concat_out = BuildSeqExpandConcatPattern(pattern); + BuildFCPattern(pattern, concat_out); + +#define GET_NODE(id, pattern) \ + PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(pattern.RetrieveNode(#id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + + detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "get one concat pattern"; + // fc + GET_NODE(fc_w, detector.pattern()); + GET_NODE(fc_bias, detector.pattern()); + GET_NODE(act, detector.pattern()); + GET_NODE(fc_out, detector.pattern()); + + // concat + GET_NODE(concat_in0, detector.pattern()); + GET_NODE(sequence_expand0_in, detector.pattern()); + GET_NODE(sequence_expand1_in, detector.pattern()); + + OpDesc op_desc; + op_desc.SetType("fusion_seqexpand_concat_fc"); + op_desc.SetInput("X", {concat_in0->Name(), sequence_expand0_in->Name(), + sequence_expand1_in->Name()}); + op_desc.SetInput("FCWeight", {fc_w->Name()}); + op_desc.SetInput("FCBias", {fc_bias->Name()}); + const std::string fc_out_tmp = fc_out->Name() + ".tmp"; + param_scope()->Var(fc_out_tmp)->GetMutable(); + op_desc.SetOutput("FCOut", {fc_out_tmp}); + op_desc.SetOutput("Out", {fc_out->Name()}); + op_desc.SetAttr("fc_activation", act->Op()->Type()); + + auto* op_node = graph->CreateOpNode(&op_desc); +// Add links +#define NODE_LINKS(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + NODE_LINKS(fc_w, op_node); + NODE_LINKS(fc_bias, op_node); + NODE_LINKS(concat_in0, op_node); + NODE_LINKS(sequence_expand0_in, op_node); + NODE_LINKS(sequence_expand1_in, op_node); + NODE_LINKS(op_node, fc_out); + + // Clean nodes. + std::unordered_set marked_nodes; + for (auto& item : subgraph) { + marked_nodes.insert(item.second); + } + marked_nodes.erase(fc_w); + marked_nodes.erase(fc_bias); + marked_nodes.erase(concat_in0); + marked_nodes.erase(sequence_expand0_in); + marked_nodes.erase(sequence_expand1_in); + marked_nodes.erase(fc_out); + + GraphSafeRemoveNodes(graph, marked_nodes); + }); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seq_concat_fc_fuse_pass, + paddle::framework::ir::SeqConcatFcFusePass); diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..9f5fd1a29adf918806d8f30097d8c7f002f48f3e --- /dev/null +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SeqConcatFcFusePass : public FusePassBase { + public: + virtual ~SeqConcatFcFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index dc22e8e581663e6be1a5bcd4d81f753dc2e49d2e..495fb755059c473d448c8adf0b18c789bd89482d 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -25,7 +25,8 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" -#if defined(_WIN32) + +#if !defined(_WIN32) #include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/writer.h" #endif // _WIN32 diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 122dc161b41246e5f08bd0ae8b763489e9ee22f9..555faba9624b9c76a9efdf4a62cd319f9682566e 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -95,6 +95,12 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, need_update_ = true; } +OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) { + CopyFrom(other); + block_ = block; + need_update_ = true; +} + void OpDesc::CopyFrom(const OpDesc &op_desc) { desc_.set_type(op_desc.Type()); inputs_ = op_desc.inputs_; @@ -131,8 +137,9 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block) for (const proto::OpDesc::Attr &attr : desc_.attrs()) { std::string attr_name = attr.name(); // The sub_block referred to by the BLOCK attr hasn't been added - // to ProgramDesc class yet, we skip setting BLOCK attr here. - if (attr.type() != proto::AttrType::BLOCK) { + // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here. + if (attr.type() != proto::AttrType::BLOCK && + attr.type() != proto::AttrType::BLOCKS) { attrs_[attr_name] = GetAttrValue(attr); } } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 2422392e24d864dc3e7973ab35e038ecf2c0392a..b4205aba83e774fb9c08193124adb93935c00157 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -37,11 +37,7 @@ class OpDesc { explicit OpDesc(BlockDesc *block) : block_(block) {} - OpDesc(const OpDesc &other, BlockDesc *block) { - *this = other; - block_ = block; - need_update_ = true; - } + OpDesc(const OpDesc &other, BlockDesc *block); void CopyFrom(const OpDesc &op_desc); diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 2288c7fe6609a765612b468d69ad35101b92b384..4fa047bf3ee3d06ac4aec5d2cc6a355965836d42 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -129,6 +129,9 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, "Optimized for variable") .SetDefault({}); + AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") + .SetDefault(""); + Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 80970291c9c234f1306162f4ffa3c2528f88c35f..18827385ad659922230ff68709a2926a8c9013ac 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -39,6 +39,7 @@ class OpProtoAndCheckerMaker { public: static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } + static const char *OpNamescopeAttrName() { return "op_namescope"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d04f7744961b2561977f4d36d0073a97557043da..d58d6e4f3e684b97fcc1121e51355bdf3aae3fce 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -74,6 +74,12 @@ static DDim GetDims(const Scope& scope, const std::string& name, } } +static bool VarInited(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) return false; + return var->IsInitialized(); +} + static std::string GetDtype(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); if (var == nullptr) { @@ -87,8 +93,12 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { } return DataTypeToString(ToDataType(tensor.type())); } else if (var->IsType()) { - return DataTypeToString( - ToDataType(var->Get().value().type())); + auto tensor = var->Get().value(); + if (UNLIKELY(!tensor.IsInitialized())) { + return "uninited"; + } else { + return DataTypeToString(ToDataType(tensor.type())); + } } else { return ""; } @@ -197,16 +207,21 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { auto& input = *it; ss << input.first << "["; for (size_t i = 0; i < input.second.size(); ++i) { - ss << input.second[i]; + auto var_name = input.second[i]; + ss << var_name; if (scope) { - int row_size = GetRowSize(*scope, input.second[i]); - if (row_size >= 0) { - ss << "[row_size=" << row_size << "]"; + if (!VarInited(*scope, var_name)) { + ss << "[uninited]"; + } else { + int row_size = GetRowSize(*scope, var_name); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } + std::string dtype = GetDtype(*scope, var_name); + ss << ":" << dtype; + ss << "[" << GetDims(*scope, var_name, true) << "]"; + ss << "(" << GetLoD(*scope, var_name) << ")"; } - std::string dtype = GetDtype(*scope, input.second[i]); - ss << ":" << dtype; - ss << "[" << GetDims(*scope, input.second[i], true) << "]"; - ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } if (i != input.second.size() - 1) { ss << ", "; @@ -223,14 +238,19 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { auto& output = *it; ss << output.first << "["; for (size_t i = 0; i < output.second.size(); ++i) { - ss << output.second[i]; + auto var_name = output.second[i]; + ss << var_name; if (scope) { - int row_size = GetRowSize(*scope, output.second[i]); - if (row_size >= 0) { - ss << "[row_size=" << row_size << "]"; + if (!VarInited(*scope, var_name)) { + ss << "[uninited]"; + } else { + int row_size = GetRowSize(*scope, output.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } + ss << "[" << GetDims(*scope, var_name, true) << "]"; + ss << "(" << GetLoD(*scope, var_name) << ")"; } - ss << "[" << GetDims(*scope, output.second[i], true) << "]"; - ss << "(" << GetLoD(*scope, output.second[i]) << ")"; } if (i != output.second.size() - 1) { ss << ", "; diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 344c001a69b53c82967ee983783892a514c2490b..a63944eaee6132c1082947fddcad4e0d72e26df1 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -80,6 +80,12 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { InitFromProto(); } +void ProgramDesc::CopyFrom(const proto::ProgramDesc &desc) { + blocks_.clear(); + desc_ = desc; + InitFromProto(); +} + ProgramDesc::ProgramDesc(const std::string &binary_str) { PADDLE_ENFORCE(desc_.ParseFromString(binary_str), "Fail to parse program_desc from binary string."); @@ -111,10 +117,16 @@ void ProgramDesc::InitFromProto() { const std::vector ProgramDesc::GetFeedTargetNames() { auto &global_block = Block(0); + // The order of feed_target_names must follow the index specified in `col`. + // since feed operator's order doesn't necessary follow 'col'. std::vector feed_target_names; for (auto *op : global_block.AllOps()) { if (op->Type() == kFeedOpType) { - feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]); + int col = boost::get(op->GetAttr("col")); + if (col >= feed_target_names.size()) { + feed_target_names.resize(col + 1); + } + feed_target_names[col] = op->Output("Out")[0]; } } return feed_target_names; @@ -122,10 +134,16 @@ const std::vector ProgramDesc::GetFeedTargetNames() { const std::vector ProgramDesc::GetFetchTargetNames() { auto &global_block = Block(0); + // The order of fetch_target_names must follow the index specified in `col`. + // since fetch operator's order doesn't necessary follow 'col'. std::vector fetch_target_names; for (auto *op : global_block.AllOps()) { if (op->Type() == kFetchOpType) { - fetch_target_names.push_back(op->Input("X")[0]); + int col = boost::get(op->GetAttr("col")); + if (col >= fetch_target_names.size()) { + fetch_target_names.resize(col + 1); + } + fetch_target_names[col] = op->Input("X")[0]; } } return fetch_target_names; diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index f3afc85eb924e4b03b7597e043ffd4e267adc977..a0e81cade18c0ca5eb1b98fee8325ae2d917d1a2 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -53,6 +53,8 @@ class ProgramDesc { void Flush(); + void CopyFrom(const proto::ProgramDesc &desc); + proto::ProgramDesc *Proto(); // The output variable of feed_op is referenced as feed_target. diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index a4319ffabb04f39437b76d97845e021ef9de66d3..8c290bb095d554a973e66a3a19606a06759fd668 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -49,7 +49,7 @@ struct TensorCopyVisitor { size_(size) {} template - void operator()() const { + void apply() const { // TODO(Yancey1989): support other place platform::CPUPlace cpu; memory::Copy(cpu, dst_->mutable_data(cpu) + dst_offset_, cpu, diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index d61dbb98a235ca9af089d35318b7f4c68cb125cc..b6ba0df033af12d48e88eb57a3b97b559077250d 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type, "When calling this method, the Tensor's numel must be " "equal or larger than zero. " "Please check Tensor::Resize has been called first."); - size_t size = requested_size ? requested_size : numel() * SizeOfType(type); + size_t size = numel() * SizeOfType(type); + if (requested_size) { + PADDLE_ENFORCE_GE(requested_size, size); + size = requested_size; + } /* some versions of boost::variant don't have operator!= */ if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + offset_) { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 5d1e72505d8159e01f2dc7aaacc780cae050e5e1..f136b11bd07978aa1163452f58de2eb9ad33b113 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -150,7 +150,11 @@ struct AnyDTypeVisitor { : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} template +<<<<<<< HEAD void apply()() const { +======= + void apply() const { +>>>>>>> origin/develop auto t = EigenVector::Flatten(tensor_); auto o = EigenScalar::From(*out_); // return any of predicate_(t) is true. diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 429997c8b89fef7aa164e878095ab3b5c9998e5b..e9550dbfb976bee70741158b94b04084919e8271 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -26,7 +26,7 @@ namespace paddle { namespace framework { template -bool IsType(const std::type_index& type_index) { +inline bool IsType(const std::type_index& type_index) { return type_index == std::type_index(typeid(T)); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 5cb5993512855ad5306d1d89d9aabb6f1a07a2d8..0b515b79c636289801719f309934f59017b7c4b4 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -10,7 +10,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? cc_library(paddle_fluid_api SRCS io.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS) @@ -21,9 +21,9 @@ endif(WIN32) # paddle_fluid_origin exclude inference api interface cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) -if(NOT APPLE) +#if(APPLE) add_subdirectory(api) -endif() +#endif() # Create static library diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 4feaed2b0d9cdec735bd3fadc98aa2bad715c209..d43ecc722ea3c78541835fb3f5efc9a3529fbf11 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,5 +1,8 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) -cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc +set(analysis_deps + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor) + +cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc helper.cc # passes @@ -10,11 +13,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph tensorrt_subgraph_node_mark_pass.cc fluid_to_ir_pass.cc model_store_pass.cc - DEPS framework_proto proto_desc ir_pass_manager graph pass) + DEPS ${analysis_deps}) cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) -cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) +cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) @@ -31,47 +34,49 @@ function (inference_analysis_test TARGET) endif() cc_test(${TARGET} SRCS "${analysis_test_SRCS}" - DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass ${analysis_test_EXTRA_DEPS} + DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS} ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) endfunction(inference_analysis_test) -set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz") -set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz") -set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE) -set(DITU_RNN_MODEL ${DITU_INSTALL_DIR}/model) -set(DITU_RNN_DATA ${DITU_INSTALL_DIR}/data.txt) - -function (inference_download_and_uncompress target url gz_filename) +function (inference_download_and_uncompress install_dir url gz_filename) message(STATUS "Download inference test stuff ${gz_filename} from ${url}") - execute_process(COMMAND bash -c "mkdir -p ${DITU_INSTALL_DIR}") - execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && wget -q ${url}") - execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && tar xzf ${gz_filename}") + execute_process(COMMAND bash -c "mkdir -p ${install_dir}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}") message(STATUS "finish downloading ${gz_filename}") endfunction(inference_download_and_uncompress) +set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz") +set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz") +set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE) if (NOT EXISTS ${DITU_INSTALL_DIR}) - inference_download_and_uncompress(ditu_rnn_model ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz") - inference_download_and_uncompress(ditu_rnn_data ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz") + inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz") + inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz") endif() inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis + analysis_predictor # ir fc_fuse_pass + fc_lstm_fuse_pass + seq_concat_fc_fuse_pass graph_viz_pass infer_clean_graph_pass - graph_pattern_detecter - infer_clean_graph_pass + graph_pattern_detector + infer_clean_graph_pass + attention_lstm_fuse_pass + paddle_inference_api pass ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) -inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) -inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) +inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api) +inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) @@ -79,3 +84,17 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) + +set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") +set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") +set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) +if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR}) + inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") + inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") +endif() + +inference_analysis_test(test_chinese_ner SRCS chinese_ner_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model + --infer_model=${CHINESE_NER_INSTALL_DIR}/model + --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 7d16364609463e9c48720e772cebee7731dfd452..e6e63544ffa2de09e39b02769aaaf0793d6b1111 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax", - "depthwise_conv2d", "batch_norm"}); + "depthwise_conv2d", "batch_norm", "concat"}); if (!node->IsFunction()) return false; const auto* func = static_cast(node); @@ -93,7 +93,6 @@ class DfgPassManagerImpl final : public DfgPassManager { void AddGraphvizDebugerPass(Pass* pass) { auto* debuger_pass = pass->CreateGraphvizDebugerPass(); if (debuger_pass) { - LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]"; Register(debuger_pass->repr(), debuger_pass); } } @@ -102,6 +101,19 @@ class DfgPassManagerImpl final : public DfgPassManager { Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } void Analyzer::Run(Argument* argument) { + // Ugly support fluid-to-ir-pass + argument->Set(kFluidToIrPassesAttr, + new std::vector({ + // Manual update the passes here. + "graph_viz_pass", // + "infer_clean_graph_pass", "graph_viz_pass", // + "attention_lstm_fuse_pass", "graph_viz_pass", // + "fc_lstm_fuse_pass", "graph_viz_pass", // + "seq_concat_fc_fuse_pass", "graph_viz_pass", // + "fc_fuse_pass", "graph_viz_pass" // + + })); + for (auto& x : data_) { PADDLE_ENFORCE(x->Initialize(argument)); x->RunAll(); diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index baa7600283a9bc0b81833b419a2ea64692ed2203..1a65e85dd237eb1bacd3c15b4538a9835ec4b9e0 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -16,10 +16,14 @@ #include #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/profiler.h" DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN"); DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN"); @@ -30,6 +34,8 @@ namespace paddle { namespace inference { namespace analysis { +using namespace framework; + TEST(Analyzer, analysis_without_tensorrt) { FLAGS_IA_enable_tensorrt_subgraph_engine = false; Argument argument; @@ -195,13 +201,13 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, minute_tensor.lod.assign({one_batch.lod3}); // clang-format on // assign data - TensorAssignData(&lod_attention_tensor, - std::vector>({{0, 0}})); + TensorAssignData(&lod_attention_tensor, + std::vector>({{0, 0}})); std::vector tmp_zeros(batch_size * 15, 0.); - TensorAssignData(&init_zero_tensor, {tmp_zeros}); - TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); - TensorAssignData(&week_tensor, one_batch.rnn_week_datas); - TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); + TensorAssignData(&init_zero_tensor, {tmp_zeros}); + TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); + TensorAssignData(&week_tensor, one_batch.rnn_week_datas); + TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); // Set inputs. auto init_zero_tensor1 = init_zero_tensor; init_zero_tensor1.name = "hidden_init"; @@ -264,39 +270,24 @@ void TestDituRNNPrediction(const std::string &model_path, const std::string &data_path, int batch_size, bool use_analysis, bool activate_ir, int num_times = 1) { - FLAGS_IA_enable_ir = activate_ir; - FLAGS_IA_enable_tensorrt_subgraph_engine = false; - FLAGS_IA_output_storage_path = "./analysis.out"; - - std::string model_out; - if (use_analysis) { - Argument argument(model_path); - argument.model_output_store_path.reset(new std::string("./analysis.out")); - - Analyzer analyzer; - analyzer.Run(&argument); - - // Should get the transformed model stored to ./analysis.out - model_out = "./analysis.out"; - ASSERT_TRUE(PathExists(model_out)); - } else { - model_out = FLAGS_infer_ditu_rnn_model; - } - NativeConfig config; - config.prog_file = model_out + "/__model__"; - config.param_file = model_out + "/param"; + config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__"; + config.param_file = FLAGS_infer_ditu_rnn_model + "/param"; config.use_gpu = false; config.device = 0; config.specify_input_name = true; - auto predictor = + auto base_predictor = CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor(config); std::vector input_slots; DataRecord data(data_path, batch_size); // Prepare inputs. PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; + std::vector outputs, base_outputs; + + base_predictor->Run(input_slots, &base_outputs); Timer timer; timer.tic(); @@ -308,35 +299,37 @@ void TestDituRNNPrediction(const std::string &model_path, << ", latency: " << timer.toc() / num_times << "ms"; LOG(INFO) << "====================================="; - for (auto &out : outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, [](int a, int b) { return a * b; }); + size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), + 1, [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); float *data = static_cast(out.data.data()); - for (size_t i = 0; - i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size); - i++) { - EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); } } -} - -// Turn on the IR pass supportion, run a real inference and check the result. -TEST(Analyzer, SupportIRPass) { - FLAGS_IA_enable_ir = true; - FLAGS_IA_enable_tensorrt_subgraph_engine = false; - FLAGS_IA_output_storage_path = "./analysis.out"; - Argument argument(FLAGS_inference_model_dir); - argument.model_output_store_path.reset(new std::string("./analysis.out")); - - Analyzer analyzer; - analyzer.Run(&argument); - - // Should get the transformed model stored to ./analysis.out - ASSERT_TRUE(PathExists("./analysis.out")); + if (use_analysis && activate_ir) { + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } - // Inference from this path. - TestWord2vecPrediction("./analysis.out"); + ASSERT_TRUE(fuse_statis.count("fc")); + EXPECT_EQ(fuse_statis.at("fc"), 1); + } } // Directly infer with the original model. @@ -365,5 +358,8 @@ TEST(Analyzer, DituRNN_with_analysis_with_IR) { } // namespace paddle USE_PASS(fc_fuse_pass); +USE_PASS(seq_concat_fc_fuse_pass); +USE_PASS(fc_lstm_fuse_pass); USE_PASS(graph_viz_pass); USE_PASS(infer_clean_graph_pass); +USE_PASS(attention_lstm_fuse_pass); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 25787321db24ea71a2fcbca4ecf9e6731b01f5db..3242aced39e82099f838a2adb612868ebc47c888 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -27,6 +27,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" namespace paddle { namespace inference { @@ -59,6 +60,47 @@ struct Argument { // The output storage path of ModelStorePass. std::unique_ptr model_output_store_path; + + // Support for any other attributes. + template + void Set(const std::string& key, T* data) { + PADDLE_ENFORCE_NOT_NULL(data); + PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]", + key); + attrs_[key] = data; + attr_deleters_[key] = [data, key]() { + VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + VLOG(3) << "argument delete attr: " << key; + delete data; + }; + } + + bool Has(const std::string& name) const { return attrs_.count(name); } + + template + T* Release(const std::string& key) { + PADDLE_ENFORCE(attrs_.count(key)); + auto* res = boost::any_cast(attrs_.at(key)); + attrs_.erase(key); + attr_deleters_.erase(key); + return res; + } + + template + T& Get(const std::string& key) { + PADDLE_ENFORCE(Has(key)); + return *boost::any_cast(attrs_.at(key)); + } + + ~Argument() { + for (auto& item : attr_deleters_) { + item.second(); + } + } + + private: + std::unordered_map attrs_; + std::unordered_map> attr_deleters_; }; #define ANALYSIS_ARGUMENT_CHECK_FIELD(field__) \ diff --git a/paddle/fluid/inference/analysis/chinese_ner_tester.cc b/paddle/fluid/inference/analysis/chinese_ner_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..9088a29d504309bc2c7b96fd49a0bf44e7cf0da9 --- /dev/null +++ b/paddle/fluid/inference/analysis/chinese_ner_tester.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data path"); +DEFINE_int32(batch_size, 10, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> word_data_all, mention_data_all; + std::vector> rnn_word_datas, rnn_mention_datas; + std::vector lod; // two inputs have the same lod info. + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= word_data_all.size()) { + data.word_data_all.assign(word_data_all.begin() + batch_iter, + word_data_all.begin() + batch_end); + data.mention_data_all.assign(mention_data_all.begin() + batch_iter, + mention_data_all.begin() + batch_end); + // Prepare LoDs + data.lod.push_back(0); + CHECK(!data.word_data_all.empty()); + CHECK(!data.mention_data_all.empty()); + CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size()); + for (size_t j = 0; j < data.word_data_all.size(); j++) { + data.rnn_word_datas.push_back(data.word_data_all[j]); + data.rnn_mention_datas.push_back(data.mention_data_all[j]); + // calculate lod + data.lod.push_back(data.lod.back() + data.word_data_all[j].size()); + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ';', &data); + // load word data + std::vector word_data; + split_to_int64(data[1], ' ', &word_data); + // load mention data + std::vector mention_data; + split_to_int64(data[3], ' ', &mention_data); + word_data_all.push_back(std::move(word_data)); + mention_data_all.push_back(std::move(mention_data)); + } + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor lod_word_tensor, lod_mention_tensor; + lod_word_tensor.name = "word"; + lod_mention_tensor.name = "mention"; + auto one_batch = data->NextBatch(); + int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size + lod_word_tensor.shape.assign({size, 1}); + lod_word_tensor.lod.assign({one_batch.lod}); + lod_mention_tensor.shape.assign({size, 1}); + lod_mention_tensor.lod.assign({one_batch.lod}); + // assign data + TensorAssignData(&lod_word_tensor, one_batch.rnn_word_datas); + TensorAssignData(&lod_mention_tensor, one_batch.rnn_mention_datas); + // Set inputs. + input_slots->assign({lod_word_tensor, lod_mention_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +// the first inference result +const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, + 48, 39, 38, 16, 25}; + +void TestChineseNERPrediction() { + NativeConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + + auto predictor = + CreatePaddlePredictor(config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + // Prepare inputs. + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + std::vector outputs; + + Timer timer; + timer.tic(); + for (int i = 0; i < FLAGS_repeat; i++) { + predictor->Run(input_slots, &outputs); + } + LOG(INFO) << "===========profile result==========="; + LOG(INFO) << "batch_size: " << FLAGS_batch_size + << ", repeat: " << FLAGS_repeat + << ", latency: " << timer.toc() / FLAGS_repeat << "ms"; + LOG(INFO) << "====================================="; + + PADDLE_ENFORCE(outputs.size(), 1UL); + auto &out = outputs[0]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_GT(size, 0); + int64_t *result = static_cast(out.data.data()); + for (size_t i = 0; i < std::min(11UL, size); i++) { + PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]); + } +} + +// Directly infer with the original model. +TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); } + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 8c7dd146e429a7f5cd28bdd418e457e8ea5680bd..80c85555e722433f3657e880520b3fe459f6ce1a 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -15,10 +15,12 @@ #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" +#include "paddle/fluid/inference/io.h" namespace paddle { namespace inference { @@ -33,7 +35,6 @@ std::vector ExtractParameters( bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument) ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc) - PADDLE_ENFORCE(!argument->transformed_program_desc); // The transformed_program_desc should inherit all the VarDesc and BlockDesc // from the original program desc. The operators of the main block(the first // block) should rewritten by data flow graph. @@ -65,6 +66,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { } } + if (argument_->Has(framework::ir::kParamScopeAttr)) { + LOG(WARNING) << "parameter changes in the scope takes effect"; + } + PADDLE_ENFORCE(argument_->transformed_program_desc.get()); } diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h index 4bf1840fdda8508b52d7274a338c5b1c95baf354..4693729cb43d7a9df96b11c4bf3064a70d1db4c3 100644 --- a/paddle/fluid/inference/analysis/dot.h +++ b/paddle/fluid/inference/analysis/dot.h @@ -29,13 +29,13 @@ namespace paddle { namespace inference { namespace analysis { +static size_t dot_node_counter{0}; + /* * A Dot template that helps to build a DOT graph definition. */ class Dot { public: - static size_t counter; - struct Attr { std::string key; std::string value; @@ -57,7 +57,7 @@ class Dot { Node(const std::string& name, const std::vector& attrs) : name(name), attrs(attrs), - id_("node_" + std::to_string(Dot::counter++)) {} + id_("node_" + std::to_string(dot_node_counter++)) {} std::string id() const { return id_; } @@ -65,6 +65,10 @@ class Dot { std::stringstream ss; CHECK(!name.empty()); ss << id_; + if (attrs.empty()) { + ss << "[label=" << '"' << name << '"' << "]"; + return ss.str(); + } for (size_t i = 0; i < attrs.size(); i++) { if (i == 0) { ss << "[label=" << '"' << name << '"' << " "; @@ -108,9 +112,11 @@ class Dot { explicit Dot(const std::vector& attrs) : attrs_(attrs) {} - void AddNode(const std::string& name, const std::vector& attrs) { - CHECK(!nodes_.count(name)) << "duplicate Node '" << name << "'"; - nodes_.emplace(name, Node{name, attrs}); + void AddNode(const std::string& id, const std::vector& attrs, + std::string label = "") { + CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'"; + if (label.empty()) label = id; + nodes_.emplace(id, Node{label, attrs}); } void AddEdge(const std::string& source, const std::string& target, diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc index 073f49752872cbb65fddc74be75ec28d4dd0bbaf..fc60ca3bd0bf706407defb2655a093d999aef7c2 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc @@ -13,3 +13,48 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void FluidToIrPass::EnableParamModify(const std::string &model_dir, + const std::string &prog_file, + const std::string ¶m_file) { + PADDLE_ENFORCE(argument_); + argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope); + // Load parameters. + VLOG(3) << "Loading parameters from " << model_dir; + LoadParams(&argument_->Get(framework::ir::kParamScopeAttr), + model_dir, prog_file, param_file); +} + +bool FluidToIrPass::LoadParams(framework::Scope *scope, const std::string &dir, + const std::string &prog_file, + const std::string ¶m_file) { + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + framework::Executor executor(place); + PADDLE_ENFORCE(argument_->origin_program_desc.get()); + framework::ProgramDesc program(*argument_->origin_program_desc); + if ((!prog_file.empty()) && (!param_file.empty())) { + LOG(INFO) << "load single model file from " << prog_file; + Load(&executor, scope, prog_file, param_file); + } else if (!dir.empty()) { + LOG(INFO) << "load from dir " << dir; + Load(&executor, scope, dir); + } else { + LOG(ERROR) << "failed to load parameters"; + return false; + } + return true; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h index fa3f8d313bbdd6733fa3878dd7023e125b6ced36..6731b1f759363eec5dd8645783212a72ace67b2f 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h @@ -14,12 +14,16 @@ #pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/pass.h" namespace paddle { namespace inference { namespace analysis { +using namespace framework; + +static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__"; class FluidToIrPass final : public DataFlowGraphPass { public: @@ -27,6 +31,9 @@ class FluidToIrPass final : public DataFlowGraphPass { bool Initialize(Argument *argument) override { ANALYSIS_ARGUMENT_CHECK_FIELD(argument); + PADDLE_ENFORCE(argument->Has(kFluidToIrPassesAttr), + "argument need the attr %s", kFluidToIrPassesAttr); + argument_ = argument; if (argument->origin_program_desc) { LOG(WARNING) << "argument's origin_program_desc is already set, might " "duplicate called"; @@ -40,18 +47,26 @@ class FluidToIrPass final : public DataFlowGraphPass { ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); // Load program. auto program = LoadProgramDesc(*argument->fluid_model_program_path); - argument->origin_program_desc.reset( - new framework::proto::ProgramDesc(program)); + argument->origin_program_desc.reset(new proto::ProgramDesc(program)); // Create main data flow graph. if (!argument->main_dfg) { argument->main_dfg.reset(new DataFlowGraph); } - // Persist the ProgramDesc in graph's attribute. The IR graph just keep the - // address, will segfault if the original ProgramDesc destroys. - auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer(); - ir_program_p = new framework::ProgramDesc(program); + argument->Set("ir_program_desc", new ProgramDesc(program)); + + LOG(INFO) << "Loading parameters"; + // Load parameters to argument if needed. + if (argument->fluid_model_dir || (argument->fluid_model_program_path && + argument->fluid_model_param_path)) { +#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : ""; + SAFE_GET(fluid_model_dir); + SAFE_GET(fluid_model_program_path); + SAFE_GET(fluid_model_param_path); +#undef SAFE_GET + EnableParamModify(fluid_model_dir, fluid_model_program_path, + fluid_model_param_path); + } - argument_ = argument; return true; } @@ -59,20 +74,44 @@ class FluidToIrPass final : public DataFlowGraphPass { void Run(DataFlowGraph *graph) override { // Call all the IR Passes - IRPassManager ir_passes(*static_cast( - argument_->main_dfg->Attr("ir_program_desc").Pointer())); - ir_passes.Apply(std::vector( - {// Manual update the passes here. - "graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass", - "fc_fuse_pass", "graph_viz_pass"})); + IRPassManager ir_passes(argument_->Get("ir_program_desc"), + nullptr); + // Pass the scope from analysis to IR if needed. + if (argument_->Has(ir::kParamScopeAttr)) { + // Here the address is passed, attention that IR doesn't own the scope, so + // the real scope in analysis should live during the IR phase. + ir_passes.graph().Set( + ir::kParamScopeAttr, + new Scope *(&argument_->Get(ir::kParamScopeAttr))); + } + + const auto &ir_passes_to_apply = + argument_->Get>(kFluidToIrPassesAttr); + ir_passes.Apply(ir_passes_to_apply); PADDLE_ENFORCE(argument_->main_dfg.get()); argument_->main_dfg->Build(ir_passes.graph()); - // PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected()); + // inherit the arguments from ir. + if (ir_passes.graph().Has(ir::kFuseStatisAttr)) { + argument_->Set( + ir::kFuseStatisAttr, + new std::unordered_map( + ir_passes.graph().Get>( + ir::kFuseStatisAttr))); + } } + void EnableParamModify(const std::string &model_dir, + const std::string &prog_file, + const std::string ¶m_file); + std::string repr() const override { return "fluid-to-ir-pass"; } + private: + // Load parameters from a single file or from a directory. + bool LoadParams(Scope *scope, const std::string &dir, + const std::string &prog_file, const std::string ¶m_file); + private: Argument *argument_{nullptr}; }; diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc index af934f261baa3807059ce6ab036545594630df58..6a13c60e7b2ebf645b12d5ddf83ef6ab3a2e83bd 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc @@ -24,6 +24,8 @@ namespace analysis { TEST(FluidToIrPass, Test) { FluidToIrPass pass; Argument argument(FLAGS_inference_model_dir); + argument.Set(kFluidToIrPassesAttr, + new std::vector({"infer_clean_graph_pass"})); pass.Initialize(&argument); pass.Run(argument.main_dfg.get()); } @@ -32,6 +34,9 @@ TEST(FluidToIrPass, Test) { } // namespace inference } // namespace paddle -USE_PASS(fc_fuse_pass); USE_PASS(graph_viz_pass); USE_PASS(infer_clean_graph_pass); +USE_PASS(attention_lstm_fuse_pass); +USE_PASS(fc_lstm_fuse_pass); +USE_PASS(seq_concat_fc_fuse_pass); +USE_PASS(fc_fuse_pass); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index d849b637bcf3fe3944ad11680bbe041e19a71e24..ea0f2241d7dbab8f79ec9349effbe96112748e34 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -14,20 +14,26 @@ #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/scope.h" namespace paddle { namespace inference { namespace analysis { -IRPassManager::IRPassManager(const ProgramDesc& program) { +IRPassManager::IRPassManager(const ProgramDesc &program, + framework::Scope *scope) + : program_(program) { graph_.reset(new framework::ir::Graph(program)); + if (scope) + graph_->Set(framework::ir::kParamScopeAttr, new framework::Scope *(scope)); } -void IRPassManager::Apply(const std::vector& passes) { - graph_->Set("graph_viz_path", new std::string("./1.dot")); +void IRPassManager::Apply(const std::vector &passes) { // Apply all the passes std::string pre_pass; - for (const std::string& pass_name : passes) { + for (const std::string &pass_name : passes) { LOG(WARNING) << "Running IR pass [" << pass_name << "]"; auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); if (pass_name == "graph_viz_pass") { diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 3338e37ecf1c591a631fd829a05b07e562af703e..bb230283b7c2cc783d0b68ea0aa3cca1cabc75e6 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" namespace paddle { namespace inference { @@ -31,14 +32,15 @@ using framework::ProgramDesc; class IRPassManager final { public: - IRPassManager(const ProgramDesc& program); + IRPassManager(const ProgramDesc &program, framework::Scope *scope); - void Apply(const std::vector& passes); + void Apply(const std::vector &passes); - framework::ir::Graph& graph() const { return *graph_; } + framework::ir::Graph &graph() const { return *graph_; } private: std::unique_ptr graph_; + ProgramDesc program_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc index cfdca33882ea00a28e3ea51ca5fd77ec9605bf3a..ff5ec94265a4f05c1294ad6c8ac5f86c249b84b6 100644 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -33,9 +33,9 @@ bool PassManager::Initialize(Argument* argument) { void DfgPassManager::RunAll() { PADDLE_ENFORCE(argument_); - LOG(INFO) << "Total " << data_.size() << " passes"; + LOG(INFO) << "Total " << data_.size() << " Analysys passes"; for (auto& pass : data_) { - LOG(WARNING) << "Running pass [" << pass->repr() << "]"; + LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]"; pass->Run(argument_->main_dfg.get()); } } diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 0ca1af455ca10fa6995ad3a1c33825108a3fd7ad..adfe4392448557a30cd834022b9a5d21d9086b95 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -20,7 +20,7 @@ endif(APPLE) set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager graph_viz_pass fc_fuse_pass - infer_clean_graph_pass + infer_clean_graph_pass ) if(WITH_GPU AND TENSORRT_FOUND) @@ -46,7 +46,8 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api) cc_test(test_paddle_inference_api SRCS api_tester.cc diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc new file mode 100644 index 0000000000000000000000000000000000000000..33862232bdaae817b9ca72879605386c32ed3e8b --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { + +bool AnalysisPredictor::Init( + const std::shared_ptr& parent_scope) { + VLOG(3) << "Predictor::init()"; + if (config_.use_gpu) { + place_ = paddle::platform::CUDAPlace(config_.device); + } else { + place_ = paddle::platform::CPUPlace(); + } + PADDLE_ENFORCE(!parent_scope); + if (parent_scope) { + scope_ = parent_scope; + sub_scope_ = &(parent_scope->NewScope()); + } else { + paddle::framework::InitDevices(false); + scope_.reset(new paddle::framework::Scope()); + } + + executor_.reset(new paddle::framework::Executor(place_)); + + // Initialize the inference program + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), + config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << "fail to load inference model."; + return false; + } + + OptimizeInferenceProgram(); + ctx_ = executor_->Prepare(*inference_program_, 0); + + VLOG(5) << "to create variables"; + PADDLE_ENFORCE(scope_.get()); + executor_->CreateVariables(*inference_program_, + sub_scope_ ? sub_scope_ : scope_.get(), 0); + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + return true; +} + +void AnalysisPredictor::OptimizeInferenceProgram() { + LOG(INFO) << "optimize begin"; + FLAGS_IA_enable_ir = true; + FLAGS_IA_enable_tensorrt_subgraph_engine = false; + FLAGS_IA_output_storage_path = ""; // Don't output the model. + // Analyze inference_program + if (!config_.model_dir.empty()) { + argument_.fluid_model_dir.reset(new std::string(config_.model_dir)); + } else { + PADDLE_ENFORCE( + !config_.param_file.empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file.empty()); + argument_.fluid_model_program_path.reset( + new std::string(config_.prog_file)); + argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); + } + argument_.origin_program_desc.reset( + new ProgramDesc(*inference_program_->Proto())); + Analyzer().Run(&argument_); + CHECK(argument_.transformed_program_desc); + VLOG(5) << "to prepare executor"; + // LOG(INFO) << "transformed_parogram_desc " << + // argument.transformed_program_desc->DebugString(); + inference_program_.reset( + new framework::ProgramDesc(*argument_.transformed_program_desc)); + PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr)); + // Update scope. + scope_.reset( + argument_.Release(framework::ir::kParamScopeAttr)); + LOG(INFO) << "optimize end =="; +} + +template <> +std::unique_ptr CreatePaddlePredictor< + NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) { + VLOG(3) << "create NativePredictor"; + if (config.use_gpu) { + // 1. GPU memeroy + PADDLE_ENFORCE_GT( + config.fraction_of_gpu_memory, 0.f, + "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); + PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); + std::vector flags; + if (config.fraction_of_gpu_memory >= 0.0f || + config.fraction_of_gpu_memory <= 0.95f) { + flags.push_back("dummpy"); + std::string flag = "--fraction_of_gpu_memory_to_use=" + + std::to_string(config.fraction_of_gpu_memory); + flags.push_back(flag); + VLOG(3) << "set flag: " << flag; + framework::InitGflags(flags); + } + } + + std::unique_ptr predictor(new AnalysisPredictor(config)); + if (!dynamic_cast(predictor.get())->Init(nullptr)) { + return nullptr; + } + return predictor; +} + +} // namespace paddle + +USE_PASS(fc_fuse_pass); +USE_PASS(graph_viz_pass); +USE_PASS(infer_clean_graph_pass); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h new file mode 100644 index 0000000000000000000000000000000000000000..e32b6185f6044ab3577bde0a8f8dcf2391688aa8 --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +using inference::analysis::Argument; +using inference::analysis::Analyzer; +using framework::proto::ProgramDesc; + +/* This predictor is based on the original native predictor with IR and Analysis + * support. It will optimize IR and Parameters in the runtime. + * TODO(Superjomn) Replace the Navive predictor? + */ +class AnalysisPredictor : public NativePaddlePredictor { + public: + explicit AnalysisPredictor(const NativeConfig& config) + : NativePaddlePredictor(config), config_(config) {} + + bool Init(const std::shared_ptr& parent_scope); + + bool Run(const std::vector& inputs, + std::vector* output_data, + int batch_size = -1) override { + return NativePaddlePredictor::Run(inputs, output_data, batch_size); + } + + void OptimizeInferenceProgram(); + + Argument& analysis_argument() { return argument_; } + + private: + NativeConfig config_; + Argument argument_; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 23fe740b178355fae1320b9c60e859109635aa19..bc939f417be063193cb60a72a5d99d2678284ca8 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/platform/profiler.h" @@ -38,6 +39,25 @@ std::string num2str(T a) { } } // namespace +void NativePaddlePredictor::PrepareFeedFetch() { + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= (size_t)idx) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetchs_.size() <= (size_t)idx) { + fetchs_.resize(idx + 1); + } + fetchs_[idx] = op; + } + } +} + bool NativePaddlePredictor::Init( std::shared_ptr parent_scope) { VLOG(3) << "Predictor::init()"; @@ -90,8 +110,7 @@ bool NativePaddlePredictor::Init( sub_scope_ ? sub_scope_ : scope_.get(), 0); // Get the feed_target_names and fetch_target_names - feed_target_names_ = inference_program_->GetFeedTargetNames(); - fetch_target_names_ = inference_program_->GetFetchTargetNames(); + PrepareFeedFetch(); return true; } @@ -114,36 +133,21 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, Timer timer; timer.tic(); // set feed variable - std::map feed_targets; std::vector feeds; - if (!SetFeed(inputs, &feeds)) { + framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get(); + if (!SetFeed(inputs, scope)) { LOG(ERROR) << "fail to set feed"; return false; } - for (size_t i = 0; i < feed_target_names_.size(); ++i) { - if (config_.specify_input_name) { - feed_targets[inputs[i].name] = &feeds[i]; - } else { - feed_targets[feed_target_names_[i]] = &feeds[i]; - } - } - // get fetch variable - std::map fetch_targets; - std::vector fetchs; - fetchs.resize(fetch_target_names_.size()); - for (size_t i = 0; i < fetch_target_names_.size(); ++i) { - fetch_targets[fetch_target_names_[i]] = &fetchs[i]; - } // Run the inference program // if share variables, we need not create variables VLOG(4) << "Run prepared context"; - executor_->RunPreparedContext( - ctx_.get(), sub_scope_ != nullptr ? sub_scope_ : scope_.get(), - &feed_targets, &fetch_targets, - false, /* don't create local scope each time*/ - false /* don't create variable eatch time */); + executor_->RunPreparedContext(ctx_.get(), scope, + false, /* don't create local scope each time*/ + false /* don't create variable eatch time */); VLOG(4) << "Finish prepared context"; - if (!GetFetch(fetchs, output_data)) { + // get fetch variable + if (!GetFetch(output_data, scope)) { LOG(ERROR) << "fail to get fetches"; return false; } @@ -159,18 +163,23 @@ std::unique_ptr NativePaddlePredictor::Clone() { LOG(ERROR) << "fail to call Init"; return nullptr; } +#ifdef __clang__ + // fix clang compile error + return cls; +#else // fix manylinux compile error. return std::move(cls); +#endif } bool NativePaddlePredictor::SetFeed(const std::vector &inputs, - std::vector *feeds) { + framework::Scope *scope) { VLOG(3) << "Predictor::set_feed"; - if (inputs.size() != feed_target_names_.size()) { + if (inputs.size() != feeds_.size()) { LOG(ERROR) << "wrong feed input size."; return false; } - for (size_t i = 0; i < feed_target_names_.size(); ++i) { + for (size_t i = 0; i < inputs.size(); ++i) { framework::LoDTensor input; framework::DDim ddim = framework::make_ddim(inputs[i].shape); void *input_ptr; @@ -192,78 +201,93 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, lod.emplace_back(level); } input.set_lod(lod); - - feeds->push_back(input); + int idx = -1; + if (config_.specify_input_name) { + idx = feed_names_[inputs[i].name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); } return true; } - -bool NativePaddlePredictor::GetFetch( - const std::vector &fetchs, - std::vector *outputs) { - VLOG(3) << "Predictor::get_fetch"; - outputs->resize(fetchs.size()); - for (size_t i = 0; i < fetchs.size(); ++i) { - // TODO(panyx0718): Support fetch of other types. - if (fetchs[i].type() != typeid(float)) { - LOG(ERROR) << "only support fetching float now."; - return false; +template +void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + std::vector shape; + auto dims_i = fetch.dims(); + auto lod = fetch.lod(); + const T *output_ptr = fetch.data(); + auto num = fetch.numel(); + std::vector data; + if (0 == lod.size()) { + std::copy(output_ptr, output_ptr + num, std::back_inserter(data)); + for (int j = 0; j < dims_i.size(); ++j) { + shape.push_back(dims_i[j]); } - std::vector shape; - auto dims_i = fetchs[i].dims(); - auto lod = fetchs[i].lod(); - const float *output_ptr = fetchs[i].data(); - // const int64_t* output_ptr = fetchs[i].data(); - auto num = fetchs[i].numel(); - std::vector data; - if (0 == lod.size()) { - std::copy(output_ptr, output_ptr + num, std::back_inserter(data)); - for (int j = 0; j < dims_i.size(); ++j) { - shape.push_back(dims_i[j]); - } - } else { - // for batch detection - // image[0] -> output[0] shape {145, 6} - // image[1] -> output[1] shape {176, 6} - // then, - // the batch output shape {321, 6} - // the lod {{0, 145, 321}} - // so we should append output[0] to {176, 6} - size_t max_dim = 0; - for (size_t j = 1; j < lod[0].size(); j++) { - max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]); - } - size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back(); - if (max_dim > 0) { - data.resize((lod[0].size() - 1) * max_dim * common_dim, 0); - } - for (size_t j = 1; j < lod[0].size(); j++) { - size_t start = lod[0][j - 1] * common_dim; - size_t end = lod[0][j] * common_dim; - if (end > start) { - std::copy(output_ptr + start, output_ptr + end, - data.begin() + (j - 1) * max_dim * common_dim); - } - } - shape.push_back(lod[0].size() - 1); - shape.push_back(max_dim); - for (int j = 1; j < dims_i.size(); ++j) { - shape.push_back(dims_i[j]); + } else { + // for batch detection + // image[0] -> output[0] shape {145, 6} + // image[1] -> output[1] shape {176, 6} + // then, + // the batch output shape {321, 6} + // the lod {{0, 145, 321}} + // so we should append output[0] to {176, 6} + size_t max_dim = 0; + for (size_t j = 1; j < lod[0].size(); j++) { + max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]); + } + size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back(); + if (max_dim > 0) { + data.resize((lod[0].size() - 1) * max_dim * common_dim, 0); + } + for (size_t j = 1; j < lod[0].size(); j++) { + size_t start = lod[0][j - 1] * common_dim; + size_t end = lod[0][j] * common_dim; + if (end > start) { + std::copy(output_ptr + start, output_ptr + end, + data.begin() + (j - 1) * max_dim * common_dim); } } - - outputs->at(i).shape = shape; - auto &buffer = outputs->at(i).data; - if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) { - buffer.Resize(sizeof(float) * data.size()); + shape.push_back(lod[0].size() - 1); + shape.push_back(max_dim); + for (int j = 1; j < dims_i.size(); ++j) { + shape.push_back(dims_i[j]); } - std::memcpy(buffer.data(), data.data(), buffer.length()); - // copy LoD - for (const auto &level : fetchs[i].lod()) { - outputs->at(i).lod.emplace_back(level); + } + + output->shape = shape; + auto &buffer = output->data; + if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) { + buffer.Resize(sizeof(T) * data.size()); + } + std::memcpy(buffer.data(), data.data(), buffer.length()); + // copy LoD + for (const auto &level : fetch.lod()) { + output->lod.emplace_back(level); + } +} + +bool NativePaddlePredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetchs_.size()); + for (size_t i = 0; i < fetchs_.size(); ++i) { + int idx = boost::get(fetchs_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + if (type == typeid(float)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == typeid(int64_t)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; } - outputs->at(i).dtype = PaddleDType::FLOAT32; - // TODO(panyx0718): support other types? fill tensor name? avoid a copy. } return true; } @@ -294,7 +318,12 @@ std::unique_ptr CreatePaddlePredictor< if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } +#ifdef __clang__ + // fix clang compile error + return predictor; +#else return std::move(predictor); +#endif } } // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 4f28c3cd34bade4189871210e6168c6c1c610c2c..ec801c58857e716241d28404510530e551ed25aa 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -47,9 +48,13 @@ class NativePaddlePredictor : public PaddlePredictor { protected: bool SetFeed(const std::vector &input_datas, - std::vector *feeds); - bool GetFetch(const std::vector &fetchs, - std::vector *output_data); + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); + void PrepareFeedFetch(); NativeConfig config_; platform::Place place_; @@ -57,8 +62,9 @@ class NativePaddlePredictor : public PaddlePredictor { std::shared_ptr scope_; std::unique_ptr ctx_; std::unique_ptr inference_program_; - std::vector feed_target_names_; - std::vector fetch_target_names_; + std::vector feeds_; + std::map feed_names_; + std::vector fetchs_; // Do not use unique_ptr, use parent scope to delete framework::Scope *sub_scope_{nullptr}; }; diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 9ac037297167fe7de29925ffe36f4d39efb65313..abee375313850f1490bacec11f737706c061a5e9 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { : NativePaddlePredictor(config), config_(config) {} bool Init(const std::shared_ptr& parent_scope) { + FLAGS_IA_enable_tensorrt_subgraph_engine = true; VLOG(3) << "Predictor::init()"; FLAGS_tensorrt_max_batch_size = config_.max_batch_size; FLAGS_tensorrt_workspace_size = config_.workspace_size; @@ -73,10 +74,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { VLOG(5) << "to create variables"; executor_->CreateVariables(*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); - // Get the feed_target_names and fetch_target_names - feed_target_names_ = inference_program_->GetFeedTargetNames(); - fetch_target_names_ = inference_program_->GetFetchTargetNames(); + PrepareFeedFetch(); return true; } @@ -161,3 +160,4 @@ USE_TRT_CONVERTER(fc); USE_TRT_CONVERTER(pool2d); USE_TRT_CONVERTER(softmax); USE_TRT_CONVERTER(batch_norm); +USE_TRT_CONVERTER(concat); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 8f1a72316d6c146ebc9a86ced739ef088a3b4267..9e7425eddd2df07ffe897f908aad360abe42117a 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { config1.use_gpu = true; config1.fraction_of_gpu_memory = 0.3; config1.device = 0; + config1.max_batch_size = 10; auto predictor0 = CreatePaddlePredictor(config0); diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index f0e98cfbfd0729f47de57d5f22b4c01a62dd2995..afb46a7139f6ab8e6b3697fdc56fe1c78a05cd64 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -28,7 +28,6 @@ include_directories("${PADDLE_LIB}") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include") include_directories("${PADDLE_LIB}/third_party/install/gflags/include") -message("gflags " "${PADDLE_LIB}/third_party/install/gflags/include") if (NOT WIN32) include_directories("${PADDLE_LIB}/third_party/install/snappy/include") include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..9cc491e10d691a206dd903b78c0ea570741da44c --- /dev/null +++ b/paddle/fluid/inference/api/helper.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/helper.h" + +namespace paddle { +namespace inference { + +template <> +std::string to_string>( + const std::vector> &vec) { + std::stringstream ss; + for (const auto &piece : vec) { + ss << to_string(piece) << "\n"; + } + return ss.str(); +} + +template <> +std::string to_string>>( + const std::vector>> &vec) { + std::stringstream ss; + for (const auto &line : vec) { + for (const auto &rcd : line) { + ss << to_string(rcd) << ";\t"; + } + ss << '\n'; + } + return ss.str(); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index de05514826f4e4b032f22127c404bf14a3bc3bcc..90c4b56d5363ebaccd64c2cf6b70abad0be4602a 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,7 +26,8 @@ namespace paddle { namespace inference { -void split(const std::string &str, char sep, std::vector *pieces) { + +static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); if (str.empty()) { return; @@ -41,12 +43,20 @@ void split(const std::string &str, char sep, std::vector *pieces) { pieces->push_back(str.substr(pos)); } } -void split_to_float(const std::string &str, char sep, std::vector *fs) { +static void split_to_float(const std::string &str, char sep, + std::vector *fs) { std::vector pieces; split(str, sep, &pieces); std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs), [](const std::string &v) { return std::stof(v); }); } +static void split_to_int64(const std::string &str, char sep, + std::vector *is) { + std::vector pieces; + split(str, sep, &pieces); + std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), + [](const std::string &v) { return std::stoi(v); }); +} template std::string to_string(const std::vector &vec) { std::stringstream ss; @@ -57,36 +67,23 @@ std::string to_string(const std::vector &vec) { } template <> std::string to_string>( - const std::vector> &vec) { - std::stringstream ss; - for (const auto &piece : vec) { - ss << to_string(piece) << "\n"; - } - return ss.str(); -} + const std::vector> &vec); + template <> std::string to_string>>( - const std::vector>> &vec) { - std::stringstream ss; - for (const auto &line : vec) { - for (const auto &rcd : line) { - ss << to_string(rcd) << ";\t"; - } - ss << '\n'; - } - return ss.str(); -} + const std::vector>> &vec); -void TensorAssignData(PaddleTensor *tensor, - const std::vector> &data) { +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data) { // Assign buffer int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; }); - tensor->data.Resize(sizeof(float) * dim); + tensor->data.Resize(sizeof(T) * dim); int c = 0; for (const auto &f : data) { - for (float v : f) { - static_cast(tensor->data.data())[c++] = v; + for (T v : f) { + static_cast(tensor->data.data())[c++] = v; } } } diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 36fd0727aa7beef4a06a5f2e63ec0c43583ddf84..1baa64c249f291ec1bc874be5031abe6d4368274 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -77,6 +77,7 @@ enum class PaddleEngineKind { kNative = 0, // Use the native Fluid facility. kAnakin, // Use Anakin for inference. kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. + kAnalysis // TODO(Superjomn) support following engines latter. // kTensorRT, // Use TensorRT for inference. // kAutoMixedAnakin, // Automatically mix Fluid with Anakin. diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 181868977dd8f2568486ed0c4e1f260a69795896..cef7b2a7e3a29da05628d7540f5545dc9adda27e 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -143,5 +143,21 @@ std::unique_ptr Load( return main_program; } +void SaveVars(const framework::Scope& scope, + const std::vector& vars, const std::string& dirname, + bool predicate) { + framework::ProgramDesc prog; + auto* block = prog.MutableBlock(0); + auto* op = block->AppendOp(); + op->SetType("save_combine"); + op->SetInput("X", vars); + op->SetAttr("file_path", dirname + "/param"); + op->CheckAttrs(); + + platform::CPUPlace place; + framework::Executor exe(place); + exe.Run(prog, const_cast(&scope), 0, true, true); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index 01b50b3670cb9da2e0be232a61ea6129dd83aa20..ab492577c1476abee30d6dd1c740394391e5a93a 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -41,5 +41,10 @@ std::unique_ptr Load(framework::Executor* executor, const std::string& prog_filename, const std::string& param_filename); +// Save the variables from a scope to disk. +void SaveVars(const framework::Scope& scope, + const std::vector& vars, const std::string& dirname, + bool predicate = true); + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 2a449eb95e3537a11962912a6a3f29e89958fbd8..9d7be2d03cf7bb12afe7e52d9630f184d689dc25 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -batch_norm_op.cc activation_op.cc softmax_op.cc +batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL) nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) - nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL) - nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL) - nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL) + +nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb9627bf957b63993b2c8d23e7ec8122eb004eaf --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights. + */ +class ConcatOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + std::vector itensors; + for (auto& input_name : op_desc.Input("X")) { + itensors.push_back(engine_->GetITensor(input_name)); + } + int axis = boost::get(op_desc.GetAttr("axis")); + PADDLE_ENFORCE(axis > 0, + "The axis attr of Concat op should be large than 0 for trt"); + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(), + itensors.size()); + axis = axis - 1; // Remove batch dim + layer->setAxis(axis); + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 41faaf7212accaaec238062b1340e8da8fa6be33..d309d94c560f2b484fac6b6cd40cc2704d641069 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -79,6 +79,14 @@ class OpConverter { it = Registry::Lookup("elementwise_" + op_type + "_tensor"); } + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); + } + + if (op_desc.Type() == "depthwise_conv2d") { + it = Registry::Lookup("conv2d"); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); } if (!it) { diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f284a4db5758e072915d7fd0f16115b8a36ba8b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(concat_op, test) { + std::unordered_set parameters({""}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1)); + validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1)); + validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1)); + validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("concat"); + desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); + desc.SetOutput("Out", {"concat_out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(5); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(concat); diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index e2a3e9d46ef9f303d191d59253ffbe9f4826184b..cbcfc964c91c33ab41a72ad7fec759086ad887cc 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/framework/feed_fetch_method.h" + DEFINE_string(model_path, "", "Directory of the inference model."); DEFINE_string(data_file, "", "File of input index data."); DEFINE_int32(repeat, 100, "Running the inference program repeat times"); @@ -124,14 +126,35 @@ void ThreadRunInfer( std::map feed_targets; PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL); + // map the data of feed_targets to feed_holder + for (auto* op : inference_program->Block(0).AllOps()) { + if (op->Type() == "feed") { + std::string feed_target_name = op->Output("Out")[0]; + int idx = boost::get(op->GetAttr("col")); + paddle::framework::SetFeedVariable(scope, *feed_targets[feed_target_name], + "feed", idx); + } + } + auto& inputs = jobs[tid]; auto start_ms = GetCurrentMs(); for (size_t i = 0; i < inputs.size(); ++i) { feed_targets[feed_target_names[0]] = inputs[i]; - executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets, - &fetch_targets, false /*create_local_scope*/); + executor.RunPreparedContext(ctx.get(), &sub_scope, + false /*create_local_scope*/); } auto stop_ms = GetCurrentMs(); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : inference_program->Block(0).AllOps()) { + if (op->Type() == "fetch") { + std::string fetch_target_name = op->Input("X")[0]; + int idx = boost::get(op->GetAttr("col")); + *fetch_targets[fetch_target_name] = + paddle::framework::GetFetchVariable(*scope, "fetch", idx); + } + } + scope->DeleteScope(&sub_scope); LOG(INFO) << "Tid: " << tid << ", process " << inputs.size() << " samples, avg time per sample: " diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 695790a37dce889e838462b401ca4e89f09271d5..94f0550df57e79fa68c135f5c9c4b7effe6ac156 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/profiler.h" @@ -135,6 +136,15 @@ std::vector> GetFeedTargetShapes( return feed_target_shapes; } +void Compile(paddle::framework::ProgramDesc* program) { + std::unique_ptr g( + new paddle::framework::ir::Graph(*program)); + auto pass = paddle::framework::ir::PassRegistry::Instance().Get( + "graph_to_program_pass"); + pass->SetNotOwned("program", program); + pass->Apply(std::move(g)); +} + template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, @@ -172,6 +182,8 @@ void TestInference(const std::string& dirname, paddle::platform::DeviceContextPool::Instance().Get(place)); inference_program = InitProgram(&executor, scope, dirname, is_combined); } + Compile(inference_program.get()); + // Disable the profiler and print the timing information paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, "load_program_profiler"); @@ -249,3 +261,5 @@ void TestInference(const std::string& dirname, delete scope; } + +USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index bf7a9e8264763628ce76e61863d6e3b936799105..1b96798d23cec34a1863f56c1e4027ce32b2eec5 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -17,7 +17,7 @@ limitations under the License. */ #ifdef _WIN32 #include -#include +#include // VirtualLock/VirtualUnlock #else #include // for mlock and munlock #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index fd48b72c564c6cffb789e6ee99955abb58707cdb..d3ca385937c45c5acd4082191293a584d7cd785e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -85,9 +85,13 @@ function(op_library TARGET) #remove windows unsupported op if (WIN32) +<<<<<<< HEAD # no nccl, no avx instructions ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "lstm_op" "fusion_lstm_op") +======= + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") +>>>>>>> origin/develop if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -295,6 +299,8 @@ op_library(unsqueeze_op DEPS reshape_op) op_library(squeeze_op DEPS reshape_op) op_library(extract_rows_op DEPS memory) op_library(flatten_op DEPS reshape_op) +op_library(sequence_pad_op DEPS sequence_padding) +op_library(unstack_op DEPS stack_op) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 288b3b1f0b9fa821727f95b3656448641aaf4573..39b0c856996c11c6efdb530f1396afd5731c778d 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -55,7 +55,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { const int D = w_dims[1] / 4; PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2."); PADDLE_ENFORCE_EQ(w_dims[0], D + M, - "LSTMWeight dims should be (%d + %d) * %d.", D + M, 4 * D); + "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D); auto b_dims = ctx->GetInputDim("LSTMBias"); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2."); @@ -231,40 +231,28 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] + bias[0]; - } - math::vec_relu(n, y, y); + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + math::vec_relu(n, x, y); } } -template -inline void vec_softmax(const math::BlasT& blas, const int n, - const T* x, T* y) { +template +inline void vec_softmax(const int n, const T* x, T* y) { T scalar = x[0]; // max for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - - // sub - for (int i = 0; i < n; ++i) { - y[i] = x[i] - scalar; - } - - // exp - blas.VEXP(n, y, y); - + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { scalar += y[i]; } - - // scale - blas.SCAL(n, static_cast(1) / scalar, y); + math::vec_scal(n, static_cast(1) / scalar, y); // scale } template @@ -310,11 +298,21 @@ class AttentionLSTMKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D); fc_out->Resize({max_seq_len, 1}); - math::VecActivations act_functor; std::function act_gate, act_cell, act_cand; - act_gate = act_functor(ctx.Attr("gate_activation")); - act_cell = act_functor(ctx.Attr("cell_activation")); - act_cand = act_functor(ctx.Attr("candidate_activation")); + auto& act_gate_str = ctx.Attr("gate_activation"); + auto& act_cell_str = ctx.Attr("cell_activation"); + auto& act_cand_str = ctx.Attr("candidate_activation"); + if (platform::jit::MayIUse(platform::jit::avx)) { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } else { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } const T* x_data = x->data(); const T* h0_data = h0 ? h0->data() : NULL; @@ -362,7 +360,7 @@ class AttentionLSTMKernel : public framework::OpKernel { fc_out_data); } // 1d. softmax - vec_softmax(blas, seq_len, fc_out_data, fc_out_data); + vec_softmax(seq_len, fc_out_data, fc_out_data); // mul x(seq_len*M) and sum pool math::FCCompute(blas, 1, M, seq_len, fc_out_data, cur_x_data, lstm_x_data); diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 9ab2179b5fe689762704039c5f67dd080e530aa5..de641cb08e4cc3322cc8387d873f2aaab279e1dd 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -37,6 +37,95 @@ struct bn_type_traits { using op_prim = typename op_type::primitive_desc; }; +class BatchNormMKLDNNHandler : public platform::MKLDNNHandler { + public: + BatchNormMKLDNNHandler( + std::shared_ptr batch_norm_pd, + const platform::MKLDNNDeviceContext &dev_ctx, mkldnn::engine engine, + const std::string &base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key) { + batch_norm_pd_ = batch_norm_pd; + } + + std::shared_ptr AcquireScaleshiftMemoryFromPrimitive(void *ptr) { + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->weights_primitive_desc(), ptr, "@scaleshift_mem_p"); + } + + std::shared_ptr AcquireMeanMemoryFromPrimitive(void *ptr) { + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->mean_primitive_desc(), ptr, "@mean_mem_p"); + } + + std::shared_ptr AcquireVarianceMemoryFromPrimitive(void *ptr) { + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p"); + } + + std::shared_ptr AcquireTestTrainingBatchNormFwd( + std::shared_ptr src_memory, + std::shared_ptr scaleshift_memory, + std::shared_ptr dst_memory, std::shared_ptr mean_memory, + std::shared_ptr variance_memory, bool is_test) { + auto prim_key = key_ + "@batch_norm_p"; + auto batch_norm_p = + std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); + + PADDLE_ENFORCE((batch_norm_p != nullptr) || !is_reusing_, + "Fail to find batch norm primitive in device context"); + + if (batch_norm_p == nullptr) { + if (is_test) { + batch_norm_p = std::make_shared( + *batch_norm_pd_, *src_memory, + (const mkldnn::primitive::at &)*mean_memory, + (const mkldnn::primitive::at &)*variance_memory, *scaleshift_memory, + *dst_memory); + } else { + batch_norm_p = std::make_shared( + *batch_norm_pd_, *src_memory, *scaleshift_memory, *dst_memory, + *mean_memory, *variance_memory); + } + + dev_ctx_.SetBlob(prim_key, batch_norm_p); + } else { + is_reusing_ = true; + } + + return batch_norm_p; + } + + static std::string GetHash(const memory::dims &input_dims, float epsilon, + unsigned flag, bool is_test, memory::format format, + const std::string &suffix = "") { + auto dims2str = [](const memory::dims &operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dims2str(input_dims) + std::to_string(epsilon) + + std::to_string(flag) + std::to_string(is_test) + + std::to_string(format) + suffix; + } + + private: + std::shared_ptr batch_norm_pd_; +}; + +std::shared_ptr UpdateMemoryData( + const platform::MKLDNNDeviceContext &dev_ctx, const std::string &key, + void *new_ptr) { + auto mem = std::static_pointer_cast(dev_ctx.GetBlob(key)); + PADDLE_ENFORCE( + mem != nullptr, + (std::string("Fail to find memory in device context [key: ") + key + "]") + .c_str()); + mem->set_data_handle(new_ptr); + return mem; +} + template void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end, Container *c) { @@ -48,15 +137,6 @@ void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end, std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end)))); } -template -void run_batch_norm_op(Args &&... args) { - Op batch_norm_op{args...}; - - std::vector pipeline; - pipeline.push_back(batch_norm_op); - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); -} - } // namespace template @@ -110,6 +190,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); const unsigned int ic = scale_tz[0]; + // MKLDNN requires a single piece of memory for scale and shift/bias data + const size_t scaleshift_size = 2 * ic; + std::vector scaleshift_data; + scaleshift_data.reserve(scaleshift_size); + + copy_to_weights(scale->data(), scale->data() + ic, shift->data(), + shift->data() + ic, &scaleshift_data); + unsigned flags = mkldnn::use_scale_shift; if (is_test) flags |= mkldnn::use_global_stats; if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; @@ -118,64 +206,69 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); - auto src_memory = memory( - {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, - to_void_cast(x_data)); + // keys for backward pass + const std::string key = BatchNormMKLDNNHandler::GetHash( + src_tz, epsilon, flags, is_test, input_format, + ctx.op().Output("SavedMean")); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input_format); // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; - auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ - propagation, src_memory.get_primitive_desc().desc(), epsilon, flags}; - std::shared_ptr batch_norm_fwd_pd = - std::shared_ptr( - new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc, - mkldnn_engine)); - - // Save the pd to be used in backward pass - const std::string key = ctx.op().Output("SavedMean"); - const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + auto batch_norm_fwd_desc = + bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags}; + auto batch_norm_fwd_pd = std::make_shared( + batch_norm_fwd_desc, mkldnn_engine); + // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd); - // MKLDNN requires a single piece of memory for scale and shift/bias data - const size_t scaleshift_size = 2 * ic; - std::vector scaleshift_data; - scaleshift_data.reserve(scaleshift_size); + BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, + key); - copy_to_weights(scale->data(), scale->data() + ic, shift->data(), - shift->data() + ic, &scaleshift_data); + auto src_memory = + handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data)); // crate mkldnn memory for weights(scale/shift) - auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(), - scaleshift_data.data()); + auto scaleshift_memory = + handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data()); // create mkldnn memory for output y tensor - auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data); + auto dst_memory = handler.AcquireDstMemory( + batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data); + std::shared_ptr batch_norm_p; if (is_test) { // create mkldnn memory for stats (as input) - auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(), - to_void_cast(mean_data)); - auto variance_memory = - memory(batch_norm_fwd_pd->variance_primitive_desc(), - to_void_cast(variance_data)); - - run_batch_norm_op( - *batch_norm_fwd_pd, src_memory, - (const mkldnn::primitive::at &)mean_memory, - (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, - dst_memory); + std::shared_ptr mean_memory = + handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data)); + std::shared_ptr variance_memory = + handler.AcquireVarianceMemoryFromPrimitive( + to_void_cast(variance_data)); + + batch_norm_p = handler.AcquireTestTrainingBatchNormFwd( + src_memory, scaleshift_memory, dst_memory, mean_memory, + variance_memory, true); } else { // create mkldnn memory for stats (as output) - auto mean_memory = - memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data); - auto variance_memory = memory( - batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data); - - run_batch_norm_op(*batch_norm_fwd_pd, src_memory, - scaleshift_memory, dst_memory, - mean_memory, variance_memory); + std::shared_ptr mean_memory = + handler.AcquireMeanMemoryFromPrimitive(batch_mean_data); + std::shared_ptr variance_memory = + handler.AcquireVarianceMemoryFromPrimitive(batch_variance_data); + + batch_norm_p = handler.AcquireTestTrainingBatchNormFwd( + src_memory, scaleshift_memory, dst_memory, mean_memory, + variance_memory, false); } + y->set_layout(DataLayout::kMKLDNN); + y->set_format(platform::GetMKLDNNFormat(*dst_memory)); + + std::vector pipeline; + pipeline.push_back(*batch_norm_p); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + if (!is_test) { // mkldnn only compute stats for current batch // so we need compute momentum stats via Eigen lib @@ -192,10 +285,6 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { running_variance_e = variance_e * momentum + batch_variance_e * one_minus_momentum; } - - y->set_layout(DataLayout::kMKLDNN); - y->set_format( - (memory::format)dst_memory.get_primitive_desc().desc().data.format); } }; @@ -242,61 +331,48 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { const unsigned int ic = scale_tz[0]; - // Retrieve bn_fwd_pd from device context - const std::string key = ctx.op().Input("SavedMean"); - const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; - auto batch_norm_fwd_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_batch_norm_fwd_pd)); - PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, - "Fail to find batch_norm_fwd_pd in device context"); - using bn_bwd_types = bn_type_traits; - // create mkldnn memory from input diff_y tensor - mkldnn::memory::format dst_format = platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); - auto user_diff_dst_memory = memory( - {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, - to_void_cast(diff_y_data)); - - // create mkldnn memory from input x tensor mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); - auto src_memory = memory( - {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, - to_void_cast(x_data)); + unsigned flags = mkldnn::use_scale_shift; - // for diff_dst, try to use same format as dst in forward pass - auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); - auto diff_dst_md = diff_dst_pd.desc(); + // keys from forward pass + const std::string key = BatchNormMKLDNNHandler::GetHash( + src_tz, epsilon, flags, false, input_format, + ctx.op().Input("SavedMean")); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + + // keys for primitives reuse + const std::string key_with_hash = + key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, + input_format); + const std::string key_batch_norm_bwd_p = + key_with_hash + "@batch_norm_bwd_p"; + const std::string key_batch_norm_src_mem_p = + key_with_hash + "@batch_norm_bwd_src_mem_p"; + const std::string key_batch_norm_mean_mem_p = + key_with_hash + "@batch_norm_bwd_mean_mem_p"; + const std::string key_batch_norm_variance_mem_p = + key_with_hash + "@batch_norm_bwd_variance_mem_p"; + const std::string key_batch_norm_scaleshift_mem_p = + key_with_hash + "@batch_norm_bwd_scaleshift_mem_p"; + const std::string key_batch_norm_diff_scaleshift_mem_p = + key_with_hash + "@batch_norm_bwd_diff_scaleshift_mem_p"; + const std::string key_batch_norm_diff_src_mem_p = + key_with_hash + "@batch_norm_bwd_diff_src_mem_p"; + const std::string key_batch_norm_diff_dst_mem_p = + key_with_hash + "@batch_norm_bwd_diff_dst_mem_p"; - // create primitive descriptor for batch norm backward - unsigned flags = mkldnn::use_scale_shift; - auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ - mkldnn::prop_kind::backward, diff_dst_md, - src_memory.get_primitive_desc().desc(), epsilon, flags}; - auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ - batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; - - // reorder user_diff_dst if it's not in preferred format - auto diff_dst_memory = user_diff_dst_memory; primitive reorder_diff_dst; bool is_diff_dst_reordered = false; - if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { - diff_dst_memory = memory(diff_dst_pd); - reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory); - is_diff_dst_reordered = true; - } - - // create mkldnn memory for input tensors (src/mean/variance) - auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(), - to_void_cast(batch_mean_data)); - auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(), - to_void_cast(batch_variance_data)); + auto user_diff_dst_memory = memory( + {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, + to_void_cast(diff_y_data)); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -306,30 +382,118 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic, &scaleshift_data); - // create mkldnn memory for input tensors (scale/shift) - auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(), - scaleshift_data.data()); - - // create mkldnn memory for output diff weights (combined scale/shift) std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); - auto diff_scaleshift_memory = - memory(batch_norm_bwd_pd.diff_weights_primitive_desc(), - diff_scaleshift_data.data()); - // here assume diff_src is in the same format of src - auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data); + auto batch_norm_fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_fwd_pd)); + PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, + "Fail to find batch_norm_fwd_pd in device context"); - // finally create batch_norm backward primitive - auto batch_norm_bwd_prim = - batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory, - variance_memory, diff_dst_memory, scaleshift_memory, - diff_src_memory, diff_scaleshift_memory); + auto batch_norm_bwd_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_bwd_p)); + + if (batch_norm_bwd_p == nullptr) { + auto src_memory = std::shared_ptr(new memory( + {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine}, + to_void_cast(x_data))); + + // for diff_dst, try to use same format as dst in forward pass + auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); + auto diff_dst_md = diff_dst_pd.desc(); + + // create primitive descriptor for batch norm backward + auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ + mkldnn::prop_kind::backward, diff_dst_md, + src_memory->get_primitive_desc().desc(), epsilon, flags}; + auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ + batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; + + // reorder user_diff_dst if it's not in preferred format + auto diff_dst_memory = std::make_shared(user_diff_dst_memory); + if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = std::make_shared(diff_dst_pd); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + // create mkldnn memory for input tensors (src/mean/variance) + auto mean_memory = + std::make_shared(batch_norm_bwd_pd.mean_primitive_desc(), + to_void_cast(batch_mean_data)); + auto variance_memory = + std::make_shared(batch_norm_bwd_pd.variance_primitive_desc(), + to_void_cast(batch_variance_data)); + + // create mkldnn memory for input tensors (scale/shift) + auto scaleshift_memory = std::make_shared( + batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()); + + // create mkldnn memory for output diff weights (combined scale/shift) + auto diff_scaleshift_memory = std::make_shared( + batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()); + + // here assume diff_src is in the same format of src + auto diff_src_memory = std::make_shared( + src_memory->get_primitive_desc(), diff_x_data); + + // finally create batch_norm backward primitive + batch_norm_bwd_p = std::make_shared( + batch_norm_bwd_pd, *src_memory, *mean_memory, *variance_memory, + *diff_dst_memory, *scaleshift_memory, *diff_src_memory, + *diff_scaleshift_memory); + + dev_ctx.SetBlob(key_batch_norm_bwd_p, batch_norm_bwd_p); + dev_ctx.SetBlob(key_batch_norm_src_mem_p, src_memory); + dev_ctx.SetBlob(key_batch_norm_mean_mem_p, mean_memory); + dev_ctx.SetBlob(key_batch_norm_variance_mem_p, variance_memory); + dev_ctx.SetBlob(key_batch_norm_scaleshift_mem_p, scaleshift_memory); + dev_ctx.SetBlob(key_batch_norm_diff_scaleshift_mem_p, + diff_scaleshift_memory); + dev_ctx.SetBlob(key_batch_norm_diff_src_mem_p, diff_src_memory); + dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory); + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); + } else { + // primitives already exist + UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); + UpdateMemoryData(dev_ctx, key_batch_norm_mean_mem_p, + to_void_cast(batch_mean_data)); + UpdateMemoryData(dev_ctx, key_batch_norm_variance_mem_p, + to_void_cast(batch_variance_data)); + UpdateMemoryData(dev_ctx, key_batch_norm_scaleshift_mem_p, + scaleshift_data.data()); + UpdateMemoryData(dev_ctx, key_batch_norm_diff_scaleshift_mem_p, + diff_scaleshift_data.data()); + auto diff_src_memory = UpdateMemoryData( + dev_ctx, key_batch_norm_diff_src_mem_p, to_void_cast(diff_x_data)); + auto diff_dst_memory = UpdateMemoryData( + dev_ctx, key_batch_norm_diff_dst_mem_p, to_void_cast(diff_y_data)); + + // reorder user_diff_dst if it's not in preferred format + if (diff_dst_memory->get_primitive_desc() != + user_diff_dst_memory.get_primitive_desc()) { + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); + } // execute optional reorder and batch_norm backward primitive std::vector pipeline; if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst); - pipeline.push_back(batch_norm_bwd_prim); + pipeline.push_back(*batch_norm_bwd_p); stream(stream::kind::eager).submit(pipeline).wait(); // copy back diff sacle/shift to output tensors (diff scale/shift) @@ -338,12 +502,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::copy(it, std::next(it, ic), diff_scale_data); std::copy(std::next(it, ic), std::end(diff_scaleshift_data), diff_shift_data); - - // set layout/format of output tensors - diff_x->set_layout(DataLayout::kMKLDNN); - diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc() - .desc() - .data.format); } }; } // namespace operators diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 10d678111f5325e495b24286e6ecf651230393fe..b6cb935814e25b31d4104f9ce24fe952680cb491 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -74,7 +74,7 @@ struct BeamSearchDecodeFunctor { } template - void operator()() const; + void apply() const; bool tensor_on_gpu_; size_t beam_size_; @@ -88,7 +88,7 @@ struct BeamSearchDecodeFunctor { }; template -void BeamSearchDecodeFunctor::operator()() const { +void BeamSearchDecodeFunctor::apply() const { BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); // Check if the tensor is on GPU. If so, use the CPU copy instead if (tensor_on_gpu_) { @@ -101,7 +101,7 @@ void BeamSearchDecodeFunctor::operator()() const { } template <> -void BeamSearchDecodeFunctor::operator()() const { +void BeamSearchDecodeFunctor::apply() const { PADDLE_THROW("beam search decode op does not support bool!"); } diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h index abc209d58d08693ae8dcd7b4615882bfeff8c7fd..469fe13774f717f858d5725cad0af40e5d603ece 100644 --- a/paddle/fluid/operators/cast_op.h +++ b/paddle/fluid/operators/cast_op.h @@ -37,7 +37,7 @@ struct CastOpFunctor { : in_(in), out_(out), ctx_(ctx) {} template - void operator()() const { + void apply() const { auto* in_begin = in_->data(); auto numel = in_->numel(); auto* in_end = in_begin + numel; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index a44d84cd7b99107fef09a6b4dfa60172fabd718b..f4983c65432991a45f226d97f0fb05b08a30ca89 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -29,6 +29,7 @@ target_assign_op.cu) detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) - -# Export local libraries to parent +detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) +detection_library(generate_proposals_op SRCS generate_proposals_op.cc) +#Export local libraries to parent set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0571c46f6be99c9a06b7dd2abb310eeda506ecd5 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -0,0 +1,515 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +const int kBoxDim = 4; + +template +void AppendRois(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +class GenerateProposalLabelsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("RpnRois"), + "Input(RpnRois) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtClasses"), + "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), + "Input(GtBoxes) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImScales"), + "Input(ImScales) shouldn't be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Rois"), + "Output(Rois) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("LabelsInt32"), + "Output(LabelsInt32) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("BboxTargets"), + "Output(BboxTargets) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("BboxInsideWeights"), + "Output(BboxInsideWeights) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("BboxOutsideWeights"), + "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null"); + + auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); + auto gt_classes_dims = ctx->GetInputDim("GtClasses"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto im_scales_dims = ctx->GetInputDim("ImScales"); + + PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2, + "The rank of Input(RpnRois) must be 2."); + PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1, + "The rank of Input(GtClasses) must be 1."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1, + "The rank of Input(ImScales) must be 1."); + + int class_nums = ctx->Attrs().Get("class_nums"); + + ctx->SetOutputDim("Rois", {-1, 4}); + ctx->SetOutputDim("LabelsInt32", {-1}); + ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums}); + ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums}); + ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("RpnRois")); + return framework::OpKernelType(data_type, platform::CPUPlace()); + } +}; + +template +void Concat(const platform::CPUDeviceContext& context, + const Tensor& in_tensor_a, const Tensor& in_tensor_b, + Tensor* out_tensor) { + int axis = 0; + std::vector inputs; + inputs.emplace_back(in_tensor_a); + inputs.emplace_back(in_tensor_b); + math::ConcatFunctor concat_functor; + concat_functor(context, inputs, axis, out_tensor); +} + +template +void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, + Tensor* overlaps) { + auto r_boxes_et = framework::EigenTensor::From(r_boxes); + auto c_boxes_et = framework::EigenTensor::From(c_boxes); + auto overlaps_et = framework::EigenTensor::From(*overlaps); + int r_num = r_boxes.dims()[0]; + int c_num = c_boxes.dims()[0]; + auto zero = static_cast(0.0); + T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, + inter_area; + for (int i = 0; i < r_num; ++i) { + r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * + (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); + for (int j = 0; j < c_num; ++j) { + c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * + (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); + x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); + y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); + x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); + y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); + inter_w = std::max(x_max - x_min + 1, zero); + inter_h = std::max(y_max - y_min + 1, zero); + inter_area = inter_w * inter_h; + overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); + } + } +} + +template +void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes, + const std::vector& weights, Tensor* box_delta) { + auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); + auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); + auto box_delta_et = framework::EigenTensor::From(*box_delta); + T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y; + for (int64_t i = 0; i < box_num; ++i) { + ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1; + ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1; + ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w; + ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h; + + gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1; + gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1; + gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w; + gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h; + + box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]; + box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]; + box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2]; + box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3]; + } +} + +template +std::vector> SampleFgBgGt( + const platform::CPUDeviceContext& context, Tensor* iou, + const int batch_size_per_im, const float fg_fraction, const float fg_thresh, + const float bg_thresh_hi, const float bg_thresh_lo, + std::minstd_rand engine) { + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; + T* proposal_to_gt_overlaps = iou->mutable_data(context.GetPlace()); + int64_t row = iou->dims()[0]; + int64_t col = iou->dims()[1]; + float epsilon = 0.00001; + + // Follow the Faster RCNN's implementation + for (int64_t i = 0; i < row; ++i) { + const T* v = proposal_to_gt_overlaps + i * col; + T max_overlap = *std::max_element(v, v + col); + if (max_overlap > fg_thresh) { + for (int64_t j = 0; j < col; ++j) { + T val = proposal_to_gt_overlaps[i * col + j]; + auto diff = std::abs(max_overlap - val); + if (diff < epsilon) { + fg_inds.emplace_back(i); + gt_inds.emplace_back(j); + break; + } + } + } else { + if ((max_overlap >= bg_thresh_lo) && (max_overlap < bg_thresh_hi)) { + bg_inds.emplace_back(i); + } + } + } + + // Reservoir Sampling + int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction); + int fg_rois_this_image = fg_inds.size(); + int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image); + std::uniform_real_distribution uniform(0, 1); + const int64_t fg_size = static_cast(fg_inds.size()); + if (fg_size > fg_rois_per_this_image) { + for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) { + std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); + std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + } + } + } + std::vector new_fg_inds(fg_inds.begin(), + fg_inds.begin() + fg_rois_per_this_image); + std::vector new_gt_inds(gt_inds.begin(), + gt_inds.begin() + fg_rois_per_this_image); + + int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image; + int bg_rois_this_image = bg_inds.size(); + int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image); + const int64_t bg_size = static_cast(bg_inds.size()); + if (bg_size > bg_rois_per_this_image) { + for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) + std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + } + } + std::vector new_bg_inds(bg_inds.begin(), + bg_inds.begin() + bg_rois_per_this_image); + std::vector> res; + res.emplace_back(new_fg_inds); + res.emplace_back(new_bg_inds); + res.emplace_back(new_gt_inds); + return res; +} + +template +void GatherBoxesLabels(const platform::CPUDeviceContext& context, + const Tensor& boxes, const Tensor& gt_boxes, + const Tensor& gt_classes, + const std::vector& fg_inds, + const std::vector& bg_inds, + const std::vector& gt_inds, Tensor* sampled_boxes, + Tensor* sampled_labels, Tensor* sampled_gts) { + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + int gt_num = fg_num + bg_num; + Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t; + int* fg_inds_data = fg_inds_t.mutable_data({fg_num}, context.GetPlace()); + int* bg_inds_data = bg_inds_t.mutable_data({bg_num}, context.GetPlace()); + int* gt_box_inds_data = + gt_box_inds_t.mutable_data({gt_num}, context.GetPlace()); + int* gt_label_inds_data = + gt_label_inds_t.mutable_data({fg_num}, context.GetPlace()); + std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data); + std::copy(bg_inds.begin(), bg_inds.end(), bg_inds_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data); + + Tensor fg_boxes, bg_boxes, fg_labels, bg_labels; + fg_boxes.mutable_data({fg_num, kBoxDim}, context.GetPlace()); + CPUGather(context, boxes, fg_inds_t, &fg_boxes); + bg_boxes.mutable_data({bg_num, kBoxDim}, context.GetPlace()); + CPUGather(context, boxes, bg_inds_t, &bg_boxes); + Concat(context, fg_boxes, bg_boxes, sampled_boxes); + CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); + fg_labels.mutable_data({fg_num}, context.GetPlace()); + CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); + bg_labels.mutable_data({bg_num}, context.GetPlace()); + math::set_constant(context, &bg_labels, 0); + Concat(context, fg_labels, bg_labels, sampled_labels); +} + +template +std::vector SampleRoisForOneImage( + const platform::CPUDeviceContext& context, Tensor* rpn_rois, + Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale, + const int batch_size_per_im, const float fg_fraction, const float fg_thresh, + const float bg_thresh_hi, const float bg_thresh_lo, + const std::vector& bbox_reg_weights, const int class_nums, + std::minstd_rand engine) { + auto rpn_rois_et = framework::EigenTensor::From(*rpn_rois); + auto im_scale_data = im_scale->data()[0]; + rpn_rois_et = rpn_rois_et / im_scale_data; + + Tensor boxes; + int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0]; + boxes.mutable_data({proposals_num, kBoxDim}, context.GetPlace()); + Concat(context, *gt_boxes, *rpn_rois, &boxes); + + // Overlaps + Tensor proposal_to_gt_overlaps; + proposal_to_gt_overlaps.mutable_data({proposals_num, gt_boxes->dims()[0]}, + context.GetPlace()); + BboxOverlaps(boxes, *gt_boxes, &proposal_to_gt_overlaps); + + // Generate proposal index + std::vector> fg_bg_gt = SampleFgBgGt( + context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, engine); + std::vector fg_inds = fg_bg_gt[0]; + std::vector bg_inds = fg_bg_gt[1]; + std::vector gt_inds = fg_bg_gt[2]; + + // Gather boxes and labels + Tensor sampled_boxes, sampled_labels, sampled_gts; + int boxes_num = fg_inds.size() + bg_inds.size(); + framework::DDim bbox_dim({boxes_num, kBoxDim}); + sampled_boxes.mutable_data(bbox_dim, context.GetPlace()); + sampled_labels.mutable_data({boxes_num}, context.GetPlace()); + sampled_gts.mutable_data(bbox_dim, context.GetPlace()); + GatherBoxesLabels(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds, + gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts); + + // Compute targets + Tensor bbox_targets_single; + bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); + BoxToDelta(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights, + &bbox_targets_single); + + // Scale rois + Tensor sampled_rois; + sampled_rois.mutable_data(sampled_boxes.dims(), context.GetPlace()); + auto sampled_rois_et = framework::EigenTensor::From(sampled_rois); + auto sampled_boxes_et = framework::EigenTensor::From(sampled_boxes); + sampled_rois_et = sampled_boxes_et * im_scale_data; + + // Expand box targets + Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights; + framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums}); + bbox_targets.mutable_data(bbox_expand_dim, context.GetPlace()); + bbox_inside_weights.mutable_data(bbox_expand_dim, context.GetPlace()); + bbox_outside_weights.mutable_data(bbox_expand_dim, context.GetPlace()); + math::set_constant(context, &bbox_targets, 0.0); + math::set_constant(context, &bbox_inside_weights, 0.0); + math::set_constant(context, &bbox_outside_weights, 0.0); + + auto* bbox_targets_single_data = bbox_targets_single.data(); + auto* sampled_labels_data = sampled_labels.data(); + auto* bbox_targets_data = bbox_targets.data(); + auto* bbox_inside_weights_data = bbox_inside_weights.data(); + auto* bbox_outside_weights_data = bbox_outside_weights.data(); + int width = kBoxDim * class_nums; + for (int64_t i = 0; i < boxes_num; ++i) { + int label = sampled_labels_data[i]; + if (label > 0) { + int dst_idx = i * width + kBoxDim * label; + int src_idx = kBoxDim * i; + bbox_targets_data[dst_idx] = bbox_targets_single_data[src_idx]; + bbox_targets_data[dst_idx + 1] = bbox_targets_single_data[src_idx + 1]; + bbox_targets_data[dst_idx + 2] = bbox_targets_single_data[src_idx + 2]; + bbox_targets_data[dst_idx + 3] = bbox_targets_single_data[src_idx + 3]; + bbox_inside_weights_data[dst_idx] = 1; + bbox_inside_weights_data[dst_idx + 1] = 1; + bbox_inside_weights_data[dst_idx + 2] = 1; + bbox_inside_weights_data[dst_idx + 3] = 1; + bbox_outside_weights_data[dst_idx] = 1; + bbox_outside_weights_data[dst_idx + 1] = 1; + bbox_outside_weights_data[dst_idx + 2] = 1; + bbox_outside_weights_data[dst_idx + 3] = 1; + } + } + std::vector res; + res.emplace_back(sampled_rois); + res.emplace_back(sampled_labels); + res.emplace_back(bbox_targets); + res.emplace_back(bbox_inside_weights); + res.emplace_back(bbox_outside_weights); + return res; +} + +template +class GenerateProposalLabelsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* rpn_rois = context.Input("RpnRois"); + auto* gt_classes = context.Input("GtClasses"); + auto* gt_boxes = context.Input("GtBoxes"); + auto* im_scales = context.Input("ImScales"); + + auto* rois = context.Output("Rois"); + auto* labels_int32 = context.Output("LabelsInt32"); + auto* bbox_targets = context.Output("BboxTargets"); + auto* bbox_inside_weights = context.Output("BboxInsideWeights"); + auto* bbox_outside_weights = + context.Output("BboxOutsideWeights"); + + int batch_size_per_im = context.Attr("batch_size_per_im"); + float fg_fraction = context.Attr("fg_fraction"); + float fg_thresh = context.Attr("fg_thresh"); + float bg_thresh_hi = context.Attr("bg_thresh_hi"); + float bg_thresh_lo = context.Attr("bg_thresh_lo"); + std::vector bbox_reg_weights = + context.Attr>("bbox_reg_weights"); + int class_nums = context.Attr("class_nums"); + + PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL, + "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD"); + PADDLE_ENFORCE_EQ( + gt_classes->lod().size(), 1UL, + "GenerateProposalLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD"); + int64_t n = static_cast(rpn_rois->lod().back().size() - 1); + + rois->mutable_data({n * batch_size_per_im, kBoxDim}, context.GetPlace()); + labels_int32->mutable_data({n * batch_size_per_im}, + context.GetPlace()); + bbox_targets->mutable_data({n * batch_size_per_im, kBoxDim * class_nums}, + context.GetPlace()); + bbox_inside_weights->mutable_data( + {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); + bbox_outside_weights->mutable_data( + {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); + + std::random_device rnd; + std::minstd_rand engine; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + engine.seed(seed); + + framework::LoD lod; + std::vector lod0(1, 0); + + int64_t num_rois = 0; + auto& dev_ctx = context.device_context(); + + auto rpn_rois_lod = rpn_rois->lod().back(); + auto gt_classes_lod = gt_classes->lod().back(); + auto gt_boxes_lod = gt_boxes->lod().back(); + for (size_t i = 0; i < n; ++i) { + Tensor rpn_rois_slice = + rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); + Tensor gt_classes_slice = + gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor im_scales_slice = im_scales->Slice(i, i + 1); + std::vector tensor_output = SampleRoisForOneImage( + dev_ctx, &rpn_rois_slice, >_classes_slice, >_boxes_slice, + &im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh, + bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine); + Tensor sampled_rois = tensor_output[0]; + Tensor sampled_labels_int32 = tensor_output[1]; + Tensor sampled_bbox_targets = tensor_output[2]; + Tensor sampled_bbox_inside_weights = tensor_output[3]; + Tensor sampled_bbox_outside_weights = tensor_output[4]; + + AppendRois(rois, kBoxDim * num_rois, &sampled_rois); + AppendRois(labels_int32, num_rois, &sampled_labels_int32); + AppendRois(bbox_targets, kBoxDim * num_rois * class_nums, + &sampled_bbox_targets); + AppendRois(bbox_inside_weights, kBoxDim * num_rois * class_nums, + &sampled_bbox_inside_weights); + AppendRois(bbox_outside_weights, kBoxDim * num_rois * class_nums, + &sampled_bbox_outside_weights); + + num_rois += sampled_rois.dims()[0]; + lod0.emplace_back(num_rois); + } + + lod.emplace_back(lod0); + rois->set_lod(lod); + labels_int32->set_lod(lod); + bbox_targets->set_lod(lod); + bbox_inside_weights->set_lod(lod); + bbox_outside_weights->set_lod(lod); + rois->Resize({num_rois, kBoxDim}); + labels_int32->Resize({num_rois}); + bbox_targets->Resize({num_rois, kBoxDim * class_nums}); + bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums}); + bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums}); + } +}; + +class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + // TODO(buxingyuan): Add Document + AddInput("RpnRois", "RpnRois."); + AddInput("GtClasses", "GtClasses."); + AddInput("GtBoxes", "GtBoxes."); + AddInput("ImScales", "ImScales."); + + AddOutput("Rois", "Rois."); + AddOutput("LabelsInt32", "LabelsInt32."); + AddOutput("BboxTargets", "BboxTargets."); + AddOutput("BboxInsideWeights", "BboxInsideWeights."); + AddOutput("BboxOutsideWeights", "BboxOutsideWeights."); + + AddAttr("batch_size_per_im", "batch_size_per_im"); + AddAttr("fg_fraction", "fg_fraction"); + AddAttr("fg_thresh", "fg_thresh"); + AddAttr("bg_thresh_hi", "bg_thresh_hi"); + AddAttr("bg_thresh_lo", "bg_thresh_lo"); + AddAttr>("bbox_reg_weights", "bbox_reg_weights"); + AddAttr("class_nums", "class_nums"); + AddAttr("fix_seed", "fix_seed").SetDefault(false); + AddAttr("seed", "seed").SetDefault(0); + + AddComment(R"DOC( +Generate Proposals Labels Operator. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(generate_proposal_labels, ops::GenerateProposalLabelsOp, + ops::GenerateProposalLabelsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(generate_proposal_labels, + ops::GenerateProposalLabelsKernel, + ops::GenerateProposalLabelsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fcdcafae7273afa6887ee531dfc37ef833b92d68 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -0,0 +1,485 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +struct AppendProposalsFunctor { + LoDTensor *out_; + int64_t offset_; + Tensor *to_add_; + + AppendProposalsFunctor(LoDTensor *out, int64_t offset, Tensor *to_add) + : out_(out), offset_(offset), to_add_(to_add) {} + + template + void apply() const { + auto *out_data = out_->data(); + auto *to_add_data = to_add_->data(); + memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T)); + } +}; + +class GenerateProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Scores"), "Input(Scores) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("BboxDeltas"), + "Input(BboxDeltas) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Anchors"), + "Input(Anchors) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Variances"), + "Input(Variances) shouldn't be null."); + + auto scores_dims = ctx->GetInputDim("Scores"); + auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + auto anchors_dims = ctx->GetInputDim("Anchors"); + auto variances_dims = ctx->GetInputDim("Variances"); + + ctx->SetOutputDim("RpnRois", {-1, 4}); + ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Anchors")->type()), + platform::CPUPlace()); + } +}; + +template +void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, + Tensor *bbox_deltas, Tensor *variances, Tensor *proposals) { + T *proposals_data = proposals->mutable_data(ctx.GetPlace()); + + int64_t row = all_anchors->dims()[0]; + int64_t len = all_anchors->dims()[1]; + + auto *bbox_deltas_data = bbox_deltas->data(); + auto *anchor_data = all_anchors->data(); + const T *variances_data = nullptr; + if (variances) { + variances_data = variances->data(); + } + + for (int64_t i = 0; i < row; ++i) { + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len]; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1]; + + T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2; + T anchor_center_y = + (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2; + + T bbox_center_x = 0, bbox_center_y = 0; + T bbox_width = 0, bbox_height = 0; + + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2]) * + anchor_width; + bbox_height = std::exp(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3]) * + anchor_height; + } else { + bbox_center_x = + bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; + bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + } + + proposals_data[i * len] = bbox_center_x - bbox_width / 2; + proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; + } + // return proposals; +} + +template +void ClipTiledBoxes(const platform::DeviceContext &ctx, const Tensor &im_info, + Tensor *boxes) { + T *boxes_data = boxes->mutable_data(ctx.GetPlace()); + const T *im_info_data = im_info.data(); + for (int64_t i = 0; i < boxes->numel(); ++i) { + if (i % 4 == 0) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); + } else if (i % 4 == 1) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); + } else if (i % 4 == 2) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), 0.0f); + } else { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), 0.0f); + } + } +} + +template +void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, + float min_size, const Tensor &im_info, Tensor *keep) { + const T *im_info_data = im_info.data(); + T *boxes_data = boxes->mutable_data(ctx.GetPlace()); + min_size *= im_info_data[2]; + keep->Resize({boxes->dims()[0], 1}); + int *keep_data = keep->mutable_data(ctx.GetPlace()); + + int keep_len = 0; + for (int i = 0; i < boxes->dims()[0]; ++i) { + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; + if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && + y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } + keep->Resize({keep_len}); +} + +bool SortScorePairDescend(const std::pair &pair1, + const std::pair &pair2) { + return pair1.first > pair2.first; +} + +template +void GetMaxScoreIndex(const std::vector &scores, + std::vector> *sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); +} + +template +T BBoxArea(const T *box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores, + const T nms_threshold, const float eta) { + PADDLE_ENFORCE_NOT_NULL(bbox); + int64_t num_boxes = bbox->dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox->dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores->data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, &sorted_indices); + + std::vector selected_indices; + int selected_num = 0; + T adaptive_threshold = nms_threshold; + const T *bbox_data = bbox->data(); + bool flag; + while (sorted_indices.size() != 0) { + int idx = sorted_indices.front().second; + flag = true; + for (size_t k = 0; k < selected_indices.size(); ++k) { + if (flag) { + const int kept_idx = selected_indices[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + selected_num++; + } + sorted_indices.erase(sorted_indices.begin()); + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + Tensor keep_nms; + keep_nms.Resize({selected_num}); + int *keep_data = keep_nms.mutable_data(ctx.GetPlace()); + for (int i = 0; i < selected_num; ++i) { + keep_data[i] = selected_indices[i]; + } + + return keep_nms; +} + +template +class GenerateProposalsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_info = context.Input("ImInfo"); + auto *anchors = context.Input("Anchors"); + auto *variances = context.Input("Variances"); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + + auto &dev_ctx = context.template device_context(); + + auto scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel() / 4, 1}, + context.GetPlace()); + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + framework::LoD lod; + std::vector lod0(1, 0); + Tensor *anchor = const_cast(anchors); + anchor->Resize({anchors->numel() / 4, 4}); + Tensor *var = const_cast(variances); + var->Resize({var->numel() / 4, 4}); + + int64_t num_proposals = 0; + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair tensor_pair = + ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + Tensor proposals = tensor_pair.first; + Tensor scores = tensor_pair.second; + + framework::VisitDataType( + framework::ToDataType(rpn_rois->type()), + AppendProposalsFunctor(rpn_rois, 4 * num_proposals, &proposals)); + framework::VisitDataType( + framework::ToDataType(rpn_roi_probs->type()), + AppendProposalsFunctor(rpn_roi_probs, num_proposals, &scores)); + + num_proposals += proposals.dims()[0]; + lod0.emplace_back(num_proposals); + } + + lod.emplace_back(lod0); + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } + + std::pair ProposalForOneImage( + const DeviceContext &ctx, const Tensor &im_info_slice, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas_slice, // [M, 4] + const Tensor &scores_slice, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) const { + auto *scores_data = scores_slice.data(); + + // Sort index + Tensor index_t; + index_t.Resize({scores_slice.numel()}); + int *index = index_t.mutable_data(ctx.GetPlace()); + for (int i = 0; i < scores_slice.numel(); ++i) { + index[i] = i; + } + std::function compare = + [scores_data](const int64_t &i, const int64_t &j) { + return scores_data[i] > scores_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { + std::sort(index, index + scores_slice.numel(), compare); + } else { + std::nth_element(index, index + pre_nms_top_n, + index + scores_slice.numel(), compare); + index_t.Resize({pre_nms_top_n}); + } + + Tensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.mutable_data({index_t.numel(), 1}, ctx.GetPlace()); + bbox_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + + CPUGather(ctx, scores_slice, index_t, &scores_sel); + CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + CPUGather(ctx, anchors, index_t, &anchor_sel); + CPUGather(ctx, variances, index_t, &var_sel); + + Tensor proposals; + proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); + + ClipTiledBoxes(ctx, im_info_slice, &proposals); + + Tensor keep; + FilterBoxes(ctx, &proposals, min_size, im_info_slice, &keep); + + Tensor scores_filter; + bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, proposals, keep, &bbox_sel); + CPUGather(ctx, scores_sel, keep, &scores_filter); + if (nms_thresh <= 0) { + return std::make_pair(bbox_sel, scores_sel); + } + + Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, bbox_sel, keep_nms, &proposals); + CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + + return std::make_pair(proposals, scores_sel); + } +}; + +class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Scores", "The scores of anchors should be foreground."); + AddInput("BboxDeltas", "bbox_deltas."); + AddInput("ImInfo", "Information for image reshape."); + AddInput("Anchors", "All anchors."); + AddInput("Variances", " variances"); + + AddOutput("RpnRois", "Anchors."); + AddOutput("RpnRoiProbs", "Anchors."); + AddAttr("pre_nms_topN", "pre_nms_topN"); + AddAttr("post_nms_topN", "post_nms_topN"); + AddAttr("nms_thresh", "nms_thres"); + AddAttr("min_size", "min size"); + AddAttr("eta", "eta"); + AddComment(R"DOC( +Generate Proposals OP + +This operator proposes rois according to each box with their probability to be a foreground object and +the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals +could be used to train detection net. + +Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number +of anchors, H and W are height and width of the feature map. +BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W) + +For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and + calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area. +Finally, apply nms to get final proposals as output. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp, + ops::GenerateProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + generate_proposals, + ops::GenerateProposalsKernel); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 9a1643d5b35c067ba9064286bab32019fb34fbe8..177ff7cf187bc9daf69889e99ca57ae18766de90 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -86,7 +86,7 @@ class RpnTargetAssignKernel : public framework::OpKernel { std::minstd_rand engine, std::vector* inds) const { std::uniform_real_distribution uniform(0, 1); - const int64_t size = static_cast(inds->size()); + const int64_t size = static_cast(inds->size() - offset); if (size > num) { for (int64_t i = num; i < size; ++i) { int rng_ind = std::floor(uniform(engine) * i); @@ -126,7 +126,7 @@ class RpnTargetAssignKernel : public framework::OpKernel { neg_threshold, target_label_data, fg_inds, bg_inds); // Reservoir Sampling ReservoirSampling(fg_num, fg_offset, engine, fg_inds); - int bg_num = rpn_batch_size - fg_inds->size(); + int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset); ReservoirSampling(bg_num, bg_offset, engine, bg_inds); } diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc index 8e38b3713f28b045e9214db68aec50f0ba6c06f6..1617cc1b95216b118cf2c2122dbe8b6c106554c3 100644 --- a/paddle/fluid/operators/distributed/variable_response.cc +++ b/paddle/fluid/operators/distributed/variable_response.cc @@ -151,6 +151,7 @@ bool VariableResponse::CopySelectRowsData( ::google::protobuf::io::CodedInputStream* input, const platform::DeviceContext& ctx, int length) { auto* slr = GetVar()->GetMutable(); + slr->mutable_rows()->clear(); slr->mutable_rows()->resize(length / framework::SizeOfType(typeid(int64_t))); // int64 int64_t* rows_data = slr->mutable_rows()->data(); diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 4a29d606fb65ad6227ee20ad1e7e28ada91e6d44..52d2de60f6bf011645e4eb47d1ae1bd8ecdbf0f8 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -48,9 +48,9 @@ namespace operators { * pre=2*3, n=4*5, post=1 * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) */ -inline void get_mid_dims(const framework::DDim& x_dims, - const framework::DDim& y_dims, const int axis, - int* pre, int* n, int* post) { +inline void get_mid_dims(const framework::DDim &x_dims, + const framework::DDim &y_dims, const int axis, + int *pre, int *n, int *post) { *pre = 1; *n = 1; *post = 1; @@ -70,7 +70,7 @@ inline void get_mid_dims(const framework::DDim& x_dims, } inline framework::DDim trim_trailing_singular_dims( - const framework::DDim& dims) { + const framework::DDim &dims) { // Remove trailing dimensions of size 1 for y auto actual_dims_size = dims.size(); for (; actual_dims_size != 0; --actual_dims_size) { @@ -91,6 +91,7 @@ inline framework::DDim trim_trailing_singular_dims( template class RowwiseTransformIterator; + template class MidWiseTransformIterator; @@ -100,9 +101,9 @@ class RowwiseTransformIterator : public std::iterator { public: - RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} + RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {} - RowwiseTransformIterator& operator++() { + RowwiseTransformIterator &operator++() { ++i_; if (UNLIKELY(i_ == n_)) { i_ = 0; @@ -110,20 +111,20 @@ class RowwiseTransformIterator return *this; } - bool operator==(const RowwiseTransformIterator& - rhs) const { + bool operator==(const RowwiseTransformIterator + &rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=(const RowwiseTransformIterator& - rhs) const { + bool operator!=(const RowwiseTransformIterator + &rhs) const { return (ptr_ + i_) != &(*rhs); } - const T& operator*() { return ptr_[i_]; } + const T &operator*() { return ptr_[i_]; } private: - const T* ptr_; + const T *ptr_; int i_; int64_t n_; }; @@ -133,10 +134,10 @@ class MidWiseTransformIterator : public std::iterator { public: - MidWiseTransformIterator(const T* ptr, int n, int post) + MidWiseTransformIterator(const T *ptr, int n, int post) : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} - MidWiseTransformIterator& operator++() { + MidWiseTransformIterator &operator++() { ++j_; if (UNLIKELY(j_ == post_)) { ++i_; @@ -148,20 +149,20 @@ class MidWiseTransformIterator return *this; } - bool operator==(const MidWiseTransformIterator& - rhs) const { + bool operator==(const MidWiseTransformIterator + &rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=(const MidWiseTransformIterator& - rhs) const { + bool operator!=(const MidWiseTransformIterator + &rhs) const { return (ptr_ + i_) != &(*rhs); } - const T& operator*() { return ptr_[i_]; } + const T &operator*() { return ptr_[i_]; } private: - const T* ptr_; + const T *ptr_; int64_t i_; int64_t j_; int64_t n_; @@ -172,18 +173,18 @@ class MidWiseTransformIterator template class RowwiseTransformIterator : public thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> { + RowwiseTransformIterator, const T *> { public: typedef thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> + RowwiseTransformIterator, const T *> super_t; - HOSTDEVICE RowwiseTransformIterator(const T* x, int n) + HOSTDEVICE RowwiseTransformIterator(const T *x, int n) : super_t(x), begin_(x), n_(n) {} friend class thrust::iterator_core_access; private: unsigned int n_; - const T* begin_; + const T *begin_; HOSTDEVICE typename super_t::reference dereference() const { return *(begin_ + (this->base() - begin_) % n_); } @@ -192,19 +193,19 @@ class RowwiseTransformIterator template class MidWiseTransformIterator : public thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> { + MidWiseTransformIterator, const T *> { public: typedef thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> + MidWiseTransformIterator, const T *> super_t; - HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) + HOSTDEVICE MidWiseTransformIterator(const T *x, int n, int post) : super_t(x), begin_(x), n_(n), post_(post) {} friend class thrust::iterator_core_access; private: unsigned int post_; unsigned int n_; - const T* begin_; + const T *begin_; HOSTDEVICE typename super_t::reference dereference() const { return *(begin_ + (((this->base() - begin_) / post_) % n_)); } @@ -215,8 +216,8 @@ template class TransformFunctor { public: - TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z, const DeviceContext& ctx, Functor func) + TransformFunctor(const framework::Tensor *x, const framework::Tensor *y, + framework::Tensor *z, const DeviceContext &ctx, Functor func) : x_(x->data()), y_(y->data()), z_(z->mutable_data(ctx.GetPlace())), @@ -242,20 +243,20 @@ class TransformFunctor { } private: - const T* x_; - const T* y_; - OutType* z_; + const T *x_; + const T *y_; + OutType *z_; int64_t nx_; - const DeviceContext& ctx_; + const DeviceContext &ctx_; Functor func_; }; #define EIGEN_FUNCTOR(name, eigen_op) \ struct Eigen##name##Functor { \ template \ - inline void Run(const framework::Tensor* x, const framework::Tensor* y, \ - framework::Tensor* z, \ - const framework::ExecutionContext& ctx) { \ + inline void Run(const framework::Tensor *x, const framework::Tensor *y, \ + framework::Tensor *z, \ + const framework::ExecutionContext &ctx) { \ auto x_e = framework::EigenVector::Flatten(*x); \ auto y_e = framework::EigenVector::Flatten(*y); \ auto z_e = framework::EigenVector::Flatten(*z); \ @@ -264,9 +265,9 @@ class TransformFunctor { eigen_op(x_e, y_e); \ } \ template \ - inline void RunBroadCast(const framework::Tensor* x, \ - const framework::Tensor* y, framework::Tensor* z, \ - const framework::ExecutionContext& ctx, int pre, \ + inline void RunBroadCast(const framework::Tensor *x, \ + const framework::Tensor *y, framework::Tensor *z, \ + const framework::ExecutionContext &ctx, int pre, \ int n) { \ auto x_e = framework::EigenVector::Flatten(*x); \ auto y_e = framework::EigenVector::Flatten(*y); \ @@ -279,10 +280,10 @@ class TransformFunctor { eigen_op(x_e, y_bcast); \ } \ template \ - inline void RunBroadCast2(const framework::Tensor* x, \ - const framework::Tensor* y, \ - framework::Tensor* z, \ - const framework::ExecutionContext& ctx, int pre, \ + inline void RunBroadCast2(const framework::Tensor *x, \ + const framework::Tensor *y, \ + framework::Tensor *z, \ + const framework::ExecutionContext &ctx, int pre, \ int n, int post) { \ auto x_e = framework::EigenVector::Flatten(*x); \ auto y_e = framework::EigenVector::Flatten(*y); \ @@ -297,23 +298,27 @@ class TransformFunctor { } #define EIGEN_ADD(x, y) ((x) + (y)) + EIGEN_FUNCTOR(Add, EIGEN_ADD); #define EIGEN_SUB(x, y) ((x) - (y)) + EIGEN_FUNCTOR(Sub, EIGEN_SUB); #define EIGEN_MUL(x, y) ((x) * (y)) + EIGEN_FUNCTOR(Mul, EIGEN_MUL); #define EIGEN_DIV(x, y) ((x) / (y)) + EIGEN_FUNCTOR(Div, EIGEN_DIV); template struct ElemwiseGradNoBroadcast { - const T* x_; - const T* y_; - const T* out_; - const T* dout_; + const T *x_; + const T *y_; + const T *out_; + const T *dout_; HOSTDEVICE void operator()(size_t i) { if (dx_ != nullptr) { @@ -326,14 +331,14 @@ struct ElemwiseGradNoBroadcast { DX_OP dx_op_; DY_OP dy_op_; - T* dx_; - T* dy_; + T *dx_; + T *dy_; }; template -static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, - const T* dout, int h, int w, DX_OP dx_op, - DY_OP dy_op, T* dx, T* dy) { +static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out, + const T *dout, int h, int w, DX_OP dx_op, + DY_OP dy_op, T *dx, T *dy) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int x_offset = i * w + j; @@ -355,8 +360,8 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, #ifdef __NVCC__ template static __global__ void ElemwiseGradBroadcast1CUDAKernel( - const T* x, const T* y, const T* out, const T* dout, int h, int w, - DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) { + const T *x, const T *y, const T *out, const T *dout, int h, int w, + DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { int j = blockIdx.x; int i = threadIdx.x; int tid = threadIdx.x; @@ -383,10 +388,10 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( } template -static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x, - const T* y, const T* out, const T* dout, +static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, + const T *y, const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, - T* dx, T* dy) { + T *dx, T *dy) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); int gird_size = w; ElemwiseGradBroadcast1CUDAKernel<<>>( @@ -396,9 +401,9 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x, #endif template -static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out, - const T* dout, int pre, int n, int post, - DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) { +static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out, + const T *dout, int pre, int n, int post, + DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { for (int i = 0; i < pre; ++i) { for (int j = 0; j < n; ++j) { for (int k = 0; k < post; ++k) { @@ -423,8 +428,8 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out, #ifdef __NVCC__ template static __global__ void ElemwiseGradBroadcast2CUDAKernel( - const T* x, const T* y, const T* out, const T* dout, int pre, int n, - int post, DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) { + const T *x, const T *y, const T *out, const T *dout, int pre, int n, + int post, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { int tid = threadIdx.x; int j = blockIdx.x; @@ -460,10 +465,10 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( } template -static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x, - const T* y, const T* out, const T* dout, +static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x, + const T *y, const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op, - DY_OP dy_op, T* dx, T* dy) { + DY_OP dy_op, T *dx, T *dy) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); int gird_size = n; ElemwiseGradBroadcast2CUDAKernel<<>>( @@ -474,11 +479,11 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x, template void ElemwiseGradComputeNoBroadcast( - const framework::ExecutionContext& ctx, const framework::DDim& x_dim, - const framework::DDim& y_dim, const framework::Tensor& x, - const framework::Tensor& y, const framework::Tensor& out, - const framework::Tensor& dout, int axis, framework::Tensor* dx, - framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + const framework::ExecutionContext &ctx, const framework::DDim &x_dim, + const framework::DDim &y_dim, const framework::Tensor &x, + const framework::Tensor &y, const framework::Tensor &out, + const framework::Tensor &dout, int axis, framework::Tensor *dx, + framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { size_t N = static_cast(framework::product(x_dim)); #if !defined(_WIN32) platform::ForRange for_range( @@ -495,11 +500,11 @@ void ElemwiseGradComputeNoBroadcast( template void ElemwiseGradComputeWithBroadcast( - const framework::ExecutionContext& ctx, const framework::DDim& x_dim, - const framework::DDim& y_dim_untrimed, const framework::Tensor& x, - const framework::Tensor& y, const framework::Tensor& out, - const framework::Tensor& dout, int axis, framework::Tensor* dx, - framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) { + const framework::ExecutionContext &ctx, const framework::DDim &x_dim, + const framework::DDim &y_dim_untrimed, const framework::Tensor &x, + const framework::Tensor &y, const framework::Tensor &out, + const framework::Tensor &dout, int axis, framework::Tensor *dx, + framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); axis = (y_dim.size() == 0) ? x_dim.size() : axis; @@ -543,14 +548,14 @@ void ElemwiseGradComputeWithBroadcast( } template -void ElemwiseGradCompute(const framework::ExecutionContext& ctx, - const framework::Tensor& x, const framework::Tensor& y, - const framework::Tensor& out, - const framework::Tensor& dout, int axis, - framework::Tensor* dx, framework::Tensor* dy, +void ElemwiseGradCompute(const framework::ExecutionContext &ctx, + const framework::Tensor &x, const framework::Tensor &y, + const framework::Tensor &out, + const framework::Tensor &dout, int axis, + framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - const framework::DDim& x_dim = x.dims(); - const framework::DDim& y_dim = y.dims(); + const framework::DDim &x_dim = x.dims(); + const framework::DDim &y_dim = y.dims(); if (x.dims() == y.dims()) { ElemwiseGradComputeNoBroadcast( ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); @@ -565,27 +570,27 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, // In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse // elementwise code. template -void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx, - const framework::Tensor& x, - const framework::Tensor& y, - const framework::Tensor& out, - const framework::Tensor& dout, int axis, - framework::Tensor* dx, framework::Tensor* dy, +void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx, + const framework::Tensor &x, + const framework::Tensor &y, + const framework::Tensor &out, + const framework::Tensor &dout, int axis, + framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { if (dy == nullptr) { - const framework::DDim& dx_dims = dout.dims(); + const framework::DDim &dx_dims = dout.dims(); auto dy_dims = dx_dims; ElemwiseGradComputeNoBroadcast( ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { if (dout.dims() == dy->dims()) { - const framework::DDim& dx_dims = dout.dims(); - const framework::DDim& dy_dims = dy->dims(); + const framework::DDim &dx_dims = dout.dims(); + const framework::DDim &dy_dims = dy->dims(); ElemwiseGradComputeNoBroadcast( ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { // Y is a scalar auto dx_dims = dout.dims(); - const framework::DDim& dy_dims = dy->dims(); + const framework::DDim &dy_dims = dy->dims(); ElemwiseGradComputeWithBroadcast( ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } @@ -595,13 +600,13 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx, // Deprecated template -void ElementwiseGradCompute(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, - const framework::Tensor* out, - const framework::Tensor* dout, int axis, - framework::Tensor* dx, framework::Tensor* dy) { - auto& place = *ctx.template device_context().eigen_device(); +void ElementwiseGradCompute(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, + const framework::Tensor *out, + const framework::Tensor *dout, int axis, + framework::Tensor *dx, framework::Tensor *dy) { + auto &place = *ctx.template device_context().eigen_device(); auto x_dims = x->dims(); auto y_dims = y->dims(); @@ -639,17 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx, template -void ElementwiseComputeEx(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, int axis, Functor func, - framework::Tensor* z) { -#if !defined(_WIN32) + +void ElementwiseComputeEx(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, int axis, Functor func, + framework::Tensor *z) { TransformFunctor functor( x, y, z, ctx.template device_context(), func); -#else - TransformFunctor functor( - x, y, z, ctx.device_context(), func); -#endif // !_WIN32 auto x_dims = x->dims(); auto y_dims_untrimed = y->dims(); PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(), @@ -677,5 +678,823 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx, } } +// FusedElemwiseAndAct +// --- forward +template +struct FusedElemwiseAndActNoBroadcast { + HOSTDEVICE void operator()(size_t i) { + T y_val = y_[i]; + T x_val = x_[i]; + if (KeepIntermediateOut) { + T intermeidiate_out = compound_functor_.GetIntermediateOut(x_val, y_val); + intermediate_out_[i] = intermeidiate_out; + out_[i] = + compound_functor_.GetOutUseIntermediateOut(x_val, intermeidiate_out); + } else { + out_[i] = compound_functor_.GetOut(x_val, y_val); + } + } + + const T *x_; + const T *y_; + CompoundFunctor compound_functor_; + T *out_; + T *intermediate_out_; +}; + +// FusedElemwiseAndActBroadcast1: +// In this case, X and Y can be reshaped to a matrix. +// For example shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5) and axis = -1 or 2, +// X can be reshaped to (6, 20) and Y can be reshaped to (1, 20) +template +static void FusedElemwiseAndActBroadcast1CPU(const T *x, const T *y, + CompoundFunctor compound_functor, + int h, int w, T *out, + T *intermediate_out) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int offset = i * w + j; + + T y_val = BcastY ? y[j] : y[offset]; + T x_val = BcastY ? x[offset] : x[j]; + int64_t intermediate_out_offset; + if (KeepIntermediateOut) { + T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val); + + if (SameShapeOfIntermediateOutAndOut) { + // for the case of f1(f2(x, y)) + intermediate_out_offset = offset; + } else if (BcastY) { + intermediate_out_offset = j; + } else { + intermediate_out_offset = offset; + } + + intermediate_out[intermediate_out_offset] = intermeidiate_out; + out[offset] = + compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out); + } else { + out[offset] = compound_functor.GetOut(x_val, y_val); + } + } + } +} + +// FusedElemwiseAndActBroadcast2 +// In this case, X and Y can be reshaped to a matrix. +// For example shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4) and axis = 1, +// X can be reshaped to (2, 12, 5) and Y can be reshaped to (1, 12, 1) +// pre = 2, n = 12, post = 5 +template +static void FusedElemwiseAndActBroadcast2CPU(const T *x, const T *y, int pre, + int n, int post, + CompoundFunctor compound_functor, + T *out, T *intermediate_out) { + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int offset = i * n * post + j * post + k; + + T y_val = BcastY ? y[j] : y[offset]; + T x_val = BcastY ? x[offset] : x[j]; + int64_t intermediate_out_offset; + + if (KeepIntermediateOut) { + T intermeidiate_out = + compound_functor.GetIntermediateOut(x_val, y_val); + + if (SameShapeOfIntermediateOutAndOut) { + // for the case of f1(f2(x, y)) + intermediate_out_offset = offset; + } else if (BcastY) { + intermediate_out_offset = j; + } else { + intermediate_out_offset = offset; + } + + intermediate_out[intermediate_out_offset] = intermeidiate_out; + out[offset] = compound_functor.GetOutUseIntermediateOut( + x_val, intermeidiate_out); + } else { + out[offset] = compound_functor.GetOut(x_val, y_val); + } + } + } + } +} + +#ifdef __NVCC__ +template +static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel( + const T *x, const T *y, int h, int w, CompoundFunctor compound_functor, + T *out, T *intermediate_out) { + int j = blockIdx.x; + int i = threadIdx.x; + + while (i < h) { + int offset = i * w + j; + + T y_val = BcastY ? y[j] : y[offset]; + T x_val = BcastY ? x[offset] : x[j]; + int64_t intermediate_out_offset; + + if (KeepIntermediateOut) { + T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val); + + if (SameShapeOfIntermediateOutAndOut) { + // for the case of f1(f2(x, y)) + intermediate_out_offset = offset; + } else if (BcastY) { + intermediate_out_offset = j; + } else { + intermediate_out_offset = offset; + } + + intermediate_out[intermediate_out_offset] = intermeidiate_out; + out[offset] = + compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out); + } else { + out[offset] = compound_functor.GetOut(x_val, y_val); + } + + i += ELEMWISE_MAX_BLOCK_DIM; + } +} + +template +static void FusedElemwiseAndActBroadcast1CUDA(cudaStream_t stream, const T *x, + const T *y, + CompoundFunctor compound_functor, + int h, int w, T *out, + T *intermediate_out) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int gird_size = w; + FusedElemwiseAndActBroadcast1CUDAKernel< + T, CompoundFunctor, BcastY, KeepIntermediateOut, + SameShapeOfIntermediateOutAndOut><<>>( + x, y, h, w, compound_functor, out, intermediate_out); +} + +template +static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel( + const T *x, const T *y, CompoundFunctor compound_functor, int pre, int n, + int post, T *out, T *intermediate_out) { + int tid = threadIdx.x; + int j = blockIdx.x; + + while (true) { + int i = tid / post; + int k = tid % post; + if (i >= pre) break; + + int offset = i * n * post + j * post + k; + + T y_val = BcastY ? y[j] : y[offset]; + T x_val = BcastY ? x[offset] : x[j]; + int64_t intermediate_out_offset; + + if (KeepIntermediateOut) { + T intermeidiate_out = compound_functor.GetIntermediateOut(x_val, y_val); + + if (SameShapeOfIntermediateOutAndOut) { + // for the case of f1(f2(x, y)) + intermediate_out_offset = offset; + } else if (BcastY) { + intermediate_out_offset = j; + } else { + intermediate_out_offset = offset; + } + + intermediate_out[intermediate_out_offset] = intermeidiate_out; + out[offset] = + compound_functor.GetOutUseIntermediateOut(x_val, intermeidiate_out); + } else { + out[offset] = compound_functor.GetOut(x_val, y_val); + } + + tid += ELEMWISE_MAX_BLOCK_DIM; + } +} + +template +static void FusedElemwiseAndActBroadcast2CUDA(cudaStream_t stream, const T *x, + const T *y, int pre, int n, + int post, + CompoundFunctor compound_functor, + T *out, T *intermediate_out) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); + int gird_size = n; + + FusedElemwiseAndActBroadcast2CUDAKernel< + T, CompoundFunctor, BcastY, KeepIntermediateOut, + SameShapeOfIntermediateOutAndOut><<>>( + x, y, compound_functor, pre, n, post, out, intermediate_out); +} + +#endif + +template +void FusedElemwiseAndActComputeNoBroadcast( + const framework::ExecutionContext &ctx, const framework::DDim &x_dim, + const framework::Tensor &x, const framework::Tensor &y, + CompoundFunctor compound_functor, framework::Tensor *out, + framework::Tensor *intermediate_out) { + size_t N = static_cast(framework::product(x_dim)); + + platform::ForRange for_range( + ctx.template device_context(), N); + + for_range( + FusedElemwiseAndActNoBroadcast{ + x.data(), y.data(), compound_functor, + out->mutable_data(ctx.GetPlace()), + intermediate_out == nullptr + ? nullptr + : intermediate_out->mutable_data(ctx.GetPlace())}); +} + +template +void FusedElemwiseAndActComputeWithBroadcast( + const framework::ExecutionContext &ctx, const framework::DDim &x_dim, + const framework::DDim &y_dim_untrimed, const framework::Tensor &x, + const framework::Tensor &y, CompoundFunctor compound_functor, int axis, + framework::Tensor *out, framework::Tensor *intermediate_out) { + axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); + auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); + axis = (y_dim.size() == 0) ? x_dim.size() : axis; + + int pre, n, post; + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); + + if (post == 1) { + int h = pre; + int w = n; + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + FusedElemwiseAndActBroadcast1CUDA( + ctx.template device_context().stream(), x.data(), + y.data(), compound_functor, h, w, + out->mutable_data(ctx.GetPlace()), + intermediate_out == nullptr + ? nullptr + : intermediate_out->mutable_data(ctx.GetPlace())); +#endif + } else { + FusedElemwiseAndActBroadcast1CPU( + x.data(), y.data(), compound_functor, h, w, + out->mutable_data(ctx.GetPlace()), + intermediate_out == nullptr + ? nullptr + : intermediate_out->mutable_data(ctx.GetPlace())); + } + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + FusedElemwiseAndActBroadcast2CUDA( + ctx.template device_context().stream(), x.data(), + y.data(), pre, n, post, compound_functor, + out->mutable_data(ctx.GetPlace()), + intermediate_out == nullptr + ? nullptr + : intermediate_out->mutable_data(ctx.GetPlace())); +#endif + } else { + FusedElemwiseAndActBroadcast2CPU( + x.data(), y.data(), pre, n, post, compound_functor, + out->mutable_data(ctx.GetPlace()), + intermediate_out == nullptr + ? nullptr + : intermediate_out->mutable_data(ctx.GetPlace())); + } + } +} + +// --- backward +template +struct FusedElemwiseAndActGradNoBroadcast { + HOSTDEVICE void operator()(size_t i) { + if (dx_ != nullptr) { + dx_[i] = UseIntermediateOut ? dx_op_(x_[i], y_[i], intermediate_out_[i], + out_[i], dout_[i]) + : dx_op_(x_[i], y_[i], out_[i], dout_[i]); + } + if (dy_ != nullptr) { + dy_[i] = UseIntermediateOut ? dy_op_(x_[i], y_[i], intermediate_out_[i], + out_[i], dout_[i]) + : dy_op_(x_[i], y_[i], out_[i], dout_[i]); + } + } + + const T *x_; + const T *y_; + const T *intermediate_out_; + const T *out_; + const T *dout_; + DX_OP dx_op_; + DY_OP dy_op_; + T *dx_; + T *dy_; +}; + +template +void FusedElemwiseAndActGradComputeNoBroadcast( + const framework::ExecutionContext &ctx, const framework::DDim &x_dim, + const framework::DDim &y_dim, const framework::Tensor *x, + const framework::Tensor *y, const framework::Tensor *intermediate_out, + const framework::Tensor *out, const framework::Tensor *dout, int axis, + framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { + size_t N = static_cast(framework::product(x_dim)); + platform::ForRange for_range( + ctx.template device_context(), N); + for_range( + FusedElemwiseAndActGradNoBroadcast{ + x->data(), y->data(), + intermediate_out ? intermediate_out->data() : nullptr, + out->data(), dout->data(), dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())}); +} + +template +static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, + const T *intermediate_out, + const T *out, const T *dout, + int h, int w, DX_OP dx_op, + DY_OP dy_op, T *dx, T *dy) { + int64_t tmp_out_idx, x_idx, y_idx; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int offset = i * w + j; + + tmp_out_idx = BcastY ? j : offset; + y_idx = BcastY ? j : offset; + x_idx = BcastY ? offset : j; + + if (SameShapeOfIntermediateOutAndOut) { + tmp_out_idx = offset; + } + + if (dx != nullptr) { + T tmp = UseIntermediateOut + ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + + if (BcastY) { + dx[x_idx] = tmp; + } else { + if (i == 0) { + dx[x_idx] = tmp; + } else { + dx[x_idx] += tmp; + } + } + } + if (dy != nullptr) { + T tmp = UseIntermediateOut + ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + if (BcastY) { + if (i == 0) { + dy[y_idx] = tmp; + } else { + dy[y_idx] += tmp; + } + } else { + dy[y_idx] = tmp; + } + } + } + } +} + +template +static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, + const T *intermediate_out, + const T *out, const T *dout, + int pre, int n, int post, + DX_OP dx_op, DY_OP dy_op, + T *dx, T *dy) { + int64_t tmp_out_idx, x_idx, y_idx; + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int offset = i * n * post + j * post + k; + + tmp_out_idx = BcastY ? j : offset; + y_idx = BcastY ? j : offset; + x_idx = BcastY ? offset : j; + + if (SameShapeOfIntermediateOutAndOut) { + tmp_out_idx = offset; + } + + if (dx != nullptr) { + T tmp = UseIntermediateOut + ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + + if (BcastY) { + dx[x_idx] = tmp; + } else { + if (i == 0 && k == 0) { + dx[x_idx] = tmp; + } else { + dx[x_idx] += tmp; + } + } + } + if (dy != nullptr) { + T tmp = UseIntermediateOut + ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + if (BcastY) { + if (i == 0 && k == 0) { + dy[y_idx] = tmp; + } else { + dy[y_idx] += tmp; + } + } else { + dy[y_idx] = tmp; + } + } + } + } + } +} + +#ifdef __NVCC__ +template +static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( + const T *x, const T *y, const T *intermediate_out, const T *out, + const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { + int j = blockIdx.x; + int i = threadIdx.x; + int tid = threadIdx.x; + T val(0); + int64_t tmp_out_idx, x_idx, y_idx; + + do { + int offset = i * w + j; + + tmp_out_idx = BcastY ? j : offset; + y_idx = BcastY ? j : offset; + x_idx = BcastY ? offset : j; + + if (SameShapeOfIntermediateOutAndOut) { + tmp_out_idx = offset; + } + + if (dx != nullptr) { + T tmp = UseIntermediateOut + ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + + if (BcastY) { + dx[x_idx] = tmp; + } else { + val += tmp; + } + } + if (dy != nullptr) { + T tmp = UseIntermediateOut + ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + if (BcastY) { + val += tmp; + } else { + dy[y_idx] = tmp; + } + } + + i += ELEMWISE_MAX_BLOCK_DIM; + } while (i < h); + + if (BcastY) { + if (dy) { + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dy[j] = val; + } + } + } else { + if (dx) { + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dx[j] = val; + } + } + } +} + +template +static void FusedElemwiseAndActGradBroadcast1CUDA(cudaStream_t stream, + const T *x, const T *y, + const T *intermediate_out, + const T *out, const T *dout, + int h, int w, DX_OP dx_op, + DY_OP dy_op, T *dx, T *dy) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int gird_size = w; + FusedElemwiseAndActGradBroadcast1CUDAKernel< + T, DX_OP, DY_OP, UseIntermediateOut, BcastY, + SameShapeOfIntermediateOutAndOut><<>>( + x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dx, dy); +} + +template +static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( + const T *x, const T *y, const T *intermediate_out, const T *out, + const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, T *dx, + T *dy) { + int tid = threadIdx.x; + int j = blockIdx.x; + + T val(0); + int ttid = tid; + int64_t tmp_out_idx, x_idx, y_idx; + while (true) { + int i = ttid / post; + int k = ttid % post; + if (i >= pre) break; + + int offset = i * n * post + j * post + k; + + tmp_out_idx = BcastY ? j : offset; + y_idx = BcastY ? j : offset; + x_idx = BcastY ? offset : j; + + if (SameShapeOfIntermediateOutAndOut) { + tmp_out_idx = offset; + } + + if (dx != nullptr) { + T tmp = UseIntermediateOut + ? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + + if (BcastY) { + dx[x_idx] = tmp; + } else { + val += tmp; + } + } + if (dy != nullptr) { + T tmp = UseIntermediateOut + ? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); + if (BcastY) { + val += tmp; + } else { + dy[y_idx] = tmp; + } + } + + ttid += ELEMWISE_MAX_BLOCK_DIM; + } + + if (BcastY) { + if (dy) { + int h = pre * post; + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dy[j] = val; + } + } + } else { + if (dx) { + int h = pre * post; + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dx[j] = val; + } + } + } +} + +template +static void FusedElemwiseAndActGradBroadcast2CUDA( + cudaStream_t stream, const T *x, const T *y, const T *intermediate_out, + const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op, + DY_OP dy_op, T *dx, T *dy) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); + int gird_size = n; + FusedElemwiseAndActGradBroadcast2CUDAKernel< + T, DX_OP, DY_OP, UseIntermediateOut, BcastY, + SameShapeOfIntermediateOutAndOut><<>>( + x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op, dx, dy); +} +#endif + +template +void FusedElemwiseAndActGradComputeWithBroadcast( + const framework::ExecutionContext &ctx, const framework::DDim &x_dim, + const framework::DDim &y_dim_untrimed, const framework::Tensor *x, + const framework::Tensor *y, const framework::Tensor *intermediate_out, + const framework::Tensor *out, const framework::Tensor *dout, int axis, + framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { + axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); + auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); + axis = (y_dim.size() == 0) ? x_dim.size() : axis; + + int pre, n, post; + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); + if (post == 1) { + int h = pre; + int w = n; + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + FusedElemwiseAndActGradBroadcast1CUDA( + ctx.template device_context().stream(), x->data(), + y->data(), + intermediate_out == nullptr ? nullptr : intermediate_out->data(), + out->data(), dout->data(), h, w, dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); +#endif + } else { + FusedElemwiseAndActGradBroadcast1CPU( + x->data(), y->data(), + intermediate_out == nullptr ? nullptr : intermediate_out->data(), + out->data(), dout->data(), h, w, dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + FusedElemwiseAndActGradBroadcast2CUDA( + ctx.template device_context().stream(), x->data(), + y->data(), + intermediate_out == nullptr ? nullptr : intermediate_out->data(), + out->data(), dout->data(), pre, n, post, dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); +#endif + } else { + FusedElemwiseAndActGradBroadcast2CPU( + x->data(), y->data(), + intermediate_out == nullptr ? nullptr : intermediate_out->data(), + out->data(), dout->data(), pre, n, post, dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } + } +} + +template +void FusedElemwiseAndActGradComputeEx( + const framework::ExecutionContext &ctx, const framework::Tensor *x, + const framework::Tensor *y, const framework::Tensor *out, + const framework::Tensor *intermediate_out, const framework::Tensor *dout, + int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, + DY_OP dy_op) { + const framework::DDim &x_dim = x->dims(); + const framework::DDim &y_dim = y->dims(); + if (UseIntermediateOut) { + PADDLE_ENFORCE(intermediate_out, "intermediate_out should not be nullptr"); + } + if (x_dim == y_dim) { + FusedElemwiseAndActGradComputeNoBroadcast( + ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy, + dx_op, dy_op); + } else { // Y is a scalar + bool bcast_y = x_dim.size() >= y_dim.size(); + if (x_dim.size() == y_dim.size()) { + for (int i = 0; i < x_dim.size(); ++i) { + if (x_dim[i] < y_dim[i]) { + bcast_y = false; + break; + } + } + } + + // z = f1(x, f2(y)) + // z = f1(f2(x, y)) + if (bcast_y) { // Y should be broadcast. + FusedElemwiseAndActGradComputeWithBroadcast< + DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, true /*BcastY*/, + SameShapeOfIntermediateOutAndOut>(ctx, x_dim, y_dim, x, y, + intermediate_out, out, dout, axis, + dx, dy, dx_op, dy_op); + } else { + FusedElemwiseAndActGradComputeWithBroadcast< + DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, false /*BcastY*/, + SameShapeOfIntermediateOutAndOut>(ctx, y_dim, x_dim, x, y, + intermediate_out, out, dout, axis, + dx, dy, dx_op, dy_op); + } + } +} + +template +void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx, + const framework::Tensor &x, + const framework::Tensor &y, int axis, + CompoundFunctor compound_functor, + framework::Tensor *out, + framework::Tensor *intermediate_out) { + if (KeepIntermediateOut) { + PADDLE_ENFORCE(intermediate_out, + "The keep_intermediate_value is opened, " + "intermediate_out should not be nullptr."); + } + + const framework::DDim &x_dim = x.dims(); + const framework::DDim &y_dim = y.dims(); + if (x.dims() == y.dims()) { + FusedElemwiseAndActComputeNoBroadcast( + ctx, x_dim, x, y, compound_functor, out, intermediate_out); + } else { + // Whether the shape of Y is a continuous subsequence of X, + // For more information please refer to the op's introduction. + bool bcast_y = x.dims().size() >= y.dims().size(); + if (x.dims().size() == y.dims().size()) { + for (int i = 0; i < x.dims().size(); ++i) { + if (x.dims()[i] < y.dims()[i]) { + bcast_y = false; + break; + } + } + } + + // z = f1(x, f2(y)) + // z = f1(f2(x, y)) + if (bcast_y) { // Y should be broadcast. + // In this case, + // for 'f2(y)', the shape of intermediate_out should be equal to the shape + // of Y. + // for 'f2(x, y)', the shape of intermediate_out should be equal to the + // shape of Out. + // the shape of Out should be equal to the shape of X. + FusedElemwiseAndActComputeWithBroadcast< + DeviceContext, T, CompoundFunctor, true /*BcastY*/, + KeepIntermediateOut, SameShapeOfIntermediateOutAndOut>( + ctx, x_dim /*OutShape*/, y_dim, x, y, compound_functor, axis, out, + intermediate_out); + } else { + // In this case, + // for 'f2(y)', the shape of intermediate_out should be equal to the shape + // of Out. + // for 'f2(x, y)', the shape of intermediate_out should be equal to the + // shape of Out. + // the shape of Out should be equal to the shape of Y. + FusedElemwiseAndActComputeWithBroadcast< + DeviceContext, T, CompoundFunctor, false /*BcastY*/, + KeepIntermediateOut, SameShapeOfIntermediateOutAndOut>( + ctx, y_dim /*OutShape*/, x_dim, x, y, compound_functor, axis, out, + intermediate_out); + } + } +} } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 43f949111104ee56efc8625bdd609e412ef7f37d..2008e7027524ffd1f80a6eede015801b8a0b0254 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -18,15 +18,32 @@ limitations under the License. */ namespace paddle { namespace operators { +template +struct DequantizeFunctor { + void operator()(const platform::CPUDeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor* scale, + T max_range, framework::Tensor* out) { + auto in_e = framework::EigenVector::Flatten(*in); + const T* scale_factor = scale->data(); + auto out_e = framework::EigenVector::Flatten(*out); + + auto& dev = *dev_ctx.eigen_device(); + out_e.device(dev) = (scale_factor[0] / max_range) * in_e; + } +}; + +template struct DequantizeFunctor; +template struct DequantizeFunctor; + class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel { public: - FakeDequantizeMaxAbsOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) + FakeDequantizeMaxAbsOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FakeDequantizeMaxAbsOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(Tensor) The input with float-32/64 type is the " "low precision tensor."); + AddInput("Scale", "(float) The scale in quantization stage."); AddOutput("Out", "(Tensor) The output is the dequantized high " "precision tensor."); - AddAttr("num_bits", - "(int) `num_bits` is the quantization level bits, " - "such as 2, 5, 8."); - AddAttr("scale", - "(float) The maximum absolute value of low precision tensor." - "It is usually calculated by the fake_quantize_max_abs_op."); + AddAttr("max_range", "(float) The max range in quantization stage."); AddComment(R"DOC( FakeDequantizeMaxAbsOp operator. This calculation is an opposite operation of FakeQuantizeMaxAbsOp: -$$Out = \frac{scale*X}{2^{num_bits} - 1}$$ +$$Out = \frac{scale*X}{ max_range }$$ )DOC"); } diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 1bd38d1bd2c3a6f90d2fbad415d61efaead3afe9..225bcc45bc65bc9268d1e866a4358731eaf0c3ef 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -14,6 +14,42 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.h" +namespace paddle { +namespace operators { + +template +__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num, + T* out) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < num) { + out[idx] = in[idx] * scale[0] / max_range; + } +} + +template +struct DequantizeFunctor { + void operator()(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor* in, const framework::Tensor* scale, + T max_range, framework::Tensor* out) { + const T* in_data = in->data(); + const T* scale_factor = scale->data(); + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + + int num = in->numel(); + int block = 512; + int grid = (num + block - 1) / block; + + KeDequantize<<>>( + in_data, scale_factor, max_range, num, out_data); + } +}; + +template struct DequantizeFunctor; +template struct DequantizeFunctor; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h index 0901e68b3761159c3cc9c6684567bee38ec3f16d..d9923a10daa01ca06ebabb27cf9285b0628634bc 100644 --- a/paddle/fluid/operators/fake_dequantize_op.h +++ b/paddle/fluid/operators/fake_dequantize_op.h @@ -19,22 +19,29 @@ limitations under the License. */ namespace paddle { namespace operators { + +template +struct DequantizeFunctor { + void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in, + const framework::Tensor* scale, T max_range, + framework::Tensor* out); +}; + template class FakeDequantizeMaxAbsKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& ctx) const { auto* in = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); auto* out = ctx.Output("Out"); - out->mutable_data(in->place()); - int num_bits = ctx.Attr("num_bits"); - T scale = static_cast(ctx.Attr("scale")); - int range = std::pow(2, num_bits) - 1; + float max_range = ctx.Attr("max_range"); + + auto& dev_ctx = ctx.template device_context(); + out->mutable_data(dev_ctx.GetPlace()); - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& dev = *ctx.template device_context().eigen_device(); - eigen_out.device(dev) = (scale / range) * eigen_in; + DequantizeFunctor()(dev_ctx, in, scale, + static_cast(max_range), out); } }; diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index d9cd956dfdff3d009d38ee5088f5396080580483..9d7ac7ab6194593747548fac3cefc8d4ed3058d8 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase { class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { + AddOutput("Out", "(Any) Dummy outputs, used for control dependency") + .AsDuplicable(); AddComment(R"DOC( SendBarrier operator diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 130f18dde4f979a6a9925ede9cbf745fcec14d48..2826b82117db113d4d8c10095e89f610ca895775 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { @@ -41,19 +40,33 @@ class FillConstantOp : public framework::OperatorBase { static_cast(Attr("dtype")); auto value = Attr("value"); auto force_cpu = Attr("force_cpu"); - auto &out = - *scope.FindVar(Output("Out"))->GetMutable(); - out.Resize(framework::make_ddim(Attr>("shape"))); + + framework::Tensor *tensor = nullptr; + + auto &out_var = *scope.FindVar(Output("Out")); + + if (out_var.IsType()) { + tensor = out_var.GetMutable(); + tensor->Resize(framework::make_ddim(Attr>("shape"))); + } else if (out_var.IsType()) { + tensor = out_var.GetMutable()->mutable_value(); + tensor->Resize(framework::make_ddim(Attr>("shape"))); + } else { + PADDLE_THROW( + "fill constant op's output only" + "supports SelectedRows and LoDTensor"); + } + if (force_cpu) { auto cpu = platform::CPUPlace(); - out.mutable_data(cpu, framework::ToTypeIndex(data_type)); + tensor->mutable_data(cpu, framework::ToTypeIndex(data_type)); } else { - out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); + tensor->mutable_data(dev_place, framework::ToTypeIndex(data_type)); } platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); - math::set_constant(dev_ctx, &out, value); + math::set_constant(dev_ctx, tensor, value); } }; diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc index 925dc19061e2196a40411f415eb6e5ad59ab52ff..adc7cb1f9e48ba5fabeb91c5e3ecec016db34a45 100644 --- a/paddle/fluid/operators/fill_op.cc +++ b/paddle/fluid/operators/fill_op.cc @@ -25,7 +25,7 @@ struct FillOpVisitor { : tensor_(tensor), value_(value) {} template - void operator()() const { + void apply() const { platform::CPUPlace cpu; auto *data = tensor_->mutable_data(cpu); std::transform(value_.data(), value_.data() + tensor_->numel(), data, diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc index a6fd0aeb021dce40339c32251af130d5984dccd2..b54f0091b3fe21222b4690f4dcff1c081d4799e7 100644 --- a/paddle/fluid/operators/fused_elemwise_activation_op.cc +++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc @@ -12,14 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/fused_elemwise_activation_op.h" #include #include -#include "paddle/fluid/operators/fused_elemwise_activation_op.h" - namespace paddle { namespace operators { +/* + * Whether the compound function is Unary(Binary(X, Y)). + * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final + * out. + */ +static bool IsUnaryCompound(const std::vector &functor_list) { + PADDLE_ENFORCE_EQ(functor_list.size(), 2); + static std::unordered_set binary_fun = { + "elementwise_add", "elementwise_mul", "elementwise_add_grad", + "elementwise_mul_grad"}; + return binary_fun.count(functor_list[1]) != 0; +} + +/* + * Whether the Input(X) could be absent. + */ +static bool InputXCanBeAbsent(const std::vector &functor_list) { + PADDLE_ENFORCE_EQ(functor_list.size(), 2); + static std::unordered_set binary_fun = {"elementwise_add_grad"}; + return binary_fun.count(functor_list[0]) != 0 || + binary_fun.count(functor_list[1]) != 0; +} + +/* + * Whether the compound function is supported. + * For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final + * out. + */ +static bool IsSupportedCompound(const std::vector &functors) { + static std::unordered_set unary_fun = {"scale", "relu"}; + static std::unordered_set binary_fun = {"elementwise_add", + "elementwise_mul"}; + + std::string unary_fun_str; + if (binary_fun.count(functors[0])) { + unary_fun_str = functors[1]; + } else if (binary_fun.count(functors[1])) { + unary_fun_str = functors[0]; + } else { + PADDLE_THROW("%s and %s are not included in fused_list.", functors[0], + functors[1]); + } + PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1, + "%s is not included in fused_list.", unary_fun_str); + return true; +} + class FusedElemwiseActivationOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -37,11 +83,44 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { auto x_dim = ctx->GetInputDim("X"); auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - ctx->SetOutputDim("Out", x_dim); - ctx->ShareLoD("X", /*->*/ "Out"); + // Whether the shape of Y is a continuous subsequence of X, + // For more information please refer to the op's introduction. + bool bcast_y = x_dim.size() >= y_dim.size(); + if (x_dim.size() == y_dim.size()) { + for (int i = 0; i < x_dim.size(); ++i) { + if (x_dim[i] < y_dim[i]) { + bcast_y = false; + break; + } + } + } + + auto &out_dim = bcast_y ? x_dim : y_dim; + std::string out_lod = bcast_y ? "X" : "Y"; + + if (ctx->Attrs().Get("keep_intermediate_value")) { + PADDLE_ENFORCE(ctx->HasOutput("IntermediateOut"), + "Output(IntermediateOut) of FusedElemwiseActivationOp " + "should not be null."); + + if (IsUnaryCompound( + ctx->Attrs().Get>("functor_list"))) { + // for Unary(Binary(X, Y)), the shape and lod of out and + // intermediate_out are the same. + ctx->SetOutputDim("IntermediateOut", out_dim); + // set the lod of intermediate_out + ctx->ShareLoD(out_lod, /*->*/ "IntermediateOut"); + } else { + // for Binary(X, Unary(Y)), the shape and lod of Y and + // intermediate_out are the same. + ctx->SetOutputDim("IntermediateOut", y_dim); + // set the lod of intermediate_out + ctx->ShareLoD("Y", /*->*/ "IntermediateOut"); + } + } + ctx->SetOutputDim("Out", out_dim); + ctx->ShareLoD(out_lod, /*->*/ "Out"); } protected: @@ -59,29 +138,42 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("X", "(vector)"); - AddInput("Y", "(vector)"); - AddOutput("Out", "vector"); + AddInput( + "X", + "(Tensor) The input tensor of fused_elemwise_activation operator."); + AddInput( + "Y", + "(Tensor) The input tensor of fused_elemwise_activation operator."); + AddOutput("Out", + "vector The output tensor of fused_elemwise_activation " + "operator."); + AddOutput("IntermediateOut", + "Tensor The IntermediateOut tensor of fused_elemwise_activation " + "operator.") + .AsIntermediate(); AddAttr("axis", "axis is used by elementwise_op, the default value is -1.") .SetDefault(-1); AddAttr("scale", "scale is used by scale_op, the default value is 0.0.") .SetDefault(0.0); - AddAttr("recomputation", - "Whether to recompute the Out." - "fused_elemwise_activation_grad has two methods to get the " - "dx and dy, one " - "is to use the 'Out', and the other is not to use it. " - "The former method will save the time of recomputing the " - "'Out', but it must occupy the memory to store the 'out'. " - "While, the later method can avoid occupying the memory, " - "but it must recompute the 'Out'. The default value is true.") + AddAttr( + "recomputation", + "Whether to recompute the Out." + "The computation of fused_elemwise_activation_grad has two methods to " + "get the dx and dy, one is to use the 'Out', and the other is not. " + "The former method will save the time of recomputing the 'Out', but it " + "must occupy the memory to store the 'out'. While, the later method " + "can avoid occupying the memory, but it must recompute the 'Out'. " + "It is useful for Unary(Binary(X, Y)). The default value is true.") .SetDefault(true); + AddAttr("keep_intermediate_value", + "Whether to save the intermediate_out.") + .SetDefault(false); AddAttr>("functor_list", "The functors that should be fused.") .AddCustomChecker([&](const std::vector &functor_list) { - PADDLE_ENFORCE(ValidCheck(functor_list)); + PADDLE_ENFORCE(IsSupportedCompound(functor_list)); }); AddComment(R"DOC( @@ -93,30 +185,38 @@ operators (elementwise_op and activation_op): Z = Binary(X, Unary(Y)) Z = Unary(Binary(X, Y)) -The attributions of activation_op can be get from fused_elemwise_activation_op's -attributions. functor_list records the functors to be fused, for example -"scale,elementwise_add". +There are two cases for this operator: -)DOC"); - } +1. The shape of $Y$ and $X$ is the same. +2. The shape of $Y$ is a continuous subsequence of $X$ or the shape of $X$ is a continuous subsequence of $Y$. - private: - bool ValidCheck(const std::vector &functors) { - std::unordered_set unary_fun = {"scale", "relu"}; - std::unordered_set binary_fun = {"elementwise_add"}; +For case 2 (assume that the shape of $Y$ is a continuous subsequence of $X$ ): - std::string unary_fun_str; - if (binary_fun.count(functors[0])) { - unary_fun_str = functors[1]; - } else if (binary_fun.count(functors[1])) { - unary_fun_str = functors[0]; - } else { - PADDLE_THROW("%s and %s are not included in fused_list.", functors[0], - functors[1]); - } - PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1, - "%s is not included in fused_list.", unary_fun_str); - return true; +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. +2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of + subsequence, such as shape(Y) = (2, 1) => (2). + +For example: + + .. code-block:: python + + shape(X) = (2, 3, 4, 5), shape(Y) = (,) + shape(X) = (2, 3, 4, 5), shape(Y) = (5,) + shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2 + shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 + shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 + + +The inputs $X$ and $Y$ can carry the different LoD information. +But the output only shares the LoD information with the one whose shape is the same with Out. +The attributions of activation_op can be get from fused_elemwise_activation_op's. +The functor_list records the functions to be fused, for example +["scale", "elementwise_add"]. + +)DOC"); } }; @@ -141,6 +241,7 @@ class FusedElemwiseActivationGradMaker op_desc_ptr->SetInput(framework::GradVarName(output_param), this->OutputGrad(output_param)); } + op_desc_ptr->SetAttrMap(this->Attrs()); std::vector functor_names = @@ -158,40 +259,59 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), - "Rank of first input must >= rank of second input."); + "Input(Out@Grad) should not be null"); + if (ctx->Attrs().Get("keep_intermediate_value")) { + PADDLE_ENFORCE(ctx->HasInput("IntermediateOut"), + "Input(IntermediateOut) should not be null"); + } else { + PADDLE_ENFORCE_EQ(ctx->Inputs(framework::GradVarName("Out")).size(), 1); + } + auto funtor_list = + ctx->Attrs().Get>("functor_list"); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); + if (ctx->HasInputs("X")) { + ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); + ctx->ShareLoD("X", x_grad_name); + } else { + // Node: If "X" is absence, the shape of Y should be a continuous + // subsequence of X, if not, we could not infer the shape of dx. + + // Currently, only when Binary is elementwise_add or elementwise_sub, + // the "X" could be absent. + PADDLE_ENFORCE(InputXCanBeAbsent(funtor_list), + "Only when BinaryFunctor is elementwise_add, the 'X' " + "could be absent."); + + // For Unary(Binary(X, Y)), IntermediateOut should not be empty. + if (IsUnaryCompound(funtor_list)) { + PADDLE_ENFORCE( + ctx->HasInputs("IntermediateOut"), + "If the compound_functor is Unary(Binary(X, Y)) and Binary " + "is elementwise_add, the intermediate_out must be not absent."); + } + + ctx->SetOutputDim(x_grad_name, + ctx->GetInputDim(framework::GradVarName("Out"))); + ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name); + } } if (ctx->HasOutput(y_grad_name)) { - ctx->SetOutputDim(y_grad_name, y_dims); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y")); + ctx->ShareLoD("Y", y_grad_name); } } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type_index = ctx.Input("X")->type(); - PADDLE_ENFORCE_EQ(input_data_type_index, - ctx.Input("Y")->type(), - "The element's type of input should be the same."); - PADDLE_ENFORCE_EQ( - input_data_type_index, - ctx.Input(framework::GradVarName("Out"))->type(), - "The element's type of input should be the same."); - + // PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + auto input_data_type_index = ctx.Input("Y")->type(); auto input_data_type = framework::ToDataType(input_data_type_index); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h index fe0017b824532b1210d0ae3e51983d63d081f12a..6321541aab7e31cd703289bb8951245215ecb3e2 100644 --- a/paddle/fluid/operators/fused_elemwise_activation_op.h +++ b/paddle/fluid/operators/fused_elemwise_activation_op.h @@ -20,208 +20,114 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/compound_functors.h" #include "paddle/fluid/operators/math/functors.h" -namespace math = paddle::operators::math; - namespace paddle { namespace operators { -// CompoundFunctors -// For example: Z = Binary(X, Unary(Y)) -template -struct BinaryCompoundFunctor { - BinaryCompoundFunctor(const BinaryFun &binary_fun, const UnaryFun &unary_fun) - : binary_fun_(binary_fun), unary_fun_(unary_fun) {} - - inline HOSTDEVICE T operator()(T x, T y) { - return binary_fun_(x, unary_fun_(y)); - } - - private: - BinaryFun binary_fun_; - UnaryFun unary_fun_; -}; - -// For example: Z = Unary(Binary(X, Y)) -template -struct UnaryCompoundFunctor { - UnaryCompoundFunctor(const UnaryFun &unary_fun, const BinaryFun &binary_fun) - : unary_fun_(unary_fun), binary_fun_(binary_fun) {} - - inline HOSTDEVICE T operator()(T x, T y) { - return unary_fun_(binary_fun_(x, y)); - } - - private: - UnaryFun unary_fun_; - BinaryFun binary_fun_; -}; - -// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get -// the dx, one is to use the 'out', and the other is not to use it. -// the former method will save the time of recomputing the -// 'out', but it must occupy the memory to store the 'out'. -// While the later method can avoid occupying this memory, -// but it must recompute the 'out'. - -template -struct BinaryCompoundGradDxFunctor { - BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun, - const UnaryFun &unary_fun) - : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} - - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { - if (Recomputation) { - return dout * d_binary_fun_(x, unary_fun_(y)); - } else { - return dout * d_binary_fun_(x, unary_fun_(y), out); - } - } - - private: - DBinaryFun d_binary_fun_; - UnaryFun unary_fun_; -}; - -template -struct BinaryCompoundGradDyFunctor { - BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun, - const UnaryFun &unary_fun, - const DUnaryFun &d_unary_fun) - : d_binary_fun_(d_binary_fun), - unary_fun_(unary_fun), - d_unary_fun_(d_unary_fun) {} - - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { - if (Recomputation) { - return dout * d_binary_fun_(unary_fun_(y), x) * d_unary_fun_(y); - } else { - return dout * d_binary_fun_(unary_fun_(y), x, out) * d_unary_fun_(y); - } - } - - private: - DBinaryFun d_binary_fun_; - UnaryFun unary_fun_; - DUnaryFun d_unary_fun_; -}; - -template -struct UnaryCompoundGradDxFunctor { - UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun, - const BinaryFun &binary_fun, - const DBinaryFun &d_binary_fun) - : d_unary_fun_(d_unary_fun), - binary_fun_(binary_fun), - d_binary_fun_(d_binary_fun) {} - - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { - T base; - if (Recomputation) { - base = dout * d_unary_fun_(binary_fun_(x, y)); - } else { - base = dout * d_unary_fun_(binary_fun_(x, y), out); - } - return base * d_binary_fun_(x, y); - } - - private: - DUnaryFun d_unary_fun_; - BinaryFun binary_fun_; - DBinaryFun d_binary_fun_; -}; - -template -struct UnaryCompoundGradDyFunctor { - UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun, - const BinaryFun &binary_fun, - const DBinaryFun &d_binary_fun) - : d_unary_fun_(d_unary_fun), - binary_fun_(binary_fun), - d_binary_fun_(d_binary_fun) {} - - inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { - T base; - if (Recomputation) { - base = dout * d_unary_fun_(binary_fun_(x, y)); - } else { - base = dout * d_unary_fun_(binary_fun_(x, y), out); - } - return base * d_binary_fun_(y, x); - } - - private: - DUnaryFun d_unary_fun_; - BinaryFun binary_fun_; - DBinaryFun d_binary_fun_; -}; - template -static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx, - const BinaryFunctor &binary_functor, - const UnaryFunctor &unary_functor, - const framework::Tensor *in_x, - const framework::Tensor *in_y, - framework::Tensor *output) { +static void RunBinaryCompoundFunctor( + const framework::ExecutionContext &ctx, const BinaryFunctor &binary_functor, + const UnaryFunctor &unary_functor, const framework::Tensor &in_x, + const framework::Tensor &in_y, std::vector *outputs) { + // Z = Binary(X, Unary(Y)) + // intermediate_out = Unary(Y) + // out = Binary(X, Unary(Y)) + // In this case, the shape of intermediate_out and out are different. + paddle::operators::math::BinaryCompoundFunctor + compound_func(binary_functor, unary_functor); int axis = ctx.Attr("axis"); - using BinaryCompoundFunctor = - BinaryCompoundFunctor; - - ElementwiseComputeEx( - ctx, in_x, in_y, axis, - BinaryCompoundFunctor(binary_functor, unary_functor), output); + if (ctx.Attr("keep_intermediate_value")) { + FusedElemwiseAndActComputeEx, + true /*KeepIntermediateValue*/, + false /*SameShapeOfIntermediateOutAndOut*/>( + ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); + } else { + FusedElemwiseAndActComputeEx, + false /*KeepIntermediateValue*/, + false /*SameShapeOfIntermediateOutAndOut*/>( + ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); + } } template -static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx, - const UnaryFunctor &unary_functor, - const BinaryFunctor &binary_functor, - const framework::Tensor *in_x, - const framework::Tensor *in_y, - framework::Tensor *output) { +static void RunUnaryCompoundFunctors( + const framework::ExecutionContext &ctx, const UnaryFunctor &unary_functor, + const BinaryFunctor &binary_functor, const framework::Tensor &in_x, + const framework::Tensor &in_y, std::vector *outputs) { + // Z = Unary(Binary(X, Y)) + // intermediate_out = Binary(X, Y) + // out = Unary(Binary(X, Y)) + // In this case, the shape of intermediate_out and out are the same. int axis = ctx.Attr("axis"); - using UnaryCompoundFunctor = - UnaryCompoundFunctor; + paddle::operators::math::UnaryCompoundFunctor + compound_func(unary_functor, binary_functor); - ElementwiseComputeEx( - ctx, in_x, in_y, axis, - UnaryCompoundFunctor(unary_functor, binary_functor), output); + if (ctx.Attr("keep_intermediate_value")) { + FusedElemwiseAndActComputeEx, + true /*KeepIntermediateValue*/, + true /*SameShapeOfIntermediateOutAndOut*/>( + ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); + } else { + FusedElemwiseAndActComputeEx, + false /*KeepIntermediateValue*/, + true /*SameShapeOfIntermediateOutAndOut*/>( + ctx, in_x, in_y, axis, compound_func, (*outputs)[0], (*outputs)[1]); + } } template + typename UnaryFunctor, typename UnaryGradFunctor> static void RunBinaryCompoundGradFunctors( const framework::ExecutionContext &ctx, const BinaryGradFunctor &binary_grad_functor, const UnaryFunctor &unary_functor, const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x, const framework::Tensor *in_y, const framework::Tensor *in_out, + const framework::Tensor *in_intermediate_out, const framework::Tensor *in_out_grad, framework::Tensor *x_grad, framework::Tensor *y_grad) { + // Z = Binary(X, Unary(Y)) int axis = ctx.Attr("axis"); using BinaryCompoundDxFunctor = - BinaryCompoundGradDxFunctor; + paddle::operators::math::BinaryCompoundGradDxFunctor; using BinaryCompoundDyFunctor = - BinaryCompoundGradDyFunctor; - - ElemwiseGradCompute( - ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad, - BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), - BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, - unary_grad_functor)); + paddle::operators::math::BinaryCompoundGradDyFunctor< + T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor>; + + if (in_intermediate_out) { + FusedElemwiseAndActGradComputeEx< + DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor, + true /*UseIntermediateOut*/, + false /*SameShapeOfIntermediateOutAndOut*/>( + ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, + y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), + BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, + unary_grad_functor)); + } else { + FusedElemwiseAndActGradComputeEx< + DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor, + false /*UseIntermediateOut*/, + false /*SameShapeOfIntermediateOutAndOut*/>( + ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, + y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), + BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, + unary_grad_functor)); + } } template ("axis"); using UnaryCompoundDxFunctor = - UnaryCompoundGradDxFunctor; + paddle::operators::math::UnaryCompoundGradDxFunctor< + T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>; using UnaryCompoundDyFunctor = - UnaryCompoundGradDyFunctor; - - ElemwiseGradCompute( - ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad, - UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, - binary_grad_functor), - UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, - binary_grad_functor)); + paddle::operators::math::UnaryCompoundGradDyFunctor< + T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>; + + if (in_intermediate_out) { + FusedElemwiseAndActGradComputeEx< + DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor, + true /*UseIntermediateOut*/, true /*SameShapeOfIntermediateOutAndOut*/>( + ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, + y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, + binary_grad_functor), + UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, + binary_grad_functor)); + } else { + FusedElemwiseAndActGradComputeEx( + ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, + y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, + binary_grad_functor), + UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, + binary_grad_functor)); + } } template static void RunFunctors(const framework::ExecutionContext &ctx, - const framework::Tensor *in_x, - const framework::Tensor *in_y, - framework::Tensor *output) { + const framework::Tensor &in_x, + const framework::Tensor &in_y, + std::vector *outputs) { auto &functors = ctx.Attr>("functor_list"); - auto funcs_str = functors[0] + "," + functors[1]; + // TODO(zcd): The following code can be refined. + auto funcs_str = functors[0] + "," + functors[1]; if (funcs_str == "elementwise_add,scale") { // Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - RunBinaryCompoundFunctor, - math::ScaleFunctor>( - ctx, math::AddFunctor(), math::ScaleFunctor(scale), in_x, in_y, - output); + RunBinaryCompoundFunctor, + paddle::operators::math::ScaleFunctor>( + ctx, paddle::operators::math::AddFunctor(), + paddle::operators::math::ScaleFunctor(scale), in_x, in_y, outputs); } else if (funcs_str == "scale,elementwise_add") { // Z = Unary(Binary(X, Y)) T scale = static_cast(ctx.Attr("scale")); - RunUnaryCompoundFunctors, - math::AddFunctor>( - ctx, math::ScaleFunctor(scale), math::AddFunctor(), in_x, in_y, - output); + RunUnaryCompoundFunctors, + paddle::operators::math::AddFunctor>( + ctx, paddle::operators::math::ScaleFunctor(scale), + paddle::operators::math::AddFunctor(), in_x, in_y, outputs); } else if (funcs_str == "elementwise_add,relu") { - RunBinaryCompoundFunctor, - math::ReluFunctor>( - ctx, math::AddFunctor(), math::ReluFunctor(), in_x, in_y, output); + // Z = Binary(X, Unary(Y)) + RunBinaryCompoundFunctor, + paddle::operators::math::ReluFunctor>( + ctx, paddle::operators::math::AddFunctor(), + paddle::operators::math::ReluFunctor(), in_x, in_y, outputs); } else if (funcs_str == "relu,elementwise_add") { - RunUnaryCompoundFunctors, - math::AddFunctor>( - ctx, math::ReluFunctor(), math::AddFunctor(), in_x, in_y, output); + // Z = Unary(Binary(X, Y)) + RunUnaryCompoundFunctors, + paddle::operators::math::AddFunctor>( + ctx, paddle::operators::math::ReluFunctor(), + paddle::operators::math::AddFunctor(), in_x, in_y, outputs); + } else if (funcs_str == "elementwise_mul,scale") { + // Z = Binary(X, Unary(Y)) + T scale = static_cast(ctx.Attr("scale")); + RunBinaryCompoundFunctor, + paddle::operators::math::ScaleFunctor>( + ctx, paddle::operators::math::MulFunctor(), + paddle::operators::math::ScaleFunctor(scale), in_x, in_y, outputs); } else { PADDLE_THROW("%s has not been implemented.", funcs_str); } } -template +template static void RunGradFunctors(const framework::ExecutionContext &ctx, const framework::Tensor *in_x, const framework::Tensor *in_y, const framework::Tensor *in_out, + const framework::Tensor *in_intermediate_out, const framework::Tensor *in_out_grad, framework::Tensor *x_grad, framework::Tensor *y_grad) { auto &functors = ctx.Attr>("functor_list"); auto funcs_str = functors[0] + "," + functors[1]; - bool recomputation = ctx.Attr("recomputation"); - - // TODO(zcd): The following code can be refined. for example, use registion + // TODO(zcd): The following code can be refined. for example, use registrition if (funcs_str == "elementwise_add_grad,scale_grad") { // The backward of Z = Binary(X, Unary(Y)) T scale = static_cast(ctx.Attr("scale")); - if (recomputation) { - RunBinaryCompoundGradFunctors, - math::ScaleFunctor, - math::ScaleGradFunctor, true>( - ctx, math::AddGradFunctor(), math::ScaleFunctor(scale), - math::ScaleGradFunctor(scale), in_x, in_y, in_out, in_out_grad, - x_grad, y_grad); - } else { - RunBinaryCompoundGradFunctors, - math::ScaleFunctor, - math::ScaleGradFunctor, false>( - ctx, math::AddGradFunctor(), math::ScaleFunctor(scale), - math::ScaleGradFunctor(scale), in_x, in_y, in_out, in_out_grad, - x_grad, y_grad); - } + RunBinaryCompoundGradFunctors, + paddle::operators::math::ScaleFunctor, + paddle::operators::math::ScaleGradFunctor>( + ctx, paddle::operators::math::AddGradFunctor(), + paddle::operators::math::ScaleFunctor(scale), + paddle::operators::math::ScaleGradFunctor(scale), in_x, in_y, in_out, + in_intermediate_out, in_out_grad, x_grad, y_grad); } else if (funcs_str == "scale_grad,elementwise_add_grad") { // The backward of Z = Unary(Binary(X, Y)) T scale = static_cast(ctx.Attr("scale")); - if (recomputation) { - RunUnaryCompoundGradFunctors, - math::AddFunctor, math::AddGradFunctor, - true>(ctx, math::ScaleGradFunctor(scale), - math::AddFunctor(), - math::AddGradFunctor(), in_x, in_y, - in_out, in_out_grad, x_grad, y_grad); - } else { - RunUnaryCompoundGradFunctors, - math::AddFunctor, math::AddGradFunctor, - false>(ctx, math::ScaleGradFunctor(scale), - math::AddFunctor(), - math::AddGradFunctor(), in_x, in_y, - in_out, in_out_grad, x_grad, y_grad); - } + RunUnaryCompoundGradFunctors, + paddle::operators::math::AddFunctor, + paddle::operators::math::AddGradFunctor, + ReComputation /*Recomputation*/>( + ctx, paddle::operators::math::ScaleGradFunctor(scale), + paddle::operators::math::AddFunctor(), + paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, + in_intermediate_out, in_out_grad, x_grad, y_grad); } else if (funcs_str == "elementwise_add_grad,relu_grad") { - if (recomputation) { - RunBinaryCompoundGradFunctors, - math::ReluFunctor, - math::ReluGradFunctor, true>( - ctx, math::AddGradFunctor(), math::ReluFunctor(), - math::ReluGradFunctor(), in_x, in_y, in_out, in_out_grad, x_grad, - y_grad); - } else { - RunBinaryCompoundGradFunctors, - math::ReluFunctor, - math::ReluGradFunctor, false>( - ctx, math::AddGradFunctor(), math::ReluFunctor(), - math::ReluGradFunctor(), in_x, in_y, in_out, in_out_grad, x_grad, - y_grad); - } + RunBinaryCompoundGradFunctors, + paddle::operators::math::ReluFunctor, + paddle::operators::math::ReluGradFunctor>( + ctx, paddle::operators::math::AddGradFunctor(), + paddle::operators::math::ReluFunctor(), + paddle::operators::math::ReluGradFunctor(), in_x, in_y, in_out, + in_intermediate_out, in_out_grad, x_grad, y_grad); } else if (funcs_str == "relu_grad,elementwise_add_grad") { - if (recomputation) { - RunUnaryCompoundGradFunctors, - math::AddFunctor, math::AddGradFunctor, - true>(ctx, math::ReluGradFunctor(), - math::AddFunctor(), - math::AddGradFunctor(), in_x, in_y, - in_out, in_out_grad, x_grad, y_grad); - } else { - RunUnaryCompoundGradFunctors, - math::AddFunctor, math::AddGradFunctor, - false>(ctx, math::ReluGradFunctor(), - math::AddFunctor(), - math::AddGradFunctor(), in_x, in_y, - in_out, in_out_grad, x_grad, y_grad); - } + RunUnaryCompoundGradFunctors, + paddle::operators::math::AddFunctor, + paddle::operators::math::AddGradFunctor, + ReComputation /*Recomputation*/>( + ctx, paddle::operators::math::ReluGradFunctor(), + paddle::operators::math::AddFunctor(), + paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, + in_intermediate_out, in_out_grad, x_grad, y_grad); + } else if (funcs_str == "elementwise_mul_grad,scale_grad") { + // The backward of Z = Binary(X, Unary(Y)) + T scale = static_cast(ctx.Attr("scale")); + RunBinaryCompoundGradFunctors, + paddle::operators::math::ScaleFunctor, + paddle::operators::math::ScaleGradFunctor>( + ctx, paddle::operators::math::MulGradFunctor(), + paddle::operators::math::ScaleFunctor(scale), + paddle::operators::math::ScaleGradFunctor(scale), in_x, in_y, in_out, + in_intermediate_out, in_out_grad, x_grad, y_grad); } else { PADDLE_THROW("%s has not been implemented.", funcs_str); } @@ -385,11 +307,23 @@ class FusedElemwiseActivationKernel : public framework::OpKernel { auto &in_y = detail::Ref(ctx.Input("Y"), "Cannot get input tensor %s, variable name = %s", "Y", ctx.op().Input("Y")); - auto &output = detail::Ref(ctx.Output("Out"), - "Cannot get input tensor %s, variable name = %s", - "Out", ctx.op().Output("Out")); + PADDLE_ENFORCE(ctx.HasOutput("Out"), "The output(Out) should not be empty"); + auto output = ctx.Output("Out"); + + std::vector outputs; + outputs.emplace_back(output); + + if (ctx.Attr("keep_intermediate_value")) { + PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"), + "The keep_intermediate_value is enable, so the " + "IntermediateOut should not be empty."); + auto intermediate_out = ctx.Output("IntermediateOut"); + outputs.emplace_back(intermediate_out); + } else { + outputs.emplace_back(nullptr); + } - RunFunctors(ctx, &in_x, &in_y, &output); + RunFunctors(ctx, in_x, in_y, &outputs); } }; @@ -397,28 +331,66 @@ template class FusedElemwiseActivationGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto &in_x = detail::Ref(ctx.Input("X"), - "Cannot get input tensor %s, variable name = %s", - "X", ctx.op().Input("X")); - auto &in_y = detail::Ref(ctx.Input("Y"), - "Cannot get input tensor %s, variable name = %s", - "Y", ctx.op().Input("Y")); - auto &in_out = detail::Ref(ctx.Input("Out"), - "Cannot get input tensor %s, variable name = %s", - "Out", ctx.op().Input("Out")); - auto &in_out_grad = - detail::Ref(ctx.Input(framework::GradVarName("Out")), - "Cannot get input tensor %s, variable name = %s", - framework::GradVarName("Out"), - ctx.op().Input(framework::GradVarName("Out"))); + auto x = ctx.Input("X"); + auto y = ctx.Input("Y"); + + auto in_out = ctx.Input("Out"); + auto in_out_grad = + ctx.Input(framework::GradVarName("Out")); framework::Tensor *x_grad = ctx.Output(framework::GradVarName("X")); framework::Tensor *y_grad = ctx.Output(framework::GradVarName("Y")); - RunGradFunctors(ctx, &in_x, &in_y, &in_out, &in_out_grad, - x_grad, y_grad); + PADDLE_ENFORCE(y != nullptr, "Input(Y) should not be nullptr."); + + if (ctx.Attr("recomputation")) { + PADDLE_ENFORCE( + x != nullptr, + "The recomputation is opened, so Input(X) should not be absent."); + } else { + PADDLE_ENFORCE(in_out != nullptr, + "The recomputation is disabled, so the Input('Out') " + "should not be empty."); + } + + framework::Tensor *in_x; + auto functor_list = ctx.Attr>("functor_list"); + + // If functor_list contains elementwise_add, the backward doesn't use + // in_x, and in_outs. + if (x == nullptr) { + PADDLE_ENFORCE(functor_list[0] == "elementwise_add_grad" || + functor_list[1] == "elementwise_add_grad", + "Only when the compoundfunctor contains " + "elementwise_add_grad, the 'X' could be absent."); + in_x = const_cast(in_out_grad); + in_out = const_cast(in_out_grad); + } else { + in_x = const_cast(x); + } + + framework::Tensor *in_intermediate_out; + if (ctx.Attr("keep_intermediate_value")) { + in_intermediate_out = const_cast( + ctx.Input("IntermediateOut")); + PADDLE_ENFORCE(in_intermediate_out != nullptr, + "The option of 'keep_intermediate_value' is opened, " + "so the number of 'Out' should be two."); + } else { + in_intermediate_out = nullptr; + } + + if (ctx.Attr("recomputation")) { + RunGradFunctors( + ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad, + y_grad); + } else { + RunGradFunctors( + ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad, + y_grad); + } } }; } // namespace operators diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a34aa86b6331e4fe2813eea97cb6644323807c3 --- /dev/null +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -0,0 +1,332 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fusion_gru_op.h" +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/detail/activation_functions.h" +#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" +#include "paddle/fluid/operators/math/detail/gru_kernel.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence2batch.h" + +namespace paddle { +namespace operators { + +void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightX"), + "Input(WeightX) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasInput("WeightH"), + "Input(WeightH) of GRU should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"), + "Output(BatchedGate) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"), + "Output(BatchResetHiddenPrev) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), + "Output(BatchedHidden) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Output(Hidden) of GRU should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + + auto wx_dims = ctx->GetInputDim("WeightX"); + PADDLE_ENFORCE_EQ(wx_dims.size(), 2, + "The rank of Input(WeightX) should be 2."); + PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1], + "The first dimension of Input(WeightX) " + "should be %d.", + x_dims[1]); + + int frame_size = wx_dims[1] / 3; + auto wh_dims = ctx->GetInputDim("WeightH"); + PADDLE_ENFORCE_EQ(wh_dims.size(), 2, + "The rank of Input(WeightH) should be 2."); + PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, + "The first dimension of Input(WeightH) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(wh_dims[1], 3 * frame_size, + "The second dimension of Input(WeightH) " + "should be 3 * %d.", + frame_size); + + if (ctx->HasInput("H0")) { + auto h0_dims = ctx->GetInputDim("H0"); + PADDLE_ENFORCE_EQ(h0_dims[1], frame_size, + "The width of H0 must be equal to frame_size."); + } + if (ctx->HasInput("Bias")) { + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + PADDLE_ENFORCE_EQ(b_dims[1], frame_size * 3, + "The shape of Bias must be [1, frame_size * 3]."); + } + framework::DDim out_dims({x_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]}); + ctx->SetOutputDim("BatchedHidden", out_dims); + ctx->SetOutputDim("BatchResetHiddenPrev", out_dims); + ctx->ShareLoD("X", "Hidden"); + + int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->ShareLoD("X", "XX"); +} + +framework::OpKernelType FusionGRUOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); +} + +void FusionGRUOpMaker::Make() { + AddInput("X", + "(LoDTensor) the input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + AddInput("H0", + "(Tensor, optional) The initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size, D is the hidden size.") + .AsDispensable(); + AddInput("WeightX", + "(Tensor) The FC weight with shape (M x 3D)," + "where M is the dim size of x, D is the hidden size. "); + AddInput("WeightH", + "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "); + AddInput("Bias", + "(Tensor, optional) (1 x 3D)." + "Almost same as GRUOp." + "Note: if have FC bias it should be added on this bias.") + .AsDispensable(); + AddOutput("XX", + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input.") + .AsIntermediate(); + AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate(); + AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.") + .AsIntermediate(); + AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.") + .AsIntermediate(); + AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp"); + AddAttr("activation", + "(string, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault("tanh"); + AddAttr( + "gate_activation", + "(string, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault("sigmoid"); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed GRU.") + .SetDefault(false); + AddComment(R"DOC( +The Fusion complete GRU Operator. +This operator fuse the fully-connected operator into GRU, +more details can refer to GRU op. +)DOC"); +} + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, + framework::Vector index_lod, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index_lod, dst, indexed_src); +} + +template +class FusionGRUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* wx = ctx.Input("WeightX"); + auto* wh = ctx.Input("WeightH"); + auto* bias = ctx.Input("Bias"); + auto* h0 = ctx.Input("H0"); + + auto* xx = ctx.Output("XX"); + auto* batched_gate = ctx.Output("BatchedGate"); + auto* batch_reset_hidden_prev = + ctx.Output("BatchResetHiddenPrev"); + auto* batch_hidden = ctx.Output("BatchedHidden"); + auto* hidden_out = ctx.Output("Hidden"); + bool is_reverse = ctx.Attr("is_reverse"); + + T* xx_data = xx->mutable_data(ctx.GetPlace()); + T* batched_gate_data = batched_gate->mutable_data(ctx.GetPlace()); + batch_reset_hidden_prev->mutable_data(ctx.GetPlace()); + batch_hidden->mutable_data(ctx.GetPlace()); + hidden_out->mutable_data(ctx.GetPlace()); + + const T* x_data = x->data(); + const T* wx_data = wx->data(); + const T* wh_data = wh->data(); + auto x_dims = x->dims(); + auto wx_dims = wx->dims(); + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + math::LoDTensor2BatchFunctor to_batch; + if (x_dims[1] > wx_dims[1]) { + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + x_data, wx_data, xx_data, + bias ? bias->data() : NULL); + to_batch(dev_ctx, *xx, batched_gate, true, is_reverse); + } else { + to_batch(dev_ctx, *x, xx, true, is_reverse); + batched_gate->set_lod(xx->lod()); + math::FCCompute(blas, x_dims[0], wx_dims[1], x_dims[1], + xx_data, wx_data, batched_gate_data, + bias ? bias->data() : NULL); + } + + int frame_size = static_cast(wx_dims[1] / 3); + math::GRUMetaValue gru_value; + gru_value.gate_weight = const_cast(wh_data); + gru_value.state_weight = + const_cast(wh_data + 2 * frame_size * frame_size); + Tensor ordered_h0; + + framework::Vector order(batched_gate->lod()[2]); + + if (h0) { + ReorderInitState( + ctx.template device_context(), *h0, order, &ordered_h0, + true); + gru_value.prev_out_value = ordered_h0.data(); + } else { + gru_value.prev_out_value = nullptr; + } + auto batch_starts = batched_gate->lod()[0]; + size_t seq_len = batch_starts.size() - 1; + auto active_node = + math::detail::GetActivationType(ctx.Attr("activation")); + auto active_gate = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + +#ifdef PADDLE_WITH_MKLML + // use MKL packed to speedup GEMM + if (FLAGS_paddle_num_threads >= 4) { + auto blas = math::GetBlas(dev_ctx); + T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, + frame_size * 2 /*width of weight*/, + frame_size /*height of height*/); + PADDLE_ENFORCE(packed_gate); + blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2, + frame_size, T(1.0), gru_value.gate_weight, frame_size * 2, + packed_gate); + T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, + frame_size /*width of weight*/, + frame_size /*height of height*/); + PADDLE_ENFORCE(packed_state); + blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size, + frame_size, T(1.0), gru_value.state_weight, frame_size, + packed_state); + for (size_t n = 0; n < seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batched_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = + batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + + if (gru_value.prev_out_value) { + blas.GEMM_COMPUTE( + CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2, + frame_size, gru_value.prev_out_value, frame_size, packed_gate, + frame_size * 2, T(1), gru_value.gate_value, frame_size * 3); + } + + math::detail::forward_reset_output( + math::detail::forward::gru_resetOutput(), gru_value, frame_size, + cur_batch_size, active_gate); + + if (gru_value.prev_out_value) { + blas.GEMM_COMPUTE( + CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size, + gru_value.reset_output_value, frame_size, packed_state, + frame_size, T(1), gru_value.gate_value + frame_size * 2, + frame_size * 3); + } + + math::detail::forward_final_output( + math::detail::forward::gru_finalOutput(), gru_value, frame_size, + cur_batch_size, active_node); + + gru_value.prev_out_value = gru_value.output_value; + } + + blas.GEMM_FREE(packed_gate); + blas.GEMM_FREE(packed_state); + } else { +#endif + for (size_t n = 0; n < seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batched_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = + batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + + math::GRUUnitFunctor::compute( + dev_ctx, gru_value, frame_size, cur_batch_size, active_node, + active_gate); + + gru_value.prev_out_value = gru_value.output_value; + } +#ifdef PADDLE_WITH_MKLML + } +#endif + math::Batch2LoDTensorFunctor to_seq; + batch_hidden->set_lod(batched_gate->lod()); + to_seq(dev_ctx, *batch_hidden, hidden_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OP_CPU_KERNEL( + fusion_gru, ops::FusionGRUKernel, + ops::FusionGRUKernel); diff --git a/paddle/fluid/operators/fusion_gru_op.h b/paddle/fluid/operators/fusion_gru_op.h new file mode 100644 index 0000000000000000000000000000000000000000..eaa59cd412f8f2fd0089428f5e25202c70f032c7 --- /dev/null +++ b/paddle/fluid/operators/fusion_gru_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionGRUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionGRUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 3888333ec5626f1d8d35db215085f483c985cf0a..e4e4ac8e333ba423e151dea05e40a0e41042570e 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -15,10 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/fusion_lstm_op.h" #include #include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/platform/cpu_info.h" + +DEFINE_bool(seq_mode, true, "Use sequence mode"); namespace paddle { namespace operators { @@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ctx->ShareLoD("X", "Hidden"); ctx->ShareLoD("X", "Cell"); - int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + int xx_width; + if (FLAGS_seq_mode) { + xx_width = wx_dims[1]; + } else { + xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; + } ctx->SetOutputDim("XX", {x_dims[0], xx_width}); ctx->ShareLoD("X", "XX"); } @@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx, row_shuffle(ctx, src, index_lod, dst, indexed_src); } -template +template class FuisonLSTMKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void SeqCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = paddle::platform::CPUDeviceContext; + auto* x = ctx.Input("X"); + auto* h0 = ctx.Input("H0"); + auto* c0 = ctx.Input("C0"); + auto* wx = ctx.Input("WeightX"); + auto* wh = ctx.Input("WeightH"); + auto* bias = ctx.Input("Bias"); + + auto* xx = ctx.Output("XX"); + auto* hidden_out = ctx.Output("Hidden"); + auto* cell_out = ctx.Output("Cell"); + bool is_reverse = ctx.Attr("is_reverse"); + + std::function act_gate, act_cell, act_cand; + auto& act_gate_str = ctx.Attr("gate_activation"); + auto& act_cell_str = ctx.Attr("cell_activation"); + auto& act_cand_str = ctx.Attr("candidate_activation"); + if (platform::jit::MayIUse(platform::jit::avx)) { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } else { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } + + auto x_lod = x->lod(); + auto x_dims = x->dims(); // T x M + auto wh_dims = wh->dims(); // D x 4D + const int total_T = x_dims[0]; + const int N = x_lod[0].size() - 1; // batch size + const int M = x_dims[1]; // x frame size + const int D = wh_dims[0]; + const int D2 = D * 2; + const int D3 = D * 3; + const int D4 = wh_dims[1]; + + const T* x_data = x->data(); + const T* h0_data = h0 ? h0->data() : NULL; + const T* c0_data = c0 ? c0->data() : NULL; + const T* wx_data = wx->data(); + const T* wh_data = wh->data(); + T* xx_data = xx->mutable_data(ctx.GetPlace()); + T* hidden_out_data = hidden_out->mutable_data(ctx.GetPlace()); + T* cell_out_data = cell_out->mutable_data(ctx.GetPlace()); + + auto blas = math::GetBlas(ctx); + math::FCCompute(blas, total_T, D4, M, x_data, wx_data, + xx_data, bias->data()); + int xx_offset = D4; + int gate_offset = D; + if (is_reverse) { + const int offset = (total_T - 1) * D; + xx_data = xx_data + offset * 4; + hidden_out_data = hidden_out_data + offset; + cell_out_data = cell_out_data + offset; + xx_offset = -D4; + gate_offset = -D; + } + + auto move_step = [&]() { + xx_data = xx_data + xx_offset; + hidden_out_data = hidden_out_data + gate_offset; + cell_out_data = cell_out_data + gate_offset; + }; + + for (int i = 0; i < N; ++i) { + int bid = is_reverse ? N - 1 - i : i; + int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; + const T* prev_cell_data = NULL; + const T* prev_hidden_data = NULL; + int tstart = 0; + if (h0_data) { + prev_hidden_data = h0_data + bid * D; + prev_cell_data = c0_data + bid * D; + } else { + // W_ch, W_ih, W_fh, W_oh + act_gate(D3, xx_data + D, xx_data + D); + act_cand(D, xx_data, xx_data); + // cell out= input*tilde + blas.VMUL(D, xx_data, xx_data + D, cell_out_data); + // hidden out= act_state(cellout) * outgate + act_cell(D, cell_out_data, xx_data + D2); + blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); + + // prev + prev_hidden_data = hidden_out_data; + prev_cell_data = cell_out_data; + tstart = 1; + + move_step(); + } + for (int step = tstart; step < seq_len; ++step) { + blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast(1), + prev_hidden_data, D, wh_data, D4, static_cast(1), xx_data, + D4); + + // W_ch, W_ih, W_fh, W_oh + act_gate(D3, xx_data + D, xx_data + D); + act_cand(D, xx_data, xx_data); + + // a = forget * prev_cell + blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2); + + // b = input * tilde + blas.VMUL(D, xx_data, xx_data + D, xx_data + D); + + // cell out= a+b + blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data); + + // hidden out= act_state(cellout) * outgate + act_cell(D, cell_out_data, xx_data + D2); + blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data); + + // prev + prev_hidden_data = hidden_out_data; + prev_cell_data = cell_out_data; + + move_step(); + } + } + } + + void BatchCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = platform::CPUDeviceContext; auto* x = ctx.Input("X"); auto* wx = ctx.Input("WeightX"); auto* wh = ctx.Input("WeightH"); @@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel { // restore the output cell state in LoDTensor from the batch cell to_seq(dev_ctx, batch_cell, cell_out); } + void Compute(const framework::ExecutionContext& ctx) const override { + if (FLAGS_seq_mode) { + SeqCompute(ctx); + } else { + BatchCompute(ctx); + } + } }; } // namespace operators @@ -348,7 +492,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OP_CPU_KERNEL( - fusion_lstm, - ops::FuisonLSTMKernel, - ops::FuisonLSTMKernel); +REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel, + ops::FuisonLSTMKernel); diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cd3d3887cf5167c779a8b20442fdb458cd7eab4 --- /dev/null +++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc @@ -0,0 +1,206 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { + +void FusionSeqExpandConcatFCOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_GT( + ctx->Inputs("X").size(), 1UL, + "Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1."); + PADDLE_ENFORCE( + ctx->HasInput("FCWeight"), + "Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of FusionSeqExpandConcatFCOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("FCOut"), + "Output(FCOut) of FusionSeqExpandConcatFCOp should not be null."); + + auto ins_dims = ctx->GetInputsDim("X"); + auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D + PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2."); + const int D = w_dims[1]; + int sum = ins_dims[0][1]; + for (size_t i = 1; i < ins_dims.size(); ++i) { + sum += ins_dims[i][1]; + } + PADDLE_ENFORCE_EQ(sum, w_dims[0], + "FC height should be sum of all inputs width."); + if (ctx->HasInput("FCBias")) { + auto b_dims = ctx->GetInputDim("FCBias"); + PADDLE_ENFORCE(b_dims.size() == 1 || b_dims.size() == 2, + "b_dims should be 1 or 2, get %d", b_dims.size()); + if (b_dims.size() == 1) { + PADDLE_ENFORCE_EQ(b_dims[0], D, "FCBias shapes must be %d.", D); + } else { + PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1x%d.", D); + PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1x%d.", D); + } + } + + ctx->SetOutputDim("Out", {ins_dims[0][0], D}); + // fcout should be reshape when run since can not get lod in infershape + // explicit share the ref lod + ctx->ShareLoD("X", "Out", 0); +} + +framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType(ctx.MultiInput("X")[0]->type()), + ctx.device_context()); +} + +void FusionSeqExpandConcatFCOpMaker::Make() { + AddInput("X", + "(LoDTensor) input LodDTensors, the first one must be have ref lod " + "for sequence expand, and the rest input should have same lod.") + .AsDuplicable(); + AddInput("FCWeight", "(Tensor) the weights of fc."); + AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable(); + AddOutput("Out", "(LoDTensor) Output LodTensor."); + AddOutput( + "FCOut", + "(Tensor) the intermediate tensor to keep the result of fc." + "Shape is (N x D), where N is the batch size, D is the output dim of fc") + .AsIntermediate(); + AddAttr("fc_activation", + "(string, default: identity)" + "The activation for the result of fc." + "`identity` by default.") + .SetDefault("identity") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Fusion Sequence expand + concat + fc Operator. + +All below conditions should be meet: + +The ref_level of seq_expand should be 0. + +The ref lod of seq_expand level is the first input of concat. + +The other inputs should have same lod and same batch size of ref lod. + +The seq len of other inputs should be 1. + +The concat axis should be 1. + +)DOC"); +} + +template +class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using DeviceContext = paddle::platform::CPUDeviceContext; + auto ins = ctx.MultiInput("X"); + auto* w = ctx.Input("FCWeight"); + auto* b = ctx.Input("FCBias"); + auto* out = ctx.Output("Out"); + auto* fc_out = ctx.Output("FCOut"); + + auto* ref_in = ins[0]; + auto ref_lod = ref_in->lod(); + auto in1_lod = ins[1]->lod(); + auto ref_dims = ref_in->dims(); // T x M0 + auto in1_dims = ins[1]->dims(); // N x M1 + auto w_dims = w->dims(); + const int N = ref_lod[0].size() - 1; + const int total_T = ref_dims[0]; + const int M0 = ref_dims[1]; + const int M1 = in1_dims[1]; + const int D = w_dims[1]; + + // some check and fcout should be reshape here + // since infershape can not get lod info + PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1."); + PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1."); + PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N, + "Batch size of all inputs should be equal."); + PADDLE_ENFORCE_EQ(in1_lod[0][N], N, + "Seq_length of other inputs should be 1."); + PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size."); + for (size_t i = 2; i < ins.size(); ++i) { + PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N, + "All other inputs height should be equal"); + PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod, + "All other inputs should have same lod"); + } + fc_out->Resize({N, D}); + + std::function fc_act; + auto& fc_act_str = ctx.Attr("fc_activation"); + if (platform::jit::MayIUse(platform::jit::avx)) { + math::VecActivations act_functor; + fc_act = act_functor(fc_act_str); + } else { + math::VecActivations act_functor; + fc_act = act_functor(fc_act_str); + } + + const T* ref_in_data = ref_in->data(); + const T* in1_data = ins[1]->data(); + const T* w_data = w->data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + T* fc_out_data = fc_out->mutable_data(ctx.GetPlace()); + + auto blas = math::GetBlas(ctx); + math::FCCompute(blas, total_T, D, M0, ref_in_data, w_data, + out_data, b ? b->data() : NULL); + w_data = w_data + M0 * D; + // first write on + blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data); + w_data = w_data + M1 * D; + for (size_t i = 2; i < ins.size(); ++i) { + // add on + const T* in_data = ins[i]->data(); + const int K = ins[i]->dims()[1]; + blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast(1), in_data, + K, w_data, D, static_cast(1), fc_out_data, D); + w_data = w_data + K * D; + } + T* cur_out_data = out_data; + for (int i = 0; i < N; ++i) { + int seq_len = ref_lod[0][i + 1] - ref_lod[0][i]; + T* src = fc_out_data + i * D; + for (int step = 0; step < seq_len; ++step) { + blas.VADD(D, cur_out_data, src, cur_out_data); + cur_out_data = cur_out_data + D; + } + } + fc_act(total_T * D, out_data, out_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp, + ops::FusionSeqExpandConcatFCOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc, + ops::FusionSeqExpandConcatFCOpKernel, + ops::FusionSeqExpandConcatFCOpKernel); diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f78e820f603354944bd7fc23aff2d1d72e5ba750 --- /dev/null +++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionSeqExpandConcatFCOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index aa3e05b83b23569a4dd9c83294916e289f993abc..089b541a0a61adb5efda6b2e027c913d5808dff0 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -101,5 +101,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); -REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel); -REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel); +REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, + ops::GatherOpKernel, ops::GatherOpKernel); +REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel); diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index f196e18fe122af9536230752096a2d90de8ab527..4cc2159d9f22809a640f82ad19415f3e5a2d9999 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -165,12 +165,13 @@ void ListenAndServOp::RunSyncLoop( recv_scope); VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; - rpc_service_->SetCond(distributed::kRequestGet); - rpc_service_->WaitBarrier(distributed::kRequestGet); - rpc_service_->ResetBarrierCounter(); // reset received sparse vars to avoid reuse it in the next mini-batch dynamic_cast(request_send_handler_.get()) ->ResetSparseVarRecorder(); + + rpc_service_->SetCond(distributed::kRequestGet); + rpc_service_->WaitBarrier(distributed::kRequestGet); + rpc_service_->ResetBarrierCounter(); } // while(true) } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index c1f0d44c5b5d5cc2d221736b28eabc0c0eea5b12..d7f0f3c6280db7d121bf8821ec6d578e22a33da6 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -73,3 +73,4 @@ if(WITH_GPU) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat) +cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) diff --git a/paddle/fluid/operators/math/compound_functors.h b/paddle/fluid/operators/math/compound_functors.h new file mode 100644 index 0000000000000000000000000000000000000000..1d32a9585b08a9d27730076d9f7baa6056270a42 --- /dev/null +++ b/paddle/fluid/operators/math/compound_functors.h @@ -0,0 +1,185 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace operators { +namespace math { + +template +struct BinaryCompoundFunctor { + BinaryCompoundFunctor(const BinaryFunctor func1, const UnaryFunctor func2) + : func1_(func1), func2_(func2) {} + // Z = BinaryFunctor(X, UnaryFunctor(Y)) + + inline HOSTDEVICE T GetOut(T x, T y) { return func1_(x, func2_(y)); } + + inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) { + return func1_(x, intermediat_out); + } + + inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(y); } + + BinaryFunctor func1_; + UnaryFunctor func2_; +}; + +template +struct UnaryCompoundFunctor { + UnaryCompoundFunctor(const UnaryFunctor func1, const BinaryFunctor func2) + : func1_(func1), func2_(func2) {} + // Z = UnaryFunctor(BinaryFunctor(X, Y)) + + inline HOSTDEVICE T GetOut(T x, T y) { return func1_(func2_(x, y)); } + + inline HOSTDEVICE T GetOutUseIntermediateOut(T x, T intermediat_out) { + return func1_(intermediat_out); + } + + inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return func2_(x, y); } + + UnaryFunctor func1_; + BinaryFunctor func2_; +}; + +// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get +// the dx, one is to use the 'out', and the other is not to use it. +// the former method will save the time of recomputing the +// 'out', but it must occupy the memory to store the 'out'. +// While the later method can avoid occupying this memory, +// but it must recompute the 'out'. +template +struct BinaryCompoundGradDxFunctor { + BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun, + const UnaryFun &unary_fun) + : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + return dout * d_binary_fun_.Dx(x, unary_fun_(y)); + } + + inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { + return dout * d_binary_fun_.Dx(x, intermediate_out); + } + + private: + DBinaryFun d_binary_fun_; + UnaryFun unary_fun_; +}; + +template +struct BinaryCompoundGradDyFunctor { + BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun, + const UnaryFun &unary_fun, + const DUnaryFun &d_unary_fun) + : d_binary_fun_(d_binary_fun), + unary_fun_(unary_fun), + d_unary_fun_(d_unary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_(y); + } + + inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { + return dout * d_binary_fun_.Dy(x, intermediate_out) * + d_unary_fun_(y, intermediate_out); + } + + private: + DBinaryFun d_binary_fun_; + UnaryFun unary_fun_; + DUnaryFun d_unary_fun_; +}; + +template +struct UnaryCompoundGradDxFunctor { + UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun, + const BinaryFun &binary_fun, + const DBinaryFun &d_binary_fun) + : d_unary_fun_(d_unary_fun), + binary_fun_(binary_fun), + d_binary_fun_(d_binary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + T base; + if (Recomputation) { + base = dout * d_unary_fun_(binary_fun_(x, y)); + } else { + base = dout * d_unary_fun_(binary_fun_(x, y), out); + } + return base * d_binary_fun_.Dx(x, y); + } + + inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { + T base; + if (Recomputation) { + base = dout * d_unary_fun_(intermediate_out); + } else { + base = dout * d_unary_fun_(intermediate_out, out); + } + return base * d_binary_fun_.Dx(x, y); + } + + private: + DUnaryFun d_unary_fun_; + BinaryFun binary_fun_; + DBinaryFun d_binary_fun_; +}; + +template +struct UnaryCompoundGradDyFunctor { + UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun, + const BinaryFun &binary_fun, + const DBinaryFun &d_binary_fun) + : d_unary_fun_(d_unary_fun), + binary_fun_(binary_fun), + d_binary_fun_(d_binary_fun) {} + + inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { + T base; + if (Recomputation) { + base = dout * d_unary_fun_(binary_fun_(x, y)); + } else { + base = dout * d_unary_fun_(binary_fun_(x, y), out); + } + return base * d_binary_fun_.Dy(x, y); + } + + inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { + T base; + if (Recomputation) { + base = dout * d_unary_fun_(intermediate_out); + } else { + base = dout * d_unary_fun_(intermediate_out, out); + } + return base * d_binary_fun_.Dy(x, y); + } + + private: + DUnaryFun d_unary_fun_; + BinaryFun binary_fun_; + DBinaryFun d_binary_fun_; +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc index fbe7c2978385401b35765101c87387ff727be4e0..c3c5c160db358d39aa3f841a2b1646a21c91440e 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat.cc @@ -48,16 +48,16 @@ class ConcatFunctor { auto cpu_place = boost::get(context.GetPlace()); // computation - for (int k = 0; k < out_rows; ++k) { - T* dst_ptr = output->data() + k * out_cols; - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = input_cols[j]; - const T* src_prt = input[j].data() + k * col_len; - memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt, - sizeof(T) * col_len); - col_idx += col_len; + auto output_data = output->data(); + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = input[j].data(); + for (int k = 0; k < out_rows; ++k) { + memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place, + input_data + k * col_len, sizeof(T) * col_len); } + col_idx += col_len; } } }; diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 820e73e779720e4f76168e0a84a254ef645784ee..342379268be36cc5b532363e664f6e73990333e1 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -177,6 +177,9 @@ class ConcatFunctor { dev_ins_data, dev_ins_col_data, static_cast(inputs_col.size()), out_row, out_col, output->data()); } + // Wait() must be called because `inputs_data` may be destructed before + // kernel ends + context.Wait(); } }; @@ -252,6 +255,9 @@ class ConcatGradFunctor { input.data(), in_row, in_col, dev_outs_col_data, static_cast(outputs_cols.size()), dev_out_gpu_data); } + // Wait() must be called because `outputs_data` may be destructed before + // kernel ends + context.Wait(); } }; diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 48c0da0e368a0fe6efcd758536e5659eeee26f7e..5693761e9ffd96b40040223b5498b63b0274bf0f 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -13,8 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include #include "paddle/fluid/platform/cpu_info.h" +#ifdef __AVX__ +#include +#endif + +#ifdef PADDLE_WITH_MKLML +#include "paddle/fluid/platform/dynload/mklml.h" +#endif namespace paddle { namespace operators { @@ -22,16 +31,161 @@ namespace math { #define SIGMOID_THRESHOLD_MIN -40.0 #define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 + +#define AVX_FLOAT_BLOCK 8 +#define AVX_DOUBLE_BLOCK 4 +#define AVX2_FLOAT_BLOCK 8 +#define AVX2_DOUBLE_BLOCK 4 +#define AVX512_FLOAT_BLOCK 16 +#define AVX512_DOUBLE_BLOCK 8 template -inline T sigmoid(T x) { - return 1. / (1. + exp(-x)); +inline void vec_exp(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } } template -inline T tanh(T x) { - return 2. * sigmoid(2. * x) - 1.; +inline void vec_scal(const int n, const T a, T* x) { + for (int i = 0; i < n; ++i) { + x[i] = a * x[i]; + } +} + +#ifdef PADDLE_WITH_MKLML +template <> +inline void vec_exp(const int n, const float* x, float* y) { + platform::dynload::vsExp(n, x, y); +} + +template <> +inline void vec_exp(const int n, const double* x, double* y) { + platform::dynload::vdExp(n, x, y); +} + +template <> +inline void vec_scal(const int n, const float a, float* x) { + platform::dynload::cblas_sscal(n, a, x, 1); +} + +template <> +inline void vec_scal(const int n, const double a, double* x) { + platform::dynload::cblas_dscal(n, a, x, 1); +} +#endif + +// MKL scal only support inplace, choose this if src and dst are not equal +template +inline void vec_scal(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = a * x[i]; + } +} + +template <> +inline void vec_scal(const int n, const float a, + const float* x, float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_scal(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 scalar = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_mul_ps(tmp, scalar); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = a * x[i]; + } +#else + vec_scal(n, a, x, y); +#endif +} + +template <> +inline void vec_scal(const int n, const float a, + const float* x, float* y) { + vec_scal(n, a, x, y); +} + +template <> +inline void vec_scal(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_scal(n, a, x, y); +} + +template +inline void vec_add_bias(const int n, const T a, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] + a; + } +} + +template <> +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_add_bias(n, a, x, y); + return; + } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 bias = _mm256_set1_ps(a); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_add_ps(tmp, bias); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step if src and dst are inplace + for (i = n - rest; i < n; ++i) { + y[i] = x[i] + a; + } +#else + vec_add_bias(n, a, x, y); +#endif +} + +template <> +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { + vec_add_bias(n, a, x, y); +} + +template <> +inline void vec_add_bias(const int n, + const float a, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_add_bias(n, a, x, y); } template @@ -45,18 +199,97 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { const T min = SIGMOID_THRESHOLD_MIN; const T max = SIGMOID_THRESHOLD_MAX; for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = 1.0 / (1.0 + std::exp(-tmp)); + y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(0) - y[i]; + } + vec_exp(n, y, y); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(1) / (static_cast(1) + y[i]); + } +} + +template <> +inline void vec_sigmoid(const int n, const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block) { + vec_sigmoid(n, x, y); + return; } + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 zeros = _mm256_setzero_ps(); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, min); \ + tmp = _mm256_min_ps(tmp, max); \ + tmp = _mm256_sub_ps(zeros, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest != 0) { + // can not continue move step since the src and dst address could be equal + const float xmin = SIGMOID_THRESHOLD_MIN; + const float xmax = SIGMOID_THRESHOLD_MAX; + for (i = n - rest; i < n; ++i) { + y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i])); + } + } + + vec_exp(n, y, y); + + __m256 ones = _mm256_set1_ps(1.0f); +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(y + i); \ + tmp = _mm256_add_ps(ones, tmp); \ + tmp = _mm256_div_ps(ones, tmp); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } +#undef MOVE_ONE_STEP + if (rest == 0) { + return; + } + // can not continue move step + for (i = n - rest; i < n; ++i) { + y[i] = 1.f / (1.f + y[i]); + } +#else + vec_sigmoid(n, x, y); +#endif +} + +template <> +inline void vec_sigmoid(const int n, const float* x, + float* y) { + vec_sigmoid(n, x, y); +} + +template <> +inline void vec_sigmoid(const int n, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_sigmoid(n, x, y); } template inline void vec_tanh(const int n, const T* x, T* y) { - for (int i = 0; i < n; ++i) { - y[i] = tanh(x[i]); - } + vec_scal(n, static_cast(2), x, y); + vec_sigmoid(n, y, y); + vec_scal(n, static_cast(2), y); + vec_add_bias(n, static_cast(-1), y, y); } +// TODO(TJ): make relu clip template inline void vec_relu(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { @@ -64,24 +297,56 @@ inline void vec_relu(const int n, const T* x, T* y) { } } +template <> +inline void vec_relu(const int n, const float* x, + float* y) { +#ifdef __AVX__ + constexpr int block = AVX_FLOAT_BLOCK; + if (n < block * 4) { + vec_relu(n, x, y); + return; + } + + const int rest = n % block; + const int end = n - rest; + int i = 0; + __m256 zeros = _mm256_setzero_ps(); + __m256 tmp; +#define MOVE_ONE_STEP \ + tmp = _mm256_loadu_ps(x + i); \ + tmp = _mm256_max_ps(tmp, zeros); \ + _mm256_storeu_ps(y + i, tmp) + for (i = 0; i < end; i += block) { + MOVE_ONE_STEP; + } + if (rest == 0) { + return; + } + i = n - block; + MOVE_ONE_STEP; +#undef MOVE_ONE_STEP + +#else + vec_relu(n, x, y); +#endif +} + template <> inline void vec_relu(const int n, const float* x, float* y) { - // TODO(TJ): complete me - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } + vec_relu(n, x, y); } template <> -inline void vec_relu(const int n, const float* x, - float* y) { - // TODO(TJ): complete me - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } +inline void vec_relu(const int n, + const float* x, + float* y) { + // TODO(TJ): enable me + vec_relu(n, x, y); } +// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary + template class VecActivations { public: @@ -96,7 +361,7 @@ class VecActivations { } else if (type == "identity" || type == "") { return vec_identity; } - PADDLE_THROW("Not support type %s.", type); + LOG(FATAL) << "Not support type: " << type; } }; diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3ce66f49ed8354c49e8af26ca6eb48fef654a40b --- /dev/null +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/operators/math/cpu_vec.h" + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} +constexpr int repeat = 1000; + +template +inline T _sigmoid(T x) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + T tmp = (x < min) ? min : ((x > max) ? max : x); + return static_cast(1) / (static_cast(1) + std::exp(-tmp)); +} + +template +inline T _tanh(T x) { + return static_cast(2) * _sigmoid(static_cast(2) * x) - + static_cast(1); +} + +template +void ref_sigmoid(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = _sigmoid(x[i]); + } +} + +template +void ref_tanh(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = _tanh(x[i]); + } +} +template +void ref_relu(const int n, const T* x, T* y) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template +void RandomVec(const int n, T* a) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + const T lower = static_cast(-20.f); + const T upper = static_cast(20.f); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +template +void TestAndBench(const int n, std::function tgt, + std::function ref) { + std::vector x(n); + std::vector ytgt(n), yref(n); + RandomVec(n, x.data()); + + const T* x_data = x.data(); + T* ytgt_data = ytgt.data(); + T* yref_data = yref.data(); + auto st = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + tgt(n, x_data, ytgt_data); + } + auto mt = GetCurrentUS(); + for (int i = 0; i < repeat; ++i) { + ref(n, x_data, yref_data); + } + auto et = GetCurrentUS(); + + VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat + << " us, tgt takes: " << (mt - st) / repeat; + for (int i = 0; i < n; ++i) { + EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3); + } +} + +TEST(CpuVecTest, sigmoid) { + namespace jit = paddle::platform::jit; + using namespace paddle::operators::math; // NOLINT + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { + TestAndBench(sz, vec_sigmoid, ref_sigmoid); + TestAndBench(sz, vec_sigmoid, ref_sigmoid); + TestAndBench(sz, vec_sigmoid, ref_sigmoid); + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + } + TestAndBench(30, vec_sigmoid, ref_sigmoid); +} + +TEST(CpuVecTest, tanh) { + namespace jit = paddle::platform::jit; + using namespace paddle::operators::math; // NOLINT + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, + ref_tanh); + } + TestAndBench(30, vec_tanh, ref_tanh); +} + +TEST(CpuVecTest, relu) { + namespace jit = paddle::platform::jit; + using namespace paddle::operators::math; // NOLINT + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, + ref_relu); + } + TestAndBench(30, vec_relu, ref_relu); +} + +template +void TestInplace(const int n, std::function tgt, + std::function ref) { + std::vector x(n); + std::vector ytgt(n), yref(n); + RandomVec(n, x.data()); + + const T* x_data = x.data(); + T* yref_data = yref.data(); + T* ytgt_data = ytgt.data(); + std::memcpy(yref_data, x_data, sizeof(T) * n); + std::memcpy(ytgt_data, x_data, sizeof(T) * n); + + ref(n, yref_data, yref_data); + tgt(n, ytgt_data, ytgt_data); + + for (int i = 0; i < n; ++i) { + EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3); + } +} + +TEST(CpuVecTest, inplace_sigmoid) { + namespace jit = paddle::platform::jit; + using namespace paddle::operators::math; // NOLINT + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { + TestInplace(sz, vec_sigmoid, ref_sigmoid); + TestInplace(sz, vec_sigmoid, ref_sigmoid); + TestInplace(sz, vec_sigmoid, ref_sigmoid); + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + } + TestInplace(30, vec_sigmoid, ref_sigmoid); +} + +TEST(CpuVecTest, inplace_tanh) { + namespace jit = paddle::platform::jit; + using namespace paddle::operators::math; // NOLINT + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, + ref_tanh); + } + TestInplace(30, vec_tanh, ref_tanh); +} + +TEST(CpuVecTest, inplace_relu) { + namespace jit = paddle::platform::jit; + using namespace paddle::operators::math; // NOLINT + for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, + ref_relu); + } + TestInplace(30, vec_relu, ref_relu); +} diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h index ad2f49ccbf5ff37d33cc9e71c1a683571f4f8137..ddb01cdfc084f5ba2e9e573be461389f46fbe03f 100644 --- a/paddle/fluid/operators/math/functors.h +++ b/paddle/fluid/operators/math/functors.h @@ -18,6 +18,19 @@ namespace paddle { namespace operators { namespace math { +// MulFunctor +template +struct MulFunctor { + // out = x * y; + inline HOSTDEVICE T operator()(T x, T y) { return x * y; } +}; + +template +struct MulGradFunctor { + inline HOSTDEVICE T Dx(T x, T y) { return y; } + inline HOSTDEVICE T Dy(T x, T y) { return x; } +}; + // AddFunctor template struct AddFunctor { @@ -27,9 +40,8 @@ struct AddFunctor { template struct AddGradFunctor { - inline HOSTDEVICE T operator()(T x, T y) { return 1; } - - inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; } + inline HOSTDEVICE T Dx(T x, T y) { return 1; } + inline HOSTDEVICE T Dy(T x, T y) { return 1; } }; template diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index c3387be6daa3bd34a6e3410ced23fce5d65f2cf7..5923792902a81521256de300f77955f1ea3d16c6 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -41,7 +41,8 @@ template struct SetConstant; template struct Transpose; \ template struct Transpose; \ template struct Transpose; \ - template struct Transpose; + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); @@ -54,7 +55,7 @@ struct TensorSetConstantCPU { TensorSetConstantCPU(framework::Tensor* tensor, float value) : tensor_(tensor), value_(value) {} template - void operator()() const { + void apply() const { auto cpu = platform::CPUPlace(); auto* begin = tensor_->mutable_data(cpu); std::fill(begin, begin + tensor_->numel(), static_cast(value_)); diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index d5af718723e8d44da0971ea7756b8c36e771cca2..79b7538ad05b0ff348b8264d50b63211b5254e80 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -33,10 +33,11 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); @@ -51,7 +52,7 @@ struct TensorSetConstantGPU { : context_(context), tensor_(tensor), value_(value) {} template - void operator()() const { + void apply() const { SetConstant functor; functor(reinterpret_cast(context_), tensor_, static_cast(value_)); diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h new file mode 100644 index 0000000000000000000000000000000000000000..3ae25eae98b25bca015ec4383c7126eb81e52b8a --- /dev/null +++ b/paddle/fluid/operators/math/padding.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +template +using EigenTensor = framework::EigenTensor; + +template +void PadFunction(const framework::ExecutionContext& context, + const std::vector& pads, const framework::Tensor& src, + T pad_value, framework::Tensor* out) { + Eigen::array, D> paddings; + + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = pads[i * 2]; + paddings[i].second = pads[i * 2 + 1]; + } + + auto src_tensor = EigenTensor::From(src); + auto out_tensor = EigenTensor::From(*out); + + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = src_tensor.pad(paddings, pad_value); +} + +template +void PadGradFunction(const framework::ExecutionContext& context, + const std::vector& pads, const framework::Tensor& src, + framework::Tensor* d_out) { + Eigen::array, D> paddings; + for (size_t i = 0; i < paddings.size(); ++i) { + paddings[i].first = -pads[i * 2]; + paddings[i].second = -pads[i * 2 + 1]; + } + + auto d_out_tensor = EigenTensor::From(*d_out); + auto src_tensor = EigenTensor::From(src); + auto& place = + *context.template device_context().eigen_device(); + d_out_tensor.device(place) = src_tensor.pad(paddings, 0); +} + +template +void PaddingFunctor(int rank, const framework::ExecutionContext& context, + const std::vector& pads, T pad_value, + const framework::Tensor& src, framework::Tensor* out) { + switch (rank) { + case 1: + PadFunction(context, pads, src, pad_value, out); + break; + case 2: + PadFunction(context, pads, src, pad_value, out); + break; + case 3: + PadFunction(context, pads, src, pad_value, out); + break; + case 4: + PadFunction(context, pads, src, pad_value, out); + break; + case 5: + PadFunction(context, pads, src, pad_value, out); + break; + case 6: + PadFunction(context, pads, src, pad_value, out); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } +} + +template +void PaddingGradFunctor(int rank, const framework::ExecutionContext& context, + const std::vector& pads, + const framework::Tensor& src, framework::Tensor* out) { + switch (rank) { + case 1: + PadGradFunction(context, pads, src, out); + break; + case 2: + PadGradFunction(context, pads, src, out); + break; + case 3: + PadGradFunction(context, pads, src, out); + break; + case 4: + PadGradFunction(context, pads, src, out); + break; + case 5: + PadGradFunction(context, pads, src, out); + break; + case 6: + PadGradFunction(context, pads, src, out); + break; + default: + PADDLE_THROW( + "PadOp only support tensors with no more than 6 dimensions."); + } +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc index b546b8728217ed6013247555dcd5d7180ddeae74..e4ffeedb5a0061dd60ca3a30aa9928ef8b05887c 100644 --- a/paddle/fluid/operators/math/sequence2batch.cc +++ b/paddle/fluid/operators/math/sequence2batch.cc @@ -38,13 +38,14 @@ class CopyMatrixRowsFunctor { auto width = dst_dims[1]; auto* src_data = src.data(); auto* dst_data = dst->data(); - for (int i = 0; i < height; ++i) { - if (is_src_index) { - memcpy(dst_data + i * width, src_data + index[i] * width, - width * sizeof(T)); - } else { - memcpy(dst_data + index[i] * width, src_data + i * width, - width * sizeof(T)); + const int sz = width * sizeof(T); + if (is_src_index) { + for (int i = 0; i < height; ++i) { + memcpy(dst_data + i * width, src_data + index[i] * width, sz); + } + } else { + for (int i = 0; i < height; ++i) { + memcpy(dst_data + index[i] * width, src_data + i * width, sz); } } } diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index d63c6c4ed55331235188c1c750468d4e75b9b7f2..25f06a25a0638cbb394df58d35f88307941d117f 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -18,65 +18,86 @@ namespace paddle { namespace operators { namespace math { +template +void CopyValidData(framework::Tensor* dst_tensor, + const framework::Tensor* src_tensor, + const framework::Vector& seq_offsets, + int pad_seq_len, int step_width, bool norm_by_len, + CopyType type, PadLayout layout) { + int seq_num = seq_offsets.size() - 1; + const T* src_data = src_tensor->data(); + T* dst_data = dst_tensor->data(); + + int seq_cpy_gap = step_width; + int pad_cpy_gap = + layout == kBatchLengthWidth ? step_width : seq_num * step_width; + for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) { + int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; + PADDLE_ENFORCE_GE( + pad_seq_len, valid_seq_len, + "The padded sequence length can not be less than its original length."); + int seq_data_offset = seq_offsets[seq_idx] * step_width; + int pad_data_offset = layout == kBatchLengthWidth + ? seq_idx * pad_seq_len * step_width + : seq_idx * step_width; + float scale = 1.0f / static_cast(valid_seq_len); + + for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) { + const T* src = + src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset); + T* dst = + dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset); + memcpy(dst, src, step_width * sizeof(T)); + if (norm_by_len) { + for (int i = 0; i < step_width; ++i) { + *(dst + i) *= scale; + } + } + seq_data_offset += seq_cpy_gap; + pad_data_offset += pad_cpy_gap; + } + } +} + template class PaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor* padding, - bool norm_by_times) { - auto lod = seq.lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The LoD of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq.dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding->dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequence_length, num_sequences, sequence_width]."); - - const int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be the " - "maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be the " - "number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq.numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - const T* seq_data = seq.data(); - T* padding_data = padding->data(); - for (int64_t i = 0; i < max_sequence_length; ++i) { - for (int64_t j = 0; j < num_sequences; ++j) { - int64_t start_pos = abs_offset_lod[level][j]; - int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos; - if (i < sequence_length) { - // i > 0 => sequence_length > 0 - T scale = - norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; - for (int64_t k = 0; k < sequence_width; ++k) { - padding_data[(i * num_sequences + j) * sequence_width + k] = - seq_data[(start_pos + i) * sequence_width + k] * scale; - } - } else { - memset(padding_data + (i * num_sequences + j) * sequence_width, 0, - sequence_width * sizeof(T)); - } + const framework::LoDTensor& seq_tensor, + framework::LoDTensor* pad_tensor, + const framework::LoDTensor& pad_value, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_lod = seq_tensor.lod(); + const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; + const auto& seq_tensor_dims = seq_tensor.dims(); + const auto& pad_tensor_dims = pad_tensor->dims(); + if (pad_seq_len == -1) { + pad_seq_len = MaximumSequenceLength(seq_offsets); + } + int step_width = seq_tensor.numel() / seq_tensor_dims[0]; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width, + "The numel of 'pad_value' can only be 1 or be equal to the " + "'step_width'."); + + // fill padding value + T* pad_data = pad_tensor->data(); + const T* pad_value_data = pad_value.data(); + if (pad_value.numel() == 1) { + for (int i = 0; i < pad_tensor->numel(); ++i) { + pad_data[i] = *pad_value_data; + } + } else { + for (int i = 0; i < pad_tensor->numel(); i += step_width) { + memcpy(pad_data + i, pad_value_data, step_width * sizeof(T)); } } + + CopyValidData(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len, + step_width, norm_by_times, kSeqToPad, layout); } }; @@ -84,62 +105,35 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CPUDeviceContext& context, - framework::LoDTensor* seq, const framework::Tensor& padding, - bool norm_by_times) { - auto lod = seq->lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The LoD of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq->dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding.dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequnece_length, num_sequences, sequence_width]."); - - const int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be " - "the maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be " - "the number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq->numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - const T* padding_data = padding.data(); - T* seq_data = seq->data(); - for (int64_t i = 0; i < num_sequences; ++i) { - int64_t start_pos = abs_offset_lod[level][i]; - int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos; - for (int64_t j = 0; j < sequence_length; ++j) { - // sequence_width > j > 0 - T scale = - norm_by_times ? (1.0f / static_cast(sequence_length)) : 1.0f; - for (int64_t k = 0; k < sequence_width; ++k) { - seq_data[(start_pos + j) * sequence_width + k] = - padding_data[(j * num_sequences + i) * sequence_width + k] * - scale; - } - } + const framework::LoDTensor& pad_tensor, + framework::LoDTensor* seq_tensor, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; + const auto& seq_tensor_dims = seq_tensor->dims(); + const auto& pad_tensor_dims = pad_tensor.dims(); + if (pad_seq_len == -1) { + pad_seq_len = MaximumSequenceLength(seq_offsets); } + int step_width = seq_tensor->numel() / seq_tensor_dims[0]; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + + CopyValidData(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len, + step_width, norm_by_times, kPadToSeq, layout); } }; +template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; + +template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 0956a0c17d387f4a174c7ed4e9b1b1f816dcf4ae..035e10dcbe4e2083723e47d7dda75ce267a9f141 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -19,41 +19,32 @@ namespace paddle { namespace operators { namespace math { -template -__global__ void SequencePaddingKernel(T* padding, T* sequence, - const size_t* sequence_start_positions, - const size_t sequence_width, - const size_t max_sequence_length, - const size_t num_sequences) { - size_t padding_idx = blockIdx.y; - size_t start_pos = sequence_start_positions[padding_idx]; - size_t sequence_length = - sequence_start_positions[padding_idx + 1] - start_pos; - - size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y; - size_t padding_base_idx = - (sequence_idx * num_sequences + padding_idx) * sequence_width; - size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width; - - if (sequence_idx < sequence_length) { - T scale = NormByTimes ? (1.0f / static_cast(sequence_length)) : 1.0f; - if (Padding) { - /* sequence -> padding */ - for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { - padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i]; - } - } else { - /* padding -> sequence */ - for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { - sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i]; - } +template +__global__ void SequencePaddingKernel( + T* dst, const T* src, const T* pad_value, bool is_constant_pad, + const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len, + const size_t step_width, bool norm_by_len, const PadLayout layout) { + size_t seq_idx = blockIdx.y; + size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; + + size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y; + size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width; + size_t pad_data_offset = layout == kBatchLengthWidth + ? (seq_idx * pad_seq_len + step_idx) * step_width + : (step_idx * seq_num + seq_idx) * step_width; + + T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset); + const T* src_data = + src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset); + + if (step_idx < seq_len) { + float scale = norm_by_len ? (1.0f / static_cast(seq_len)) : 1.0f; + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = scale * src_data[i]; } - } else if (sequence_idx < max_sequence_length) { - if (Padding) { - /* sequence -> padding */ - for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) { - padding[padding_base_idx + i] = 0; - } + } else if (step_idx < pad_seq_len && Type == kSeqToPad) { + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i]; } } } @@ -62,74 +53,59 @@ template class PaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::LoDTensor& seq, framework::Tensor* padding, - bool norm_by_times) { - auto lod = seq.lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The lod of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq.dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding->dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequence_length, num_sequences, sequence_width]."); - - int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be the " - "maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be the " - "number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq.numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(seq, context.GetPlace(), context, padding); - padding->Resize(padding_dims); + const framework::LoDTensor& seq_tensor, + framework::LoDTensor* pad_tensor, + const framework::LoDTensor& pad_value, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_lod = seq_tensor.lod(); + const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; + const auto& seq_tensor_dims = seq_tensor.dims(); + const auto& pad_tensor_dims = pad_tensor->dims(); + int max_seq_len = MaximumSequenceLength(seq_offsets); + if (pad_seq_len == -1) { + pad_seq_len = max_seq_len; + } + PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len, + "The pad_seq_len must be equal to or greater than the " + "original max sequence length."); + int step_width = seq_tensor.numel() / seq_tensor_dims[0]; + int seq_num = seq_offsets.size() - 1; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width, + "The numel of 'pad_value' can only be 1 or be equal to the " + "'step_width'."); + + if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { + TensorCopy(seq_tensor, context.GetPlace(), context, pad_tensor); + pad_tensor->Resize(pad_tensor_dims); return; } - const int64_t kBlockSize = 512; + const int kBlockSize = 512; /* At least use 32 threads to copy sequence_width elements, * and at least 8 elements for each thread. */ size_t block_dim_x = - std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); size_t block_dim_y = kBlockSize / block_dim_x; dim3 threads(block_dim_x, block_dim_y); - size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; - size_t grid_dim_y = num_sequences; + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; dim3 grid(grid_dim_x, grid_dim_y); - const T* seq_data = seq.data(); - T* padding_data = padding->data(); - if (norm_by_times) { - SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } else { - SequencePaddingKernel<<>>( - padding_data, const_cast(seq_data), - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } + const T* seq_data = seq_tensor.data(); + T* pad_data = pad_tensor->data(); + const T* pad_value_data = pad_value.data(); + + SequencePaddingKernel<<>>( + pad_data, seq_data, pad_value_data, pad_value.numel() == 1, + seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, + step_width, norm_by_times, layout); } }; @@ -137,79 +113,62 @@ template class UnpaddingLoDTensorFunctor { public: void operator()(const platform::CUDADeviceContext& context, - framework::LoDTensor* seq, const framework::Tensor& padding, - bool norm_by_times) { - auto lod = seq->lod(); - PADDLE_ENFORCE_GT(lod.size(), 0UL, - "The lod of LoDTensor seq should not be null."); - - const size_t level = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - auto seq_dims = seq->dims(); - PADDLE_ENFORCE_EQ(seq_dims[0], - static_cast(abs_offset_lod[level].back()), - "The first dimension of LoDTensor seq should be " - "equal to the sum of all sequences's length."); - - auto padding_dims = padding.dims(); - PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL, - "The input padding should be a 3-D Tensor of shape " - "[max_sequnece_length, num_sequences, sequence_width]."); - - int64_t max_sequence_length = MaximumSequenceLength(lod, level); - PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length, - "The first dimension of Tensor padding should be " - "the maximum length of all sequences in LoDTensor seq."); - - const int64_t num_sequences = abs_offset_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences, - "The second dimension of Tensor padding should be " - "the number of sequences in LoDTensor seq."); - - const int64_t sequence_width = seq->numel() / seq_dims[0]; - PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width, - "The third dimension of Tensor padding should be the " - "width of sequence in LoDTensor seq."); - - if (!norm_by_times && num_sequences == 1UL) { - TensorCopy(padding, context.GetPlace(), context, seq); - seq->Resize(seq_dims); + const framework::LoDTensor& pad_tensor, + framework::LoDTensor* seq_tensor, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth) { + auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; + const auto& seq_tensor_dims = seq_tensor->dims(); + const auto& pad_tensor_dims = pad_tensor.dims(); + int max_seq_len = MaximumSequenceLength(seq_offsets); + if (pad_seq_len == -1) { + pad_seq_len = max_seq_len; + } + int step_width = seq_tensor->numel() / seq_tensor_dims[0]; + int seq_num = seq_offsets.size() - 1; + + CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, + step_width, layout); + + if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { + TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor); + seq_tensor->Resize(seq_tensor_dims); return; } - const int64_t kBlockSize = 512; + const int kBlockSize = 512; /* At least use 32 threads to copy sequence_width elements, * and at least 8 elements for each thread. */ size_t block_dim_x = - std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); size_t block_dim_y = kBlockSize / block_dim_x; dim3 threads(block_dim_x, block_dim_y); - size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y; - size_t grid_dim_y = num_sequences; + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; dim3 grid(grid_dim_x, grid_dim_y); - const T* padding_data = padding.data(); - T* seq_data = seq->data(); - if (norm_by_times) { - SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } else { - SequencePaddingKernel<<>>( - const_cast(padding_data), seq_data, - abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width, - max_sequence_length, num_sequences); - } + const T* pad_data = pad_tensor.data(); + T* seq_data = seq_tensor->data(); + + SequencePaddingKernel<<>>( + seq_data, pad_data, nullptr, false, + seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, + step_width, norm_by_times, layout); } }; +template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; +template class PaddingLoDTensorFunctor; + +template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; template class UnpaddingLoDTensorFunctor; +template class UnpaddingLoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h index b56e6db1ebdac1a00561c07845c03bb8fbd8d35a..e752aa58979dddba4d010071d2c4b5dc3e0c6756 100644 --- a/paddle/fluid/operators/math/sequence_padding.h +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -22,17 +23,33 @@ namespace paddle { namespace operators { namespace math { -inline static size_t MaximumSequenceLength(const framework::LoD& lod, - const size_t level) { - const size_t num_sequences = lod[level].size() - 1; - size_t max_sequence_length = 0; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - for (size_t i = 0; i < num_sequences; ++i) { - max_sequence_length = - std::max(max_sequence_length, - abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]); +enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth }; + +enum CopyType { kSeqToPad, kPadToSeq }; + +inline static size_t MaximumSequenceLength( + const framework::Vector& seq_offset) { + size_t seq_num = seq_offset.size() - 1; + size_t max_seq_len = 0; + for (size_t i = 0; i < seq_num; ++i) { + max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); } - return max_sequence_length; + return max_seq_len; +} + +inline static void CheckDims(const framework::DDim& seq_tensor_dims, + const framework::DDim& pad_tensor_dims, + const framework::Vector& seq_offset, + int64_t padded_seq_len, int64_t step_width, + const PadLayout& layout) { + PADDLE_ENFORCE_EQ(static_cast(seq_tensor_dims[0]), seq_offset.back(), + "Value of 1st dimension of the sequence tensor should be " + "equal to sum of lengths of all sequences."); + + PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || + seq_tensor_dims.size() == pad_tensor_dims.size(), + "pad_tensor's rank should be 1 greater than seq_tensor's " + "rank, or be equal with it."); } /* @@ -64,15 +81,22 @@ inline static size_t MaximumSequenceLength(const framework::LoD& lod, template class PaddingLoDTensorFunctor { public: - void operator()(const DeviceContext& context, const framework::LoDTensor& seq, - framework::Tensor* padding, bool norm_by_times); + void operator()(const DeviceContext& context, + const framework::LoDTensor& seq_tensor, + framework::LoDTensor* pad_tensor, + const framework::LoDTensor& pad_value, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth); }; template class UnpaddingLoDTensorFunctor { public: - void operator()(const DeviceContext& context, framework::LoDTensor* seq, - const framework::Tensor& padding, bool norm_by_times); + void operator()(const DeviceContext& context, + const framework::LoDTensor& pad_tensor, + framework::LoDTensor* seq_tensor, int pad_seq_len = -1, + int lod_level = 0, bool norm_by_times = false, + const PadLayout layout = kBatchLengthWidth); }; } // namespace math diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index b0c201db0ccbe81d8f57cd984d2cdfd2f6a48f25..4f61b1029c65aedaf4fce771866964fe1d0d6112 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -23,7 +23,9 @@ void TestSequencePadding(const paddle::framework::LoD& lod, paddle::framework::LoDTensor cpu_seq_back; paddle::framework::LoDTensor seq; paddle::framework::LoDTensor seq_back; - paddle::framework::Tensor padding; + paddle::framework::LoDTensor padding; + paddle::framework::LoDTensor cpu_pad_value; + paddle::framework::LoDTensor pad_value; const size_t level = lod.size() - 1; auto seq_dims = @@ -46,20 +48,33 @@ void TestSequencePadding(const paddle::framework::LoD& lod, } const size_t max_sequence_length = - paddle::operators::math::MaximumSequenceLength(lod, level); + paddle::operators::math::MaximumSequenceLength(lod[level]); const size_t num_sequences = lod[level].size() - 1; auto padding_dims = paddle::framework::make_ddim({static_cast(max_sequence_length), static_cast(num_sequences), static_cast(sequence_width)}); + padding.mutable_data(padding_dims, *place); + + T* pad_value_data = + cpu_pad_value.mutable_data({1}, paddle::platform::CPUPlace()); + *pad_value_data = static_cast(0); + if (paddle::platform::is_cpu_place(*place)) { + pad_value = cpu_pad_value; + } else { + TensorCopySync(cpu_pad_value, *place, &pad_value); + } + paddle::operators::math::PaddingLoDTensorFunctor()( - *context, seq, &padding, false); + *context, seq, &padding, pad_value, -1, 0, false, + paddle::operators::math::kLengthBatchWidth); seq_back.set_lod(lod); seq_back.mutable_data(seq_dims, *place); paddle::operators::math::UnpaddingLoDTensorFunctor()( - *context, &seq_back, padding, false); + *context, padding, &seq_back, -1, 0, false, + paddle::operators::math::kLengthBatchWidth); if (paddle::platform::is_cpu_place(*place)) { cpu_seq_back = seq_back; diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 625065692c1f32c89d9e566d00051e237ac9a3af..59d8b9b8a8d554eb16826712ff634eed5df2d648 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -41,7 +41,7 @@ struct OneHotOpCUDAFunctor { : in_(in), out_(out), depth_(depth), ctx_(ctx) {} template - void operator()() const { + void apply() const { auto* p_in_data = in_->data(); auto numel = in_->numel(); auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h index 7e77f25089c4bd0297b0eb5a0ed7555cc0af5a9f..1ebd2676496940ff8f90caaaded5c8227bd7ae78 100644 --- a/paddle/fluid/operators/one_hot_op.h +++ b/paddle/fluid/operators/one_hot_op.h @@ -31,7 +31,7 @@ struct OneHotOpFunctor { : in_(in), out_(out), depth_(depth), ctx_(ctx) {} template - void operator()() const { + void apply() const { auto* p_in_data = in_->data(); auto numel = in_->numel(); auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a706d05fd7c35ef993f5199f0f893622cb863c5d --- /dev/null +++ b/paddle/fluid/operators/pad2d_op.cc @@ -0,0 +1,584 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +void Pad2DConstNCHW(const T* in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, T value, + T* out_data) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + out_data[out_h * out_width + out_w] = + (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) + ? value + : in_data[in_h * in_width + in_w]; + } + } + in_data += in_height * in_width; + out_data += out_height * out_width; + } + } +} + +template +void Pad2DConstNHWC(const T* in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, T value, + T* out_data) { + for (int n = 0; n < num; ++n) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + const int out_index = (out_h * out_width + out_w) * channels; + if (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) { + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = value; + } + } else { + const int in_index = (in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = in_data[in_index + c]; + } + } + } + } + in_data += in_height * in_width * channels; + out_data += out_height * out_width * channels; + } +} + +template +void Pad2DReflectNCHW(const T* in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, T* out_data) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = std::max(in_h, -in_h); // reflect by 0 + in_h = + std::min(in_h, 2 * in_height - in_h - 2); // reflect by in_height + in_w = std::max(in_w, -in_w); // reflect by 0 + in_w = + std::min(in_w, 2 * in_width - in_w - 2); // reflect by in_width + out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w]; + } + } + in_data += in_height * in_width; + out_data += out_height * out_width; + } + } +} + +template +void Pad2DReflectNHWC(const T* in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, T* out_data) { + for (int n = 0; n < num; ++n) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + const int out_index = (out_h * out_width + out_w) * channels; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = std::max(in_h, -in_h); + in_h = std::min(in_h, 2 * in_height - in_h - 2); + in_w = std::max(in_w, -in_w); + in_w = std::min(in_w, 2 * in_width - in_w - 2); + const int in_index = (in_h * in_width + in_w) * channels; + + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = in_data[in_index + c]; + } + } + } + in_data += in_height * in_width * channels; + out_data += out_height * out_width * channels; + } +} + +template +void Pad2DEdgeNCHW(const T* in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, const int pad_top, + const int pad_left, T* out_data) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w]; + } + } + in_data += in_height * in_width; + out_data += out_height * out_width; + } + } +} + +template +void Pad2DEdgeNHWC(const T* in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, const int pad_top, + const int pad_left, T* out_data) { + for (int n = 0; n < num; ++n) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + const int out_index = (out_h * out_width + out_w) * channels; + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + const int in_index = (in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + out_data[out_index + c] = in_data[in_index + c]; + } + } + } + in_data += in_height * in_width * channels; + out_data += out_height * out_width * channels; + } +} + +template +void Pad2DGradConstNCHW(T* d_in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + if (!(in_h < 0 || in_w < 0 || in_h >= in_height || + in_w >= in_width)) { + d_in_data[in_h * in_width + in_w] = + d_out_data[out_h * out_width + out_w]; + } + } + } + d_in_data += in_height * in_width; + d_out_data += out_height * out_width; + } + } +} + +template +void Pad2DGradConstNHWC(T* d_in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + for (int n = 0; n < num; ++n) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + const int out_index = (out_h * out_width + out_w) * channels; + if (!(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)) { + const int in_index = (in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + d_in_data[in_index + c] = d_out_data[out_index + c]; + } + } + } + } + d_in_data += in_height * in_width * channels; + d_out_data += out_height * out_width * channels; + } +} + +template +void Pad2DGradReflectNCHW(T* d_in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = std::max(in_h, -in_h); // reflect over 0 + in_h = std::min(in_h, + 2 * in_height - in_h - 2); // reflect over in_height + in_w = std::max(in_w, -in_w); // reflect over 0 + in_w = + std::min(in_w, 2 * in_width - in_w - 2); // reflect over in_width + d_in_data[in_h * in_width + in_w] += + d_out_data[out_h * out_width + out_w]; + } + } + d_in_data += in_height * in_width; + d_out_data += out_height * out_width; + } + } +} + +template +void Pad2DGradReflectNHWC(T* d_in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + for (int n = 0; n < num; ++n) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + const int out_index = (out_h * out_width + out_w) * channels; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = std::max(in_h, -in_h); + in_h = std::min(in_h, 2 * in_height - in_h - 2); + in_w = std::max(in_w, -in_w); + in_w = std::min(in_w, 2 * in_width - in_w - 2); + const int in_index = (in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + d_in_data[in_index + c] += d_out_data[out_index + c]; + } + } + } + d_in_data += in_height * in_width * channels; + d_out_data += out_height * out_width * channels; + } +} + +template +void Pad2DGradEdgeNCHW(T* d_in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + d_in_data[in_h * in_width + in_w] += + d_out_data[out_h * out_width + out_w]; + } + } + d_in_data += in_height * in_width; + d_out_data += out_height * out_width; + } + } +} + +template +void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + for (int n = 0; n < num; ++n) { + for (int out_h = 0; out_h < out_height; ++out_h) { + for (int out_w = 0; out_w < out_width; ++out_w) { + const int out_index = (out_h * out_width + out_w) * channels; + int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0)); + int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0)); + const int in_index = (in_h * in_width + in_w) * channels; + for (int c = 0; c < channels; ++c) { + d_in_data[in_index + c] += d_out_data[out_index + c]; + } + } + } + d_in_data += in_height * in_width * channels; + d_out_data += out_height * out_width * channels; + } +} + +template +class Pad2dCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto pads = context.Attr>("paddings"); + auto mode = context.Attr("mode"); + auto data_format = context.Attr("data_format"); + T value = context.Attr("pad_value"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto in_dims = x->dims(); + auto out_dims = out->dims(); + const T* in_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + const int pad_top = pads[0]; + const int pad_left = pads[2]; + const int num = in_dims[0]; + if (data_format == "NCHW") { + const int channels = in_dims[1]; + const int in_height = in_dims[2]; + const int in_width = in_dims[3]; + const int out_height = out_dims[2]; + const int out_width = out_dims[3]; + if (mode == "reflect") { + Pad2DReflectNCHW(in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, out_data); + } else if (mode == "edge") { + Pad2DEdgeNCHW(in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, out_data); + } else { + Pad2DConstNCHW(in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, value, out_data); + } + } else { + const int channels = in_dims[3]; + const int in_height = in_dims[1]; + const int in_width = in_dims[2]; + const int out_height = out_dims[1]; + const int out_width = out_dims[2]; + if (mode == "reflect") { + Pad2DReflectNHWC(in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, out_data); + } else if (mode == "edge") { + Pad2DEdgeNHWC(in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, out_data); + } else { + Pad2DConstNHWC(in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, value, out_data); + } + } + } +}; + +template +class Pad2dGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto pads = context.Attr>("paddings"); + auto mode = context.Attr("mode"); + auto data_format = context.Attr("data_format"); + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_in = context.Output(framework::GradVarName("X")); + auto d_in_dims = d_in->dims(); + auto d_out_dims = d_out->dims(); + const T* d_out_data = d_out->data(); + T* d_in_data = d_in->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + set_zero(context.template device_context(), + d_in, static_cast(0)); + const int pad_top = pads[0]; + const int pad_left = pads[2]; + const int num = d_in_dims[0]; + if (data_format == "NCHW") { + const int channels = d_in_dims[1]; + const int in_height = d_in_dims[2]; + const int in_width = d_in_dims[3]; + const int out_height = d_out_dims[2]; + const int out_width = d_out_dims[3]; + if (mode == "reflect") { + Pad2DGradReflectNCHW(d_in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, + d_out_data); + } else if (mode == "edge") { + Pad2DGradEdgeNCHW(d_in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, d_out_data); + } else { + Pad2DGradConstNCHW(d_in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, + d_out_data); + } + } else { + const int channels = d_in_dims[3]; + const int in_height = d_in_dims[1]; + const int in_width = d_in_dims[2]; + const int out_height = d_out_dims[1]; + const int out_width = d_out_dims[2]; + if (mode == "reflect") { + Pad2DGradReflectNHWC(d_in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, + d_out_data); + } else if (mode == "edge") { + Pad2DGradEdgeNHWC(d_in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, d_out_data); + } else { + Pad2DGradConstNHWC(d_in_data, num, channels, in_height, in_width, + out_height, out_width, pad_top, pad_left, + d_out_data); + } + } + } +}; + +class Pad2dOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Pad2dOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Pad2dOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto paddings = ctx->Attrs().Get>("paddings"); + PADDLE_ENFORCE_EQ(x_dim.size(), 4, + "Size of paddings should be equal to 4."); + std::vector out_dims(x_dim.size()); + + auto data_format = ctx->Attrs().Get("data_format"); + out_dims[0] = x_dim[0]; + if (data_format == "NCHW") { + out_dims[1] = x_dim[1]; + out_dims[2] = x_dim[2] + paddings[0] + paddings[1]; // height + out_dims[3] = x_dim[3] + paddings[2] + paddings[3]; // width + } else { // NHWC + out_dims[3] = x_dim[3]; + out_dims[1] = x_dim[1] + paddings[0] + paddings[1]; + out_dims[2] = x_dim[2] + paddings[2] + paddings[3]; + } + + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + if (out_dims[0] == x_dim[0]) { + // Only pass LoD when the first dimension is equal between + // output and input. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } +}; + +class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input of pad2d op. " + "The input should be a 4-D tensor with formate NCHW or NHWC."); + AddOutput("Out", + "The output of pad2d op. " + "A tensor with the same shape as X."); + AddAttr>( + "paddings", + "(vector) " + "A list to describe the padding rules." + "paddings=[0, 1, 2, 3] means " + "padding 0 row to top, 1 row to bottom, 2 columns to left " + "and 3 columns to right. Size of paddings must be 4."); + AddAttr("pad_value", + "(float, default 0.0) " + "The value to fill the padded areas in constant mode.") + .SetDefault(0.0f); + AddAttr("mode", + "(float, default constant) " + "Three modes: constant(default), reflect, edge.") + .SetDefault("constant"); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the input data.") + .SetDefault("NCHW"); + AddComment(R"DOC( +Pad2d Operator. +Pad 2-d images accordding to 'paddings' and 'mode'. +If mode is 'reflect', paddings[0] and paddings[1] must be no greater +than height-1. And the width dimension has the same condition. + +Given that X is a channel of image from input: + +X = [[1, 2, 3], + [4, 5, 6]] + +Case 0: + +paddings = [0, 1, 2, 3], +mode = 'constant' +pad_value = 0 + +Out = [[0, 0, 1, 2, 3, 0, 0, 0] + [0, 0, 4, 5, 6, 0, 0, 0] + [0, 0, 0, 0, 0, 0, 0, 0]] + +Case 1: + +paddings = [0, 1, 2, 1], +mode = 'reflect' + +Out = [[3, 2, 1, 2, 3, 2] + [6, 5, 4, 5, 6, 5] + [3, 2, 1, 2, 3, 2]] + +Case 2: + +paddings = [0, 1, 2, 1], +mode = 'edge' + +Out = [[1, 1, 1, 2, 3, 3] + [4, 4, 4, 5, 6, 6] + [4, 4, 4, 5, 6, 6]] +)DOC"); + } +}; + +class Pad2dOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* bind = new framework::OpDesc(); + bind->SetInput("X", Input("X")); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); + bind->SetAttrMap(Attrs()); + bind->SetType("pad2d_grad"); + return std::unique_ptr(bind); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker, + ops::Pad2dOpGradMaker); +REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad); +REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel); +REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel); diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..9ba0ddbd84a43cfd5f028ce072b5c7606fae343d --- /dev/null +++ b/paddle/fluid/operators/pad2d_op.cu @@ -0,0 +1,432 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +using framework::Tensor; + +template +__global__ void Pad2DConstNCHW(const int nthreads, const T* in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, T value, + T* out_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int nc = index / out_width; + const int out_w = index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + out_data[index] = + (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) + ? value + : in_data[(nc * in_height + in_h) * in_width + in_w]; + } +} + +template +__global__ void Pad2DConstNHWC(const int nthreads, const T* in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, T value, + T* out_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int n = index / channels; + const int c = index % channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int in_h = out_h - pad_top; + const int in_w = out_w - pad_left; + out_data[index] = + (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) + ? value + : in_data[((n * in_height + in_h) * in_width + in_w) * channels + + c]; + } +} + +template +__global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + T* out_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int nc = index / out_width; + const int out_w = index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = max(in_h, -in_h); // reflect by 0 + in_h = min(in_h, 2 * in_height - in_h - 2); // reflect by in_height + in_w = max(in_w, -in_w); // reflect by 0 + in_w = min(in_w, 2 * in_width - in_w - 2); // reflect by in_width + out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w]; + } +} + +template +__global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + T* out_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int n = index / channels; + const int c = index % channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = max(in_h, -in_h); + in_h = min(in_h, 2 * in_height - in_h - 2); + in_w = max(in_w, -in_w); + in_w = min(in_w, 2 * in_width - in_w - 2); + out_data[index] = + in_data[((n * in_height + in_h) * in_width + in_w) * channels + c]; + } +} + +template +__global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + T* out_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int nc = index / out_width; + const int out_w = index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w]; + } +} + +template +__global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + T* out_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int n = index / channels; + const int c = index % channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + out_data[index] = + in_data[((n * in_height + in_h) * in_width + in_w) * channels + c]; + } +} + +template +__global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + CUDA_1D_KERNEL_LOOP(in_index, in_size) { + int nc = in_index / in_width; + const int out_w = in_index % in_width + pad_left; + const int out_h = nc % in_height + pad_top; + nc /= in_height; + d_in_data[in_index] = + d_out_data[(nc * out_height + out_h) * out_width + out_w]; + } +} + +template +__global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + CUDA_1D_KERNEL_LOOP(in_index, in_size) { + int n = in_index / channels; + const int c = in_index % channels; + const int out_w = n % in_width + pad_left; + n /= in_width; + const int out_h = n % in_height + pad_top; + n /= in_height; + d_in_data[in_index] = + d_out_data[((n * out_height + out_h) * out_width + out_w) * channels + + c]; + } +} + +template +__global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + CUDA_1D_KERNEL_LOOP(out_index, out_size) { + int nc = out_index / out_width; + const int out_w = out_index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = max(in_h, -in_h); + in_w = max(in_w, -in_w); + in_h = min(in_h, 2 * in_height - in_h - 2); + in_w = min(in_w, 2 * in_width - in_w - 2); + atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w], + d_out_data[out_index]); + } +} + +template +__global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + CUDA_1D_KERNEL_LOOP(out_index, out_size) { + const int c = out_index % channels; + int n = out_index / channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + int in_h = out_h - pad_top; + int in_w = out_w - pad_left; + in_h = max(in_h, -in_h); + in_w = max(in_w, -in_w); + in_h = min(in_h, in_height * 2 - in_h - 2); + in_w = min(in_w, in_width * 2 - in_w - 2); + atomicAdd( + &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], + d_out_data[out_index]); + } +} + +template +__global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + CUDA_1D_KERNEL_LOOP(out_index, out_size) { + int nc = out_index / out_width; + const int out_w = out_index % out_width; + const int out_h = nc % out_height; + nc /= out_height; + const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w], + d_out_data[out_index]); + } +} + +template +__global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data, + const int num, const int channels, + const int in_height, const int in_width, + const int out_height, const int out_width, + const int pad_top, const int pad_left, + const T* d_out_data) { + CUDA_1D_KERNEL_LOOP(out_index, out_size) { + const int c = out_index % channels; + int n = out_index / channels; + const int out_w = n % out_width; + n /= out_width; + const int out_h = n % out_height; + n /= out_height; + const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); + const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); + atomicAdd( + &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], + d_out_data[out_index]); + } +} + +template +class Pad2dCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto pads = context.Attr>("paddings"); + auto mode = context.Attr("mode"); + auto data_format = context.Attr("data_format"); + T value = context.Attr("pad_value"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto in_dims = x->dims(); + auto out_dims = out->dims(); + const T* in_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + const int pad_top = pads[0]; + const int pad_left = pads[2]; + const int num = in_dims[0]; + + auto stream = context.cuda_device_context().stream(); + int block = PADDLE_CUDA_NUM_THREADS; + const int out_size = out->numel(); + int grid = (out_size + block - 1) / block; + + if (data_format == "NCHW") { + const int channels = in_dims[1]; + const int in_height = in_dims[2]; + const int in_width = in_dims[3]; + const int out_height = out_dims[2]; + const int out_width = out_dims[3]; + if (mode == "reflect") { + Pad2DReflectNCHW<<>>( + out_size, in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, out_data); + } else if (mode == "edge") { + Pad2DEdgeNCHW<<>>( + out_size, in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, out_data); + } else { + Pad2DConstNCHW<<>>( + out_size, in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, value, out_data); + } + } else { + const int channels = in_dims[3]; + const int in_height = in_dims[1]; + const int in_width = in_dims[2]; + const int out_height = out_dims[1]; + const int out_width = out_dims[2]; + if (mode == "reflect") { + Pad2DReflectNHWC<<>>( + out_size, in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, out_data); + } else if (mode == "edge") { + Pad2DEdgeNHWC<<>>( + out_size, in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, out_data); + } else { + Pad2DConstNHWC<<>>( + out_size, in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, value, out_data); + } + } + } +}; + +template +class Pad2dGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto pads = context.Attr>("paddings"); + auto mode = context.Attr("mode"); + auto data_format = context.Attr("data_format"); + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_in = context.Output(framework::GradVarName("X")); + auto d_in_dims = d_in->dims(); + auto d_out_dims = d_out->dims(); + const T* d_out_data = d_out->data(); + T* d_in_data = d_in->mutable_data(context.GetPlace()); + + math::SetConstant set_zero; + set_zero(context.template device_context(), + d_in, static_cast(0)); + + const int pad_top = pads[0]; + const int pad_left = pads[2]; + const int num = d_in_dims[0]; + + auto stream = context.cuda_device_context().stream(); + int block = PADDLE_CUDA_NUM_THREADS; + const int out_size = d_out->numel(); + const int in_size = d_in->numel(); + int grid = (out_size + block - 1) / block; + + if (data_format == "NCHW") { + const int channels = d_in_dims[1]; + const int in_height = d_in_dims[2]; + const int in_width = d_in_dims[3]; + const int out_height = d_out_dims[2]; + const int out_width = d_out_dims[3]; + if (mode == "reflect") { + Pad2DGradReflectNCHW<<>>( + out_size, d_in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, d_out_data); + } else if (mode == "edge") { + Pad2DGradEdgeNCHW<<>>( + out_size, d_in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, d_out_data); + } else { + grid = (in_size + block - 1) / block; + Pad2DGradConstNCHW<<>>( + in_size, d_in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, d_out_data); + } + } else { + const int channels = d_in_dims[3]; + const int in_height = d_in_dims[1]; + const int in_width = d_in_dims[2]; + const int out_height = d_out_dims[1]; + const int out_width = d_out_dims[2]; + if (mode == "reflect") { + Pad2DGradReflectNHWC<<>>( + out_size, d_in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, d_out_data); + } else if (mode == "edge") { + Pad2DGradEdgeNHWC<<>>( + out_size, d_in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, d_out_data); + } else { + grid = (in_size + block - 1) / block; + Pad2DGradConstNHWC<<>>( + in_size, d_in_data, num, channels, in_height, in_width, out_height, + out_width, pad_top, pad_left, d_out_data); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(pad2d, ops::Pad2dCUDAKernel); +REGISTER_OP_CUDA_KERNEL(pad2d_grad, ops::Pad2dGradCUDAKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..37646c7b4c50fc7409002aca56e5462bde93cc30 --- /dev/null +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -0,0 +1,212 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pad_constant_like_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class PadConstantLikeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PadConstantLikeOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of PadConstantLikeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PadConstantLikeOp should not be null."); + + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(), + "The dimention of X and Y should be the same."); + + for (int i = 0; i < x_dim.size(); ++i) { + PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]); + } + ctx->SetOutputDim("Out", x_dim); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Y")->type()), + ctx.device_context()); + } +}; + +class PadConstantLikeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input of pad_constant_like op. " + "The input should be a k-D tensor(k > 0 and k < 7)"); + AddInput("Y", + "The input of pad_constant_like op. " + "The input should be a k-D tensor(k > 0 and k < 7)"); + AddOutput("Out", + "The output of pad_constant_like op. " + "A tensor with the same shape as X."); + AddAttr("pad_value", + "(float, default 0.0) " + "The value to fill the padded areas.") + .SetDefault(0.0f); + AddComment(R"DOC( +PadConstantLikeOp Operator. + +Pad input(Y) with a pad_value, the number of values padded to the edges of each +axis is specified by the difference of the shape of X and Y. +((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for +each axis. +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +case1: + Given: + X = [[1, 2], + [3, 4], + [1, 2], + [3, 4]]], + X.shape = (4, 2) + + Y = [[5, 6], + [7, 8]], + Y.shape = (2, 2) + + And + pad_value = 0, + + Return: + Out = [[5, 6], + [7, 8], + [0, 0], + [0, 0]] + Out.shape = (4, 2) + +case2: + Given: + X = [[[[ 0, 1, 2], + [ 3, 4, 5]], + [[ 6, 7, 8], + [ 9, 10, 11]], + [[12, 13, 14], + [15, 16, 17]]], + [[[18, 19, 20], + [21, 22, 23]], + [[24, 25, 26], + [27, 28, 29]], + [[30, 31, 32], + [33, 34, 35]]]] + X.shape = (2, 3, 2, 3) + + Y = [[[[35, 36, 37]], + [[38, 39, 40]], + [[41, 42, 43]]]] + Y.shape = (1, 3, 1, 3) + + And + pad_value = -1, + + Return: + + Out = [[[[35, 36, 37], + [-1, -1, -1]], + [[38, 39, 40], + [-1, -1, -1]], + [[41, 42, 43], + [-1, -1, -1]]], + [[[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]]]] + Out.shape = (2, 3, 2, 3) +)DOC"); + } +}; + +class PadConstantLikeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto y_dim = ctx->GetInputDim("Y"); + auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(), + "The dimention of X and Y should be the same."); + + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dim); + ctx->ShareLoD("Y", /*->*/ y_grad_name); + + for (int i = 0; i < y_dim.size(); ++i) { + PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Y")->type()), + ctx.device_context()); + } +}; + +class PadConstantLikeOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *bind = new framework::OpDesc(); + bind->SetType("pad_constant_like_grad"); + bind->SetInput("Y", Input("Y")); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + bind->SetAttrMap(Attrs()); + return std::unique_ptr(bind); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOp, + ops::PadConstantLikeOpMaker, ops::PadConstantLikeOpGradMaker); +REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad); + +REGISTER_OP_CPU_KERNEL( + pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); +REGISTER_OP_CPU_KERNEL( + pad_constant_like_grad, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ea69577904577de353b63491973bf74b7724e18e --- /dev/null +++ b/paddle/fluid/operators/pad_constant_like_op.cu @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/fluid/operators/pad_constant_like_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); +REGISTER_OP_CUDA_KERNEL( + pad_constant_like_grad, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h new file mode 100644 index 0000000000000000000000000000000000000000..01d66901afc49a487c344b039b65f547967e95ff --- /dev/null +++ b/paddle/fluid/operators/pad_constant_like_op.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/padding.h" + +namespace paddle { +namespace operators { + +template +class PadConstantLikeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto in_x = context.Input("X"); + auto in_y = context.Input("Y"); + auto* out = context.Output("Out"); + + if (in_x->dims() == in_y->dims()) { + // TensorCopy(in_y, context.GetPlace(), context, out); + out->ShareDataWith(*in_y); + return; + } + + T pad_value = context.Attr("pad_value"); + out->mutable_data(context.GetPlace()); + + int rank = context.Input("X")->dims().size(); + + std::vector pads(rank * 2, 0); + + for (int j = 0; j < rank; ++j) { + pads[j * 2] = 0; + pads[j * 2 + 1] = static_cast(in_x->dims()[j] - in_y->dims()[j]); + } + + math::PaddingFunctor(rank, context, pads, pad_value, + *in_y, out); + } +}; + +template +class PadConstantLikeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto in_y = context.Input("Y"); + auto in_dout = + context.Input(framework::GradVarName("Out")); + auto* d_y = context.Output(framework::GradVarName("Y")); + + if (d_y == nullptr) { + return; + } + + if (in_dout->dims() == in_y->dims()) { + // TensorCopy(in_dout, context.GetPlace(), context, d_y); + d_y->ShareDataWith(*in_dout); + return; + } + + d_y->mutable_data(context.GetPlace()); + int rank = in_dout->dims().size(); + + std::vector pads(static_cast(rank) * 2, 0); + for (int j = 0; j < rank; ++j) { + pads[j * 2] = 0; + pads[j * 2 + 1] = static_cast(in_dout->dims()[j] - in_y->dims()[j]); + } + + math::PaddingGradFunctor(rank, context, pads, *in_dout, + d_y); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h index c93c096575a30dd9344894ead4b81acc16930e21..32698dac4917e183cfe36c831787b049985b19b3 100644 --- a/paddle/fluid/operators/pad_op.h +++ b/paddle/fluid/operators/pad_op.h @@ -18,117 +18,44 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/padding.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; - -template -void PadFunction(const framework::ExecutionContext& context) { - auto pads = context.Attr>("paddings"); - Eigen::array, D> paddings; - for (size_t i = 0; i < paddings.size(); ++i) { - paddings[i].first = pads[i * 2]; - paddings[i].second = pads[i * 2 + 1]; - } - T pad_value = context.Attr("pad_value"); - - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - auto x_tensor = EigenTensor::From(*x); - auto out_tensor = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); - out_tensor.device(place) = x_tensor.pad(paddings, pad_value); -} - template class PadKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - int rank = context.Input("X")->dims().size(); - switch (rank) { - case 1: - PadFunction(context); - break; - case 2: - PadFunction(context); - break; - case 3: - PadFunction(context); - break; - case 4: - PadFunction(context); - break; - case 5: - PadFunction(context); - break; - case 6: - PadFunction(context); - break; - default: - PADDLE_THROW( - "PadOp only support tensors with no more than 6 dimensions."); - } + auto pads = context.Attr>("paddings"); + T pad_value = context.Attr("pad_value"); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + int rank = x->dims().size(); + math::PaddingFunctor(rank, context, pads, pad_value, *x, + out); } }; -template -void PadGradFunction(const framework::ExecutionContext& context) { - auto pads = context.Attr>("paddings"); - Eigen::array, D> paddings; - for (size_t i = 0; i < paddings.size(); ++i) { - paddings[i].first = -pads[i * 2]; - paddings[i].second = -pads[i * 2 + 1]; - } - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - if (d_x != nullptr) { - d_x->mutable_data(context.GetPlace()); - auto d_x_tensor = EigenTensor::From(*d_x); - auto d_out_tensor = EigenTensor::From(*d_out); - auto& place = - *context.template device_context().eigen_device(); - d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); - } -} - template class PadGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - size_t rank = - context.Input(framework::GradVarName("Out"))->dims().size(); - switch (rank) { - case 1: - PadGradFunction(context); - break; - case 2: - PadGradFunction(context); - break; - case 3: - PadGradFunction(context); - break; - case 4: - PadGradFunction(context); - break; - case 5: - PadGradFunction(context); - break; - case 6: - PadGradFunction(context); - break; - default: - PADDLE_THROW( - "PadOp only support tensors with no more than 6 dimensions."); + auto pads = context.Attr>("paddings"); + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + if (d_x == nullptr) { + return; } + + d_x->mutable_data(context.GetPlace()); + int rank = d_out->dims().size(); + math::PaddingGradFunctor(rank, context, pads, *d_out, + d_x); } }; diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index eb09470f37eabb5524f774bc289fc68f5884c540..97c36a83fc5eff421725d05f66fca05f5169d1bb 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -355,6 +355,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { grad->SetInput(framework::GradVarName(output_param), og_names); } } + grad->SetInput("Communicator", {"nccl_com__do_not_change_"}); grad->SetAttrMap(this->Attrs()); grad->SetBlockAttr(kParallelBlock, grad_block_[0]); diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc index cceac402951ae6bf3fe0b4c96af5b7ce9ca1ba0e..e7f1caf4d3a81dc7633139933c6a4c3d51a4e2a0 100644 --- a/paddle/fluid/operators/print_op.cc +++ b/paddle/fluid/operators/print_op.cc @@ -13,14 +13,12 @@ limitations under the License. */ #include -#include - #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/framework/variable.h" namespace paddle { namespace operators { +using framework::GradVarName; #define CLOG std::cout @@ -35,7 +33,7 @@ struct Formater { std::type_index dtype{typeid(const char)}; framework::LoD lod; int summarize; - void* data{nullptr}; + void *data{nullptr}; void operator()(size_t size) { PrintMessage(); @@ -101,7 +99,7 @@ struct Formater { template void Display(size_t size) { - auto* d = reinterpret_cast(data); + auto *d = reinterpret_cast(data); CLOG << "\tdata: "; if (summarize != -1) { summarize = std::min(size, (size_t)summarize); @@ -120,51 +118,36 @@ struct Formater { // TODO(ChunweiYan) there should be some other printers for TensorArray class TensorPrintOp : public framework::OperatorBase { public: - TensorPrintOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) + TensorPrintOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} - TensorPrintOp(const TensorPrintOp& o) + TensorPrintOp(const TensorPrintOp &o) : framework::OperatorBase( - static_cast(o)) { + static_cast(o)) { PADDLE_THROW("Not implemented."); } private: - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - const framework::Variable* in_var_ptr = nullptr; - std::string phase(kForward); + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + const framework::Variable *in_var_ptr = nullptr; std::string printed_var_name = ""; - auto& inputs = Inputs(); - if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) { - in_var_ptr = scope.FindVar(Input("In")); - printed_var_name = Inputs("In").front(); - } else if (inputs.find("In@GRAD") != inputs.end() && - !Inputs("In@GRAD").empty()) { - in_var_ptr = scope.FindVar(Input("In@GRAD")); - printed_var_name = Inputs("In@GRAD").front(); - phase = std::string(kBackward); - } else { - PADDLE_THROW("Unknown phase, should be forward or backward."); - } + in_var_ptr = scope.FindVar(Input("In")); + printed_var_name = Inputs("In").front(); PADDLE_ENFORCE_NOT_NULL(in_var_ptr); - auto& in_tensor = in_var_ptr->Get(); - auto* out_var_ptr = scope.FindVar(Output("Out")); - auto& out_tensor = *out_var_ptr->GetMutable(); - - // Just copy data from input tensor to output tensor - // output tensor share same memory with input tensor - out_tensor.ShareDataWith(in_tensor); - out_tensor.set_lod(in_tensor.lod()); + auto &in_tensor = in_var_ptr->Get(); std::string print_phase = Attr("print_phase"); - if (print_phase != phase && print_phase != std::string(kBoth)) { + bool is_forward = Attr("is_forward"); + + if ((is_forward && print_phase == kBackward) || + (!is_forward && print_phase == kForward)) { return; } @@ -192,7 +175,7 @@ class TensorPrintOp : public framework::OperatorBase { formater.dtype = printed_tensor.type(); } if (Attr("print_tensor_shape")) { - auto& dims = printed_tensor.dims(); + auto &dims = printed_tensor.dims(); formater.dims.resize(dims.size()); for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i]; } @@ -200,7 +183,7 @@ class TensorPrintOp : public framework::OperatorBase { formater.lod = printed_tensor.lod(); } formater.summarize = Attr("summarize"); - formater.data = reinterpret_cast(printed_tensor.data()); + formater.data = reinterpret_cast(printed_tensor.data()); formater(printed_tensor.numel()); } @@ -219,14 +202,14 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker { AddAttr("print_tensor_type", "Whether to print the tensor's dtype."); AddAttr("print_tensor_shape", "Whether to print the tensor's shape."); AddAttr("print_tensor_lod", "Whether to print the tensor's lod."); - AddAttr( - "print_phase", - "(string, default 'BOTH') Which phase to display including 'FORWARD' " - "'BACKWARD' and 'BOTH'.") + AddAttr("print_phase", + "(string, default 'FORWARD') Which phase to display " + "including 'FORWARD' " + "'BACKWARD' and 'BOTH'.") .SetDefault(std::string(kBoth)) .InEnum({std::string(kForward), std::string(kBackward), std::string(kBoth)}); - AddOutput("Out", "Output tensor with same data as input tensor."); + AddAttr("is_forward", "Whether is forward or not").SetDefault(true); AddComment(R"DOC( Creates a print op that will print when a tensor is accessed. @@ -238,40 +221,21 @@ tensor `t`.)DOC"); class InferShapeForward : public framework::InferShapeBase { public: - void operator()(framework::InferShapeContext* context) const override { + void operator()(framework::InferShapeContext *context) const override { PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null."); - context->ShareLoD("In", /*->*/ "Out"); - context->SetOutputDim("Out", context->GetInputDim("In")); - } -}; - -class InferShapeBackward : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* context) const override { - PADDLE_ENFORCE(context->HasInput("In@GRAD"), - "Input(In@GRAD) should not be null."); - context->ShareLoD("In@GRAD", /*->*/ "Out"); - context->SetOutputDim("Out", context->GetInputDim("In@GRAD")); } }; -class InferVarType : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override {} -}; - -class PrintOpProtoAndCheckGradOpMaker - : public framework::SingleGradOpDescMaker { +class PrintOpGradientMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; std::unique_ptr Apply() const override { - auto* op_desc_ptr = new framework::OpDesc(); - op_desc_ptr->SetType("print_grad"); - op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out")); - op_desc_ptr->SetOutput("Out", InputGrad("In")); + auto *op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("print"); + op_desc_ptr->SetInput("In", InputGrad("In")); op_desc_ptr->SetAttrMap(Attrs()); + op_desc_ptr->SetAttr("is_forward", false); return std::unique_ptr(op_desc_ptr); } }; @@ -282,6 +246,4 @@ class PrintOpProtoAndCheckGradOpMaker namespace ops = paddle::operators; REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker, - ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward, - ops::InferVarType); -REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward); + ops::PrintOpGradientMaker, ops::InferShapeForward); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 7f8822e40053b5bcd394f446138a2292d80b69bf..c614de2eac143b3a545c60226aefa93dd72dea4f 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/scale_op.h" + #include +#include "paddle/fluid/operators/detail/safe_ref.h" + namespace paddle { namespace operators { @@ -52,6 +55,21 @@ $$Out = scale*X$$ } }; +class ScaleOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto &in_var_name = op_desc.Input("X").front(); + auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name)); + + auto out_var_name = op_desc.Output("Out").front(); + auto *out_var = block->FindVarRecursive(out_var_name); + + out_var->SetType(in_var.GetType()); + out_var->SetDataType(in_var.GetDataType()); + } +}; + class ScaleGradMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; @@ -71,7 +89,8 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; -REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); +REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, + ops::ScaleOpVarTypeInference); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index c6a59b76adcd6b4d3e7db5e7c7185f266f46841f..fe035aba81dd74d21539974beed255275be3013b 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -22,17 +22,29 @@ namespace operators { template class ScaleKernel : public framework::OpKernel { public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* tensor = context.Output("Out"); - auto* in = context.Input("X"); - tensor->mutable_data(in->place()); + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in_var = ctx.InputVar("X"); + auto* in = ctx.Input("X"); - auto scale = static_cast(context.Attr("scale")); + auto* out_var = ctx.OutputVar("Out"); + auto* out = ctx.Output("Out"); + out->mutable_data(in->place()); - auto eigen_out = framework::EigenVector::Flatten(*tensor); + PADDLE_ENFORCE_EQ(in->dims(), out->dims(), + "in and out should have the same dim"); + + auto scale = static_cast(ctx.Attr("scale")); + + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); + out_slr->set_rows(in_slr.rows()); + out_slr->set_height(in_slr.height()); + } + + auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& dev = - *context.template device_context().eigen_device(); + auto& dev = *ctx.template device_context().eigen_device(); eigen_out.device(dev) = scale * eigen_in; } }; diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 14b07649c416ff1b671fc9b5ee4eb956b44570c5..40404295266899c6ac2f7b1e08fdf7db40958794 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -56,6 +56,10 @@ class SendBarrierOp : public framework::OperatorBase { class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { + AddInput("X", "(Any) Dummy inputs, used for control dependency") + .AsDuplicable(); + AddOutput("Out", "(Any) Dummy outputs, used for control dependency") + .AsDuplicable(); AddComment(R"DOC( SendBarrier operator diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index 39301e1ac0971dfe0ca7854257f10ddeb60f1000..9228c81310463c3cb1d32fb613dd51d175b99c0e 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -53,25 +53,27 @@ struct SequenceExpandFunctor { const framework::Vector& ref_lod, /*expand referenced lod*/ LoDTensor* out) { int out_offset = 0; - auto& eigen_place = *context.eigen_device(); + int x_item_length = x.numel() / x.dims()[0]; + auto out_data = out->data(); + auto x_data = x.data(); for (size_t i = 1; i < ref_lod.size(); ++i) { int repeat_num = ref_lod[i] - ref_lod[i - 1]; int x_start = x_lod[i - 1]; int x_end = x_lod[i]; int x_seq_len = x_end - x_start; if (repeat_num > 0) { - auto x_sub_tensor = x.Slice(x_start, x_end); - x_sub_tensor.Resize({1, x_sub_tensor.numel()}); int out_start = out_offset; if (out->lod().size() == 1) { out_start = out->lod()[0][out_offset]; } - auto out_sub_tensor = - out->Slice(out_start, out_start + x_seq_len * repeat_num); - out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]}); - EigenMatrix::From(out_sub_tensor).device(eigen_place) = - EigenMatrix::From(x_sub_tensor) - .broadcast(Eigen::array({{repeat_num, 1}})); + for (int j = 0; j < repeat_num; j++) { + for (int k = 0; k < x_seq_len; k++) { + for (int l = 0; l < x_item_length; l++) { + out_data[(out_start + j * x_seq_len + k) * x_item_length + l] = + x_data[(x_start + k) * x_item_length + l]; + } + } + } } out_offset += repeat_num; } diff --git a/paddle/fluid/operators/sequence_mask_op.cc b/paddle/fluid/operators/sequence_mask_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e45c18d6aff65ecac565ef05e36b2d47ad8744b8 --- /dev/null +++ b/paddle/fluid/operators/sequence_mask_op.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/sequence_mask_op.h" + +REGISTER_OPERATOR(sequence_mask, paddle::operators::SequenceMaskOp, + paddle::operators::SequenceMaskOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + sequence_mask, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel); diff --git a/paddle/fluid/operators/sequence_mask_op.cu b/paddle/fluid/operators/sequence_mask_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ff5acf4d9edd5f0f15cbcb22eae212c2d49ccaab --- /dev/null +++ b/paddle/fluid/operators/sequence_mask_op.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/sequence_mask_op.h" + +REGISTER_OP_CUDA_KERNEL( + sequence_mask, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel); diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h new file mode 100644 index 0000000000000000000000000000000000000000..18acb735cecabd1e01f7821c880fd8ed5e52971f --- /dev/null +++ b/paddle/fluid/operators/sequence_mask_op.h @@ -0,0 +1,154 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef __NVCC__ +#include +#include +#include +#else +#include +#endif + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +class SequenceMaskOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist"); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist"); + + auto maxlen = ctx->Attrs().Get("maxlen"); + if (maxlen > 0) { // We can only infershape when maxlen > 0 + auto dim = framework::vectorize2int(ctx->GetInputDim("X")); + dim.push_back(maxlen); + ctx->SetOutputDim("Y", framework::make_ddim(dim)); + } + } +}; + +class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor of sequence_mask op."); + AddOutput("Y", "The output mask of sequence_mask op."); + AddAttr("maxlen", + "The maximum length of the sequence. If maxlen < 0, maxlen " + "= max(Input(X)).") + .SetDefault(-1) + .AddCustomChecker([](int &v) { + PADDLE_ENFORCE(v < 0 || v >= 1, + "Attr(maxlen) must be less than 0 or larger than 1"); + }); + AddAttr("out_dtype", "Output data type"); + AddComment(R"DOC( +SequenceMask Operator + +This operator outputs a Mask according to Input(X) and Attr(maxlen). +Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the +Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where: + +Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) + +If maxlen < 0, maxlen = max(X) + )DOC"); + } +}; + +template +struct SequenceMaskForRangeFunctor { + HOSTDEVICE SequenceMaskForRangeFunctor(const Tx *x, Ty *y, int maxlen) + : x_(x), y_(y), maxlen_(maxlen) {} + + HOSTDEVICE void operator()(int y_idx) const { + int x_idx = y_idx / maxlen_; + int j = y_idx % maxlen_; + y_[y_idx] = static_cast(j < x_[x_idx] ? 1 : 0); + } + + private: + const Tx *x_; + Ty *y_; + int maxlen_; +}; + +template +struct SequenceMaskFunctor { + using Tensor = framework::LoDTensor; + + SequenceMaskFunctor(const DeviceContext &ctx, const Tx *x, Tensor *y, + int limits, int maxlen) + : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {} + + template + void apply() const { + auto *y_data = y_->mutable_data(ctx_.GetPlace()); + platform::ForRange for_range(ctx_, limits_); + for_range(SequenceMaskForRangeFunctor(x_, y_data, maxlen_)); + } + + private: + const DeviceContext &ctx_; + const Tx *x_; + Tensor *y_; + int limits_; + int maxlen_; +}; + +template +class SequenceMaskKernel : public framework::OpKernel { + using Tensor = framework::LoDTensor; + + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *y = ctx.Output("Y"); + auto maxlen = ctx.Attr("maxlen"); + + auto *x_data = x->data(); + auto x_numel = x->numel(); + if (maxlen < 0) { +#ifdef __NVCC__ + VLOG(10) + << "SequenceMaskOp on GPU may be slow when maxlen is not provided."; + maxlen = static_cast( + thrust::reduce(thrust::device_pointer_cast(x_data), + thrust::device_pointer_cast(x_data) + x_numel, + static_cast(0), thrust::maximum())); +#else + maxlen = static_cast(*std::max_element(x_data, x_data + x_numel)); +#endif + auto y_dim = framework::vectorize2int(x->dims()); + y_dim.push_back(maxlen); + y->Resize(framework::make_ddim(y_dim)); + } + + auto out_dtype = static_cast( + ctx.Attr("out_dtype")); + auto &dev_ctx = ctx.template device_context(); + framework::VisitDataType(out_dtype, + SequenceMaskFunctor( + dev_ctx, x_data, y, x_numel * maxlen, maxlen)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_pad_op.cc b/paddle/fluid/operators/sequence_pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..44d73aa4076abfe15c906478702ac7c4a55303d4 --- /dev/null +++ b/paddle/fluid/operators/sequence_pad_op.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_pad_op.h" + +namespace paddle { +namespace operators { + +class SequencePadOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePadOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("PadValue"), + "Input(PadValue) of SequencePadOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SequencePadOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "The rank of Input(x) can't be less than 2."); + auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size()); + auto pad_value_dims = ctx->GetInputDim("PadValue"); + PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) || + pad_value_dims == time_step_dims, + "The Input(PadValue) must be a scalar or a tensor whose " + "shape equals to time steps in sequences"); + + int out_dim_0 = -1; + int out_dim_1 = -1; + + if (ctx->IsRuntime()) { + // run time + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + int seq_num = x_lod_0.size() - 1; + int max_seq_len = math::MaximumSequenceLength(x_lod_0); + int padded_length = ctx->Attrs().Get("padded_length"); + if (padded_length == -1) { + padded_length = max_seq_len; + } + PADDLE_ENFORCE_GE(padded_length, max_seq_len, + "The Attr(padded_length) must be -1 or an int greater " + "than the length of the longest original sequence."); + out_dim_0 = seq_num; + out_dim_1 = padded_length; + } else { + // compile time + framework::VarDesc* x_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1); + } + + std::vector out_dims_vec{out_dim_0, out_dim_1}; + auto time_step_dims_vec = framework::vectorize2int(time_step_dims); + out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(), + time_step_dims_vec.end()); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + } +}; + +class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("PadValue", + "(LoDTensor), this Tensor holds values that will be fill into " + "padded steps. It can be a scalar or a tensor whose shape equals " + "to time steps in sequences. If it's a scalar, it will be " + "automatically broadcasted to the shape of time step."); + AddOutput( + "Out", + "(LoDTensor) The output vairable, which contains padded sequences."); + AddAttr( + "padded_length", + "The length of padded sequences. It can be setted to -1 or " + "any positive int. When it is -1, all sequences will be padded up to " + "the length of the longest one among them; when it a certain positive " + "value, it must be greater than the length of the longest original " + "sequence.") + .SetDefault(-1); + AddComment(R"DOC( + Sequence Pad Operator + + This operator pads sequences in a same batch to a consistent length. + The length is specified by attribute 'padded_length'. New elements, + whose values are specified by input 'PadValue', will be appended to + the end of each sequence, to make their final lengths consistent. + + Following are cases to better explain how this works: + + Case 1: + + Given a 1-level LoDTensor input(X): + X.lod = [[0, 2, 5]] + X.data = [a, b, c, d, e] + and Input(PadValue): + PadValue.data = [0] + and attribite 'padded_length' = 4, + then we get LoDTensor: + Out.data = [[a, b, 0, 0], + [c, d, e, 0]] + + Case 2: + + Given a 1-level LoDTensor input(X): + X.lod = [[0, 2, 5]] + X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]] + and Input(PadValue): + PadValue.data = [0] + and attribite 'padded_length' = -1, which mean using the length + of longest input sequence(3 in this case), + then we get LoDTensor: + Out.data = [[[a1, a2], [b1, b2], [0, 0]], + [[c1, c2], [d1, d2], [e1, e2]]] + + Case 3: + + Given a 1-level LoDTensor input(X): + X.lod = [[0, 2, 5]] + X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]] + and Input(PadValue): + PadValue.data = [p1, p2] + and attribite 'padded_length' = -1, which mean using the length + of longest input sequence(3 in this case), + then we get LoDTensor: + Out.data = [[[a1, a2], [b1, b2], [p1, p2]], + [[c1, c2], [d1, d2], [e1, e2]]] + + )DOC"); + } +}; + +class SequencePadGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp); +REGISTER_OP_CPU_KERNEL( + sequence_pad, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel); +REGISTER_OP_CPU_KERNEL( + sequence_pad_grad, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_pad_op.cu b/paddle/fluid/operators/sequence_pad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ff8f81a2f0ec4a72befc3be2a5fc48c3a586c824 --- /dev/null +++ b/paddle/fluid/operators/sequence_pad_op.cu @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_pad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sequence_pad, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel); +REGISTER_OP_CUDA_KERNEL( + sequence_pad_grad, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_pad_op.h b/paddle/fluid/operators/sequence_pad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..5fc9da69d787ff3aeffa716689d44772ad8f7bd2 --- /dev/null +++ b/paddle/fluid/operators/sequence_pad_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_padding.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +class SequencePadOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const auto* pad_value = ctx.Input("PadValue"); + + int padded_length = ctx.Attr("padded_length"); + + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *x, out, *pad_value, + padded_length, 0, false, math::kBatchLengthWidth); + } +}; + +template +class SequencePadGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_x = ctx.Output(framework::GradVarName("X")); + if (d_x) { + const auto* d_out = ctx.Input(framework::GradVarName("Out")); + d_x->mutable_data(ctx.GetPlace()); + + int padded_length = ctx.Attr("padded_length"); + + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *d_out, d_x, + padded_length, 0, false, math::kBatchLengthWidth); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc index 8146c5f56104b7dec86b1c4491ed10fc2e94b58b..29d2fb989754f5621222768a279a1c898ea1c355 100644 --- a/paddle/fluid/operators/shrink_rnn_memory_op.cc +++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc @@ -62,7 +62,10 @@ class ShrinkRNNMemoryOp : public ArrayOp { } if (dst_num_rows != 0) { - out_tensor.ShareDataWith(x_tensor.Slice(0, height)); + out_tensor.mutable_data(place, x_tensor.type()); + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx, + &out_tensor); } } }; diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index c777d5feaec1c3a6216b01359a250072a674b700..d236c5b943704683c27b9b155c11ca9113edf514 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -150,30 +150,17 @@ class StackKernel : public framework::OpKernel { int total_num = pre * n * post; auto &dev_ctx = ctx.template device_context(); - constexpr auto kMaxThreshold = 16; - if (std::is_same::value || - n > kMaxThreshold) { #ifdef __NVCC__ - VLOG(10) << "Stack more than " << kMaxThreshold - << " tensors on GPU may be slow."; - thrust::device_vector device_x_vec(x_datas); - auto x_data_arr = device_x_vec.data().get(); + thrust::device_vector device_x_vec(x_datas); + auto x_data_arr = device_x_vec.data().get(); #else - auto x_data_arr = x_datas.data(); + auto x_data_arr = x_datas.data(); #endif - StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); + StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); #ifdef __NVCC__ - // Wait() must be called because device_x_vec may be destructed before - // kernel ends - dev_ctx.Wait(); -#endif - } -#ifdef __NVCC__ - else { // NOLINT - framework::Array x_data_arr; - for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i]; - StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); - } + // Wait() must be called because device_x_vec may be destructed before + // kernel ends + dev_ctx.Wait(); #endif } }; @@ -244,32 +231,17 @@ class StackGradKernel : public framework::OpKernel { int post = total_num / (n * pre); auto &dev_ctx = ctx.template device_context(); - constexpr auto kMaxThreshold = 16; - if (std::is_same::value || - n > kMaxThreshold) { #ifdef __NVCC__ - VLOG(10) << "Stack more than " << kMaxThreshold - << " tensors on GPU may be slow."; - thrust::device_vector device_dx_vec(dx_datas); - auto dx_data_arr = device_dx_vec.data().get(); + thrust::device_vector device_dx_vec(dx_datas); + auto dx_data_arr = device_dx_vec.data().get(); #else - auto dx_data_arr = dx_datas.data(); + auto dx_data_arr = dx_datas.data(); #endif - StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, - post); + StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); #ifdef __NVCC__ - // Wait() must be called because device_dx_vec may be destructed before - // kernel ends - dev_ctx.Wait(); -#endif - } -#ifdef __NVCC__ - else { // NOLINT - framework::Array dx_data_arr; - for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i]; - StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, - post); - } + // Wait() must be called because device_dx_vec may be destructed before + // kernel ends + dev_ctx.Wait(); #endif } }; diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 5248767c2eeb9388c26d203e64f8b2c68ffe0865..763bb403588d13c15271d26b09813dddf3a5dd8c 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -37,7 +37,7 @@ class CPUUniformRandomKernel : public framework::OpKernel { } else { PADDLE_THROW( "uniform_random_op's output only" - "supports SelectedRows and Tensor"); + "supports SelectedRows and LoDTensor"); } T* data = tensor->mutable_data(ctx.GetPlace()); unsigned int seed = static_cast(ctx.Attr("seed")); diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index e1c7323a30233f4ec4f60e46aa6088ee6d8601b7..bbb692b0ddfc18e8a62c0d2a6bac88f9932f6704 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -54,7 +54,7 @@ class GPUUniformRandomKernel : public framework::OpKernel { } else { PADDLE_THROW( "uniform_random_op's output only" - "supports SelectedRows and Tensor"); + "supports SelectedRows and LoDTensor"); } T* data = tensor->mutable_data(context.GetPlace()); unsigned int seed = static_cast(context.Attr("seed")); diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ff3249cc333231a0624cd5aab9603a6a75f4480 --- /dev/null +++ b/paddle/fluid/operators/unstack_op.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/unstack_op.h" + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +USE_OP(stack); + +REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker, + ops::UnStackOpInferShape, ops::UnStackGradOpDescMaker); + +REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp, + ops::UnStackOpGradInferShape); diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h new file mode 100644 index 0000000000000000000000000000000000000000..348a1038804ccb2551e5f729cc1a38bcef1511f5 --- /dev/null +++ b/paddle/fluid/operators/unstack_op.h @@ -0,0 +1,135 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class UnStackOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist."); + + int axis = ctx->Attrs().Get("axis"); + int num = ctx->Attrs().Get("num"); + auto x_dim = ctx->GetInputDim("X"); + int rank = x_dim.size(); + PADDLE_ENFORCE(axis >= -rank && axis < rank, + "Attr(axis) must be inside [-rank, rank), where rank = %d", + rank); + if (axis < 0) axis += rank; + + PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast(num), + "Number of Outputs(Y) is wrong"); + if (x_dim[axis] > 0) { + PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong"); + } + auto vec = framework::vectorize2int(x_dim); + vec.erase(vec.begin() + axis); + ctx->SetOutputsDim("Y", std::vector( // NOLINT + x_dim[axis], framework::make_ddim(vec))); + } +}; + +class UnStackOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input of unstack op."); + AddOutput("Y", "The output of unstack op.").AsDuplicable(); + AddAttr("axis", "The axis along which Input(X) should be unstacked.") + .SetDefault(0); + AddAttr("num", "The number of outputs(Y).").GreaterThan(0); + AddComment(R"DOC( + UnStack Operator. + + UnStack Input(X) into several tensors along Attr(axis). + )DOC"); + } +}; + +class UnStackOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto stack_grad_op = framework::OpRegistry::CreateOp( + "stack_grad", {{framework::GradVarName("Y"), {Input("X")}}}, + {{framework::GradVarName("X"), Outputs("Y")}}, Attrs()); + stack_grad_op->Run(scope, place); + } +}; + +class UnStackOpGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0, + "Number of Inputs(Y@Grad) must be larger than 0"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) must exist."); + + auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y")); + for (size_t i = 1; i < input_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0], + "Dims of all Inputs(Y@Grad) must be the same"); + } + + int axis = ctx->Attrs().Get("axis"); + int rank = input_dims[0].size(); + PADDLE_ENFORCE( + axis >= -(rank + 1) && axis < rank + 1, + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank); + if (axis < 0) axis += (rank + 1); + + auto vec = framework::vectorize2int(input_dims[0]); + vec.insert(vec.begin() + axis, input_dims.size()); + ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec)); + } +}; + +class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("unstack_grad"); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class UnStackGradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto stack_op = framework::OpRegistry::CreateOp( + "stack", {{"X", Inputs(framework::GradVarName("Y"))}}, + {{"Y", {Output(framework::GradVarName("X"))}}}, Attrs()); + stack_op->Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index ab70c1f0592d122ba248a101db487e64c0bdae6f..444265f58de732f07c5db2abd87811a063016866 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -153,17 +153,29 @@ class WarpCTCKernel : public framework::OpKernel { framework::make_ddim({static_cast(num_sequences), 1}); // warpctc needs sequences data stored in transposed padding format - Tensor warpctc_logits; + LoDTensor warpctc_logits; const size_t max_sequence_length = - math::MaximumSequenceLength(logits_lod, level); + math::MaximumSequenceLength(logits_lod[level]); auto warpctc_logits_dims = framework::make_ddim({static_cast(max_sequence_length), static_cast(num_sequences), static_cast(sequence_width)}); warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); + + LoDTensor cpu_pad_value; + T* pad_value_data = + cpu_pad_value.mutable_data({1}, platform::CPUPlace()); + *pad_value_data = static_cast(0); + LoDTensor pad_value; + if (platform::is_cpu_place(ctx.GetPlace())) { + pad_value = cpu_pad_value; + } else { + TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value); + } + math::PaddingLoDTensorFunctor()( ctx.template device_context(), *logits, &warpctc_logits, - false); + pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth); const T* warpctc_logits_data = warpctc_logits.data(); std::vector warpctc_label_lengths(num_sequences); @@ -209,15 +221,15 @@ template class WarpCTCGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* warpctc_grad = ctx.Input("WarpCTCGrad"); + auto* warpctc_grad = ctx.Input("WarpCTCGrad"); auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); logits_grad->mutable_data(ctx.GetPlace()); bool norm_by_times = ctx.Attr("norm_by_times"); math::UnpaddingLoDTensorFunctor()( - ctx.template device_context(), logits_grad, - *warpctc_grad, norm_by_times); + ctx.template device_context(), *warpctc_grad, + logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); const T* loss_grad_data = loss_grad->data(); math::ScaleLoDTensorFunctor()( diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 5d17978dd7946596c490dc465dab51e7cf53a044..30c8fbcfce92a8b06a175ddf198cde572f72b2a4 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -51,7 +51,7 @@ typedef enum { } cpu_isa_t; // Instruction set architecture // May I use some instruction -inline bool MayIUse(const cpu_isa_t cpu_isa); +bool MayIUse(const cpu_isa_t cpu_isa); } // namespace jit diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 88e0383146c1adf2752a362091996bad9cfcce5e..b97dad20db0b003b4886b7c7cfd1c8de8bf44ab9 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -24,7 +24,7 @@ limitations under the License. */ #endif #ifdef PADDLE_WITH_MKLDNN -#include +#include "mkldnn.hpp" #endif #include diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 6f1f0c4796f3bae2fb419bf103cb6c0c5489bf65..4c99f4be321160caf0ee2f89a655bdfb933408e3 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" @@ -84,9 +85,6 @@ void InitDevices(bool init_p2p) { } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } -#else - LOG(WARNING) - << "'CUDA' is not supported, Please re-compile with WITH_GPU option"; #endif InitDevices(init_p2p, devices); } @@ -100,9 +98,6 @@ void InitDevices(bool init_p2p, const std::vector devices) { } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } -#else - LOG(WARNING) - << "'CUDA' is not supported, Please re-compile with WITH_GPU option"; #endif for (size_t i = 0; i < devices.size(); ++i) { @@ -120,6 +115,22 @@ void InitDevices(bool init_p2p, const std::vector devices) { #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif + + if (platform::jit::MayIUse(platform::jit::avx512_common)) { +#ifndef __AVX512F__ + LOG(WARNING) << "AVX512F is available, Please re-compile on local machine"; +#endif + } + if (platform::jit::MayIUse(platform::jit::avx2)) { +#ifndef __AVX2__ + LOG(WARNING) << "AVX2 is available, Please re-compile on local machine"; +#endif + } + if (platform::jit::MayIUse(platform::jit::avx)) { +#ifndef __AVX__ + LOG(WARNING) << "AVX is available, Please re-compile on local machine"; +#endif + } } void InitGLOG(const std::string &prog_name) { diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 2cf26a1fe05a998c5ae5ffc10340f14347e92b30..38630686f7cf3c669373f941d989adf11ba6cfe6 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -96,8 +96,7 @@ struct RecordBlock { uint64_t start_ns_; }; #else -// Our profiler deeply coupled in many operators. -// use fake object to avoid large modifies these files. +// windows do not support profiler temporarily. struct RecordEvent { RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {} }; diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index e4415ed15c791100a5b309e73d7deb5943f71b97..f577068d1f39a3083a54f106d006f9982304411e 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -43,6 +43,9 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpRoleVarAttrName", framework::OpProtoAndCheckerMaker::OpRoleVarAttrName); + op_proto_and_checker_maker.def( + "kOpNameScopeAttrName", + framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); } } // namespace pybind diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index c2137ec6d7df24251432a4dfb8fffc3d3f77194e..f21f8d23f99c27529b2ed1995c92fd4eee4a5807 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -234,6 +234,7 @@ void BindVarDsec(pybind11::module *m) { pybind11::enum_(var_desc, "VarType", "") .value("BOOL", pd::proto::VarType::BOOL) .value("UINT8", pd::proto::VarType::UINT8) + .value("INT8", pd::proto::VarType::INT8) .value("INT16", pd::proto::VarType::INT16) .value("INT32", pd::proto::VarType::INT32) .value("INT64", pd::proto::VarType::INT64) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 67734659233515ca8110f4212a2b1553fe4e9d24..5b20b87174e42f4dfdd22214e8f9dd20c7296374 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -130,6 +130,7 @@ PYBIND11_PLUGIN(core) { .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) + .def("set", PyCPUTensorSetFromArray) #ifdef PADDLE_WITH_CUDA .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) @@ -138,6 +139,7 @@ PYBIND11_PLUGIN(core) { .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) @@ -145,6 +147,7 @@ PYBIND11_PLUGIN(core) { .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) .def("set", PyCUDAPinnedTensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("_set_float_element", TensorSetElement) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 3e2ea1ef88b03f5b2576c1cee2b5d26a439943da..51614a6a3dd2f7f830cf533fc365b56a99d3b918 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -97,7 +97,7 @@ struct CastToPyBufferImpl { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) { auto buffer_info = details::CastToPyBufferImpl()(tensor); + uint8_t, int8_t, platform::float16>()(tensor); return buffer_info; } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index f2a9a6b3b9af59e5c4709eb822fa5d9ab1543a0c..7199424b4709fbe9fc962cf98aea6223b9f3e51d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -328,14 +328,25 @@ function assert_api_not_changed() { source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec + if [ "$1" == "cp35-cp35m" ]; then + # Use sed to make python2 and python3 sepc keeps the same + sed -i 's/arg0: str/arg0: unicode/g' new.spec + sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec + fi python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate +} + +function assert_api_spec_approvals() { + if [ -z ${BRANCH} ]; then + BRANCH="develop" + fi - API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true` + API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true` echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then - # TODO: curl -H 'Authorization: token ${TOKEN}' - APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews | \ + # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then @@ -617,11 +628,12 @@ function main() { cicheck) cmake_gen ${PYTHON_ABI:-""} build + assert_api_not_changed ${PYTHON_ABI:-""} run_test gen_capi_package gen_fluid_inference_lib test_fluid_inference_lib - assert_api_not_changed + assert_api_spec_approvals ;; *) print_usage diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py index 0a1cdaceaf3be48a06b1c0b5b979e90f50e9000c..0d4e7f1ee46ff97912d010cdb268cc4898d99f58 100644 --- a/python/paddle/dataset/flowers.py +++ b/python/paddle/dataset/flowers.py @@ -42,6 +42,7 @@ from paddle.reader import * import os import numpy as np from multiprocessing import cpu_count +import six from six.moves import cPickle as pickle from six.moves import zip __all__ = ['train', 'test', 'valid'] diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 1cd50bd1802095db07e5618f37b0d42d11e94760..920dbf3b4ebb0bc3d98c9ea986d7d039deed4a4c 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -36,11 +36,6 @@ import numpy as np try: import cv2 except ImportError: - import sys - sys.stderr.write( - '''Warning with paddle image module: opencv-python should be imported, - or paddle image module could NOT work; please install opencv-python first.''' - ) cv2 = None import os import tarfile @@ -53,6 +48,18 @@ __all__ = [ ] +def _check_cv2(): + if cv2 is None: + import sys + sys.stderr.write( + '''Warning with paddle image module: opencv-python should be imported, + or paddle image module could NOT work; please install opencv-python first.''' + ) + return False + else: + return True + + def batch_images_from_tar(data_file, dataset_name, img2label, @@ -134,7 +141,7 @@ def load_image_bytes(bytes, is_color=True): load and return a gray image. :type is_color: bool """ - assert cv2 is not None + assert _check_cv2() is True flag = 1 if is_color else 0 file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8) @@ -159,7 +166,7 @@ def load_image(file, is_color=True): load and return a gray image. :type is_color: bool """ - assert cv2 is not None + assert _check_cv2() is True # cv2.IMAGE_COLOR for OpenCV3 # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version @@ -188,7 +195,7 @@ def resize_short(im, size): :param size: the shorter edge size of image after resizing. :type size: int """ - assert cv2 is not None + assert _check_cv2() is True h, w = im.shape[:2] h_new, w_new = size, size @@ -196,7 +203,7 @@ def resize_short(im, size): h_new = size * h // w else: w_new = size * w // h - im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC) return im @@ -338,7 +345,6 @@ def simple_transform(im, if np.random.randint(2) == 0: im = left_right_flip(im, is_color) else: - im = center_crop(im, crop_size, is_color) im = center_crop(im, crop_size, is_color=is_color) if len(im.shape) == 3: im = to_chw(im) diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py index c98e0019f7ab5fc2723e8df919257a59af7c9e5d..64bf7414819ad74365744adbd760b73d4adaff7c 100644 --- a/python/paddle/dataset/movielens.py +++ b/python/paddle/dataset/movielens.py @@ -24,6 +24,7 @@ set and test set into paddle reader creators. from __future__ import print_function +import numpy as np import zipfile import paddle.dataset.common import re @@ -150,12 +151,12 @@ def __initialize_meta_info__(): def __reader__(rand_seed=0, test_ratio=0.1, is_test=False): fn = __initialize_meta_info__() - rand = random.Random(x=rand_seed) + np.random.seed(rand_seed) with zipfile.ZipFile(file=fn) as package: with package.open('ml-1m/ratings.dat') as rating: for line in rating: line = cpt.to_text(line, encoding='latin') - if (rand.random() < test_ratio) == is_test: + if (np.random.random() < test_ratio) == is_test: uid, mov_id, rating, _ = line.strip().split("::") uid = int(uid) mov_id = int(mov_id) diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py index 09721e430b7e5bb6b9891d5272ca54475baf6157..baa14a573fcfdfa943af1e995f687c74e9fb4d07 100644 --- a/python/paddle/fluid/contrib/memory_usage_calc.py +++ b/python/paddle/fluid/contrib/memory_usage_calc.py @@ -70,23 +70,37 @@ def memory_usage(program, batch_size): if not isinstance(program, Program): raise TypeError( "Calculating Memory Usage requires Program as its Parameter." - "But you passed in %s" % (type(prgram))) + "But you passed in %s" % (type(program))) if batch_size <= 0: raise ValueError("The batch size need to be positive.") # Get the var_name list of first block and calculate total_memory = 0.0 - for var in six.itervalues(program.global_block().vars): - data_count = 1 - for x in var.shape: - if x == -1: - data_count *= batch_size - else: - data_count *= x - var_memory = data_count * dtype_to_size[var.dtype] - if DEBUG: - print("%s memory usage: %d" % (var.name, var_memory)) - total_memory += var_memory + processed_var_names = set() + for op in program.global_block().ops: + for var_name in op.output_arg_names: + if var_name in processed_var_names: + continue + processed_var_names.add(var_name) + var = program.global_block().vars[var_name] + if var.desc.type() != core.VarDesc.VarType.LOD_TENSOR: + continue + + data_count = 1 + neg_dim_count = 0 + for x in var.shape: + if x < 0: + if neg_dim_count >= 1: + raise ValueError("Var %s has more than one negtive dim." + % (var_name)) + neg_dim_count += 1 + data_count *= batch_size * (-x) + else: + data_count *= x + var_memory = data_count * dtype_to_size[var.dtype] + if DEBUG: + print("%s memory usage: %d" % (var.name, var_memory)) + total_memory += var_memory if DEBUG: print("total memory usage: %.2f" % (total_memory)) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index febb750ee1af26c71e6c1ae1e4c97fb02fb27a04..b0e0d27ff7a0c603523065d34169b1b73eabdac3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -43,6 +43,7 @@ __all__ = [ 'default_main_program', 'program_guard', 'get_var', + 'name_scope', ] EMPTY_VAR_NAME = core.kEmptyVarName() @@ -52,6 +53,70 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() +class NameScope(object): + def __init__(self, name="", parent=None): + self._children = dict() + self._name = name + self._parent = parent + + def child(self, prefix): + if prefix not in self._children: + new_child = NameScope(prefix, self) + self._children[prefix] = [new_child] + else: + new_child = NameScope(prefix + "_%d" % len(self._children[prefix]), + self) + self._children[prefix].append(new_child) + return new_child + + def parent(self): + return self._parent + + def name(self): + return self._name + + +_name_scope = NameScope() + + +@contextlib.contextmanager +def name_scope(prefix=None): + """ + Generate hierarchical name prefix for the operators. + + Note: This should only used for debugging and visualization purpose. + Don't use it for serious analysis such as graph/program transformations. + + Args: + prefix(str): prefix. + + Examples: + .. code-block:: python + with name_scope("encoder"): + ... + with name_scope("decoder"): + ... + with name_scope("attention"): + ... + """ + # TODO(panyx0718): Only [0-9a-z]. + assert prefix, "namescope prefix cannot be empty." + global _name_scope + _name_scope = _name_scope.child(prefix) + yield + _name_scope = _name_scope.parent() + + +def _full_name_scope(): + global _name_scope + scope = _name_scope + name = "" + while scope: + name = scope.name() + "/" + name + scope = scope.parent() + return name + + def generate_control_dev_var_name(): import random return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random()) @@ -95,6 +160,8 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.INT16 elif dtype == np.uint8: return core.VarDesc.VarType.UINT8 + elif dtype == np.int8: + return core.VarDesc.VarType.INT8 else: raise ValueError("Not supported numpy dtype %s" % dtype) @@ -513,6 +580,9 @@ class Operator(object): self.desc.set_type(type) proto = OpProtoHolder.instance().get_op_proto(type) + namescope_var_name = op_maker.kOpNameScopeAttrName() + op_attrs[namescope_var_name] = _full_name_scope() + def find_name(var_list, name): for var_name in var_list: if var_list[var_name] is not None and var_name == name: diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index d2954c4c22069627bac5188cf1d485e50b68c8e4..c9a2f8a0abf9c811074e3fbadec0c61cb6dbf681 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -189,7 +189,6 @@ def Print(input, message="The content of some_layer: ") ''' helper = LayerHelper('print', **locals()) - out = helper.create_tmp_variable(dtype=helper.input_dtype()) helper.append_op( type='print', inputs={'In': input}, @@ -202,9 +201,7 @@ def Print(input, 'print_tensor_shape': print_tensor_shape, 'print_tensor_lod': print_tensor_lod, 'print_phase': print_phase.upper() - }, - outputs={'Out': out}) - return out + }) class BlockGuard(object): diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 72071478845df444ce72ce946787b2d0ce5f0d23..5757b2798e43dc70b406462a74b4f74eedcf56fa 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -39,6 +39,8 @@ __all__ = [ 'detection_map', 'rpn_target_assign', 'anchor_generator', + 'generate_proposal_labels', + 'generate_proposals', ] __auto__ = [ @@ -56,6 +58,7 @@ for _OP in set(__auto__): def rpn_target_assign(loc, scores, anchor_box, + anchor_var, gt_box, rpn_batch_size_per_im=256, fg_fraction=0.25, @@ -94,6 +97,8 @@ def rpn_target_assign(loc, if the input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom coordinate of the anchor box. + anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded + variances of anchors. gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth bboxes of mini-batch input. @@ -143,30 +148,29 @@ def rpn_target_assign(loc, # 1. Compute the regression target bboxes target_bbox = box_coder( prior_box=anchor_box, + prior_box_var=anchor_var, target_box=gt_box, code_type='encode_center_size', box_normalized=False) - # 2. Compute overlaps between the prior boxes and the gt boxes overlaps iou = iou_similarity(x=gt_box, y=anchor_box) - # 3. Assign target label to anchors loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype) score_index = helper.create_tmp_variable(dtype=anchor_box.dtype) target_label = helper.create_tmp_variable(dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", - inputs={'Overlap': iou, }, + inputs={'DistMat': iou}, outputs={ 'LocationIndex': loc_index, 'ScoreIndex': score_index, - 'TargetLabel': target_label, + 'TargetLabel': target_label }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, 'rpn_positive_overlap': rpn_positive_overlap, 'rpn_negative_overlap': rpn_negative_overlap, - 'fg_fraction': fg_fraction, + 'fg_fraction': fg_fraction }) # 4. Reshape and gather the target entry @@ -179,7 +183,7 @@ def rpn_target_assign(loc, predicted_location = nn.gather(loc, loc_index) target_label = nn.gather(target_label, score_index) target_bbox = nn.gather(target_bbox, loc_index) - return predicted_scores, predicted_loc, target_label, target_bbox + return predicted_scores, predicted_location, target_label, target_bbox def detection_output(loc, @@ -1253,3 +1257,131 @@ def anchor_generator(input, anchor.stop_gradient = True var.stop_gradient = True return anchor, var + + +def generate_proposal_labels(rpn_rois, + gt_classes, + gt_boxes, + im_scales, + batch_size_per_im=256, + fg_fraction=0.25, + fg_thresh=0.25, + bg_thresh_hi=0.5, + bg_thresh_lo=0.0, + bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], + class_nums=None): + """ + ** Generate proposal labels Faster-RCNN ** + TODO(buxingyuan): Add Document + """ + + helper = LayerHelper('generate_proposal_labels', **locals()) + + rois = helper.create_tmp_variable(dtype=rpn_rois.dtype) + labels_int32 = helper.create_tmp_variable(dtype=gt_classes.dtype) + bbox_targets = helper.create_tmp_variable(dtype=rpn_rois.dtype) + bbox_inside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype) + bbox_outside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype) + + helper.append_op( + type="generate_proposal_labels", + inputs={ + 'RpnRois': rpn_rois, + 'GtClasses': gt_classes, + 'GtBoxes': gt_boxes, + 'ImScales': im_scales + }, + outputs={ + 'Rois': rois, + 'LabelsInt32': labels_int32, + 'BboxTargets': bbox_targets, + 'BboxInsideWeights': bbox_inside_weights, + 'BboxOutsideWeights': bbox_outside_weights + }, + attrs={ + 'batch_size_per_im': batch_size_per_im, + 'fg_fraction': fg_fraction, + 'fg_thresh': fg_thresh, + 'bg_thresh_hi': bg_thresh_hi, + 'bg_thresh_lo': bg_thresh_lo, + 'bbox_reg_weights': bbox_reg_weights, + 'class_nums': class_nums + }) + + rois.stop_gradient = True + labels_int32.stop_gradient = True + bbox_targets.stop_gradient = True + bbox_inside_weights.stop_gradient = True + bbox_outside_weights.stop_gradient = True + + return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights + + +def generate_proposals(scores, + bbox_deltas, + im_info, + anchors, + variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0, + name=None): + """ + ** Generate proposal labels Faster-RCNN ** + + This operation proposes RoIs according to each box with their probability to be a foreground object and + the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals + could be used to train detection net. + + For generating proposals, this operation performs following steps: + + 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) + 2. Calculate box locations as proposals candidates. + 3. Clip boxes to image + 4. Remove predicted boxes with small area. + 5. Apply NMS to get final proposals as output. + + + Args: + scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. + N is batch size, A is number of anchors, H and W are height and width of the feature map. + bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. + im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale + between origin image size and the size of feature map. + anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, + num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. + variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. + pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. + post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. + nms_thresh(float): Threshold in NMS, 0.5 by default. + min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. + eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. + """ + helper = LayerHelper('generate_proposals', **locals()) + + rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype) + rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype) + helper.append_op( + type="generate_proposals", + inputs={ + 'Scores': scores, + 'BboxDeltas': bbox_deltas, + 'ImInfo': im_info, + 'Anchors': anchors, + 'Variances': variances + }, + attrs={ + 'pre_nms_topN': pre_nms_top_n, + 'post_nms_topN': post_nms_top_n, + 'nms_thresh': nms_thresh, + 'min_size': min_size, + 'eta': eta + }, + outputs={'RpnRois': rpn_rois, + 'RpnRoiProbs': rpn_roi_probs}) + rpn_rois.stop_gradient = True + rpn_roi_probs.stop_gradient = True + + return rpn_rois, rpn_roi_probs diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index b03ee514f50f9a8c1425bd5b1d409b58ed62351a..0cf7aaef4ab75ca6976465d1b404004a9f2f64c5 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -246,7 +246,11 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True): rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC }) if sync: - helper.append_op(type="send_barrier", attrs={"endpoints": endpoints}) + helper.append_op( + type="send_barrier", + inputs={"X": dummy_output}, + outputs={"Out": []}, + attrs={"endpoints": endpoints}) def Recv(endpoints, get_vars, dummy_input=None, sync=True): @@ -282,7 +286,10 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True): attrs={"endpoints": endpoints, "epmap": epmap}) if sync: - helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints}) + helper.append_op( + type="fetch_barrier", + outputs={"Out": get_vars}, + attrs={"endpoints": endpoints}) return get_vars diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index 2c3bdd77e1fa1c86baa3a288caab4ad4324e2ef2..0182bbeb637ec7b6a341a4822a1cc5fb5aef077d 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -119,10 +119,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1): helper = LayerHelper("auc", **locals()) auc_out = helper.create_tmp_variable(dtype="float64") # make tp, tn, fp, fn persistable, so that can accumulate all batches. - tp = helper.create_global_variable(persistable=True, dtype='int64') - tn = helper.create_global_variable(persistable=True, dtype='int64') - fp = helper.create_global_variable(persistable=True, dtype='int64') - fn = helper.create_global_variable(persistable=True, dtype='int64') + tp = helper.create_global_variable( + persistable=True, dtype='int64', shape=[num_thresholds]) + tn = helper.create_global_variable( + persistable=True, dtype='int64', shape=[num_thresholds]) + fp = helper.create_global_variable( + persistable=True, dtype='int64', shape=[num_thresholds]) + fn = helper.create_global_variable( + persistable=True, dtype='int64', shape=[num_thresholds]) for var in [tp, tn, fp, fn]: helper.set_variable_initializer( var, Constant( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4bd260a00503c57b7f67b2706b4c25e43271c3f6..0ecfc958a3b89c85ef00574d630042d410c3fa0a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -17,6 +17,7 @@ All layers just related to the neural network. from __future__ import print_function +import numpy as np from ..layer_helper import LayerHelper from ..initializer import Normal, Constant from ..framework import Variable @@ -24,7 +25,6 @@ from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc from .tensor import concat from . import utils -import random from .. import unique_name from functools import reduce @@ -54,6 +54,7 @@ __all__ = [ 'conv2d_transpose', 'conv3d_transpose', 'sequence_expand', + 'sequence_pad', 'lstm_unit', 'reduce_sum', 'reduce_mean', @@ -84,9 +85,12 @@ __all__ = [ 'one_hot', 'autoincreased_step_counter', 'reshape', + 'squeeze', + 'unsqueeze', 'lod_reset', 'lrn', 'pad', + 'pad_constant_like', 'label_smooth', 'roi_pool', 'dice_loss', @@ -103,7 +107,10 @@ __all__ = [ 'rank_loss', 'prelu', 'flatten', + 'sequence_mask', 'stack', + 'pad2d', + 'unstack', ] @@ -2654,6 +2661,51 @@ def sequence_expand(x, y, ref_level=-1, name=None): return tmp +@templatedoc() +def sequence_pad(x, pad_value, maxlen=None): + """ + ${comment} + + Args: + x(Variable): Input variable which should contain lod information. + pad_value(Variable): The Variable that holds values that will be fill + into padded steps. It can be a scalar or a tensor whose shape + equals to time steps in sequences. If it's a scalar, it will be + automatically broadcasted to the shape of time step. + maxlen(int, default None): The length of padded sequences. It can be + None or any positive int. When it is None, all sequences will be + padded up to the length of the longest one among them; when it a + certain positive value, it must be greater than the length of the + longest original sequence." + + Returns: + Variable: The padded sequence batch. All sequences has the same length. + + Examples: + .. code-block:: python + + import numpy + + x = fluid.layers.data(name='y', shape=[10, 5], + dtype='float32', lod_level=1) + pad_value = fluid.layers.assign(input=numpy.array([0])) + out = fluid.layers.sequence_pad(x=x, pad_value=pad_value) + """ + + helper = LayerHelper('sequence_pad', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + if maxlen is None: + maxlen = -1 + helper.append_op( + type='sequence_pad', + inputs={'X': x, + 'PadValue': pad_value}, + outputs={'Out': out}, + attrs={'padded_length': maxlen}) + return out + + def beam_search(pre_ids, pre_scores, ids, @@ -4483,6 +4535,89 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): return helper.append_activation(out) +def squeeze(input, axes, name=None): + """ + Remove single-dimensional entries from the shape of a tensor. Takes a + parameter axes with a list of axes to squeeze. If axes is not provided, all + the single dimensions will be removed from the shape. If an axis is + selected with shape entry not equal to one, an error is raised. + + Examples: + Case 1: + Given + X.shape = (1, 3, 1, 5) + and + axes = [0] + we get: + Out.shape = (3, 1, 5) + Case 2: + Given + X.shape = (1, 3, 1, 5) + and + axes = [] + we get: + Out.shape = (3, 5) + + Args: + input (Variable): The input variable to be squeezed. + axes (list): List of integers, indicating the dimensions to be squeezed. + name (str|None): Name for this layer. + + Returns: + Variable: Output squeezed variable. + + Examples: + .. code-block:: python + + x = layers.data(name='x', shape=[5, 1, 10]) + y = layers.sequeeze(input=x, axes=[1]) + """ + helper = LayerHelper("squeeze", **locals()) + out = helper.create_tmp_variable(dtype=input.dtype) + helper.append_op( + type="squeeze", + inputs={"X": input}, + attrs={"axes": axes}, + outputs={"Out": out}) + + return out + + +def unsqueeze(input, axes, name=None): + """ + Insert single-dimensional entries to the shape of a tensor. Takes one + required argument axes, a list of dimensions that will be inserted. + Dimension indices in axes are as seen in the output tensor. + + For example: + Given a tensor such that tensor with shape [3, 4, 5], + then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. + + Args: + input (Variable): The input variable to be unsqueezed. + axes (list): List of integers, indicating the dimensions to be inserted. + name (str|None): Name for this layer. + + Returns: + Variable: Output unsqueezed variable. + + Examples: + .. code-block:: python + + x = layers.data(name='x', shape=[5, 10]) + y = layers.unsequeeze(input=x, axes=[1]) + """ + helper = LayerHelper("unsqueeze", **locals()) + out = helper.create_tmp_variable(dtype=input.dtype) + helper.append_op( + type="unsqueeze", + inputs={"X": input}, + attrs={"axes": axes}, + outputs={"Out": out}) + + return out + + def lod_reset(x, y=None, target_lod=None): """ Set LoD of :attr:`x` to a new one specified by :attr:`y` or @@ -4707,6 +4842,86 @@ def pad(x, paddings, pad_value=0., name=None): return out +def pad_constant_like(x, y, pad_value=0., name=None): + """ + Pad input(Y) with :attr:`pad_value`, the number of values padded to + the edges of each axis is specified by the difference of the shape + of X and Y. ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) + unique pad widths for each axis. The input should be a k-D + tensor(k > 0 and k < 7). + + See below for an example. + + .. code-block:: text + + Given: + X = [[[[ 0, 1, 2], + [ 3, 4, 5]], + [[ 6, 7, 8], + [ 9, 10, 11]], + [[12, 13, 14], + [15, 16, 17]]], + [[[18, 19, 20], + [21, 22, 23]], + [[24, 25, 26], + [27, 28, 29]], + [[30, 31, 32], + [33, 34, 35]]]] + X.shape = (2, 3, 2, 3) + + Y = [[[[35, 36, 37]], + [[38, 39, 40]], + [[41, 42, 43]]]] + Y.shape = (1, 3, 1, 3) + + And + pad_value = -1, + + Return: + Out = [[[[35, 36, 37], + [-1, -1, -1]], + [[38, 39, 40], + [-1, -1, -1]], + [[41, 42, 43], + [-1, -1, -1]]], + [[[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]]]] + Out.shape = (2, 3, 2, 3) + + Args: + x (Variable): The input tensor variable. + y (Variable): The input tensor variable. + pad_value (float): The constant value used to pad. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The padded tensor variable. + + Examples: + .. code-block:: python + + # x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3) + # y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3) + out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.) + # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3] + """ + helper = LayerHelper('pad_constant_like', input=x, **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='pad_constant_like', + inputs={'X': x, + 'Y': y}, + outputs={'Out': out}, + attrs={'pad_value': float(pad_value)}) + return out + + def label_smooth(label, prior_dist=None, epsilon=0.1, @@ -5101,7 +5316,7 @@ def random_crop(x, shape, seed=None): dtype = x.dtype out = helper.create_tmp_variable(dtype) if seed is None: - seed = random.randint(-65536, 65535) + seed = np.random.randint(-65536, 65536) op_attrs = {"shape": shape} if isinstance(seed, int): op_attrs["startup_seed"] = seed @@ -5303,7 +5518,7 @@ def crop(x, shape=None, offsets=None, name=None): helper = LayerHelper('crop', **locals()) if not (isinstance(shape, list) or isinstance(shape, tuple) or \ - isinstance(shape, Variable)): + isinstance(shape, Variable)): raise ValueError("The shape should be a list, tuple or Variable.") if offsets is None: @@ -5400,6 +5615,94 @@ def rank_loss(label, left, right, name=None): return out +def pad2d(input, + paddings=[0, 0, 0, 0], + mode='constant', + pad_value=0.0, + data_format="NCHW", + name=None): + """ + Pad 2-d images accordding to 'paddings' and 'mode'. + If mode is 'reflect', paddings[0] and paddings[1] must be no greater + than height-1. And the width dimension has the same condition. + + Example: + + Given that X is a channel of image from input: + + X = [[1, 2, 3], + [4, 5, 6]] + + Case 0: + + paddings = [0, 1, 2, 3], + mode = 'constant' + pad_value = 0 + + Out = [[0, 0, 1, 2, 3, 0, 0, 0] + [0, 0, 4, 5, 6, 0, 0, 0] + [0, 0, 0, 0, 0, 0, 0, 0]] + + Case 1: + + paddings = [0, 1, 2, 1], + mode = 'reflect' + + Out = [[3, 2, 1, 2, 3, 2] + [6, 5, 4, 5, 6, 5] + [3, 2, 1, 2, 3, 2]] + + Case 2: + + paddings = [0, 1, 2, 1], + mode = 'edge' + + Out = [[1, 1, 1, 2, 3, 3] + [4, 4, 4, 5, 6, 6] + [4, 4, 4, 5, 6, 6]] + + + Args: + input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format. + paddings (tuple|list): The padding size. If padding is a tuple, it must + contain four integers, (padding_top, padding_bottom, padding_left, padding_right). + Default: padding = [0, 0, 0, 0]. + mode (str): Three modes: constant(default), reflect, edge. Default: constant + pad_value (float32): The value to fill the padded areas in constant mode. Default: 0 + data_format (str): An optional string from: "NHWC", "NCHW". Specify the data format of + the input data. + Default: "NCHW" + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The tensor variable padded accordding to paddings and mode. + + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') + result = fluid.layers.pad2d(input=data, padding=[1,2,3,4], mode='reflect') + """ + + helper = LayerHelper('pad2d', **locals()) + dtype = helper.input_dtype(input_param_name='input') + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='pad2d', + inputs={'X': input}, + outputs={"Out": out}, + attrs={ + 'paddings': paddings, + 'mode': mode, + 'pad_value': pad_value, + 'data_frmat': data_format + }) + + return out + + def prelu(x, mode, param_attr=None, name=None): """ Equation: @@ -5414,7 +5717,7 @@ def prelu(x, mode, param_attr=None, name=None): all: all elements share same weight channel:elements in a channel share same weight element:each element has a weight - name(str|None): A name for this layer(optional). If set None, the layer + name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. Returns: @@ -5520,7 +5823,75 @@ def flatten(x, axis=1, name=None): return out +def sequence_mask(x, maxlen=None, dtype='int64', name=None): + """ + **SequenceMask Layer** + + This layer outputs a mask according to the input :code:`x` and + :code:`maxlen` with data type of :code:`dtype`. + + Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the + :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where: + + .. math:: + + y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n)) + + Args: + x (Variable): Input tensor of sequence_mask layer, + whose elements are integers less than :code:`maxlen`. + maxlen (int|None): Maximum length of the sequence. If :code:`maxlen` + is None, it would be replace with :math:`max(x)`. + dtype (np.dtype|core.VarDesc.VarType|str): Data type of the output. + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The output sequence mask. + + """ + + helper = LayerHelper('sequence_mask', **locals()) + if name is None: + out = helper.create_tmp_variable(dtype=dtype) + else: + out = helper.create_tmp_variable(dtype=dtype, name=name) + + helper.append_op( + type='sequence_mask', + inputs={'X': [x]}, + outputs={'Y': out}, + attrs={ + 'max_len': maxlen if maxlen is not None else -1, + 'out_dtype': out.dtype + }) + return out + + def stack(x, axis=0): + """ + **Stack Layer** + + This layer stacks all of the input :code:`x` along axis. + + Input :code:`x` can be a single variable, a :code:`list` of variables, + or a :code:`tuple` of variables. If :code:`x` is a :code:`list` or + :code:`tuple`, the shapes of all these variables must be the same. + Supposing the shape of each input is :math:`[d_0, d_1, ..., d_{n-1}]`, + the shape of the output variable would be + :math:`[d_0, d_1, ..., d_{axis}=len(x), ..., d_{n-1}]`. + If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x[0])+1`. + If :code:`axis` is None, it would be replaced with 0. + + Args: + x (Variable|list(Variable)|tuple(Variable)): Input variables. + axis (int|None): The axis along which all inputs are stacked. + + Returns: + Variable: The stacked variable. + + """ + helper = LayerHelper('stack', **locals()) axis = 0 if axis is None else axis @@ -5532,3 +5903,44 @@ def stack(x, axis=0): type='stack', inputs={'X': x}, outputs={'Y': out}, attrs={'axis': axis}) return out + + +def unstack(x, axis=0, num=None): + """ + **UnStack Layer** + + This layer unstacks input :code:`x` into several tensors along axis. + + If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`. + If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`, + and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is + raised. + + Args: + x (Variable): Input variable. + axis (int): The axis along which the input is unstacked. + num (int|None): The number of output variables. + + Returns: + list(Variable): The unstacked variables. + + """ + + helper = LayerHelper('unstack', **locals()) + if num is None: + if axis is None or x.shape[axis] <= 0: + raise ValueError('unknown unstack number') + else: + num = x.shape[axis] + + outs = [] + for _ in num: + outs.append(helper.create_tmp_variable(x.dtype)) + + helper.append_op( + type='unstack', + inputs={'X': [x]}, + outputs={'Y': outs}, + attrs={'axis': axis, + 'num': num}) + return outs diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 7cd62efda8900c830f43d882a41ab03184ebe594..129252653dc139b7405626e6fd410704a4ad06d9 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -64,6 +64,7 @@ __all__ = [ 'logical_not', 'uniform_random_batch_size_like', 'gaussian_random', + 'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 031ddd09a0b27b050b6ac651e4d8c46854092b2f..33d6311b9717c66f0d6782eb6b3e348cd4c02a69 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -15,7 +15,7 @@ from __future__ import print_function import re from collections import defaultdict -from paddle.fluid.framework import Program, Variable +from paddle.fluid.framework import Program, Variable, name_scope from . import framework from . import layers from .backward import append_backward @@ -46,10 +46,12 @@ class Optimizer(object): def __init__(self, learning_rate, regularization=None, - LARS_weight_decay=0.0): + LARS_weight_decay=0.0, + name=None): if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): raise TypeError("learning rate should be float or Variable") + self._name = name self.regularization = regularization self._learning_rate = learning_rate # the learning rate type should be inferenced from loss @@ -153,6 +155,8 @@ class Optimizer(object): dtype: data type of the accumulator variable fill_value: value to initialize the accumulator variable """ + if self._name is not None: + name = self._name + "_" + name if (name in self._accumulators and param.name in self._accumulators[name]): raise Exception("Accumulator {} already exists for parameter {}". @@ -181,6 +185,8 @@ class Optimizer(object): Returns: accumulator variable for the parameter """ + if self._name is not None: + name = self._name + "_" + name if (name not in self._accumulators or param.name not in self._accumulators[name]): raise Exception("Accumulator {} does not exist for parameter {}". @@ -231,7 +237,7 @@ class Optimizer(object): if param_and_grad[1] is None: continue with param_and_grad[0].block.program.optimized_guard( - param_and_grad): + param_and_grad), name_scope("optimizer"): if param_and_grad[0].trainable is True: optimize_op = self._append_optimize_op(loss.block, param_and_grad) diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py index 3951e7b8ca649b63eea4b311f6205a6c7d761804..a231bbfbc8d5712275c92b4d27580016825ea91b 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py @@ -125,8 +125,8 @@ opts = optimizer.minimize(avg_cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size) -# fluid.memory_optimize(fluid.default_main_program(), level=0) -fluid.release_memory(fluid.default_main_program()) +fluid.memory_optimize(fluid.default_main_program(), level=0) +# fluid.release_memory(fluid.default_main_program()) BATCH_SIZE = 16 PASS_NUM = 1 diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py index 1ad51936b5b8f7c5149452d6033754a570c72654..e520c8965089263d1ba10a6057acda1a53cc34a9 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -92,8 +92,8 @@ def main(): optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) - # fluid.memory_optimize(fluid.default_main_program()) - fluid.release_memory(fluid.default_main_program()) + fluid.memory_optimize(fluid.default_main_program()) + # fluid.release_memory(fluid.default_main_program()) # fix the order of training data train_data = paddle.batch( diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 1467e72caac26a3ea2a0c770d665141988696630..ec0bf3ff8d64345111537780aaa5367ed0e1f8ff 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -146,6 +146,55 @@ class TestAnchorGenerator(unittest.TestCase): assert anchor.shape[3] == 4 +class TestGenerateProposalLabels(unittest.TestCase): + def test_generate_proposal_labels(self): + rpn_rois = layers.data( + name='rpn_rois', + shape=[4, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + gt_classes = layers.data( + name='gt_classes', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + gt_boxes = layers.data( + name='gt_boxes', + shape=[6, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + im_scales = layers.data( + name='im_scales', + shape=[1], + dtype='float32', + lod_level=1, + append_batch_size=False) + class_nums = 5 + rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( + rpn_rois=rpn_rois, + gt_classes=gt_classes, + gt_boxes=gt_boxes, + im_scales=im_scales, + batch_size_per_im=2, + fg_fraction=0.5, + fg_thresh=0.5, + bg_thresh_hi=0.5, + bg_thresh_lo=0.0, + bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], + class_nums=class_nums) + assert rois.shape[1] == 4 + assert rois.shape[0] == labels_int32.shape[0] + assert rois.shape[0] == bbox_targets.shape[0] + assert rois.shape[0] == bbox_inside_weights.shape[0] + assert rois.shape[0] == bbox_outside_weights.shape[0] + assert bbox_targets.shape[1] == 4 * class_nums + assert bbox_inside_weights.shape[1] == 4 * class_nums + assert bbox_outside_weights.shape[1] == 4 * class_nums + + class TestMultiBoxHead(unittest.TestCase): def test_multi_box_head(self): data_shape = [3, 224, 224] @@ -201,5 +250,97 @@ class TestDetectionMAP(unittest.TestCase): print(str(program)) +class TestRpnTargetAssign(unittest.TestCase): + def test_rpn_target_assign(self): + program = Program() + with program_guard(program): + loc_shape = [10, 50, 4] + score_shape = [10, 50, 2] + anchor_shape = [50, 4] + + loc = layers.data( + name='loc', + shape=loc_shape, + append_batch_size=False, + dtype='float32') + scores = layers.data( + name='scores', + shape=score_shape, + append_batch_size=False, + dtype='float32') + anchor_box = layers.data( + name='anchor_box', + shape=anchor_shape, + append_batch_size=False, + dtype='float32') + anchor_var = layers.data( + name='anchor_var', + shape=anchor_shape, + append_batch_size=False, + dtype='float32') + gt_box = layers.data( + name='gt_box', shape=[4], lod_level=1, dtype='float32') + + predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign( + loc=loc, + scores=scores, + anchor_box=anchor_box, + anchor_var=anchor_var, + gt_box=gt_box, + rpn_batch_size_per_im=256, + fg_fraction=0.25, + rpn_positive_overlap=0.7, + rpn_negative_overlap=0.3) + + self.assertIsNotNone(predicted_scores) + self.assertIsNotNone(predicted_location) + self.assertIsNotNone(target_label) + self.assertIsNotNone(target_bbox) + assert predicted_scores.shape[1] == 2 + assert predicted_location.shape[1] == 4 + assert predicted_location.shape[1] == target_bbox.shape[1] + + print(str(program)) + + +class TestGenerateProposals(unittest.TestCase): + def test_generate_proposals(self): + data_shape = [20, 64, 64] + images = fluid.layers.data( + name='images', shape=data_shape, dtype='float32') + im_info = fluid.layers.data( + name='im_info', shape=[1, 3], dtype='float32') + anchors, variances = fluid.layers.anchor_generator( + name='anchor_generator', + input=images, + anchor_sizes=[32, 64], + aspect_ratios=[1.0], + variance=[0.1, 0.1, 0.2, 0.2], + stride=[16.0, 16.0], + offset=0.5) + num_anchors = anchors.shape[2] + scores = fluid.layers.data( + name='scores', shape=[1, num_anchors, 8, 8], dtype='float32') + bbox_deltas = fluid.layers.data( + name='bbox_deltas', + shape=[1, num_anchors * 4, 8, 8], + dtype='float32') + rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals( + name='generate_proposals', + scores=scores, + bbox_deltas=bbox_deltas, + im_info=im_info, + anchors=anchors, + variances=variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0) + self.assertIsNotNone(rpn_rois) + self.assertIsNotNone(rpn_roi_probs) + print(rpn_rois.shape) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e7dd85ef5c3641be04261dc5d4166fa8452b4200..8ac1cb164e158cf38d1c0570f5bf37ee6a6badae 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -64,6 +64,7 @@ if(WITH_DISTRIBUTE) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) +set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 0387e911880256ea6b8efb6f2311bbf4c4f8c0f2..a4ffe7d40c40501ebd43fec0b664159227ea34bd 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -134,7 +134,7 @@ class SE_ResNeXt(): size=class_dim, act='softmax', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.2))) + initializer=fluid.initializer.Constant(value=0.05))) return out def shortcut(self, input, ch_out, stride): @@ -184,7 +184,7 @@ class SE_ResNeXt(): act=None, # avoid pserver CPU init differs from GPU param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.2)), + initializer=fluid.initializer.Constant(value=0.05)), bias_attr=False) return fluid.layers.batch_norm(input=conv, act=act) @@ -192,13 +192,19 @@ class SE_ResNeXt(): pool = fluid.layers.pool2d( input=input, pool_size=0, pool_type='avg', global_pooling=True) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) - squeeze = fluid.layers.fc(input=pool, - size=num_channels // reduction_ratio, - act='relu') + squeeze = fluid.layers.fc( + input=pool, + size=num_channels // reduction_ratio, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='relu') stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) - excitation = fluid.layers.fc(input=squeeze, - size=num_channels, - act='sigmoid') + excitation = fluid.layers.fc( + input=squeeze, + size=num_channels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='sigmoid') scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) return scale diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 239adcb9d5900d4073a6c07cb189ab7503aea86e..7abfa0a4be0dec9fe251704e22dfef1f932e7c5b 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -18,54 +18,129 @@ import numpy as np import argparse import time import math +import os +import sys +import six +import argparse +import ast +import multiprocessing +import time +from functools import partial +from os.path import expanduser +import glob +import random +import tarfile import paddle import paddle.fluid as fluid +import paddle.fluid.layers as layers from paddle.fluid import core -import os -import sys -import six -import transformer_model -import paddle.dataset.wmt16 as wmt16 +from test_dist_base import TestDistRunnerBase, runtime_main +from paddle.compat import long_type + +import hashlib + +from paddle.fluid.transpiler.details import program_to_code + +const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001)) +const_bias_attr = const_para_attr # Fix seed for test fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 -WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio" +#from transformer_config import ModelHyperParams, TrainTaskConfig, merge_cfg_from_list +class TrainTaskConfig(object): + # only support GPU currently + use_gpu = True + # the epoch number to train. + pass_num = 1 + # the number of sequences contained in a mini-batch. + # deprecated, set batch_size in args. + batch_size = 20 + # the hyper parameters for Adam optimizer. + # This static learning_rate will be multiplied to the LearningRateScheduler + # derived learning rate the to get the final learning rate. + learning_rate = 1 + beta1 = 0.9 + beta2 = 0.98 + eps = 1e-9 + # the parameters for learning rate scheduling. + warmup_steps = 4000 + # the weight used to mix up the ground-truth distribution and the fixed + # uniform distribution in label smoothing when training. + # Set this as zero if label smoothing is not wanted. + label_smooth_eps = 0.1 + # the directory for saving trained models. + model_dir = "trained_models" + # the directory for saving checkpoints. + ckpt_dir = "trained_ckpts" + # the directory for loading checkpoint. + # If provided, continue training from the checkpoint. + ckpt_path = None + # the parameter to initialize the learning rate scheduler. + # It should be provided if use checkpoints, since the checkpoint doesn't + # include the training step counter currently. + start_step = 0 -class ModelHyperParams(object): - # Dictionary size for source and target language. This model directly uses - # paddle.dataset.wmt16 in which , and token has - # alreay been added, but the token is not added. Transformer requires - # sequences in a mini-batch are padded to have the same length. A token is - # added into the original dictionary in paddle.dateset.wmt16. + check_acc = True - # size of source word dictionary. - src_vocab_size = 10000 - # index for token in source language. - src_pad_idx = src_vocab_size + data_path = expanduser("~") + ( + "/.cache/paddle/dataset/test_dist_transformer/") + src_vocab_fpath = data_path + "vocab.bpe.32000" + trg_vocab_fpath = data_path + "vocab.bpe.32000" + train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de" + val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de" + pool_size = 2000 + sort_type = None + local = True + shuffle = False + shuffle_batch = False + special_token = ['', '', ''] + token_delimiter = ' ' + use_token_batch = False - # size of target word dictionay - trg_vocab_size = 10000 - # index for token in target language. - trg_pad_idx = trg_vocab_size - # position value corresponding to the token. - pos_pad_idx = 0 +class InferTaskConfig(object): + use_gpu = True + # the number of examples in one run for sequence generation. + batch_size = 10 + # the parameters for beam search. + beam_size = 5 + max_out_len = 256 + # the number of decoded sentences to output. + n_best = 1 + # the flags indicating whether to output the special tokens. + output_bos = False + output_eos = False + output_unk = True + # the directory for loading the trained model. + model_path = "trained_models/pass_1.infer.model" - # max length of sequences. It should plus 1 to include position - # padding token for position encoding. - max_length = 50 +class ModelHyperParams(object): + # These following five vocabularies related configurations will be set + # automatically according to the passed vocabulary path and special tokens. + # size of source word dictionary. + src_vocab_size = 10000 + # size of target word dictionay + trg_vocab_size = 10000 + # index for token + bos_idx = 0 + # index for token + eos_idx = 1 + # index for token + unk_idx = 2 + # max length of sequences deciding the size of position encoding table. + # Start from 1 and count start and end tokens in. + max_length = 256 # the dimension for word embeddings, which is also the last dimension of # the input and output of multi-head attention, position-wise feed-forward # networks, encoder and decoder. - d_model = 512 # size of the hidden layer in position-wise feed-forward networks. - d_inner_hid = 1024 + d_inner_hid = 2048 # the dimension that keys are projected to for dot-product attention. d_key = 64 # the dimension that values are projected to for dot-product attention. @@ -75,212 +150,1577 @@ class ModelHyperParams(object): # number of sub-layers to be stacked in the encoder and decoder. n_layer = 6 # dropout rate used by all dropout layers. - dropout = 0.1 + dropout = 0.0 # no random + # random seed used in dropout for CE. + dropout_seed = None + # the flag indicating whether to share embedding and softmax weights. + # vocabularies in source and target should be same for weight sharing. + weight_sharing = True + + +def merge_cfg_from_list(cfg_list, g_cfgs): + """ + Set the above global configurations using the cfg_list. + """ + assert len(cfg_list) % 2 == 0 + for key, value in zip(cfg_list[0::2], cfg_list[1::2]): + for g_cfg in g_cfgs: + if hasattr(g_cfg, key): + try: + value = eval(value) + except Exception: # for file path + pass + setattr(g_cfg, key, value) + break + + +# The placeholder for batch_size in compile time. Must be -1 currently to be +# consistent with some ops' infer-shape output in compile time, such as the +# sequence_expand op used in beamsearch decoder. +batch_size = -1 +# The placeholder for squence length in compile time. +seq_len = ModelHyperParams.max_length +# Here list the data shapes and data types of all inputs. +# The shapes here act as placeholder and are set to pass the infer-shape in +# compile time. +input_descs = { + # The actual data shape of src_word is: + # [batch_size * max_src_len_in_batch, 1] + "src_word": [(batch_size, seq_len, long_type(1)), "int64", 2], + # The actual data shape of src_pos is: + # [batch_size * max_src_len_in_batch, 1] + "src_pos": [(batch_size, seq_len, long_type(1)), "int64"], + # This input is used to remove attention weights on paddings in the + # encoder. + # The actual data shape of src_slf_attn_bias is: + # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch] + "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # The actual data shape of trg_word is: + # [batch_size * max_trg_len_in_batch, 1] + "trg_word": [(batch_size, seq_len, long_type(1)), "int64", + 2], # lod_level is only used in fast decoder. + # The actual data shape of trg_pos is: + # [batch_size * max_trg_len_in_batch, 1] + "trg_pos": [(batch_size, seq_len, long_type(1)), "int64"], + # This input is used to remove attention weights on paddings and + # subsequent words in the decoder. + # The actual data shape of trg_slf_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch] + "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # This input is used to remove attention weights on paddings of the source + # input in the encoder-decoder attention. + # The actual data shape of trg_src_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch] + "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # This input is used in independent decoder program for inference. + # The actual data shape of enc_output is: + # [batch_size, max_src_len_in_batch, d_model] + "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"], + # The actual data shape of label_word is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_word": [(batch_size * seq_len, long_type(1)), "int64"], + # This input is used to mask out the loss of paddding tokens. + # The actual data shape of label_weight is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_weight": [(batch_size * seq_len, long_type(1)), "float32"], + # These inputs are used to change the shape tensor in beam-search decoder. + "trg_slf_attn_pre_softmax_shape_delta": [(long_type(2), ), "int32"], + "trg_slf_attn_post_softmax_shape_delta": [(long_type(4), ), "int32"], + "init_score": [(batch_size, long_type(1)), "float32"], +} + +# Names of word embedding table which might be reused for weight sharing. +word_emb_param_names = ( + "src_word_emb_table", + "trg_word_emb_table", ) +# Names of position encoding table which will be initialized externally. +pos_enc_param_names = ( + "src_pos_enc_table", + "trg_pos_enc_table", ) +# separated inputs for different usages. +encoder_data_input_fields = ( + "src_word", + "src_pos", + "src_slf_attn_bias", ) +decoder_data_input_fields = ( + "trg_word", + "trg_pos", + "trg_slf_attn_bias", + "trg_src_attn_bias", + "enc_output", ) +label_data_input_fields = ( + "lbl_word", + "lbl_weight", ) +# In fast decoder, trg_pos (only containing the current time step) is generated +# by ops and trg_slf_attn_bias is not needed. +fast_decoder_data_input_fields = ( + "trg_word", + "init_score", + "trg_src_attn_bias", ) + +# fast_decoder_util_input_fields = ( +# "trg_slf_attn_pre_softmax_shape_delta", +# "trg_slf_attn_post_softmax_shape_delta", ) + +#from optim import LearningRateScheduler +class LearningRateScheduler(object): + """ + Wrapper for learning rate scheduling as described in the Transformer paper. + LearningRateScheduler adapts the learning rate externally and the adapted + learning rate will be feeded into the main_program as input data. + """ + + def __init__(self, + d_model, + warmup_steps, + learning_rate=0.001, + current_steps=0, + name="learning_rate"): + self.current_steps = current_steps + self.warmup_steps = warmup_steps + self.d_model = d_model + self.static_lr = learning_rate + self.learning_rate = layers.create_global_var( + name=name, + shape=[1], + value=float(learning_rate), + dtype="float32", + persistable=True) + + def update_learning_rate(self): + self.current_steps += 1 + lr_value = np.power(self.d_model, -0.5) * np.min([ + np.power(self.current_steps, -0.5), + np.power(self.warmup_steps, -1.5) * self.current_steps + ]) * self.static_lr + return np.array([lr_value], dtype="float32") -def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): + +#from transformer_train import train_loop +def pad_batch_data(insts, + pad_idx, + n_head, + is_target=False, + is_label=False, + return_attn_bias=True, + return_max_len=True, + return_num_token=False): """ Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. Then, convert the numpy - data to tensors and return a dict mapping names to tensors. + corresponding position data and attention bias. """ + return_list = [] + max_len = max(len(inst) for inst in insts) + num_token = reduce(lambda x, y: x + y, + [len(inst) for inst in insts]) if return_num_token else 0 + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if is_label: # label weight + inst_weight = np.array( + [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_weight.astype("float32").reshape([-1, 1])] + else: # position data + inst_pos = np.array([ + range(1, len(inst) + 1) + [0] * (max_len - len(inst)) + for inst in insts + ]) + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, + 1).reshape([-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + if return_num_token: + return_list += [num_token] + return return_list if len(return_list) > 1 else return_list[0] + + +def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx, + n_head, d_model): + """ + Put all padded data needed by training into a dict. + """ + src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) + src_word = src_word.reshape(-1, src_max_len, 1) + src_pos = src_pos.reshape(-1, src_max_len, 1) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) + trg_word = trg_word.reshape(-1, trg_max_len, 1) + trg_pos = trg_pos.reshape(-1, trg_max_len, 1) - def __pad_batch_data(insts, - pad_idx, - is_target=False, - return_pos=True, - return_attn_bias=True, - return_max_len=True): - """ - Pad the instances to the max sequence length in batch, and generate the - corresponding position data and attention bias. - """ - return_list = [] - max_len = max(len(inst) for inst in insts) - inst_data = np.array( - [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) - return_list += [inst_data.astype("int64").reshape([-1, 1])] - if return_pos: - inst_pos = np.array([[ - pos_i + 1 if w_i != pad_idx else 0 - for pos_i, w_i in enumerate(inst) - ] for inst in inst_data]) - - return_list += [inst_pos.astype("int64").reshape([-1, 1])] - if return_attn_bias: - if is_target: - # This is used to avoid attention on paddings and subsequent - # words. - slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, - max_len)) - slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( - [-1, 1, max_len, max_len]) - slf_attn_bias_data = np.tile(slf_attn_bias_data, - [1, n_head, 1, 1]) * [-1e9] - else: - # This is used to avoid attention on paddings. - slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * - (max_len - len(inst)) - for inst in insts]) - slf_attn_bias_data = np.tile( - slf_attn_bias_data.reshape([-1, 1, 1, max_len]), - [1, n_head, max_len, 1]) - return_list += [slf_attn_bias_data.astype("float32")] - if return_max_len: - return_list += [max_len] - return return_list if len(return_list) > 1 else return_list[0] - - src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( - [inst[0] for inst in insts], src_pad_idx, is_target=False) - trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( - [inst[1] for inst in insts], trg_pad_idx, is_target=True) trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]).astype("float32") - lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, - False, False, False) - lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + lbl_word, lbl_weight, num_token = pad_batch_data( + [inst[2] for inst in insts], + trg_pad_idx, + n_head, + is_target=False, + is_label=True, + return_attn_bias=False, + return_max_len=False, + return_num_token=True) + + data_input_dict = dict( + zip(data_input_names, [ + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ])) + return data_input_dict, np.asarray([num_token], dtype="float32") + + +def read_multiple(reader, count, clip_last=True): + """ + Stack data from reader for multi-devices. + """ + + def __impl__(): + res = [] + for item in reader(): + res.append(item) + if len(res) == count: + yield res + res = [] + if len(res) == count: + yield res + elif not clip_last: + data = [] + for item in res: + data += item + if len(data) > count: + inst_num_per_part = len(data) // count + yield [ + data[inst_num_per_part * i:inst_num_per_part * (i + 1)] + for i in range(count) + ] + + return __impl__ + + +def split_data(data, num_part): + """ + Split data for each device. + """ + if len(data) == num_part: + return data + data = data[0] + inst_num_per_part = len(data) // num_part return [ - src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, - trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + data[inst_num_per_part * i:inst_num_per_part * (i + 1)] + for i in range(num_part) ] -def transformer(use_feed): - assert not use_feed, "transfomer doesn't support feed yet" - return transformer_model.transformer( - ModelHyperParams.src_vocab_size + 1, - ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, ModelHyperParams.n_head, - ModelHyperParams.d_key, ModelHyperParams.d_value, - ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, - ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) - - -def get_model(): - avg_cost = transformer(use_feed=False) - optimizer = fluid.optimizer.Adam() - optimizer.minimize(avg_cost) - fluid.memory_optimize(fluid.default_main_program()) - return avg_cost - - -def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id=trainer_id, - program=main_program, - pservers=pserver_endpoints, - trainers=trainers) - return t - - -class DistTransformer2x2(object): - def run_pserver(self, pserver_endpoints, trainers, current_endpoint, - trainer_id): - get_model() - t = get_transpiler(trainer_id, - fluid.default_main_program(), pserver_endpoints, - trainers) - pserver_prog = t.get_pserver_program(current_endpoint) - startup_prog = t.get_startup_program(current_endpoint, pserver_prog) +def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, + sum_cost, token_num): + # Context to do validation. + test_program = train_progm.clone() + with fluid.program_guard(test_program): + test_program = fluid.io.get_inference_program([avg_cost]) + + val_data = DataReader( + src_vocab_fpath=TrainTaskConfig.src_vocab_fpath, + trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath, + fpattern=TrainTaskConfig.val_file_pattern, + token_delimiter=TrainTaskConfig.token_delimiter, + use_token_batch=TrainTaskConfig.use_token_batch, + batch_size=TrainTaskConfig.batch_size * + (1 if TrainTaskConfig.use_token_batch else dev_count), + pool_size=TrainTaskConfig.pool_size, + sort_type=TrainTaskConfig.sort_type, + start_mark=TrainTaskConfig.special_token[0], + end_mark=TrainTaskConfig.special_token[1], + unk_mark=TrainTaskConfig.special_token[2], + # count start and end tokens out + max_length=ModelHyperParams.max_length - 2, + clip_last_batch=False, + shuffle=False, + shuffle_batch=False) + + build_strategy = fluid.BuildStrategy() + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + + test_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + main_program=test_program, + share_vars_from=train_exe, + build_strategy=build_strategy, + exec_strategy=strategy) + + def test(exe=test_exe): + test_total_cost = 0 + test_total_token = 0 + test_data = read_multiple( + reader=val_data.batch_generator, + count=dev_count if TrainTaskConfig.use_token_batch else 1) + for batch_id, data in enumerate(test_data()): + feed_list = [] + for place_id, data_buffer in enumerate( + split_data( + data, num_part=dev_count)): + data_input_dict, _ = prepare_batch_input( + data_buffer, data_input_names, ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model) + feed_list.append(data_input_dict) + + outs = exe.run(feed=feed_list, + fetch_list=[sum_cost.name, token_num.name]) + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + test_total_cost += sum_cost_val.sum() + test_total_token += token_num_val.sum() + test_avg_cost = test_total_cost / test_total_token + test_ppl = np.exp([min(test_avg_cost, 100)]) + return test_avg_cost, test_ppl + + return test + + +def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, + token_num, predict): + # Initialize the parameters. + if TrainTaskConfig.ckpt_path: + lr_scheduler.current_steps = TrainTaskConfig.start_step + else: + exe.run(fluid.framework.default_startup_program()) + + train_data = DataReader( + src_vocab_fpath=TrainTaskConfig.src_vocab_fpath, + trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath, + fpattern=TrainTaskConfig.train_file_pattern, + token_delimiter=TrainTaskConfig.token_delimiter, + use_token_batch=TrainTaskConfig.use_token_batch, + batch_size=TrainTaskConfig.batch_size * + (1 if TrainTaskConfig.use_token_batch else dev_count), + pool_size=TrainTaskConfig.pool_size, + sort_type=TrainTaskConfig.sort_type, + shuffle=TrainTaskConfig.shuffle, + shuffle_batch=TrainTaskConfig.shuffle_batch, + start_mark=TrainTaskConfig.special_token[0], + end_mark=TrainTaskConfig.special_token[1], + unk_mark=TrainTaskConfig.special_token[2], + # count start and end tokens out + max_length=ModelHyperParams.max_length - 2, + clip_last_batch=False) + train_data = read_multiple( + reader=train_data.batch_generator, + count=dev_count if TrainTaskConfig.use_token_batch else 1) + + build_strategy = fluid.BuildStrategy() + # Since the token number differs among devices, customize gradient scale to + # use token average cost among multi-devices. and the gradient scale is + # `1 / token_number` for average cost. + build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + + train_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + loss_name=sum_cost.name, + main_program=train_progm, + build_strategy=build_strategy, + exec_strategy=strategy) + + data_input_names = encoder_data_input_fields + decoder_data_input_fields[: + -1] + label_data_input_fields + + if TrainTaskConfig.val_file_pattern is not None: + test = test_context(train_progm, avg_cost, train_exe, dev_count, + data_input_names, sum_cost, token_num) + + # the best cross-entropy value with label smoothing + loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log( + (1. - TrainTaskConfig.label_smooth_eps + )) + TrainTaskConfig.label_smooth_eps * + np.log(TrainTaskConfig.label_smooth_eps / ( + ModelHyperParams.trg_vocab_size - 1) + 1e-20)) + init = False + for pass_id in xrange(TrainTaskConfig.pass_num): + pass_start_time = time.time() + for batch_id, data in enumerate(train_data()): + if batch_id >= 5: + break + + feed_list = [] + total_num_token = 0 + + #if TrainTaskConfig.local: + # lr_rate = lr_scheduler.update_learning_rate() + #for place_id, data_buffer in enumerate( + # split_data( + # data, num_part=dev_count)): + + if TrainTaskConfig.local: + lr_rate = lr_scheduler.update_learning_rate() + + for place_id, data_buffer in enumerate( + split_data( + data, num_part=dev_count)): + data_input_dict, num_token = prepare_batch_input( + data_buffer, data_input_names, ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model) + total_num_token += num_token + feed_kv_pairs = data_input_dict.items() + if TrainTaskConfig.local: + feed_kv_pairs += { + lr_scheduler.learning_rate.name: lr_rate + }.items() + feed_list.append(dict(feed_kv_pairs)) + + if not init: + for pos_enc_param_name in pos_enc_param_names: + pos_enc = position_encoding_init( + ModelHyperParams.max_length + 1, + ModelHyperParams.d_model) + feed_list[place_id][pos_enc_param_name] = pos_enc + + if not TrainTaskConfig.check_acc: + for feed_dict in feed_list: + feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token + else: + b = 100 * TrainTaskConfig.batch_size + a = np.asarray([b], dtype="float32") + for feed_dict in feed_list: + feed_dict[sum_cost.name + "@GRAD"] = 1. / a + + outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], + feed=feed_list) + + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + total_sum_cost = sum_cost_val.sum() + total_token_num = token_num_val.sum() + total_avg_cost = total_sum_cost / total_token_num + + init = True + + # Validate and save the model for inference. + if TrainTaskConfig.val_file_pattern is not None: + val_avg_cost, val_ppl = test() + print("[%f]" % val_avg_cost) + else: + assert (False) + + +#import transformer_reader as reader +class SortType(object): + GLOBAL = 'global' + POOL = 'pool' + NONE = "none" + + +class Converter(object): + def __init__(self, vocab, beg, end, unk, delimiter): + self._vocab = vocab + self._beg = beg + self._end = end + self._unk = unk + self._delimiter = delimiter + + def __call__(self, sentence): + return [self._beg] + [ + self._vocab.get(w, self._unk) + for w in sentence.split(self._delimiter) + ] + [self._end] + + +class ComposedConverter(object): + def __init__(self, converters): + self._converters = converters + + def __call__(self, parallel_sentence): + return [ + self._converters[i](parallel_sentence[i]) + for i in range(len(self._converters)) + ] + + +class SentenceBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self._batch_size = batch_size + + def append(self, info): + self.batch.append(info) + if len(self.batch) == self._batch_size: + tmp = self.batch + self.batch = [] + return tmp + + +class TokenBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self.max_len = -1 + self._batch_size = batch_size + + def append(self, info): + cur_len = info.max_len + max_len = max(self.max_len, cur_len) + if max_len * (len(self.batch) + 1) > self._batch_size: + result = self.batch + self.batch = [info] + self.max_len = cur_len + return result + else: + self.max_len = max_len + self.batch.append(info) + + +class SampleInfo(object): + def __init__(self, i, max_len, min_len): + self.i = i + self.min_len = min_len + self.max_len = max_len + + +class MinMaxFilter(object): + def __init__(self, max_len, min_len, underlying_creator): + self._min_len = min_len + self._max_len = max_len + self._creator = underlying_creator + + def append(self, info): + if info.max_len > self._max_len or info.min_len < self._min_len: + return + else: + return self._creator.append(info) + + @property + def batch(self): + return self._creator.batch + + +class DataReader(object): + """ + The data reader loads all data from files and produces batches of data + in the way corresponding to settings. + + An example of returning a generator producing data batches whose data + is shuffled in each pass and sorted in each pool: + + ``` + train_data = DataReader( + src_vocab_fpath='data/src_vocab_file', + trg_vocab_fpath='data/trg_vocab_file', + fpattern='data/part-*', + use_token_batch=True, + batch_size=2000, + pool_size=10000, + sort_type=SortType.POOL, + shuffle=True, + shuffle_batch=True, + start_mark='', + end_mark='', + unk_mark='', + clip_last_batch=False).batch_generator + ``` + + :param src_vocab_fpath: The path of vocabulary file of source language. + :type src_vocab_fpath: basestring + :param trg_vocab_fpath: The path of vocabulary file of target language. + :type trg_vocab_fpath: basestring + :param fpattern: The pattern to match data files. + :type fpattern: basestring + :param batch_size: The number of sequences contained in a mini-batch. + or the maximum number of tokens (include paddings) contained in a + mini-batch. + :type batch_size: int + :param pool_size: The size of pool buffer. + :type pool_size: int + :param sort_type: The grain to sort by length: 'global' for all + instances; 'pool' for instances in pool; 'none' for no sort. + :type sort_type: basestring + :param clip_last_batch: Whether to clip the last uncompleted batch. + :type clip_last_batch: bool + :param tar_fname: The data file in tar if fpattern matches a tar file. + :type tar_fname: basestring + :param min_length: The minimum length used to filt sequences. + :type min_length: int + :param max_length: The maximum length used to filt sequences. + :type max_length: int + :param shuffle: Whether to shuffle all instances. + :type shuffle: bool + :param shuffle_batch: Whether to shuffle the generated batches. + :type shuffle_batch: bool + :param use_token_batch: Whether to produce batch data according to + token number. + :type use_token_batch: bool + :param field_delimiter: The delimiter used to split source and target in + each line of data file. + :type field_delimiter: basestring + :param token_delimiter: The delimiter used to split tokens in source or + target sentences. + :type token_delimiter: basestring + :param start_mark: The token representing for the beginning of + sentences in dictionary. + :type start_mark: basestring + :param end_mark: The token representing for the end of sentences + in dictionary. + :type end_mark: basestring + :param unk_mark: The token representing for unknown word in dictionary. + :type unk_mark: basestring + :param seed: The seed for random. + :type seed: int + """ + + def __init__(self, + src_vocab_fpath, + trg_vocab_fpath, + fpattern, + batch_size, + pool_size, + sort_type=SortType.GLOBAL, + clip_last_batch=True, + tar_fname=None, + min_length=0, + max_length=100, + shuffle=True, + shuffle_batch=False, + use_token_batch=False, + field_delimiter="\t", + token_delimiter=" ", + start_mark="", + end_mark="", + unk_mark="", + seed=0): + self._src_vocab = self.load_dict(src_vocab_fpath) + self._only_src = True + if trg_vocab_fpath is not None: + self._trg_vocab = self.load_dict(trg_vocab_fpath) + self._only_src = False + self._pool_size = pool_size + self._batch_size = batch_size + self._use_token_batch = use_token_batch + self._sort_type = sort_type + self._clip_last_batch = clip_last_batch + self._shuffle = shuffle + self._shuffle_batch = shuffle_batch + self._min_length = min_length + self._max_length = max_length + self._field_delimiter = field_delimiter + self._token_delimiter = token_delimiter + self.load_src_trg_ids(end_mark, fpattern, start_mark, tar_fname, + unk_mark) + self._random = random.Random(x=seed) + + def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname, + unk_mark): + converters = [ + Converter( + vocab=self._src_vocab, + beg=self._src_vocab[start_mark], + end=self._src_vocab[end_mark], + unk=self._src_vocab[unk_mark], + delimiter=self._token_delimiter) + ] + if not self._only_src: + converters.append( + Converter( + vocab=self._trg_vocab, + beg=self._trg_vocab[start_mark], + end=self._trg_vocab[end_mark], + unk=self._trg_vocab[unk_mark], + delimiter=self._token_delimiter)) + + converters = ComposedConverter(converters) + + self._src_seq_ids = [] + self._trg_seq_ids = None if self._only_src else [] + self._sample_infos = [] + + for i, line in enumerate(self._load_lines(fpattern, tar_fname)): + src_trg_ids = converters(line) + self._src_seq_ids.append(src_trg_ids[0]) + lens = [len(src_trg_ids[0])] + if not self._only_src: + self._trg_seq_ids.append(src_trg_ids[1]) + lens.append(len(src_trg_ids[1])) + self._sample_infos.append(SampleInfo(i, max(lens), min(lens))) + + def _load_lines(self, fpattern, tar_fname): + fpaths = glob.glob(fpattern) + + if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]): + if tar_fname is None: + raise Exception("If tar file provided, please set tar_fname.") + + f = tarfile.open(fpaths[0], "r") + for line in f.extractfile(tar_fname): + fields = line.strip("\n").split(self._field_delimiter) + if (not self._only_src and len(fields) == 2) or ( + self._only_src and len(fields) == 1): + yield fields + else: + for fpath in fpaths: + if not os.path.isfile(fpath): + raise IOError("Invalid file: %s" % fpath) + + with open(fpath, "r") as f: + for line in f: + fields = line.strip("\n").split(self._field_delimiter) + if (not self._only_src and len(fields) == 2) or ( + self._only_src and len(fields) == 1): + yield fields + + @staticmethod + def load_dict(dict_path, reverse=False): + word_dict = {} + with open(dict_path, "r") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip("\n") + else: + word_dict[line.strip("\n")] = idx + return word_dict + + def batch_generator(self): + # global sort or global shuffle + if self._sort_type == SortType.GLOBAL: + infos = sorted( + self._sample_infos, key=lambda x: x.max_len, reverse=True) + else: + if self._shuffle: + infos = self._sample_infos + self._random.shuffle(infos) + else: + infos = self._sample_infos + + if self._sort_type == SortType.POOL: + for i in range(0, len(infos), self._pool_size): + infos[i:i + self._pool_size] = sorted( + infos[i:i + self._pool_size], key=lambda x: x.max_len) + + # concat batch + batches = [] + batch_creator = TokenBatchCreator( + self._batch_size + ) if self._use_token_batch else SentenceBatchCreator(self._batch_size) + batch_creator = MinMaxFilter(self._max_length, self._min_length, + batch_creator) + + for info in infos: + batch = batch_creator.append(info) + if batch is not None: + batches.append(batch) + + if not self._clip_last_batch and len(batch_creator.batch) != 0: + batches.append(batch_creator.batch) + + if self._shuffle_batch: + self._random.shuffle(batches) + + for batch in batches: + batch_ids = [info.i for info in batch] + + if self._only_src: + yield [[self._src_seq_ids[idx]] for idx in batch_ids] + else: + yield [(self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1], + self._trg_seq_ids[idx][1:]) for idx in batch_ids] + + +#from transformer_model import transformer +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + position_enc = np.array([[ + pos / np.power(10000, 2 * (j // 2) / d_pos_vec) + for j in range(d_pos_vec) + ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc.astype("float32") + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + k = layers.fc(input=keys, + size=d_key * n_head, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + v = layers.fc(input=values, + size=d_value * n_head, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + if n_head == 1: + return x + + hidden_size = x.shape[-1] + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + reshaped = layers.reshape( + x=x, shape=[0, 0, n_head, hidden_size // n_head]) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. + return layers.reshape( + x=trans_x, + shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): + """ + Scaled Dot-Product Attention + """ + scaled_q = layers.scale(x=q, scale=d_model**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if dropout_rate: + weights = layers.dropout( + weights, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + + if cache is not None: # use cache and concat time steps + k = cache["k"] = layers.concat([cache["k"], k], axis=1) + v = cache["v"] = layers.concat([cache["v"], v], axis=1) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + return proj_out + + +def positionwise_feed_forward(x, d_inner_hid, d_hid): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + act="relu", + param_attr=const_para_attr, + bias_attr=const_bias_attr) + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.initializer.Constant(1.), + bias_attr=fluid.initializer.Constant(0.)) + elif cmd == "d": # add dropout + if dropout_rate: + out = layers.dropout( + out, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def prepare_encoder(src_word, + src_pos, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0., + word_emb_param_name=None, + pos_enc_param_name=None): + """Add word embeddings and position encodings. + The output tensor has a shape of: + [batch_size, max_src_length_in_batch, d_model]. + This module is used at the bottom of the encoder stacks. + """ + if TrainTaskConfig.check_acc: + src_word_emb = layers.embedding( + src_word, + size=[src_vocab_size, src_emb_dim], + param_attr=fluid.ParamAttr( + name=word_emb_param_name, + initializer=fluid.initializer.ConstantInitializer(0.001))) + else: + src_word_emb = layers.embedding( + src_word, + size=[src_vocab_size, src_emb_dim], + param_attr=fluid.ParamAttr( + name=word_emb_param_name, + initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) + + src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) + src_pos_enc = layers.embedding( + src_pos, + size=[src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, + trainable=False, + initializer=fluid.initializer.ConstantInitializer(0.001))) + enc_input = src_word_emb + src_pos_enc + return layers.dropout( + enc_input, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) if dropout_rate else enc_input + + +prepare_encoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[0]) +prepare_decoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[1]) + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """The encoder layers that can be stacked to form a deep encoder. + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention(enc_input, enc_input, enc_input, + attn_bias, d_key, d_value, d_model, + n_head, dropout_rate) + attn_output = post_process_layer(enc_input, attn_output, "dan", + dropout_rate) + ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model) + return post_process_layer(attn_output, ffd_output, "dan", dropout_rate) + + +def encoder(enc_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + for i in range(n_layer): + enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value, + d_model, d_inner_hid, dropout_rate) + enc_input = enc_output + return enc_output + + +def decoder_layer(dec_input, + enc_output, + slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + cache=None): + """ The layer to be stacked in decoder part. + The structure of this module is similar to that in the encoder part except + a multi-head attention is added to implement encoder-decoder attention. + """ + slf_attn_output = multi_head_attention( + dec_input, + dec_input, + dec_input, + slf_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, + cache, ) + slf_attn_output = post_process_layer( + dec_input, + slf_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + enc_attn_output = multi_head_attention( + slf_attn_output, + enc_output, + enc_output, + dec_enc_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, ) + enc_attn_output = post_process_layer( + slf_attn_output, + enc_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + ffd_output = positionwise_feed_forward( + enc_attn_output, + d_inner_hid, + d_model, ) + dec_output = post_process_layer( + enc_attn_output, + ffd_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + return dec_output + + +def decoder(dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0., + caches=None): + """ + The decoder is composed of a stack of identical decoder_layer layers. + """ + for i in range(n_layer): + cache = None + if caches is not None: + cache = caches[i] + + dec_output = decoder_layer( + dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + cache=cache) + dec_input = dec_output + return dec_output + + +def make_all_inputs(input_fields): + """ + Define the input data layers for the transformer model. + """ + inputs = [] + for input_field in input_fields: + input_var = layers.data( + name=input_field, + shape=input_descs[input_field][0], + dtype=input_descs[input_field][1], + lod_level=input_descs[input_field][2] + if len(input_descs[input_field]) == 3 else 0, + append_batch_size=False) + inputs.append(input_var) + return inputs + + +def transformer( + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + label_smooth_eps, ): + if weight_sharing: + assert src_vocab_size == src_vocab_size, ( + "Vocabularies in source and target should be same for weight sharing." + ) + enc_inputs = make_all_inputs(encoder_data_input_fields) + + enc_output = wrap_encoder( + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + enc_inputs, ) + + dec_inputs = make_all_inputs(decoder_data_input_fields[:-1]) + + predict = wrap_decoder( + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + dec_inputs, + enc_output, ) + + # Padding index do not contribute to the total loss. The weights is used to + # cancel padding index in calculating the loss. + label, weights = make_all_inputs(label_data_input_fields) + if label_smooth_eps: + label = layers.label_smooth( + label=layers.one_hot( + input=label, depth=trg_vocab_size), + epsilon=label_smooth_eps) + + cost = layers.softmax_with_cross_entropy( + logits=layers.reshape( + predict, shape=[-1, trg_vocab_size]), + label=label, + soft_label=True if label_smooth_eps else False) + weighted_cost = cost * weights + sum_cost = layers.reduce_sum(weighted_cost) + token_num = layers.reduce_sum(weights) + avg_cost = sum_cost / token_num + avg_cost.stop_gradient = True + return sum_cost, avg_cost, predict, token_num + + +def wrap_encoder(src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + enc_inputs=None): + """ + The wrapper assembles together all needed layers for the encoder. + """ + if enc_inputs is None: + # This is used to implement independent encoder program in inference. + src_word, src_pos, src_slf_attn_bias = \ + make_all_inputs(encoder_data_input_fields) + else: + src_word, src_pos, src_slf_attn_bias = \ + enc_inputs + enc_input = prepare_encoder( + src_word, + src_pos, + src_vocab_size, + d_model, + max_length, + dropout_rate, + word_emb_param_name=word_emb_param_names[0]) + enc_output = encoder(enc_input, src_slf_attn_bias, n_layer, n_head, d_key, + d_value, d_model, d_inner_hid, dropout_rate) + return enc_output + + +def wrap_decoder(trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + dec_inputs=None, + enc_output=None, + caches=None): + """ + The wrapper assembles together all needed layers for the decoder. + """ + if dec_inputs is None: + # This is used to implement independent decoder program in inference. + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + enc_output = make_all_inputs( + decoder_data_input_fields + decoder_util_input_fields) + else: + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs + + dec_input = prepare_decoder( + trg_word, + trg_pos, + trg_vocab_size, + d_model, + max_length, + dropout_rate, + word_emb_param_name=word_emb_param_names[0] + if weight_sharing else word_emb_param_names[1]) + dec_output = decoder( + dec_input, + enc_output, + trg_slf_attn_bias, + trg_src_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + caches=caches) + # Return logits for training and probs for inference. + if weight_sharing: + predict = layers.matmul( + x=dec_output, + y=fluid.get_var(word_emb_param_names[0]), + transpose_y=True) + else: + predict = layers.fc(input=dec_output, + size=trg_vocab_size, + num_flatten_dims=2, + param_attr=const_para_attr, + bias_attr=const_bias_attr) + if dec_inputs is None: + predict = layers.softmax(predict) + return predict + + +def fast_decode( + src_vocab_size, + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + beam_size, + max_out_len, + eos_idx, ): + """ + Use beam search to decode. Caches will be used to store states of history + steps which can make the decoding faster. + """ + enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, + dropout_rate, weight_sharing) + start_tokens, init_scores, trg_src_attn_bias = \ + make_all_inputs(fast_decoder_data_input_fields ) + + def beam_search(): + max_len = layers.fill_constant( + shape=[1], dtype=start_tokens.dtype, value=max_out_len) + step_idx = layers.fill_constant( + shape=[1], dtype=start_tokens.dtype, value=0) + cond = layers.less_than(x=step_idx, y=max_len) + while_op = layers.While(cond) + # array states will be stored for each step. + ids = layers.array_write( + layers.reshape(start_tokens, (-1, 1)), step_idx) + scores = layers.array_write(init_scores, step_idx) + # cell states will be overwrited at each step. + # caches contains states of history steps to reduce redundant + # computation in decoder. + caches = [{ + "k": layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, 0, d_model], + dtype=enc_output.dtype, + value=0), + "v": layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, 0, d_model], + dtype=enc_output.dtype, + value=0) + } for i in range(n_layer)] + with while_op.block(): + pre_ids = layers.array_read(array=ids, i=step_idx) + pre_ids = layers.reshape(pre_ids, (-1, 1, 1)) + pre_scores = layers.array_read(array=scores, i=step_idx) + # sequence_expand can gather sequences according to lod thus can be + # used in beam search to sift states corresponding to selected ids. + pre_src_attn_bias = layers.sequence_expand( + x=trg_src_attn_bias, y=pre_scores) + pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores) + pre_caches = [{ + "k": layers.sequence_expand( + x=cache["k"], y=pre_scores), + "v": layers.sequence_expand( + x=cache["v"], y=pre_scores), + } for cache in caches] + pre_pos = layers.elementwise_mul( + x=layers.fill_constant_batch_size_like( + input=pre_enc_output, # cann't use pre_ids here since it has lod + value=1, + shape=[-1, 1, 1], + dtype=pre_ids.dtype), + y=layers.increment( + x=step_idx, value=1.0, in_place=False), + axis=0) + logits = wrap_decoder( + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + weight_sharing, + dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), + enc_output=pre_enc_output, + caches=pre_caches) + logits = layers.reshape(logits, (-1, trg_vocab_size)) + + topk_scores, topk_indices = layers.topk( + input=layers.softmax(logits), k=beam_size) + accu_scores = layers.elementwise_add( + x=layers.log(topk_scores), + y=layers.reshape( + pre_scores, shape=[-1]), + axis=0) + # beam_search op uses lod to distinguish branches. + topk_indices = layers.lod_reset(topk_indices, pre_ids) + selected_ids, selected_scores = layers.beam_search( + pre_ids=pre_ids, + pre_scores=pre_scores, + ids=topk_indices, + scores=accu_scores, + beam_size=beam_size, + end_id=eos_idx) + + layers.increment(x=step_idx, value=1.0, in_place=True) + # update states + layers.array_write(selected_ids, i=step_idx, array=ids) + layers.array_write(selected_scores, i=step_idx, array=scores) + layers.assign(pre_src_attn_bias, trg_src_attn_bias) + layers.assign(pre_enc_output, enc_output) + for i in range(n_layer): + layers.assign(pre_caches[i]["k"], caches[i]["k"]) + layers.assign(pre_caches[i]["v"], caches[i]["v"]) + length_cond = layers.less_than(x=step_idx, y=max_len) + finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) + layers.logical_and(x=length_cond, y=finish_cond, out=cond) + + finished_ids, finished_scores = layers.beam_search_decode( + ids, scores, beam_size=beam_size, end_id=eos_idx) + return finished_ids, finished_scores + + finished_ids, finished_scores = beam_search() + return finished_ids, finished_scores + + +def get_model(is_dist, is_async): + sum_cost, avg_cost, predict, token_num = transformer( + ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, + ModelHyperParams.n_head, ModelHyperParams.d_key, + ModelHyperParams.d_value, ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, + ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) + + local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps, + TrainTaskConfig.learning_rate) + + if not is_dist: + optimizer = fluid.optimizer.Adam( + learning_rate=local_lr_scheduler.learning_rate, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + elif is_async: + optimizer = fluid.optimizer.SGD(0.003) + optimizer.minimize(sum_cost) + else: + lr_decay = fluid.layers\ + .learning_rate_scheduler\ + .noam_decay(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps) + + optimizer = fluid.optimizer.Adam( + learning_rate=lr_decay, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + + return sum_cost, avg_cost, predict, token_num, local_lr_scheduler + + +def update_args(): + src_dict = DataReader.load_dict(TrainTaskConfig.src_vocab_fpath) + trg_dict = DataReader.load_dict(TrainTaskConfig.trg_vocab_fpath) + dict_args = [ + "src_vocab_size", str(len(src_dict)), "trg_vocab_size", + str(len(trg_dict)), "bos_idx", + str(src_dict[TrainTaskConfig.special_token[0]]), "eos_idx", + str(src_dict[TrainTaskConfig.special_token[1]]), "unk_idx", + str(src_dict[TrainTaskConfig.special_token[2]]) + ] + merge_cfg_from_list(dict_args, [TrainTaskConfig, ModelHyperParams]) + + +class DistTransformer2x2(TestDistRunnerBase): + def run_pserver(self, args): + get_model(True, not args.sync_mode) + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), args.endpoints, + args.trainers, args.sync_mode) + pserver_prog = t.get_pserver_program(args.current_endpoint) + startup_prog = t.get_startup_program(args.current_endpoint, + pserver_prog) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) exe.run(pserver_prog) - def _wait_ps_ready(self, pid): - retry_times = 20 - while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(3) - print("waiting ps ready: ", pid) - try: - # the listen_and_serv_op would touch a file which contains the listen port - # on the /tmp directory until it was ready to process all the RPC call. - os.stat("/tmp/paddle.%d.port" % pid) - return - except os.error: - retry_times -= 1 - - def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): - avg_cost = get_model() - if is_dist: - t = get_transpiler(trainer_id, - fluid.default_main_program(), endpoints, - trainers) + def run_trainer(self, place, args): + + sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( + args.is_dist, not args.sync_mode) + + if args.is_dist: + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), + args.endpoints, args.trainers, + args.sync_mode) trainer_prog = t.get_trainer_program() + TrainTaskConfig.batch_size = 10 + TrainTaskConfig.train_file_pattern = TrainTaskConfig.data_path + "train.tok.clean.bpe.32000.en-de.train_{}".format( + args.trainer_id) else: + TrainTaskConfig.batch_size = 20 trainer_prog = fluid.default_main_program() startup_exe = fluid.Executor(place) - startup_exe.run(fluid.default_startup_program()) - - strategy = fluid.ExecutionStrategy() - strategy.num_threads = 1 - strategy.allow_op_delay = False - exe = fluid.ParallelExecutor( - True, loss_name=avg_cost.name, exec_strategy=strategy) - - first_loss, = exe.run(fetch_list=[avg_cost.name]) - print(first_loss) - for i in six.moves.xrange(5): - _ = exe.run(fetch_list=[avg_cost.name]) - last_loss, = exe.run(fetch_list=[avg_cost.name]) - print(last_loss) - - -def main(role="pserver", - endpoints="127.0.0.1:9123", - trainer_id=0, - current_endpoint="127.0.0.1:9123", - trainers=1, - is_dist=True): - - reader = paddle.batch( - wmt16.train(ModelHyperParams.src_vocab_size, - ModelHyperParams.trg_vocab_size), - batch_size=transformer_model.batch_size) - - with fluid.recordio_writer.create_recordio_writer( - WMT16_RECORDIO_FILE) as writer: - for batch in reader(): - for tensor in prepare_batch_input( - batch, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): - t = fluid.LoDTensor() - t.set(tensor, fluid.CPUPlace()) - writer.append_tensor(t) - writer.complete_append_tensor() - - model = DistTransformer2x2() - if role == "pserver": - model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) - else: - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + + TrainTaskConfig.local = not args.is_dist + + train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost, + local_lr_scheduler, token_num, predict) if __name__ == "__main__": - if len(sys.argv) != 8: - print( - "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]" - ) - role = sys.argv[1] - endpoints = sys.argv[2] - trainer_id = int(sys.argv[3]) - current_endpoint = sys.argv[4] - trainers = int(sys.argv[5]) - is_dist = True if sys.argv[6] == "TRUE" else False - # FIXME(typhoonzero): refine this test. - is_async = True if sys.argv[7] == "TRUE" else False - main( - role=role, - endpoints=endpoints, - trainer_id=trainer_id, - current_endpoint=current_endpoint, - trainers=trainers, - is_dist=is_dist) + update_args() + runtime_main(DistTransformer2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py index 0ad994a258c04cabc807823b7d2a8ae8bb62ab2c..f3e740fc7027a4a562b836c3113b87d55062c185 100644 --- a/python/paddle/fluid/tests/unittests/dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -49,28 +49,32 @@ class TestDistWord2vec2x2(TestDistRunnerBase): dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_second = fluid.layers.embedding( input=words[1], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_third = fluid.layers.embedding( input=words[2], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) embed_forth = fluid.layers.embedding( input=words[3], size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr( - name='shared_w', initializer=fluid.initializer.Constant())) + name='shared_w', + initializer=fluid.initializer.Constant(value=0.1))) concat_embed = fluid.layers.concat( input=[embed_first, embed_second, embed_third, embed_forth], @@ -80,13 +84,13 @@ class TestDistWord2vec2x2(TestDistRunnerBase): size=HIDDEN_SIZE, act='sigmoid', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant())) + initializer=fluid.initializer.Constant(value=0.1))) predict_word = fluid.layers.fc( input=hidden1, size=dict_size, act='softmax', param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant())) + initializer=fluid.initializer.Constant(value=0.1))) cost = fluid.layers.cross_entropy( input=predict_word, label=words[4]) avg_cost = fluid.layers.mean(cost) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 44cd073379f293a1114c2c77fa80d35d112d4fb8..20f1a37a426e9697048d636bf738c9056213e5f6 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -47,7 +47,8 @@ def get_numeric_gradient(place, input_to_check, output_names, delta=0.005, - in_place=False): + in_place=False, + sum_outputs=None): # FIXME: change this method by compile time concepts set_input(scope, op, inputs, place) @@ -58,9 +59,11 @@ def get_numeric_gradient(place, sum = [] op.run(scope, place) for output_name in output_names: + if sum_outputs and output_name not in sum_outputs: + continue sum.append( np.array(scope.find_var(output_name).get_tensor()).mean()) - return np.array(sum).mean() + return np.array(sum).sum() / len(output_names) tensor_to_check = scope.find_var(input_to_check).get_tensor() tensor_size = product(tensor_to_check.shape()) @@ -396,13 +399,14 @@ class OpTest(unittest.TestCase): numeric_grad_delta=0.005, in_place=False, max_relative_error=0.005, - user_defined_grads=None): + user_defined_grads=None, + sum_outputs=None): places = self._get_places() for place in places: self.check_grad_with_place(place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, in_place, max_relative_error, - user_defined_grads) + user_defined_grads, sum_outputs) def check_grad_with_place(self, place, @@ -412,7 +416,8 @@ class OpTest(unittest.TestCase): numeric_grad_delta=0.005, in_place=False, max_relative_error=0.005, - user_defined_grads=None): + user_defined_grads=None, + sum_outputs=None): self.scope = core.Scope() op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_outputs = self.outputs if hasattr(self, "outputs") else dict() @@ -435,7 +440,8 @@ class OpTest(unittest.TestCase): input_to_check, output_names, delta=numeric_grad_delta, - in_place=in_place) for input_to_check in inputs_to_check + in_place=in_place, + sum_outputs=sum_outputs) for input_to_check in inputs_to_check ] analytic_grads = self._get_gradient(inputs_to_check, place, output_names, no_grad_set) diff --git a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py index a7382c2244ec3291c4e8f625cc2d15499e0acdac..1b9c3efe0fa9e9f1b8ad09029079898622e7d489 100644 --- a/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_attention_lstm_op.py @@ -37,7 +37,7 @@ def attention_lstm( T = sum(lod[0]) N = len(lod[0]) M = x.shape[1] - D = b.shape[1] / 4 + D = b.shape[1] // 4 assert T == x.shape[0] assert len(fcws) == len(fcbs) hidden = [] diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py index fa6b67956259f33b109758c5939ab5729482695a..08579c7dd62ea6aea87b053345211914a6be6237 100644 --- a/python/paddle/fluid/tests/unittests/test_desc_clone.py +++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py @@ -120,8 +120,8 @@ def operator_equal(a, b): raise ValueError("In operator_equal not equal:{0}\n".format(k)) elif isinstance(v, collections.OrderedDict): - v0 = sorted(six.iteritems(v), key=lambda x: x[0]) - v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0]) + v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0]) + v1 = sorted(list(six.iteritems(b.__dict__[k])), key=lambda x: x[0]) if v0 != v1: raise ValueError("In operator_equal not equal:{0}\n".format(k)) @@ -139,17 +139,15 @@ def block_equal(a, b): continue elif k == "ops": + assert (len(a.ops) == len(b.ops)) for i in range(0, len(a.ops)): if not operator_equal(a.ops[i], b.ops[i]): raise ValueError("In block_equal not equal:{0}\n".format(k)) - assert (len(a.ops) == len(b.ops)) elif isinstance(v, collections.OrderedDict): - v0 = sorted(six.iteritems(v), key=lambda x: x[0]) - v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0]) - - if v0 != v1: - raise ValueError("In block_equal not equal:{0}\n".format(k)) + for key, value in six.iteritems(v): + if str(value) != str(b.__dict__[k][key]): + raise ValueError("In block_equal not equal:{0}\n".format(k)) elif (v != b.__dict__[k]): raise ValueError("In block_equal not equal:{0}\n".format(k)) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 0e815c91446b285ba2c2c5aa9ad18d97f51eae65..58875a1dd19fd91f6f2bed928397ee7f73302dff 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -21,7 +21,7 @@ import sys import six import signal import subprocess -import six +import argparse class TestDistRunnerBase(object): @@ -43,40 +43,35 @@ class TestDistRunnerBase(object): sync_mode=sync_mode) return t - def run_pserver(self, - pserver_endpoints, - trainers, - current_endpoint, - trainer_id, - sync_mode=True): + def run_pserver(self, args): import paddle import paddle.fluid as fluid self.get_model(batch_size=2) - t = self.get_transpiler(trainer_id, - fluid.default_main_program(), pserver_endpoints, - trainers, sync_mode) - pserver_prog = t.get_pserver_program(current_endpoint) - startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + if args.mem_opt: + fluid.memory_optimize(fluid.default_main_program()) + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), args.endpoints, + args.trainers, args.sync_mode) + pserver_prog = t.get_pserver_program(args.current_endpoint) + startup_prog = t.get_startup_program(args.current_endpoint, + pserver_prog) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, - place, - endpoints, - trainer_id, - trainers, - is_dist=True, - sync_mode=True): + def run_trainer(self, place, args): import paddle import paddle.fluid as fluid test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ - self.get_model(batch_size=2) - if is_dist: - t = self.get_transpiler(trainer_id, - fluid.default_main_program(), endpoints, - trainers, sync_mode) + self.get_model(batch_size=2) + if args.mem_opt: + fluid.memory_optimize(fluid.default_main_program()) + if args.is_dist: + t = self.get_transpiler(args.trainer_id, + fluid.default_main_program(), + args.endpoints, args.trainers, + args.sync_mode) trainer_prog = t.get_trainer_program() else: trainer_prog = fluid.default_main_program() @@ -87,8 +82,18 @@ class TestDistRunnerBase(object): strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False + build_stra = fluid.BuildStrategy() + + if args.use_reduce: + build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + else: + build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + exe = fluid.ParallelExecutor( - True, loss_name=avg_cost.name, exec_strategy=strategy) + True, + loss_name=avg_cost.name, + exec_strategy=strategy, + build_strategy=build_stra) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() @@ -117,27 +122,28 @@ def runtime_main(test_class): import paddle.fluid as fluid import paddle.fluid.core as core - if len(sys.argv) != 8: - print( - "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]" - ) - role = sys.argv[1] - endpoints = sys.argv[2] - trainer_id = int(sys.argv[3]) - current_endpoint = sys.argv[4] - trainers = int(sys.argv[5]) - is_dist = True if sys.argv[6] == "TRUE" else False - sync_mode = True if sys.argv[7] == "TRUE" else False + parser = argparse.ArgumentParser(description='Run dist test.') + parser.add_argument( + '--role', type=str, required=True, choices=['pserver', 'trainer']) + parser.add_argument('--endpoints', type=str, required=False, default="") + parser.add_argument('--is_dist', action='store_true') + parser.add_argument('--trainer_id', type=int, required=False, default=0) + parser.add_argument('--trainers', type=int, required=False, default=1) + parser.add_argument( + '--current_endpoint', type=str, required=False, default="") + parser.add_argument('--sync_mode', action='store_true') + parser.add_argument('--mem_opt', action='store_true') + parser.add_argument('--use_reduce', action='store_true') + + args = parser.parse_args() model = test_class() - if role == "pserver": - model.run_pserver(endpoints, trainers, current_endpoint, trainer_id, - sync_mode) + if args.role == "pserver" and args.is_dist: + model.run_pserver(args) else: p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( ) else fluid.CPUPlace() - model.run_trainer(p, endpoints, trainer_id, trainers, is_dist, - sync_mode) + model.run_trainer(p, args) import paddle.compat as cpt @@ -153,30 +159,39 @@ class TestDistBase(unittest.TestCase): self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._python_interp = "python" self._sync_mode = True + self._mem_opt = False + self._use_reduce = False self._setup_config() def start_pserver(self, model_file, check_error_log): - sync_mode_str = "TRUE" if self._sync_mode else "FALSE" ps0_ep, ps1_ep = self._ps_endpoints.split(",") - ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \ + ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" + ps0_cmd = ps_cmd % \ (self._python_interp, model_file, self._ps_endpoints, ps0_ep, - self._trainers, sync_mode_str) - ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \ + self._trainers) + ps1_cmd = ps_cmd % \ (self._python_interp, model_file, self._ps_endpoints, ps1_ep, - self._trainers, sync_mode_str) + self._trainers) + + if self._sync_mode: + ps0_cmd += " --sync_mode" + ps1_cmd += " --sync_mode" + if self._mem_opt: + ps0_cmd += " --mem_opt" + ps1_cmd += " --mem_opt" ps0_pipe = subprocess.PIPE ps1_pipe = subprocess.PIPE if check_error_log: - print("ps0_cmd:", ps0_cmd) - print("ps1_cmd:", ps1_cmd) + print(ps0_cmd) + print(ps1_cmd) ps0_pipe = open("/tmp/ps0_err.log", "wb") ps1_pipe = open("/tmp/ps1_err.log", "wb") ps0_proc = subprocess.Popen( - ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) + ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) ps1_proc = subprocess.Popen( - ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) + ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) if not check_error_log: return ps0_proc, ps1_proc, None, None @@ -199,7 +214,7 @@ class TestDistBase(unittest.TestCase): retry_times -= 1 def check_with_place(self, model_file, delta=1e-3, check_error_log=False): - # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN + # TODO(typhoonzero): should auto adapt GPU count on the machine. required_envs = { "PATH": os.getenv("PATH"), "PYTHONPATH": os.getenv("PYTHONPATH"), @@ -215,10 +230,7 @@ class TestDistBase(unittest.TestCase): # Run local to get a base line env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local.update(required_envs) - sync_mode_str = "TRUE" if self._sync_mode else "FALSE" - local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \ - (self._python_interp, model_file, - "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str) + local_cmd = "%s %s --role trainer" % (self._python_interp, model_file) if not check_error_log: local_proc = subprocess.Popen( local_cmd.split(" "), @@ -226,7 +238,6 @@ class TestDistBase(unittest.TestCase): stderr=subprocess.PIPE, env=env_local) else: - print("trainer cmd:", local_cmd) err_log = open("/tmp/trainer.err.log", "wb") local_proc = subprocess.Popen( local_cmd.split(" "), @@ -247,12 +258,23 @@ class TestDistBase(unittest.TestCase): self._wait_ps_ready(ps1.pid) ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \ - (self._python_interp, model_file, self._ps_endpoints, ps0_ep, - self._trainers, sync_mode_str) - tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \ - (self._python_interp, model_file, self._ps_endpoints, ps1_ep, - self._trainers, sync_mode_str) + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" + tr0_cmd = tr_cmd % \ + (self._python_interp, model_file, self._ps_endpoints, + 0, ps0_ep, self._trainers) + tr1_cmd = tr_cmd % \ + (self._python_interp, model_file, self._ps_endpoints, + 1, ps1_ep, self._trainers) + + if self._sync_mode: + tr0_cmd += " --sync_mode" + tr1_cmd += " --sync_mode" + if self._mem_opt: + tr0_cmd += " --mem_opt" + tr1_cmd += " --mem_opt" + if self._use_reduce: + tr0_cmd += " --use_reduce" + tr1_cmd += " --use_reduce" env0 = {"CUDA_VISIBLE_DEVICES": "0"} env1 = {"CUDA_VISIBLE_DEVICES": "1"} @@ -269,12 +291,12 @@ class TestDistBase(unittest.TestCase): tr1_pipe = open("/tmp/tr1_err.log", "wb") tr0_proc = subprocess.Popen( - tr0_cmd.split(" "), + tr0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=tr0_pipe, env=env0) tr1_proc = subprocess.Popen( - tr1_cmd.split(" "), + tr1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=tr1_pipe, env=env1) @@ -303,6 +325,10 @@ class TestDistBase(unittest.TestCase): # FIXME: use terminate() instead of sigkill. os.kill(ps0.pid, signal.SIGKILL) os.kill(ps1.pid, signal.SIGKILL) + ps0.terminate() + ps1.terminate() + ps0.wait() + ps1.wait() FNULL.close() self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 36bab6f04603b7ad3218603489eead859bfcb5b6..59a137c18c9435ef5c5772d0cc08f197c1d86603 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -20,6 +20,16 @@ from test_dist_base import TestDistBase class TestDistMnist2x2(TestDistBase): def _setup_config(self): self._sync_mode = True + self._use_reduce = False + + def test_se_resnext(self): + self.check_with_place("dist_mnist.py", delta=1e-7) + + +class TestDistMnist2x2WithMemopt(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._mem_opt = True def test_se_resnext(self): self.check_with_place("dist_mnist.py", delta=1e-7) @@ -28,10 +38,30 @@ class TestDistMnist2x2(TestDistBase): class TestDistMnistAsync(TestDistBase): def _setup_config(self): self._sync_mode = False + self._use_reduce = False def test_se_resnext(self): self.check_with_place("dist_mnist.py", delta=200) +# FIXME(typhoonzero): enable these tests once we have 4 +# 4 GPUs on CI machine, and the base class should be updated. +# +# class TestDistMnist2x2ReduceMode(TestDistBase): +# def _setup_config(self): +# self._sync_mode = True +# self._use_reduce = True + +# def test_se_resnext(self): +# self.check_with_place("dist_mnist.py", delta=1e-7) + +# class TestDistMnistAsyncReduceMode(TestDistBase): +# def _setup_config(self): +# self._sync_mode = False +# self._use_reduce = True + +# def test_se_resnext(self): +# self.check_with_place("dist_mnist.py", delta=200) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py index 9581abdf394d738470d32ae609838832077ee519..083525ccf54d389b60c4aaa9f8c6223f07c773cd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_train.py +++ b/python/paddle/fluid/tests/unittests/test_dist_train.py @@ -100,7 +100,7 @@ class TestSendOp(unittest.TestCase): main.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": []}, attrs={ "endpoints": ["127.0.0.1:{0}".format(port)], RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index 62fcf5953f93637a20beed649de21476a8673419..a8e6ce4cfe18384e405f1602429628914d2c2e00 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -15,17 +15,55 @@ from __future__ import print_function import unittest +import paddle from test_dist_base import TestDistBase -class TestDistTransformer2x2(TestDistBase): +def download_files(): + url_prefix = 'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/' + vocab_url = url_prefix + 'vocab.bpe.32000' + vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853' + paddle.dataset.common.download(vocab_url, 'test_dist_transformer', + vocab_md5) + + local_train_url = url_prefix + 'train.tok.clean.bpe.32000.en-de' + local_train_md5 = '033eb02b9449e6dd823f050782ac8914' + paddle.dataset.common.download(local_train_url, 'test_dist_transformer', + local_train_md5) + + train0_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_0' + train0_md5 = 'ddce7f602f352a0405267285379a38b1' + paddle.dataset.common.download(train0_url, 'test_dist_transformer', + train0_md5) + + train1_url = url_prefix + 'train.tok.clean.bpe.32000.en-de.train_1' + train1_md5 = '8757798200180285b1a619cd7f408747' + paddle.dataset.common.download(train1_url, 'test_dist_transformer', + train1_md5) + + test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de' + test_md5 = '9dd74a266dbdb25314183899f269b4a2' + paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5) + + +class TestDistTransformer2x2Sync(TestDistBase): def _setup_config(self): self._sync_mode = True def test_transformer(self): - # TODO(paddle-dev): check if the delta is OK. - # Usually start around ~8000 and converge to ~5000 - self.check_with_place("dist_transformer.py", delta=400) + download_files() + #Note: loss on test dataset of the first 5 batch are: + # 10.518872, 10.518871, 10.518868, 10.518862, 10.518855 + self.check_with_place("dist_transformer.py", delta=1e-7) + + +class TestDistTransformer2x2Async(TestDistBase): + def _setup_config(self): + self._sync_mode = False + + def test_transformer(self): + download_files() + self.check_with_place("dist_transformer.py", delta=1.0) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 9f04d290f7596a60d5fdfa66cbc4beec1c3fe93d..b85501ef6b80d1f5004aa0dd08c3123d3bda48a5 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -21,6 +21,7 @@ import paddle.fluid as fluid from paddle.fluid.transpiler.distribute_transpiler import delete_ops import traceback import collections +import six class TranspilerTest(unittest.TestCase): @@ -437,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): # 2 optimize for table adam # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num self.assertEqual([op.type for op in pserver1.blocks[2].ops], - ["sum", "adam", "scale", "scale"]) + ["sum", "scale", "adam", "scale", "scale"]) trainer, _ = self.get_trainer() self.assertEqual(len(trainer.blocks), 1) @@ -644,18 +645,18 @@ class TestLoadSliceVar(TranspilerTest): self.assertTrue(pserver._slice_vars_and_attrs) self.assertTrue(pserver2._slice_vars_and_attrs) - for idx in xrange(len(pserver._slice_vars_and_attrs)): + for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)): self.assertEqual(pserver._slice_vars_and_attrs[idx][0], pserver2._slice_vars_and_attrs[idx][0]) - total_numel = reduce(lambda x, y: x * y, - pserver._slice_vars_and_attrs[idx][0].shape) + total_numel = six.moves.reduce( + lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape) self.assertEqual( total_numel, - reduce(lambda x, y: x * y, - pserver._slice_vars_and_attrs[idx][2].shape) + reduce( - lambda x, y: x * y, - pserver2._slice_vars_and_attrs[idx][2].shape)) + six.moves.reduce(lambda x, y: x * y, + pserver._slice_vars_and_attrs[idx][2].shape) + + six.moves.reduce(lambda x, y: x * y, + pserver2._slice_vars_and_attrs[idx][2].shape)) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 38af149ad336fcb818c3cbc9c686bcbdf00238be..9a3e92e8d775a37e0c24ee1bcc5435628d61bb91 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -22,7 +22,7 @@ class TestDistSeResneXt2x2(TestDistBase): self._sync_mode = True def test_se_resnext(self): - self.check_with_place("dist_word2vec.py", delta=1e-7) + self.check_with_place("dist_word2vec.py", delta=1e-4) class TestDistSeResneXt2x2Async(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index d84ebed3fac67db323392494c701cf2a51b28305..1bb4662e8d83ac0c34b209e4e7a605869fdb59d5 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -20,41 +20,50 @@ import math from op_test import OpTest -def quantize_max_abs(x, num_bits): - range = math.pow(2, num_bits) - 1 +def quantize_max_abs(x, max_range): scale = np.max(np.abs(x).flatten()) - y = np.round(x / scale * range) + y = np.round(x / scale * max_range) return y, scale -def dequantize_max_abs(x, num_bits, scale): - range = math.pow(2, num_bits) - 1 - y = (scale / range) * x +def dequantize_max_abs(x, scale, max_range): + y = (scale / max_range) * x return y class TestFakeDequantizeMaxAbsOp(OpTest): def set_args(self): self.num_bits = 8 + self.max_range = math.pow(2, self.num_bits - 1) - 1 + self.data_type = "float32" def setUp(self): self.set_args() self.op_type = "fake_dequantize_max_abs" - x = np.random.randn(31, 65).astype("float32") - yq, scale = quantize_max_abs(x, self.num_bits) - ydq = dequantize_max_abs(yq, self.num_bits, scale) + x = np.random.randn(31, 65).astype(self.data_type) + yq, scale = quantize_max_abs(x, self.max_range) + ydq = dequantize_max_abs(yq, scale, self.max_range) - self.inputs = {'X': yq} - self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)} + self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)} + self.attrs = {'max_range': self.max_range} self.outputs = {'Out': ydq} def test_check_output(self): self.check_output() -class TestFakeDequantizeMaxAbsOp5Bits(OpTest): +class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp): + def set_args(self): + self.num_bits = 8 + self.max_range = math.pow(2, self.num_bits - 1) - 1 + self.data_type = "float64" + + +class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp): def set_args(self): self.num_bits = 5 + self.max_range = math.pow(2, self.num_bits - 1) - 1 + self.data_type = "float32" if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py index 44fb1d047dff48d2554c0bf637afbfda725e0a02..fd59c5bb7cff5dd33fae284ba3efe04e667ed75a 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py @@ -18,6 +18,9 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator + class TestFillConstantOp1(OpTest): def setUp(self): @@ -47,5 +50,31 @@ class TestFillConstantOp2(OpTest): self.check_output() +class TestFillConstantOpWithSelectedRows(OpTest): + def check_with_place(self, place): + scope = core.Scope() + # create Out Variable + out = scope.var('Out').get_selected_rows() + + # create and run fill_constant_op operator + fill_constant_op = Operator( + "fill_constant", shape=[123, 92], value=3.8, Out='Out') + fill_constant_op.run(scope, place) + + # get result from Out + result_array = np.array(out.get_tensor()) + full_array = np.full((123, 92), 3.8, 'float32') + + self.assertTrue(np.array_equal(result_array, full_array)) + + def test_fill_constant_with_selected_rows(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self.check_with_place(place) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py index 97e1b9061afb738dd9e5f8b3b6a9c9a123c6aac6..4a213c29113e5e23af2caf7fbcb807be3d0166d2 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py @@ -15,806 +15,327 @@ from __future__ import print_function import unittest import numpy as np +from functools import partial import paddle.fluid.core as core from op_test import OpTest -# scale + add -# TestElementwiseAddOp -# TestFusedOperatorsOp_scalar -# TestFusedOperatorsOp_scalar2 -# TestFusedOperatorsOp_Vector -# TestFusedOperatorsOp_broadcast_0 -# TestFusedOperatorsOp_broadcast_1 -# TestFusedOperatorsOp_broadcast_2 -# TestFusedOperatorsOp_broadcast_3 -# TestFusedOperatorsOp_broadcast_4 -# TestFusedOperatorsOp_rowwise_add_0 -# TestFusedOperatorsOp_rowwise_add_1 -# TestFusedOperatorsOp_channelwise_add - - -class TestElementwiseAddOp(OpTest): - def setUp(self): - self.op_type = "fused_elemwise_activation" - self.dtype = np.float32 - self.axis = -1 - - self.init_axis() - self.init_dtype() - self.init_input() - self.init_output() - self.init_attr() - - self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(self.x), - 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) - } - self.outputs = {'Out': self.out} - - def init_input(self): - self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) - self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["scale", "elementwise_add"] - } - - def init_dtype(self): - pass - - def init_axis(self): - pass - - def test_check_output(self): - self.check_output() - - def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005) - - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X")) - - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y')) - - -class TestFusedOperatorsOp_scalar(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4).astype(self.dtype) - self.y = np.random.rand(1).astype(self.dtype) - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y) * self.scale - - -class TestFusedOperatorsOp_scalar2(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4).astype(self.dtype) - self.y = np.random.rand(1, 1).astype(self.dtype) - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y) * self.scale - - -class TestFusedOperatorsOp_Vector(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.random((32, )).astype(self.dtype) - self.y = np.random.random((32, )).astype(self.dtype) - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y) * self.scale - - -class TestFusedOperatorsOp_broadcast_0(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4).astype(self.dtype) - self.y = np.random.rand(2).astype(self.dtype) - - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y.reshape(2, 1, 1)) * self.scale - - -class TestFusedOperatorsOp_broadcast_1(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4).astype(self.dtype) - self.y = np.random.rand(3).astype(self.dtype) - - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y.reshape(1, 3, 1)) * self.scale - - -class TestFusedOperatorsOp_broadcast_2(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4).astype(self.dtype) - self.y = np.random.rand(4).astype(self.dtype) - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y.reshape(1, 1, 4)) * self.scale - - -class TestFusedOperatorsOp_broadcast_3(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) - self.y = np.random.rand(3, 4).astype(self.dtype) - - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y.reshape(1, 3, 4, 1)) * self.scale - - -class TestFusedOperatorsOp_broadcast_4(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) - self.y = np.random.rand(2, 1).astype(self.dtype) - - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y.reshape(2, 1, 1, 1)) * self.scale - - -class TestFusedOperatorsOp_rowwise_add_0(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 3, 4).astype(self.dtype) - self.y = np.random.rand(3, 4).astype(self.dtype) - - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y.reshape(1, 3, 4)) * self.scale - - -class TestFusedOperatorsOp_rowwise_add_1(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(2, 1).astype(self.dtype) - self.y = np.random.rand(1).astype(self.dtype) - - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y.reshape(1, 1)) * self.scale - - -class TestFusedOperatorsOp_channelwise_add(TestElementwiseAddOp): - def init_input(self): - self.x = np.random.rand(3, 20, 20).astype(self.dtype) - self.y = np.random.rand(3, 1, 1).astype(self.dtype) - - def init_axis(self): - self.axis = -1 - - def init_output(self): - self.scale = 0.1 - self.out = (self.x + self.y) * self.scale - - -# add + scale -# TestElementwiseAddOp_f_add_scale -# TestFusedOperatorsOp_scalar_f_add_scale -# TestFusedOperatorsOp_scalar2_f_add_scale -# TestFusedOperatorsOp_Vector_f_add_scale -# TestFusedOperatorsOp_broadcast_0_f_add_scale -# TestFusedOperatorsOp_broadcast_1_f_add_scale -# TestFusedOperatorsOp_broadcast_2_f_add_scale -# TestFusedOperatorsOp_broadcast_3_f_add_scale -# TestFusedOperatorsOp_broadcast_4_f_add_scale -# TestFusedOperatorsOp_rowwise_add_0_f_add_scale -# TestFusedOperatorsOp_rowwise_add_1_f_add_scale -# TestFusedOperatorsOp_channelwise_add_f_add_scale - - -class TestFusedOperatorsOp_f_add_scale(TestElementwiseAddOp): - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_scalar_f_add_scale(TestFusedOperatorsOp_scalar): - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_scalar2_f_add_scale(TestFusedOperatorsOp_scalar2): - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_Vector_f_add_scale(TestFusedOperatorsOp_Vector): - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_broadcast_0_f_add_scale( - TestFusedOperatorsOp_broadcast_0): - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y.reshape(2, 1, 1) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_broadcast_1_f_add_scale( - TestFusedOperatorsOp_broadcast_1): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y.reshape(1, 3, 1) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_broadcast_2_f_add_scale( - TestFusedOperatorsOp_broadcast_2): - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y.reshape(1, 1, 4) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_broadcast_3_f_add_scale( - TestFusedOperatorsOp_broadcast_3): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y.reshape(1, 3, 4, 1) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_broadcast_4_f_add_scale( - TestFusedOperatorsOp_broadcast_4): - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.scale = 0.2 - self.out = self.x + self.y.reshape(2, 1, 1, 1) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_rowwise_add_0_f_add_scale( - TestFusedOperatorsOp_rowwise_add_0): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.1 - self.out = self.x + self.y.reshape(1, 3, 4) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_rowwise_add_1_f_add_scale( - TestFusedOperatorsOp_rowwise_add_1): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.scale = 0.2 - self.out = self.x + self.y.reshape(1, 1) * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -class TestFusedOperatorsOp_channelwise_add_f_add_scale( - TestFusedOperatorsOp_channelwise_add): - def init_axis(self): - self.axis = -1 - - def init_output(self): - self.scale = 0.2 - self.out = self.x + self.y * self.scale - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'scale': self.scale, - 'functor_list': ["elementwise_add", "scale"] - } - - -# add + relu -# TestElementwiseAddOp_f_add_relu -# TestFusedOperatorsOp_scalar_f_add_relu -# TestFusedOperatorsOp_scalar2_f_add_relu -# TestFusedOperatorsOp_Vector_f_add_relu -# TestFusedOperatorsOp_broadcast_0_f_add_relu -# TestFusedOperatorsOp_broadcast_1_f_add_relu -# TestFusedOperatorsOp_broadcast_2_f_add_relu -# TestFusedOperatorsOp_broadcast_3_f_add_relu -# TestFusedOperatorsOp_broadcast_4_f_add_relu -# TestFusedOperatorsOp_rowwise_add_0_f_add_relu -# TestFusedOperatorsOp_rowwise_add_1_f_add_relu -# TestFusedOperatorsOp_channelwise_add_f_add_relu - - -class TestFusedOperatorsOp_f_add_relu(TestElementwiseAddOp): - def init_output(self): - # Copy from test_activation_op.py - # Because we set delta = 0.005 in calculating numeric gradient, - # if x is too small, such as 0.002, x_neg will be -0.003 - # x_pos will be 0.007, so the numeric gradient is inaccurate. - # we should avoid this - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y, 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_scalar_f_add_relu(TestFusedOperatorsOp_scalar): - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y, 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_scalar2_f_add_relu(TestFusedOperatorsOp_scalar2): - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y, 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_Vector_f_add_relu(TestFusedOperatorsOp_Vector): - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y, 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_broadcast_0_f_add_relu( - TestFusedOperatorsOp_broadcast_0): - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y.reshape(2, 1, 1), 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_broadcast_1_f_add_relu( - TestFusedOperatorsOp_broadcast_1): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y.reshape(1, 3, 1), 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_broadcast_2_f_add_relu( - TestFusedOperatorsOp_broadcast_2): - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y.reshape(1, 1, 4), 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_broadcast_3_f_add_relu( - TestFusedOperatorsOp_broadcast_3): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y.reshape(1, 3, 4, 1), 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_broadcast_4_f_add_relu( - TestFusedOperatorsOp_broadcast_4): - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y.reshape(2, 1, 1, 1), 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_rowwise_add_0_f_add_relu( - TestFusedOperatorsOp_rowwise_add_0): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y.reshape(1, 3, 4), 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_rowwise_add_1_f_add_relu( - TestFusedOperatorsOp_rowwise_add_1): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y.reshape(1, 1), 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -class TestFusedOperatorsOp_channelwise_add_f_add_relu( - TestFusedOperatorsOp_channelwise_add): - def init_axis(self): - self.axis = -1 - - def init_output(self): - self.y[np.abs(self.y) < 0.005] = 0.02 - self.out = self.x + np.maximum(self.y, 0) - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["elementwise_add", "relu"] - } - - -# relu + add -# TestElementwiseAddOp_f_relu_add -# TestFusedOperatorsOp_scalar_f_relu_add -# TestFusedOperatorsOp_scalar2_f_relu_add -# TestFusedOperatorsOp_Vector_f_relu_add -# TestFusedOperatorsOp_broadcast_0_f_relu_add -# TestFusedOperatorsOp_broadcast_1_f_relu_add -# TestFusedOperatorsOp_broadcast_2_f_relu_add -# TestFusedOperatorsOp_broadcast_3_f_relu_add -# TestFusedOperatorsOp_broadcast_4_f_relu_add -# TestFusedOperatorsOp_rowwise_add_0_f_relu_add -# TestFusedOperatorsOp_rowwise_add_1_f_relu_add -# TestFusedOperatorsOp_channelwise_add_f_relu_add - - -class TestFusedOperatorsOp_f_relu_add(TestElementwiseAddOp): - def init_output(self): - # Copy from test_activation_op.py - # Because we set delta = 0.005 in calculating numeric gradient, - # if x is too small, such as 0.002, x_neg will be -0.003 - # x_pos will be 0.007, so the numeric gradient is inaccurate. - # we should avoid this - self.out = self.x + self.y - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_scalar_f_relu_add(TestFusedOperatorsOp_scalar): - def init_output(self): - self.out = self.x + self.y - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_scalar2_f_relu_add(TestFusedOperatorsOp_scalar2): - def init_output(self): - self.out = self.x + self.y - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_Vector_f_relu_add(TestFusedOperatorsOp_Vector): - def init_output(self): - self.out = self.x + self.y - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_broadcast_0_f_relu_add( - TestFusedOperatorsOp_broadcast_0): - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.out = self.x + self.y.reshape(2, 1, 1) - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_broadcast_1_f_relu_add( - TestFusedOperatorsOp_broadcast_1): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.out = self.x + self.y.reshape(1, 3, 1) - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_broadcast_2_f_relu_add( - TestFusedOperatorsOp_broadcast_2): - def init_output(self): - self.out = self.x + self.y.reshape(1, 1, 4) - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_broadcast_3_f_relu_add( - TestFusedOperatorsOp_broadcast_3): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.out = self.x + self.y.reshape(1, 3, 4, 1) - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_broadcast_4_f_relu_add( - TestFusedOperatorsOp_broadcast_4): - def init_axis(self): - self.axis = 0 - - def init_output(self): - self.out = self.x + self.y.reshape(2, 1, 1, 1) - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_rowwise_add_0_f_relu_add( - TestFusedOperatorsOp_rowwise_add_0): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.out = self.x + self.y.reshape(1, 3, 4) - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_rowwise_add_1_f_relu_add( - TestFusedOperatorsOp_rowwise_add_1): - def init_axis(self): - self.axis = 1 - - def init_output(self): - self.out = self.x + self.y.reshape(1, 1) - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - - -class TestFusedOperatorsOp_channelwise_add_f_relu_add( - TestFusedOperatorsOp_channelwise_add): - def init_axis(self): - self.axis = -1 - - def init_output(self): - self.out = self.x + self.y - self.out = np.maximum(self.out, 0) - self.out[np.abs(self.out) < 0.005] = 0.02 - - def init_attr(self): - self.attrs = { - 'axis': self.axis, - 'functor_list': ["relu", "elementwise_add"] - } - +# TestFusedElementwiseActivationOp +# TestFusedElementwiseActivationOp_scalar +# TestFusedElementwiseActivationOp_scalar2 +# TestFusedElementwiseActivationOp_Vector +# TestFusedElementwiseActivationOp_broadcast_0 +# TestFusedElementwiseActivationOp_broadcast_1 +# TestFusedElementwiseActivationOp_broadcast_2 +# TestFusedElementwiseActivationOp_broadcast_3 +# TestFusedElementwiseActivationOp_broadcast_4 +# TestFusedElementwiseActivationOp_rowwise_add_0 +# TestFusedElementwiseActivationOp_rowwise_add_1 +# TestFusedElementwiseActivationOp_channelwise_add + + +def create_test_class(test_case, callback, attrs): + class TestFusedElementwiseActivationOp_base(OpTest): + def setUp(self): + self.op_type = "fused_elemwise_activation" + self.dtype = np.float32 + self.axis = -1 + + self.init_input() + self.init_output() + self.init_attr() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + if self.attrs["keep_intermediate_value"]: + self.outputs = { + 'Out': self.out, + "IntermediateOut": self.intermediate_out + } + else: + self.outputs = {'Out': self.out} + + def init_input(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.axis = -1 + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y) + + def init_attr(self): + self.attrs = {'axis': self.axis, } + for key in attrs.keys(): + self.attrs[key] = attrs[key] + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + if self.attrs["keep_intermediate_value"]: + self.check_grad( + ['X', 'Y'], ['Out', 'IntermediateOut'], + max_relative_error=0.005, + sum_outputs=['Out']) + else: + self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) + + def test_check_grad_ingore_x(self): + if self.attrs["keep_intermediate_value"]: + self.check_grad( + ['Y'], ['Out', 'IntermediateOut'], + max_relative_error=0.005, + no_grad_set=set("X"), + sum_outputs=['Out']) + else: + self.check_grad( + ['Y'], ['Out'], + max_relative_error=0.005, + no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + if self.attrs["keep_intermediate_value"]: + self.check_grad( + ['X'], ['Out', 'IntermediateOut'], + max_relative_error=0.005, + no_grad_set=set("Y"), + sum_outputs=['Out']) + else: + self.check_grad( + ['X'], ['Out'], + max_relative_error=0.005, + no_grad_set=set("Y")) + + class TestFusedElementwiseActivationOp_scalar( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + + class TestFusedElementwiseActivationOp_scalar2( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1, 1).astype(self.dtype) + + class TestFusedElementwiseActivationOp_Vector( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.random((32, )).astype(self.dtype) + self.y = np.random.random((32, )).astype(self.dtype) + + class TestFusedElementwiseActivationOp_broadcast_0( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(2).astype(self.dtype) + self.axis = 0 + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y.reshape(2, 1, 1)) + + class TestFusedElementwiseActivationOp_broadcast_1( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(3).astype(self.dtype) + self.axis = 1 + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y.reshape(1, 3, 1)) + + class TestFusedElementwiseActivationOp_broadcast_2( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(4).astype(self.dtype) + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y.reshape(1, 1, 4)) + + class TestFusedElementwiseActivationOp_broadcast_3( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(3, 4).astype(self.dtype) + self.axis = 1 + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y.reshape(1, 3, 4, 1)) + + class TestFusedElementwiseActivationOp_broadcast_4( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.y = np.random.rand(2, 1).astype(self.dtype) + self.axis = 0 + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y.reshape(2, 1, 1, 1)) + + class TestFusedElementwiseActivationOp_rowwise_add_0( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(3, 4).astype(self.dtype) + self.axis = 1 + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y.reshape(1, 3, 4)) + + class TestFusedElementwiseActivationOp_rowwise_add_1( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(2, 1).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.axis = 1 + + def init_output(self): + self.x, self.y, self.intermediate_out, self.out = \ + callback(self.x, self.y, self.x, self.y.reshape(1, 1)) + + class TestFusedElementwiseActivationOp_channelwise_add( + TestFusedElementwiseActivationOp_base): + def init_input(self): + self.x = np.random.rand(3, 20, 20).astype(self.dtype) + self.y = np.random.rand(3, 1, 1).astype(self.dtype) + + TestFusedElementwiseActivationOp_base.__name__ = test_case + "_base" + TestFusedElementwiseActivationOp_scalar.__name__ = test_case + "_scalar" + TestFusedElementwiseActivationOp_scalar2.__name__ = test_case + "_scalar2" + TestFusedElementwiseActivationOp_Vector.__name__ = test_case + "_Vector" + TestFusedElementwiseActivationOp_broadcast_0.__name__ = test_case + "_broadcast_0" + TestFusedElementwiseActivationOp_broadcast_1.__name__ = test_case + "_broadcast_1" + TestFusedElementwiseActivationOp_broadcast_2.__name__ = test_case + "_broadcast_2" + TestFusedElementwiseActivationOp_broadcast_3.__name__ = test_case + "_broadcast_3" + TestFusedElementwiseActivationOp_broadcast_4.__name__ = test_case + "_broadcast_4" + TestFusedElementwiseActivationOp_rowwise_add_0.__name__ = test_case + "_rowwise_add_0" + TestFusedElementwiseActivationOp_rowwise_add_1.__name__ = test_case + "_rowwise_add_1" + TestFusedElementwiseActivationOp_channelwise_add.__name__ = test_case + "_channelwise_add" + + globals()[test_case + "_base"] = TestFusedElementwiseActivationOp_base + globals()[test_case + "_scalar"] = TestFusedElementwiseActivationOp_scalar + globals()[test_case + "_scalar2"] = TestFusedElementwiseActivationOp_scalar2 + globals()[test_case + "_Vector"] = TestFusedElementwiseActivationOp_Vector + globals()[test_case + + "_broadcast_0"] = TestFusedElementwiseActivationOp_broadcast_0 + globals()[test_case + + "_broadcast_1"] = TestFusedElementwiseActivationOp_broadcast_1 + globals()[test_case + + "_broadcast_2"] = TestFusedElementwiseActivationOp_broadcast_2 + globals()[test_case + + "_broadcast_3"] = TestFusedElementwiseActivationOp_broadcast_3 + globals()[test_case + + "_broadcast_4"] = TestFusedElementwiseActivationOp_broadcast_4 + globals()[test_case + + "_rowwise_add_0"] = TestFusedElementwiseActivationOp_rowwise_add_0 + globals()[test_case + + "_rowwise_add_1"] = TestFusedElementwiseActivationOp_rowwise_add_1 + globals( + )[test_case + + "_channelwise_add"] = TestFusedElementwiseActivationOp_channelwise_add + + +def scale_add_func(x, y, x_bcast, y_bcast, scale, mode=0): + if mode == 0: + return x, y, (x_bcast + y_bcast), (x_bcast + y_bcast) * scale + else: + return y, x, (x_bcast + y_bcast), (x_bcast + y_bcast) * scale + + +def add_scale_func(x, y, x_bcast, y_bcast, scale, mode=0): + if mode == 0: + return x, y, y * scale, x_bcast + y_bcast * scale + else: + return y, x, x * scale, y_bcast + x_bcast * scale + + +def add_relu_func(x, y, x_bcast, y_bcast, mode=0): + # Copy from test_activation_op.py + # Because we set delta = 0.005 in calculating numeric gradient, + # if x is too small, such as 0.002, x_neg will be -0.003 + # x_pos will be 0.007, so the numeric gradient is inaccurate. + # we should avoid this + if mode == 0: + y[np.abs(y) < 0.005] = 0.02 + y_bcast[np.abs(y_bcast) < 0.005] = 0.02 + return x, y, np.maximum(y, 0), x_bcast + np.maximum(y_bcast, 0) + else: + x[np.abs(x) < 0.005] = 0.02 + x_bcast[np.abs(x_bcast) < 0.005] = 0.02 + return y, x, np.maximum(x, 0), y_bcast + np.maximum(x_bcast, 0) + + +def relu_add_func(x, y, x_bcast, y_bcast, mode=0): + intermediate_out = x_bcast + y_bcast + out = np.maximum(intermediate_out, 0) + out[np.abs(out) < 0.005] = 0.02 + if mode == 0: + return x, y, intermediate_out, out + else: + return y, x, intermediate_out, out + + +def mul_scale_func(x, y, x_bcast, y_bcast, scale, mode=0): + if mode == 0: + return x, y, y * scale, x_bcast * (y_bcast * scale) + else: + return y, x, x * scale, y_bcast * (x_bcast * scale) + + +scale = 0.1 +scale_add_func = partial(scale_add_func, scale=scale) +add_scale_func = partial(add_scale_func, scale=scale) +mul_scale_func = partial(mul_scale_func, scale=scale) + +for mode in {0, 1}: + scale_add_func = partial(scale_add_func, mode=mode) + add_scale_func = partial(add_scale_func, mode=mode) + mul_scale_func = partial(mul_scale_func, mode=mode) + relu_add_func = partial(relu_add_func, mode=mode) + add_relu_func = partial(add_relu_func, mode=mode) + + for recomputation in {True, False}: + for keep_intermediate_value in {True, False}: + suffix = ("_keep_intermediate_value" if keep_intermediate_value else "") \ + + ("_recomputation" if recomputation else "") \ + + ("_mode_"+ str(mode)) + create_test_class('scale_add' + suffix, scale_add_func, { + 'scale': scale, + 'functor_list': ["scale", "elementwise_add"], + 'keep_intermediate_value': keep_intermediate_value, + 'recomputation': recomputation + }) + create_test_class('add_scale' + suffix, add_scale_func, { + 'scale': scale, + 'functor_list': ["elementwise_add", "scale"], + 'keep_intermediate_value': keep_intermediate_value, + 'recomputation': recomputation + }) + create_test_class('add_relu' + suffix, add_relu_func, { + 'functor_list': ["elementwise_add", "relu"], + 'keep_intermediate_value': keep_intermediate_value, + 'recomputation': recomputation + }) + create_test_class('relu_add' + suffix, relu_add_func, { + 'functor_list': ["relu", "elementwise_add"], + 'keep_intermediate_value': keep_intermediate_value, + 'recomputation': recomputation + }) + create_test_class('mul_scale' + suffix, mul_scale_func, { + 'scale': scale, + 'functor_list': ["elementwise_mul", "scale"], + 'keep_intermediate_value': keep_intermediate_value, + 'recomputation': recomputation + }) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py new file mode 100644 index 0000000000000000000000000000000000000000..764f83b534c8a183dbf21511f0b05741c13c9528 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py @@ -0,0 +1,133 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import math +from op_test import OpTest +from test_gru_op import gru +from test_fusion_lstm_op import fc, ACTIVATION + + +def fusion_gru( + x, # T x M + lod, # 1 x N + h0, # N x D + wx, # M x 3D + wh, # D x 3D + bias, # 1 x 3D + is_reverse, + act_state, + act_gate): + return gru(fc(x, wx, bias), + lod, + h0, + wh, + np.zeros( + (1, wh.shape[1]), dtype='float64'), + is_reverse, + act_state, + act_gate) + + +class TestFusionGRUOp(OpTest): + def set_confs(self): + pass + + def setUp(self): + self.op_type = "fusion_gru" + self.lod = [[2, 4, 3]] + self.M = 3 + self.D = 5 + self.is_reverse = False + self.with_h0 = True + self.with_bias = True + self.act_state = 'tanh' + self.act_gate = 'sigmoid' + self.set_confs() + + T = sum(self.lod[0]) + N = len(self.lod[0]) + + x = np.random.rand(T, self.M).astype('float64') + wx = np.random.rand(self.M, 3 * self.D).astype('float64') + wh = np.random.rand(self.D, 3 * self.D).astype('float64') + bias = np.random.rand( + 1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( + (1, 3 * self.D), dtype='float64') + h0 = np.random.rand( + N, self.D).astype('float64') if self.with_h0 else np.zeros( + (N, self.D), dtype='float64') + + _, _, _, hidden = fusion_gru( + x, self.lod, h0, wx, wh, bias, self.is_reverse, + ACTIVATION[self.act_state], ACTIVATION[self.act_gate]) + + self.inputs = {'X': (x, self.lod), 'WeightX': wx, 'WeightH': wh} + + if self.with_bias: + self.inputs['Bias'] = bias + + if self.with_h0: + self.inputs['H0'] = h0 + + self.outputs = {'Hidden': (hidden, self.lod)} + + self.attrs = { + 'activation': self.act_state, + 'gate_activation': self.act_gate, + 'is_reverse': self.is_reverse + } + + def test_check_output(self): + self.check_output(atol=1e-8) + + +class TestFusionGRUOpNoInitial(TestFusionGRUOp): + def set_confs(self): + self.with_h0 = False + + +class TestFusionGRUOpNoBias(TestFusionGRUOp): + def set_confs(self): + self.with_bias = False + + +class TestFusionGRUOpReverse(TestFusionGRUOp): + def set_confs(self): + self.is_reverse = True + + +class TestFusionGRUOpMD1(TestFusionGRUOp): + def set_confs(self): + self.M = 36 + self.D = 8 + + +class TestFusionGRUOpMD2(TestFusionGRUOp): + def set_confs(self): + self.M = 8 + self.D = 8 + + +class TestFusionGRUOpBS1(TestFusionGRUOp): + def set_confs(self): + self.lod = [[3]] + self.D = 16 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py index 9d8bef677fd16fb6bdc20b929137b4d885f4efd1..5805bdf461998e90611dec05b079cd55feda520d 100644 --- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py +++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py @@ -43,13 +43,13 @@ def fusion_lstm( act_cell, act_cand) -class TestLstmOp(OpTest): - def set_argument(self): - self.lod = [[2, 3, 2]] +class TestFusionLSTMOp(OpTest): + def set_conf(self): + pass def setUp(self): self.op_type = 'fusion_lstm' - self.lod = [[2, 3, 2]] + self.lod = [[2, 3, 5, 4]] self.M = 8 self.D = 16 self.has_initial_state = False @@ -58,33 +58,33 @@ class TestLstmOp(OpTest): self.act_cell = 'tanh' self.act_cand = 'tanh' self.use_peepholes = False - self.set_argument() + self.set_conf() T = sum(self.lod[0]) bs = len(self.lod[0]) - x = np.random.normal(size=(T, self.M)).astype('float64') + x = np.random.normal(size=(T, self.M)).astype('float32') if self.has_initial_state: - h0 = np.random.normal(size=(bs, self.D)).astype('float64') - c0 = np.random.normal(size=(bs, self.D)).astype('float64') + h0 = np.random.normal(size=(bs, self.D)).astype('float32') + c0 = np.random.normal(size=(bs, self.D)).astype('float32') else: - h0 = np.zeros((bs, self.D)).astype('float64') - c0 = np.zeros((bs, self.D)).astype('float64') + h0 = np.zeros((bs, self.D)).astype('float32') + c0 = np.zeros((bs, self.D)).astype('float32') - wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64') + wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32') if self.use_peepholes: - b = np.random.normal(size=(1, 7 * self.D)).astype('float64') + b = np.random.normal(size=(1, 7 * self.D)).astype('float32') else: - b = np.random.normal(size=(1, 4 * self.D)).astype('float64') + b = np.random.normal(size=(1, 4 * self.D)).astype('float32') w_b = np.copy(b[:, 0:4 * self.D]) w_c = b[:, 4 * self.D:] if self.use_peepholes else None # this is the weight of fc - wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64') + wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32') # this is the bias of fc # and it should be manually added into the bias of this fusion LSTM - bx = np.random.normal(size=(1, 4 * self.D)).astype('float64') + bx = np.random.normal(size=(1, 4 * self.D)).astype('float32') b[0, 0:4 * self.D] += bx[0, :] h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c, self.is_reverse, ACTIVATION[self.act_gate], @@ -114,35 +114,45 @@ class TestLstmOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8) + self.check_output() -class TestLstmOpInitReverse(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpInit(TestFusionLSTMOp): + def set_conf(self): + self.has_initial_state = True + + +class TestFusionLSTMOpReverse(TestFusionLSTMOp): + def set_conf(self): + self.is_reverse = True + + +class TestFusionLSTMOpInitReverse(TestFusionLSTMOp): + def set_conf(self): self.has_initial_state = True self.is_reverse = True -class TestLstmOpMD1(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpMD1(TestFusionLSTMOp): + def set_conf(self): self.M = 36 self.D = 8 -class TestLstmOpMD2(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpMD2(TestFusionLSTMOp): + def set_conf(self): self.M = 8 self.D = 8 -class TestLstmOpMD3(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpMD3(TestFusionLSTMOp): + def set_conf(self): self.M = 15 self.D = 3 -class TestLstmOpBS1(TestLstmOp): - def set_argument(self): +class TestFusionLSTMOpBS1(TestFusionLSTMOp): + def set_conf(self): self.lod = [[3]] self.D = 16 diff --git a/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py new file mode 100644 index 0000000000000000000000000000000000000000..aeee3a9999a94b4979fc3793150101352e50be85 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py @@ -0,0 +1,139 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from test_fusion_lstm_op import fc, ACTIVATION + + +def fusion_seqexpand_concat_fc(xs, lod, w, b, fc_act): + + T = sum(lod[0]) + N = len(lod[0]) + num_inputs = len(xs) + D = w.shape[1] + + expanded_inputs = [xs[0]] + for i in range(num_inputs - 1): + x = xs[i + 1] + assert x.shape[0] == N + expanded = np.repeat(x, lod[0], axis=0) + assert expanded.shape[0] == T + assert expanded.shape[1] == x.shape[1] + expanded_inputs.append(expanded) + + fc_input = np.concatenate(expanded_inputs, axis=1) + assert fc_input.shape[0] == T + assert fc_input.shape[1] == w.shape[0] + fc_out = fc(fc_input, w, b) + fc_out = fc_act(fc_out) + assert fc_out.shape[0] == T + assert fc_out.shape[1] == D + return fc_out + + +class TestFusionSeqExpandConcatFCOp(OpTest): + def set_conf(self): + pass + + def setUp(self): + self.op_type = 'fusion_seqexpand_concat_fc' + self.lod = [[3, 5, 8, 2]] + self.inputs_M = [15, 10, 10] + self.D = 20 + self.with_bias = True + self.fc_act = 'relu' + self.set_conf() + + T = sum(self.lod[0]) + bs = len(self.lod[0]) + num_inputs = len(self.inputs_M) + + x0 = np.random.normal(size=(T, self.inputs_M[0])).astype('float32') + xs = [x0] + for i in range(num_inputs - 1): + xi = np.random.normal(size=(bs, + self.inputs_M[i + 1])).astype('float32') + xs.append(xi) + + # fc weight and bias + w = np.random.normal(size=(sum(self.inputs_M), + self.D)).astype('float32') + b = np.random.normal(size=( + 1, self.D)).astype('float32') if self.with_bias else np.zeros( + (1, self.D)).astype('float32') + + out = fusion_seqexpand_concat_fc(xs, self.lod, w, b, + ACTIVATION[self.fc_act]) + + self.inputs = {'X': [('x0', (x0, self.lod))], 'FCWeight': w} + normal_lod = [[1] * bs] + for i in range(num_inputs - 1): + self.inputs['X'].append(('x%d' % (i + 1), (xs[i + 1], normal_lod))) + + if self.with_bias: + self.inputs['FCBias'] = b + + self.outputs = {'Out': (out, self.lod)} + self.attrs = {'fc_activation': self.fc_act} + + def test_check_output(self): + self.check_output() + + +class TestFusionSECFCOpNonBias(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.with_bias = False + + +class TestFusionSECFCOpNonAct(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.fc_act = 'identity' + + +class TestFusionSECFCOpMD1(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.inputs_M = [3, 4, 2, 1, 5] + self.D = 8 + + +class TestFusionSECFCOpMD2(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[5, 6]] + self.inputs_M = [1, 1] + + +class TestFusionSECFCOpBS1_1(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[1]] + self.inputs_M = [3, 4, 2] + + +class TestFusionSECFCOpBS1_2(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[1]] + self.inputs_M = [3, 4] + + +class TestFusionSECFCOpBS1_3(TestFusionSeqExpandConcatFCOp): + def set_conf(self): + self.lod = [[5]] + self.inputs_M = [6, 3] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py new file mode 100644 index 0000000000000000000000000000000000000000..ce766fffbce98a6a2cee4c508d6db85ee0163401 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py @@ -0,0 +1,317 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://w_idxw.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +import paddle.fluid as fluid +from op_test import OpTest + + +def generate_proposal_labels_in_python( + rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im, + fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, + class_nums): + rois = [] + labels_int32 = [] + bbox_targets = [] + bbox_inside_weights = [] + bbox_outside_weights = [] + lod = [] + assert len(rpn_rois) == len( + im_scales), 'batch size of rpn_rois and ground_truth is not matched' + + for im_i in range(len(im_scales)): + frcn_blobs = _sample_rois( + rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i], + batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, + bg_thresh_lo, bbox_reg_weights, class_nums) + + lod.append(frcn_blobs['rois'].shape[0]) + + rois.append(frcn_blobs['rois']) + labels_int32.append(frcn_blobs['labels_int32']) + bbox_targets.append(frcn_blobs['bbox_targets']) + bbox_inside_weights.append(frcn_blobs['bbox_inside_weights']) + bbox_outside_weights.append(frcn_blobs['bbox_outside_weights']) + + return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod + + +def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, + fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, + bbox_reg_weights, class_nums): + rois_per_image = int(batch_size_per_im) + fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) + + # Roidb + inv_im_scale = 1. / im_scale + rpn_rois = rpn_rois * inv_im_scale + + boxes = np.vstack([gt_boxes, rpn_rois]) + gt_overlaps = np.zeros((boxes.shape[0], class_nums)) + box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32) + if len(gt_boxes) > 0: + proposal_to_gt_overlaps = _bbox_overlaps(boxes, gt_boxes) + + overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1) + overlaps_max = proposal_to_gt_overlaps.max(axis=1) + # Boxes which with non-zero overlap with gt boxes + overlapped_boxes_ind = np.where(overlaps_max > 0)[0] + overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[ + overlapped_boxes_ind]] + gt_overlaps[overlapped_boxes_ind, + overlapped_boxes_gt_classes] = overlaps_max[ + overlapped_boxes_ind] + box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[ + overlapped_boxes_ind] + + max_overlaps = gt_overlaps.max(axis=1) + max_classes = gt_overlaps.argmax(axis=1) + + # Foreground + fg_inds = np.where(max_overlaps >= fg_thresh)[0] + fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0]) + # Sample foreground if there are too many + if fg_inds.shape[0] > fg_rois_per_this_image: + fg_inds = np.random.choice( + fg_inds, size=fg_rois_per_this_image, replace=False) + + # Background + bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >= + bg_thresh_lo))[0] + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, + bg_inds.shape[0]) + # Sample background if there are too many + if bg_inds.shape[0] > bg_rois_per_this_image: + bg_inds = np.random.choice( + bg_inds, size=bg_rois_per_this_image, replace=False) + + keep_inds = np.append(fg_inds, bg_inds) + sampled_labels = max_classes[keep_inds] + sampled_labels[fg_rois_per_this_image:] = 0 + sampled_boxes = boxes[keep_inds] + sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]] + sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0] + + bbox_label_targets = _compute_targets(sampled_boxes, sampled_gts, + sampled_labels, bbox_reg_weights) + bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_label_targets, + class_nums) + bbox_outside_weights = np.array( + bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) + + # Scale rois + sampled_rois = sampled_boxes * im_scale + + # Faster RCNN blobs + frcn_blobs = dict( + rois=sampled_rois, + labels_int32=sampled_labels, + bbox_targets=bbox_targets, + bbox_inside_weights=bbox_inside_weights, + bbox_outside_weights=bbox_outside_weights) + return frcn_blobs + + +def _bbox_overlaps(roi_boxes, gt_boxes): + w1 = np.maximum(roi_boxes[:, 2] - roi_boxes[:, 0] + 1, 0) + h1 = np.maximum(roi_boxes[:, 3] - roi_boxes[:, 1] + 1, 0) + w2 = np.maximum(gt_boxes[:, 2] - gt_boxes[:, 0] + 1, 0) + h2 = np.maximum(gt_boxes[:, 3] - gt_boxes[:, 1] + 1, 0) + area1 = w1 * h1 + area2 = w2 * h2 + + overlaps = np.zeros((roi_boxes.shape[0], gt_boxes.shape[0])) + for ind1 in range(roi_boxes.shape[0]): + for ind2 in range(gt_boxes.shape[0]): + inter_x1 = np.maximum(roi_boxes[ind1, 0], gt_boxes[ind2, 0]) + inter_y1 = np.maximum(roi_boxes[ind1, 1], gt_boxes[ind2, 1]) + inter_x2 = np.minimum(roi_boxes[ind1, 2], gt_boxes[ind2, 2]) + inter_y2 = np.minimum(roi_boxes[ind1, 3], gt_boxes[ind2, 3]) + inter_w = np.maximum(inter_x2 - inter_x1 + 1, 0) + inter_h = np.maximum(inter_y2 - inter_y1 + 1, 0) + inter_area = inter_w * inter_h + iou = inter_area / (area1[ind1] + area2[ind2] - inter_area) + overlaps[ind1, ind2] = iou + return overlaps + + +def _compute_targets(roi_boxes, gt_boxes, labels, bbox_reg_weights): + assert roi_boxes.shape[0] == gt_boxes.shape[0] + assert roi_boxes.shape[1] == 4 + assert gt_boxes.shape[1] == 4 + + targets = np.zeros(roi_boxes.shape) + bbox_reg_weights = np.asarray(bbox_reg_weights) + targets = _box_to_delta( + ex_boxes=roi_boxes, gt_boxes=gt_boxes, weights=bbox_reg_weights) + + return np.hstack([labels[:, np.newaxis], targets]).astype( + np.float32, copy=False) + + +def _box_to_delta(ex_boxes, gt_boxes, weights): + ex_w = ex_boxes[:, 2] - ex_boxes[:, 0] + 1 + ex_h = ex_boxes[:, 3] - ex_boxes[:, 1] + 1 + ex_ctr_x = ex_boxes[:, 0] + 0.5 * ex_w + ex_ctr_y = ex_boxes[:, 1] + 0.5 * ex_h + + gt_w = gt_boxes[:, 2] - gt_boxes[:, 0] + 1 + gt_h = gt_boxes[:, 3] - gt_boxes[:, 1] + 1 + gt_ctr_x = gt_boxes[:, 0] + 0.5 * gt_w + gt_ctr_y = gt_boxes[:, 1] + 0.5 * gt_h + + dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0] + dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1] + dw = (np.log(gt_w / ex_w)) / ex_w / weights[2] + dh = (np.log(gt_h / ex_h)) / ex_h / weights[3] + + targets = np.vstack([dx, dy, dw, dh]).transpose() + return targets + + +def _expand_bbox_targets(bbox_targets_input, class_nums): + class_labels = bbox_targets_input[:, 0] + fg_inds = np.where(class_labels > 0)[0] + + bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums)) + bbox_inside_weights = np.zeros(bbox_targets.shape) + for ind in fg_inds: + class_label = int(class_labels[ind]) + start_ind = class_label * 4 + end_ind = class_label * 4 + 4 + bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind, 1:] + bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0, 1.0) + + return bbox_targets, bbox_inside_weights + + +class TestGenerateProposalLabelsOp(OpTest): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = { + 'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod), + 'GtClasses': (self.gt_classes[0], self.gts_lod), + 'GtBoxes': (self.gt_boxes[0], self.gts_lod), + 'ImScales': self.im_scales[0] + } + self.attrs = { + 'batch_size_per_im': self.batch_size_per_im, + 'fg_fraction': self.fg_fraction, + 'fg_thresh': self.fg_thresh, + 'bg_thresh_hi': self.bg_thresh_hi, + 'bg_thresh_lo': self.bg_thresh_lo, + 'bbox_reg_weights': self.bbox_reg_weights, + 'class_nums': self.class_nums + } + self.outputs = { + 'Rois': (self.rois[0], [self.lod]), + 'LabelsInt32': (self.labels_int32[0], [self.lod]), + 'BboxTargets': (self.bbox_targets[0], [self.lod]), + 'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]), + 'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]), + } + + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = 'generate_proposal_labels' + self.set_data() + + def init_test_params(self): + self.batch_size_per_im = 10 + self.fg_fraction = 1.0 + self.fg_thresh = 0.5 + self.bg_thresh_hi = 0.5 + self.bg_thresh_lo = 0.0 + self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2] + self.class_nums = 81 + + def init_test_input(self): + np.random.seed(0) + image_nums = 1 + gt_nums = 6 # Keep same with batch_size_per_im for unittest + proposal_nums = self.batch_size_per_im - gt_nums + images_shape = [] + self.im_scales = [] + for i in range(image_nums): + images_shape.append(np.random.randint(200, size=2)) + self.im_scales.append(np.ones((1)).astype(np.float32)) + + self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape, + proposal_nums) + ground_truth, self.gts_lod = _generate_groundtruth( + images_shape, self.class_nums, gt_nums) + self.gt_classes = [gt['gt_classes'] for gt in ground_truth] + self.gt_boxes = [gt['boxes'] for gt in ground_truth] + + def init_test_output(self): + self.rois, self.labels_int32, self.bbox_targets, \ + self.bbox_inside_weights, self.bbox_outside_weights, \ + self.lod = generate_proposal_labels_in_python( + self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales, + self.batch_size_per_im, self.fg_fraction, + self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo, + self.bbox_reg_weights, self.class_nums + ) + + +def _generate_proposals(images_shape, proposal_nums): + rpn_rois = [] + rpn_rois_lod = [] + num_proposals = 0 + for i, image_shape in enumerate(images_shape): + proposals = _generate_boxes(image_shape, proposal_nums) + rpn_rois.append(proposals) + num_proposals += len(proposals) + rpn_rois_lod.append(num_proposals) + return rpn_rois, [rpn_rois_lod] + + +def _generate_groundtruth(images_shape, class_nums, gt_nums): + ground_truth = [] + gts_lod = [] + num_gts = 0 + for i, image_shape in enumerate(images_shape): + # Avoid background + gt_classes = np.random.randint( + low=1, high=class_nums, size=gt_nums).astype(np.int32) + gt_boxes = _generate_boxes(image_shape, gt_nums) + ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes)) + num_gts += len(gt_classes) + gts_lod.append(num_gts) + return ground_truth, [gts_lod] + + +def _generate_boxes(image_size, box_nums): + width = image_size[0] + height = image_size[1] + xywh = np.random.rand(box_nums, 4) + xy1 = xywh[:, [0, 1]] * image_size + wh = xywh[:, [2, 3]] * (image_size - xy1) + xy2 = xy1 + wh + boxes = np.hstack([xy1, xy2]) + boxes[:, [0, 2]] = np.minimum(width - 1., np.maximum(0., boxes[:, [0, 2]])) + boxes[:, [1, 3]] = np.minimum(height - 1., np.maximum(0., boxes[:, [1, 3]])) + return boxes.astype(np.float32) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals.py b/python/paddle/fluid/tests/unittests/test_generate_proposals.py new file mode 100644 index 0000000000000000000000000000000000000000..3fbd2ce95a4f22b91cd4955f914e12f422b0ee83 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals.py @@ -0,0 +1,320 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://w_idxw.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +import paddle.fluid as fluid +from op_test import OpTest +from test_multiclass_nms_op import nms +from test_anchor_generator_op import anchor_generator_in_python +import copy + + +def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors, + variances, pre_nms_topN, post_nms_topN, + nms_thresh, min_size, eta): + all_anchors = anchors.reshape(-1, 4) + rois = np.empty((0, 5), dtype=np.float32) + roi_probs = np.empty((0, 1), dtype=np.float32) + + rpn_rois = [] + rpn_roi_probs = [] + lod = [] + num_images = scores.shape[0] + for img_idx in range(num_images): + img_i_boxes, img_i_probs = proposal_for_one_image( + im_info[img_idx, :], all_anchors, variances, + bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :], + pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta) + lod.append(img_i_probs.shape[0]) + rpn_rois.append(img_i_boxes) + rpn_roi_probs.append(img_i_probs) + + return rpn_rois, rpn_roi_probs, lod + + +def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores, + pre_nms_topN, post_nms_topN, nms_thresh, min_size, + eta): + # Transpose and reshape predicted bbox transformations to get them + # into the same order as the anchors: + # - bbox deltas will be (4 * A, H, W) format from conv output + # - transpose to (H, W, 4 * A) + # - reshape to (H * W * A, 4) where rows are ordered by (H, W, A) + # in slowest to fastest order to match the enumerated anchors + bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4) + all_anchors = all_anchors.reshape(-1, 4) + variances = variances.reshape(-1, 4) + # Same story for the scores: + # - scores are (A, H, W) format from conv output + # - transpose to (H, W, A) + # - reshape to (H * W * A, 1) where rows are ordered by (H, W, A) + # to match the order of anchors and bbox_deltas + scores = scores.transpose((1, 2, 0)).reshape(-1, 1) + + # sort all (proposal, score) pairs by score from highest to lowest + # take top pre_nms_topN (e.g. 6000) + if pre_nms_topN <= 0 or pre_nms_topN >= len(scores): + order = np.argsort(-scores.squeeze()) + else: + # Avoid sorting possibly large arrays; + # First partition to get top K unsorted + # and then sort just thoes + inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN] + order = np.argsort(-scores[inds].squeeze()) + order = inds[order] + scores = scores[order, :] + bbox_deltas = bbox_deltas[order, :] + all_anchors = all_anchors[order, :] + proposals = box_coder(all_anchors, bbox_deltas, variances) + # clip proposals to image (may result in proposals with zero area + # that will be removed in the next step) + proposals = clip_tiled_boxes(proposals, im_info[:2]) + # remove predicted boxes with height or width < min_size + keep = filter_boxes(proposals, min_size, im_info) + proposals = proposals[keep, :] + scores = scores[keep, :] + + # apply loose nms (e.g. threshold = 0.7) + # take post_nms_topN (e.g. 1000) + # return the top proposals + if nms_thresh > 0: + keep = nms(boxes=proposals, + scores=scores, + nms_threshold=nms_thresh, + eta=eta) + if post_nms_topN > 0 and post_nms_topN < len(keep): + keep = keep[:post_nms_topN] + proposals = proposals[keep, :] + scores = scores[keep, :] + + return proposals, scores + + +def box_coder(all_anchors, bbox_deltas, variances): + """ + Decode proposals by anchors and bbox_deltas from RPN + """ + #proposals: xmin, ymin, xmax, ymax + proposals = np.zeros_like(bbox_deltas, dtype=np.float32) + + #anchor_loc: width, height, center_x, center_y + anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) + + anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2 + anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2 + + #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height + pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32) + if variances is not None: + for i in range(bbox_deltas.shape[0]): + pred_bbox[i, 0] = variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[ + i, 0] + anchor_loc[i, 2] + pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[ + i, 1] + anchor_loc[i, 3] + pred_bbox[i, 2] = math.exp(variances[i, 2] * + bbox_deltas[i, 2]) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp(variances[i, 3] * + bbox_deltas[i, 3]) * anchor_loc[i, 1] + else: + for i in range(bbox_deltas.shape[0]): + pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[ + i, 2] + pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[ + i, 3] + pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1] + + proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 + proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 + proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 + proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 + + return proposals + + +def clip_tiled_boxes(boxes, im_shape): + """Clip boxes to image boundaries. im_shape is [height, width] and boxes + has shape (N, 4 * num_tiled_boxes).""" + assert boxes.shape[1] % 4 == 0, \ + 'boxes.shape[1] is {:d}, but must be divisible by 4.'.format( + boxes.shape[1] + ) + # x1 >= 0 + boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) + # y1 >= 0 + boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) + # x2 < im_shape[1] + boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) + # y2 < im_shape[0] + boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) + return boxes + + +def filter_boxes(boxes, min_size, im_info): + """Only keep boxes with both sides >= min_size and center within the image. + """ + # Scale min_size to match image scale + min_size *= im_info[2] + ws = boxes[:, 2] - boxes[:, 0] + 1 + hs = boxes[:, 3] - boxes[:, 1] + 1 + x_ctr = boxes[:, 0] + ws / 2. + y_ctr = boxes[:, 1] + hs / 2. + keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) & + (y_ctr < im_info[0]))[0] + return keep + + +def iou(box_a, box_b): + """ + Apply intersection-over-union overlap between box_a and box_b + """ + xmin_a = min(box_a[0], box_a[2]) + ymin_a = min(box_a[1], box_a[3]) + xmax_a = max(box_a[0], box_a[2]) + ymax_a = max(box_a[1], box_a[3]) + + xmin_b = min(box_b[0], box_b[2]) + ymin_b = min(box_b[1], box_b[3]) + xmax_b = max(box_b[0], box_b[2]) + ymax_b = max(box_b[1], box_b[3]) + + area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1) + area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1) + if area_a <= 0 and area_b <= 0: + return 0.0 + + xa = max(xmin_a, xmin_b) + ya = max(ymin_a, ymin_b) + xb = min(xmax_a, xmax_b) + yb = min(ymax_a, ymax_b) + + inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + + iou_ratio = inter_area / (area_a + area_b - inter_area) + + return iou_ratio + + +def nms(boxes, scores, nms_threshold, eta=1.0): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + nms_threshold: (float) The overlap thresh for suppressing unnecessary + boxes. + eta: (float) The parameter for adaptive NMS. + Return: + The indices of the kept boxes with respect to num_priors. + """ + all_scores = copy.deepcopy(scores) + all_scores = all_scores.flatten() + + sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort') + sorted_scores = all_scores[sorted_indices] + selected_indices = [] + adaptive_threshold = nms_threshold + for i in range(sorted_scores.shape[0]): + idx = sorted_indices[i] + keep = True + for k in range(len(selected_indices)): + if keep: + kept_idx = selected_indices[k] + overlap = iou(boxes[idx], boxes[kept_idx]) + keep = True if overlap <= adaptive_threshold else False + else: + break + if keep: + selected_indices.append(idx) + if keep and eta < 1 and adaptive_threshold > 0.5: + adaptive_threshold *= eta + return selected_indices + + +class TestGenerateProposalsOp(OpTest): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = { + 'Scores': self.scores, + 'BboxDeltas': self.bbox_deltas, + 'ImInfo': self.im_info.astype(np.float32), + 'Anchors': self.anchors, + 'Variances': self.variances + } + + self.attrs = { + 'pre_nms_topN': self.pre_nms_topN, + 'post_nms_topN': self.post_nms_topN, + 'nms_thresh': self.nms_thresh, + 'min_size': self.min_size, + 'eta': self.eta + } + + print("lod = ", self.lod) + self.outputs = { + 'RpnRois': (self.rpn_rois[0], [self.lod]), + 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]) + } + + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "generate_proposals" + self.set_data() + + def init_test_params(self): + self.pre_nms_topN = 12000 # train 12000, test 2000 + self.post_nms_topN = 5000 # train 6000, test 1000 + self.nms_thresh = 0.7 + self.min_size = 3.0 + self.eta = 0.8 + + def init_test_input(self): + batch_size = 1 + input_channels = 20 + layer_h = 16 + layer_w = 16 + input_feat = np.random.random( + (batch_size, input_channels, layer_h, layer_w)).astype('float32') + self.anchors, self.variances = anchor_generator_in_python( + input_feat=input_feat, + anchor_sizes=[16., 32.], + aspect_ratios=[0.5, 1.0], + variances=[1.0, 1.0, 1.0, 1.0], + stride=[16.0, 16.0], + offset=0.5) + self.im_info = np.array([[64., 64., 8.]]) #im_height, im_width, scale + num_anchors = self.anchors.shape[2] + self.scores = np.random.random( + (batch_size, num_anchors, layer_h, layer_w)).astype('float32') + self.bbox_deltas = np.random.random( + (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32') + + def init_test_output(self): + self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python( + self.scores, self.bbox_deltas, self.im_info, self.anchors, + self.variances, self.pre_nms_topN, self.post_nms_topN, + self.nms_thresh, self.min_size, self.eta) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 001fd7efb159e60bdf3cd0698d85dea90ad71616..9f6f03f9cfe3c505a7b1227e2b20db3c3c84c745 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -19,22 +19,19 @@ import numpy as np import math import functools from op_test import OpTest -from test_lstm_op import identity, sigmoid, tanh, relu - - -class TestGRUOp(OpTest): - lod = [[2, 4, 3]] - batch_size = sum(lod[0]) - frame_size = 5 - activate = { - 'identity': identity, - 'sigmoid': sigmoid, - 'tanh': tanh, - 'relu': relu - } - - @staticmethod - def seq_to_batch(lod, is_reverse): +from test_lstm_op import ACTIVATION + + +def gru( + input, # T x 3D + lod, # 1 x N + h0, # N x D + weight, # D x 3D + bias, # 1 x 3D + is_reverse, + act_state, + act_gate): + def _seq_to_batch(lod, is_reverse): idx_in_seq_list = [] seq_lens = lod[0] seq_starts = [0] @@ -56,121 +53,125 @@ class TestGRUOp(OpTest): idx_in_seq_list.append(idx_in_seq) return idx_in_seq_list, sorted_seqs - def gru_step(self, x, h_p, w, b): - batch_size = x.shape[0] - frame_size = w.shape[0] - g = x + np.tile(b, (batch_size, 1)) - w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape( - (frame_size, frame_size * 2)) - u_r = self.activate[self.attrs['gate_activation']](np.dot( - h_p, w_u_r) + g[:, :frame_size * 2]) - u = u_r[:, :frame_size] - r = u_r[:, frame_size:frame_size * 2] + def _step(x, h_p, w, b, act_state, act_gate): + T = x.shape[0] + D = w.shape[0] + g = x + np.tile(b, (T, 1)) + w_u_r = w.flatten()[:D * D * 2].reshape((D, D * 2)) + u_r = act_gate(np.dot(h_p, w_u_r) + g[:, :D * 2]) + u = u_r[:, :D] + r = u_r[:, D:D * 2] r_h_p = r * h_p - w_c = w.flatten()[frame_size * frame_size * 2:].reshape( - (frame_size, frame_size)) - c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) + - g[:, frame_size * 2:]) + w_c = w.flatten()[D * D * 2:].reshape((D, D)) + c = act_state(np.dot(r_h_p, w_c) + g[:, D * 2:]) g = np.hstack((u_r, c)) h = u * c + (1 - u) * h_p return g, r_h_p, h - def gru(self): - input, lod = self.inputs['Input'] - w = self.inputs['Weight'] - b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros( - (1, self.frame_size * 3)) - batch_gate = self.outputs['BatchGate'] - batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev'] - batch_hidden = self.outputs['BatchHidden'] - hidden = self.outputs['Hidden'] - idx_in_seq_list = self.idx_in_seq_list - h_p = self.inputs['H0'][ - self.sorted_seqs] if 'H0' in self.inputs else np.zeros( - (len(idx_in_seq_list[0]), self.frame_size)) - num_batch = len(idx_in_seq_list) - end_idx = 0 - for batch_idx in range(num_batch): - x = input[idx_in_seq_list[batch_idx]] - g, r_h_p, h = self.gru_step(x, h_p, w, b) - if batch_idx < (num_batch - 1): - h_p = h[:len(idx_in_seq_list[batch_idx + 1])] - start_idx = end_idx - end_idx = start_idx + len(idx_in_seq_list[batch_idx]) - batch_gate[start_idx:end_idx] = g - batch_reset_hidden_prev[start_idx:end_idx] = r_h_p - batch_hidden[start_idx:end_idx] = h - hidden[idx_in_seq_list[batch_idx]] = h - return batch_gate, batch_reset_hidden_prev, hidden - - def set_data(self): - lod = self.lod - self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch( - lod, self.is_reverse) - batch_size = self.batch_size - frame_size = self.frame_size - input = np.random.rand(batch_size, frame_size * 3).astype('float64') - h0 = np.random.rand(len(self.idx_in_seq_list[0]), - frame_size).astype('float64') - weight = np.random.rand(frame_size, frame_size * 3).astype('float64') - bias = np.random.rand(1, frame_size * 3).astype('float64') - - self.inputs = { - 'Input': (input, lod), - 'H0': h0, - 'Weight': weight, - 'Bias': bias - } + T = sum(lod[0]) + N = len(lod[0]) + D = weight.shape[0] + batch_gate = np.zeros((T, 3 * D), dtype='float64') + batch_reset_hidden_prev = np.zeros((T, D), dtype='float64') + batch_hidden = np.zeros((T, D), dtype='float64') + hidden = np.zeros((T, D), dtype='float64') + + idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse) + h_p = h0[sorted_seqs] + max_seq_len = len(idx_in_seq_list) + assert len(idx_in_seq_list[0]) == N + end_idx = 0 + for batch_idx in range(max_seq_len): + x = input[idx_in_seq_list[batch_idx]] + g, r_h_p, h = _step(x, h_p, weight, bias, act_state, act_gate) + if batch_idx < (max_seq_len - 1): + h_p = h[:len(idx_in_seq_list[batch_idx + 1])] + start_idx = end_idx + end_idx = start_idx + len(idx_in_seq_list[batch_idx]) + batch_gate[start_idx:end_idx] = g + batch_reset_hidden_prev[start_idx:end_idx] = r_h_p + batch_hidden[start_idx:end_idx] = h + hidden[idx_in_seq_list[batch_idx]] = h + return batch_gate, batch_reset_hidden_prev, batch_hidden, hidden - self.outputs = { - 'BatchGate': np.zeros( - (batch_size, frame_size * 3), dtype='float64'), - 'BatchResetHiddenPrev': np.zeros( - (batch_size, frame_size), dtype='float64'), - 'BatchHidden': np.zeros( - (batch_size, frame_size), dtype='float64'), - 'Hidden': np.zeros( - (batch_size, frame_size), dtype='float64') - } +class TestGRUOp(OpTest): def set_confs(self): - self.is_reverse = False - self.attrs = { - 'activation': 'tanh', - 'gate_activation': 'sigmoid', - 'is_reverse': self.is_reverse - } + pass def setUp(self): self.op_type = "gru" + self.lod = [[2, 4, 3]] + self.D = 5 + self.is_reverse = False + self.with_h0 = True + self.with_bias = True + self.act_state = 'tanh' + self.act_gate = 'sigmoid' self.set_confs() - self.set_data() - self.gru() + + T = sum(self.lod[0]) + N = len(self.lod[0]) + + input = np.random.rand(T, 3 * self.D).astype('float64') + weight = np.random.rand(self.D, 3 * self.D).astype('float64') + bias = np.random.rand( + 1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( + (1, 3 * self.D), dtype='float64') + h0 = np.random.rand( + N, self.D).astype('float64') if self.with_h0 else np.zeros( + (N, self.D), dtype='float64') + + batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru( + input, self.lod, h0, weight, bias, self.is_reverse, + ACTIVATION[self.act_state], ACTIVATION[self.act_gate]) + self.inputs = {'Input': (input, self.lod), 'Weight': weight} + + if self.with_bias: + self.inputs['Bias'] = bias + + if self.with_h0: + self.inputs['H0'] = h0 + + self.outputs = { + 'Hidden': (hidden, self.lod), + 'BatchGate': batch_gate, + 'BatchResetHiddenPrev': batch_reset_hidden_prev, + 'BatchHidden': batch_hidden, + } + + self.attrs = { + 'activation': self.act_state, + 'gate_activation': self.act_gate, + 'is_reverse': self.is_reverse + } def test_check_output(self): - self.check_output() + self.check_output(atol=1e-8) def test_check_grad(self): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) class TestGRUOpNoInitial(TestGRUOp): - def set_data(self): - super(TestGRUOpNoInitial, self).set_data() - self.inputs.pop('H0') + def set_confs(self): + self.with_h0 = False def test_check_grad(self): self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden']) +class TestGRUOpNoBias(TestGRUOp): + def set_confs(self): + self.with_bias = False + + def test_check_grad(self): + self.check_grad(['Input', 'H0', 'Weight'], ['Hidden']) + + class TestGRUOpReverse(TestGRUOp): def set_confs(self): self.is_reverse = True - self.attrs = { - 'activation': 'tanh', - 'gate_activation': 'sigmoid', - 'is_reverse': self.is_reverse - } if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 8e707c8b00b7bf3c5ea77c18c18135e89ffab9c7..ecdf32524afb1357b192ce14674b7073972dee9f 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -240,6 +240,22 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.softmax(hid)) print(str(program)) + def test_sequence_unsqueeze(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[8, 2], dtype='float32') + out = layers.unsqueeze(input=x, axes=[1]) + self.assertIsNotNone(out) + print(str(program)) + + def test_squeeze(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[1, 1, 4], dtype='float32') + out = layers.squeeze(input=x, axes=[2]) + self.assertIsNotNone(out) + print(str(program)) + def test_lrn(self): program = Program() with program_guard(program): @@ -505,6 +521,20 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_pad2d(self): + program = Program() + with program_guard(program): + input = layers.data( + name="input", shape=[3, 100, 100], dtype="float32") + out = layers.pad2d( + input, + paddings=[1, 2, 3, 4], + mode='reflect', + data_format='NCHW', + name="shape") + self.assertIsNotNone(out) + print(str(program)) + def test_prelu(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_name_scope.py b/python/paddle/fluid/tests/unittests/test_name_scope.py new file mode 100644 index 0000000000000000000000000000000000000000..08c802e20d2bb364ef7f116ee0042a2ad21a9b2b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_name_scope.py @@ -0,0 +1,45 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + + +class TestNameScope(unittest.TestCase): + def test_name_scope(self): + with fluid.name_scope("s1"): + a = fluid.layers.data(name='data', shape=[1], dtype='int32') + b = a + 1 + with fluid.name_scope("s2"): + c = b * 1 + with fluid.name_scope("s3"): + d = c / 1 + with fluid.name_scope("s1"): + f = fluid.layers.pow(d, 2.0) + with fluid.name_scope("s4"): + g = f - 1 + + for op in fluid.default_main_program().block(0).ops: + if op.type == 'elementwise_add': + self.assertEqual(op.desc.attr("op_namescope"), '/s1/') + elif op.type == 'elementwise_mul': + self.assertEqual(op.desc.attr("op_namescope"), '/s1/s2/') + elif op.type == 'elementwise_div': + self.assertEqual(op.desc.attr("op_namescope"), '/s1/s3/') + elif op.type == 'elementwise_sub': + self.assertEqual(op.desc.attr("op_namescope"), '/s4/') + elif op.type == 'pow': + self.assertEqual(op.desc.attr("op_namescope"), '/s1_1/') diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 6d01955993324498de42462b7f85ef6f8e444505..cac132e6e08a8a9ec595236b1a990c0900ea4f0f 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -67,7 +67,10 @@ class TestOperator(unittest.TestCase): self.assertEqual(mul_op.output("Out"), ["mul.out"]) self.assertEqual( set(mul_op.attr_names), - set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"])) + set([ + "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", + "op_namescope" + ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) self.assertEqual(mul_op.attr("x_num_col_dims"), 1) diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py new file mode 100644 index 0000000000000000000000000000000000000000..728b8c181a4410d7df7f304bcc8d2816e91ea6d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py @@ -0,0 +1,102 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestPad2dOp(OpTest): + def setUp(self): + self.pad_value = 0.0 + self.initTestCase() + self.op_type = "pad2d" + self.inputs = {'X': np.random.random(self.shape).astype("float32"), } + self.attrs = {} + self.attrs['paddings'] = np.array(self.paddings).flatten() + self.attrs['pad_value'] = self.pad_value + self.attrs['mode'] = self.mode + self.attrs['data_format'] = self.data_format + if self.data_format == "NCHW": + paddings = [(0, 0), (0, 0), (self.paddings[0], self.paddings[1]), + (self.paddings[2], self.paddings[3])] + else: + paddings = [(0, 0), (self.paddings[0], self.paddings[1]), + (self.paddings[2], self.paddings[3]), (0, 0)] + if self.mode == "constant": + out = np.pad(self.inputs['X'], + paddings, + mode=self.mode, + constant_values=self.pad_value) + else: + out = np.pad(self.inputs['X'], paddings, mode=self.mode) + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X'], 'Out', max_relative_error=0.006) + + def initTestCase(self): + self.shape = (2, 3, 4, 4) + self.paddings = [0, 1, 2, 3] + self.mode = "constant" + self.data_format = "NCHW" + self.pad_value = 0.0 + + +class TestCase1(TestPad2dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 4) + self.paddings = [0, 1, 2, 3] + self.mode = "reflect" + self.data_format = "NCHW" + + +class TestCase2(TestPad2dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 4) + self.paddings = [0, 1, 2, 3] + self.mode = "edge" + self.data_format = "NCHW" + + +class TestCase3(TestPad2dOp): + def initTestCase(self): + self.shape = (2, 4, 4, 2) + self.paddings = [0, 1, 2, 3] + self.mode = "reflect" + self.data_format = "NHWC" + + +class TestCase4(TestPad2dOp): + def initTestCase(self): + self.shape = (2, 4, 4, 2) + self.paddings = [0, 1, 2, 3] + self.mode = "edge" + self.data_format = "NHWC" + + +class TestCase5(TestPad2dOp): + def initTestCase(self): + self.shape = (2, 4, 4, 2) + self.paddings = [0, 1, 2, 3] + self.mode = "constant" + self.pad_value = 1.2 + self.data_format = "NHWC" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py new file mode 100644 index 0000000000000000000000000000000000000000..6b733fd8fa023f07013909502dbbd5371297216e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py @@ -0,0 +1,69 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestPadOp(OpTest): + def setUp(self): + self.initTestCase() + self.op_type = "pad_constant_like" + self.inputs = { + 'X': np.random.random(self.x_shape).astype("float32"), + 'Y': np.random.random(self.y_shape).astype("float32") + } + self.attrs = {} + self.attrs['pad_value'] = self.pad_value + self.outputs = { + 'Out': np.pad(self.inputs['Y'], + self.paddings, + mode='constant', + constant_values=self.pad_value) + } + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['Y'], 'Out', max_relative_error=0.006) + + def initTestCase(self): + self.x_shape = (16, 16) + self.y_shape = (3, 16) + self.pad_value = 0.1 + self.paddings = [(0, 13), (0, 0)] + + +class TestCase1(TestPadOp): + def initTestCase(self): + self.x_shape = (4, 3, 4, 4) + self.y_shape = (2, 3, 4, 4) + self.paddings = [(0, 2), (0, 0), (0, 0), (0, 0)] + self.pad_value = 0.5 + + +class TestCase2(TestPadOp): + def initTestCase(self): + self.x_shape = (4, 3, 4, 4) + self.y_shape = (2, 3, 2, 4) + self.paddings = [(0, 2), (0, 0), (0, 2), (0, 0)] + self.pad_value = 0.5 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 5b96d641d667eee1aa0c7c6019bf92494f777259..af3745987aa3eae96968bdc6b5c9cd951e9ca6fa 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -67,18 +67,20 @@ def fc_with_batchnorm(use_feed): hidden = img for _ in range(1): - hidden = fluid.layers.fc( - hidden, - size=200, - act='tanh', - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1.0))) - - hidden = fluid.layers.batch_norm(input=hidden) - - prediction = fluid.layers.fc(hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + with fluid.name_scope("hidden"): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + with fluid.name_scope("fc_layer"): + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + with fluid.name_scope("loss"): + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py index 979be5af3bdc24b1a2fc115198eeab53469a91c0..1e3e40d54a78045c8d8fdd9a3a3715107d1e7a80 100644 --- a/python/paddle/fluid/tests/unittests/test_prelu_op.py +++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py @@ -51,30 +51,28 @@ class PReluTest(OpTest): def test_check_output(self): self.check_output() - def test_check_grad(self): - self.check_grad(['X', 'Alpha'], 'Out') - - def test_check_grad_ignore_x(self): + def test_check_grad_1_ignore_x(self): self.check_grad(['Alpha'], 'Out', no_grad_set=set('X')) - def test_check_grad_ignore_alpha(self): - self.check_grad(['X'], 'Out', no_grad_set=set('Alpha')) - - -class TestCase1(PReluTest): - def initTestCase(self): - self.attrs = {'mode': "all"} + def test_check_grad_2(self): + self.check_grad(['X', 'Alpha'], 'Out') + def test_check_grad_3_ignore_alpha(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Alpha')) -class TestCase2(PReluTest): - def initTestCase(self): - self.attrs = {'mode': "channel"} +# TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues +# class TestCase1(PReluTest): +# def initTestCase(self): +# self.attrs = {'mode': "all"} -class TestCase3(PReluTest): - def initTestCase(self): - self.attrs = {'mode': "element"} +# class TestCase2(PReluTest): +# def initTestCase(self): +# self.attrs = {'mode': "channel"} +# class TestCase3(PReluTest): +# def initTestCase(self): +# self.attrs = {'mode': "element"} if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py index ac682d6181cfcc5a064a51a736b03d493c37b780..8097b5f734343ca97c131474338ed1cd60eefc85 100644 --- a/python/paddle/fluid/tests/unittests/test_print_op.py +++ b/python/paddle/fluid/tests/unittests/test_print_op.py @@ -35,9 +35,8 @@ class TestPrintOpCPU(unittest.TestCase): def build_network(self, only_forward, **kargs): x = layers.data('x', shape=[3], dtype='float32', lod_level=1) x.stop_gradient = False - printed = layers.Print(input=x, **kargs) - if only_forward: return printed - loss = layers.mean(printed) + layers.Print(input=x, **kargs) + loss = layers.mean(x) append_backward(loss=loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py index 0a8a43253d79ba21c7333dd19af05d8adf410289..032af6ed5ce9e1007d6775306ef4c0aefb9dcc41 100644 --- a/python/paddle/fluid/tests/unittests/test_scale_op.py +++ b/python/paddle/fluid/tests/unittests/test_scale_op.py @@ -17,6 +17,8 @@ from __future__ import print_function import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator class TestScaleOp(OpTest): @@ -33,5 +35,57 @@ class TestScaleOp(OpTest): self.check_grad(['X'], 'Out') +class TestScaleOpSelectedRows(unittest.TestCase): + def check_with_place(self, place, in_name, out_name): + scope = core.Scope() + + # create and initialize Grad Variable + in_height = 10 + in_rows = [0, 4, 7] + in_row_numel = 12 + scale = 2.0 + + in_selected_rows = scope.var(in_name).get_selected_rows() + in_selected_rows.set_height(in_height) + in_selected_rows.set_rows(in_rows) + in_array = np.random.random( + (len(in_rows), in_row_numel)).astype("float32") + + in_tensor = in_selected_rows.get_tensor() + in_tensor.set(in_array, place) + + # create and initialize Param Variable + out_selected_rows = scope.var(out_name).get_selected_rows() + out_tensor = out_selected_rows.get_tensor() + out_tensor._set_dims(in_tensor._get_dims()) + + # create and run sgd operator + scale_op = Operator("scale", X=in_name, Out=out_name, scale=scale) + scale_op.run(scope, place) + + # get and compare result + out_height = out_selected_rows.height() + out_rows = out_selected_rows.rows() + result_array = np.array(out_tensor) + + assert (in_array * scale == result_array).all() + assert in_height == out_height + assert in_rows == out_rows + + def test_scale_selected_rows(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place, 'in', 'out') + + def test_scale_selected_rows_inplace(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place, 'in', 'in') + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sequence_mask.py b/python/paddle/fluid/tests/unittests/test_sequence_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..02c5b204082ece0d98d014c952293c5be39520ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_mask.py @@ -0,0 +1,94 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from op_test import OpTest +import paddle.fluid as fluid +from paddle.fluid.framework import convert_np_dtype_to_dtype_ +import paddle.fluid.core as core +import numpy as np +import copy +import unittest + + +class SequenceMaskTestBase(OpTest): + def initDefaultParameters(self): + self.op_type = 'sequence_mask' + self.maxlen = 10 + self.mask_dtype = 'int64' + self.x = [[0, 3, 4], [5, 7, 9]] + + def initParameters(self): + pass + + def setUp(self): + self.initDefaultParameters() + self.initParameters() + if not isinstance(self.x, np.ndarray): + self.x = np.array(self.x) + + self.inputs = {'X': self.x} + self.outputs = {'Y': self.calc_ground_truth_mask()} + self.attrs = { + 'maxlen': self.maxlen, + 'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype) + } + + def calc_ground_truth_mask(self): + maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen + shape = self.x.shape + (maxlen, ) + index_broadcast = np.broadcast_to( + np.reshape( + range(maxlen), newshape=[1] * self.x.ndim + [-1]), + shape=shape) + x_broadcast = np.broadcast_to( + np.reshape( + self.x, newshape=self.x.shape + (-1, )), shape=shape) + return (index_broadcast < x_broadcast).astype(self.mask_dtype) + + def test_check_output(self): + self.check_output() + + +class SequenceMaskTest1(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'bool' + + +class SequenceMaskTest2(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'uint8' + + +class SequenceMaskTest3(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'int32' + + +class SequenceMaskTest4(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'float32' + + +class SequenceMaskTest5(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'float64' + + +class SequenceMaskTest6(SequenceMaskTestBase): + def initParameters(self): + self.maxlen = -1 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py new file mode 100644 index 0000000000000000000000000000000000000000..471515c817541976a06eb024fa3d4f77b78f920d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sequence_pad_op.py @@ -0,0 +1,131 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSequencePadOp(OpTest): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = -1 + self.dtype = 'float32' + + def set_data(self): + x_data = np.random.uniform(0.1, 0.5, self.x_shape).astype(self.dtype) + pad_value_data = np.array(self.pad_value).astype(self.dtype) + self.inputs = { + 'X': (x_data, self.x_len_lod), + 'PadValue': pad_value_data + } + self.attrs = {'padded_length': self.padded_length} + + def compute(self): + # get padded length + padded_length = self.padded_length + x_len_lod_0 = self.x_len_lod[0] + if padded_length == -1: + max_seq_len = 0 + for l in x_len_lod_0: + max_seq_len = max(max_seq_len, l) + padded_length = max_seq_len + + # do padding + x_data = self.inputs['X'][0] + pad_value_data = self.inputs['PadValue'] + if pad_value_data.shape == (1, ): + pad_value_data = np.broadcast_to( + pad_value_data, shape=x_data.shape[1:]) + padded_sequences = [] + start_idx = 0 + for l in x_len_lod_0: + end_idx = start_idx + l + seq = x_data[start_idx:end_idx] + to_pad_len = padded_length - l + for _ in range(to_pad_len): + seq = np.append(seq, pad_value_data[np.newaxis, :], axis=0) + padded_sequences.append(seq) + start_idx = end_idx + + out_data = np.array(padded_sequences) + self.outputs = {'Out': out_data} + + def setUp(self): + self.op_type = 'sequence_pad' + self.set_attr() + self.set_data() + self.compute() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestSequencePadOp2(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0, 2.0, 3.0, 4.0] + self.padded_length = -1 + self.dtype = 'float32' + + +class TestSequencePadOp3(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = 7 + self.dtype = 'float32' + + +class TestSequencePadOp4(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 4] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0, 2.0, 3.0, 4.0] + self.padded_length = 7 + self.dtype = 'float32' + + +class TestSequencePadOp5(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 2, 2] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = -1 + self.dtype = 'float32' + + +class TestSequencePadOp6(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 2, 2] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [[1.0, 2.0], [3.0, 4.0]] + self.padded_length = -1 + self.dtype = 'float32' + + +class TestSequencePadOp7(TestSequencePadOp): + def set_attr(self): + self.x_shape = [12, 2, 2] + self.x_len_lod = [[2, 3, 4, 3]] + self.pad_value = [1.0] + self.padded_length = 7 + self.dtype = 'float32' diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py index e9d0f8a0193c77da33a8cf128dbf8a1c5087782b..1822957c23d0bb1e4821373515d4faef2b76950e 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor.py @@ -59,6 +59,27 @@ class TestTensor(unittest.TestCase): self.assertAlmostEqual(1.0, tensor_array_2[3, 9]) self.assertAlmostEqual(2.0, tensor_array_2[19, 11]) + def test_int8_tensor(self): + scope = core.Scope() + var = scope.var("int8_tensor") + cpu_tensor = var.get_tensor() + tensor_array = numpy.random.randint( + -127, high=128, size=[100, 200], dtype=numpy.int8) + place = core.CPUPlace() + cpu_tensor.set(tensor_array, place) + cpu_tensor_array_2 = numpy.array(cpu_tensor) + self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all()) + + if core.is_compiled_with_cuda(): + cuda_tensor = var.get_tensor() + tensor_array = numpy.random.randint( + -127, high=128, size=[100, 200], dtype=numpy.int8) + place = core.CUDAPlace(0) + cuda_tensor.set(tensor_array, place) + cuda_tensor_array_2 = numpy.array(cuda_tensor) + self.assertAlmostEqual(cuda_tensor_array_2.all(), + tensor_array.all()) + def test_int_lod_tensor(self): place = core.CPUPlace() scope = core.Scope() diff --git a/python/paddle/fluid/tests/unittests/test_unstack_op.py b/python/paddle/fluid/tests/unittests/test_unstack_op.py new file mode 100644 index 0000000000000000000000000000000000000000..7cbac8928ec40dc3e1c0e91e7779ec9ec978d884 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py @@ -0,0 +1,81 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from op_test import OpTest +import numpy as np +import unittest + + +class TestUnStackOpBase(OpTest): + def initDefaultParameters(self): + self.input_dim = (5, 6, 7) + self.axis = 0 + self.dtype = 'float32' + + def initParameters(self): + pass + + def get_y_names(self): + y_names = [] + for i in range(self.input_dim[self.axis]): + y_names.append('y{}'.format(i)) + return y_names + + def setUp(self): + self.initDefaultParameters() + self.initParameters() + self.op_type = 'unstack' + self.x = np.random.random(size=self.input_dim).astype(self.dtype) + + outs = np.split(self.x, self.input_dim[self.axis], self.axis) + new_shape = list(self.input_dim) + del new_shape[self.axis] + y_names = self.get_y_names() + tmp = [] + for i in range(self.input_dim[self.axis]): + tmp.append((y_names[i], np.reshape(outs[i], new_shape))) + + self.inputs = {'X': self.x} + self.outputs = {'Y': tmp} + self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad('X', self.get_y_names()) + + +class TestStackOp3(TestUnStackOpBase): + def initParameters(self): + self.axis = -1 + + +class TestStackOp4(TestUnStackOpBase): + def initParameters(self): + self.axis = -3 + + +class TestStackOp5(TestUnStackOpBase): + def initParameters(self): + self.axis = 1 + + +class TestStackOp6(TestUnStackOpBase): + def initParameters(self): + self.axis = 2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index b0830e130dd9a9037f8dd900a256eea3d05f64b8..4f3c26ca7bdf4d807952b413c8b0dc8b211c06f6 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -31,7 +31,8 @@ class TestVariable(unittest.TestCase): self.assertEqual(DT.INT16, convert("int16")) self.assertEqual(DT.INT64, convert("int64")) self.assertEqual(DT.BOOL, convert("bool")) - self.assertRaises(ValueError, lambda: convert("int8")) + self.assertEqual(DT.INT8, convert("int8")) + self.assertEqual(DT.UINT8, convert("uint8")) def test_var(self): b = default_main_program().current_block() diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index 420ae6dfd4b75b507dd01bb947fa707bca5cdb08..f0fafaa84a73d641ff6ceb74def6addaea759516 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -62,9 +62,12 @@ def variable_to_code(var): Returns: string: The formatted string. """ - - var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})".\ - format(i="{", e="}", name=var.name, type=var.type, shape=var.shape, dtype=var.dtype) + if var.type == core.VarDesc.VarType.SELECTED_ROWS or var.type == core.VarDesc.VarType.LOD_TENSOR: + var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})".\ + format(i="{", e="}", name=var.name, type=var.type, shape=var.shape, dtype=var.dtype) + else: + var_str = "{name} : fluid.{type})".\ + format(i="{", e="}", name=var.name, type=var.type) if type(var) == paddle.fluid.framework.Parameter: if var.trainable: @@ -142,6 +145,28 @@ def op_to_code(op): return op_str +def block_to_code(block, block_idx): + indent = 0 + + print("{0}{1} // block {2}".format( + get_indent_space(indent), '{', block_idx)) + + indent += 1 + # sort all vars + all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0]) + for var in all_vars: + print("{}{}".format(get_indent_space(indent), variable_to_code(var[1]))) + + if len(all_vars) > 0: + print("") + + for op in block.ops: + print("{}{}".format(get_indent_space(indent), op_to_code(op))) + indent -= 1 + + print("{0}{1}".format(get_indent_space(indent), '}')) + + def program_to_code(prog): """ Print readable codes of fluid program. @@ -152,23 +177,7 @@ def program_to_code(prog): An example result like bellow: https://github.com/PaddlePaddle/Paddle/pull/12673 """ - indent = 0 block_idx = 0 for block in prog.blocks: - print("{0}{1} // block {2}".format( - get_indent_space(indent), '{', block_idx)) - indent += 1 - # sort all vars - all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0]) - for var in all_vars: - print("{}{}".format( - get_indent_space(indent), variable_to_code(var[1]))) - - if len(all_vars) > 0: - print("") - - for op in block.ops: - print("{}{}".format(get_indent_space(indent), op_to_code(op))) - indent -= 1 - print("{0}{1}".format(get_indent_space(indent), '}')) + block_to_code(block, block_idx) block_idx += 1 diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 80d9758b3dd0d6c57adc888c492cac26da3939d0..bddeb6617c1743de946b3c5b4b0a465d85f35ce3 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -31,7 +31,7 @@ Steps to transpile pserver: """ import math -import random +import sys import numpy as np import collections import six @@ -182,7 +182,8 @@ class DistributeTranspiler(object): program=None, pservers="127.0.0.1:6174", trainers=1, - sync_mode=True): + sync_mode=True, + startup_program=None): """ Run the transpiler. @@ -195,13 +196,17 @@ class DistributeTranspiler(object): list. trainers (int): number of trainers in the distributed job. sync_mode (bool): Do sync training or not, default is True. + startup_program (Program|None): startup_program to transpile, + default is fluid.default_main_program(). """ if program is None: program = default_main_program() + if startup_program is None: + startup_program = default_startup_program() self.origin_program = program - self.origin_startup_program = default_startup_program().clone() + self.startup_program = startup_program + self.origin_startup_program = self.startup_program.clone() - self.startup_program = default_startup_program() self.trainer_num = trainers self.sync_mode = sync_mode self.trainer_id = trainer_id @@ -239,8 +244,8 @@ class DistributeTranspiler(object): grad_var_mapping_items = list(six.iteritems(self.grad_var_mapping)) if not self.config.slice_var_up: - random.seed(self.origin_program.random_seed) - random.shuffle(grad_var_mapping_items) + np.random.seed(self.origin_program.random_seed) + np.random.shuffle(grad_var_mapping_items) grad_name_to_send_dummy_out = dict() for grad_varname, splited_vars in grad_var_mapping_items: @@ -268,6 +273,10 @@ class DistributeTranspiler(object): name=framework.generate_control_dev_var_name()) grad_name_to_send_dummy_out[grad_varname] = dummy_output + # get send op_role_var, if not splited, the grad should have .trainer suffix + # if splited, grad should be the original grad var name (split_by_ref and send + # will be on the same place). ParallelExecutor + # will use op_role_var to get expected device place to run this op. program.global_block()._insert_op( index=index + 1, type="send", @@ -276,18 +285,23 @@ class DistributeTranspiler(object): attrs={ "epmap": eplist, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, - OP_ROLE_VAR_ATTR_NAME: - [self.grad_name_to_param_name[grad_varname], grad_varname], + OP_ROLE_VAR_ATTR_NAME: [ + self.grad_name_to_param_name[grad_varname], + splited_grad_varname + ], "sync_mode": not self.sync_mode, }) for _, var in enumerate(splited_vars): send_vars.append(var) if self.sync_mode: + send_barrier_out = program.global_block().create_var( + name=framework.generate_control_dev_var_name()) + input_deps = grad_name_to_send_dummy_out.values() program.global_block().append_op( type="send_barrier", - inputs={}, - outputs={}, + inputs={"X": input_deps}, + outputs={"Out": send_barrier_out}, attrs={ "endpoints": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -305,32 +319,46 @@ class DistributeTranspiler(object): self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) # step4: Concat the parameters splits together after recv. + all_recv_outputs = [] for param_varname, splited_var in six.iteritems(self.param_var_mapping): eps = [] for var in splited_var: index = [v.name for v in recv_vars].index(var.name) eps.append(eplist[index]) - grad_send_dummy_out = grad_name_to_send_dummy_out[ - self.param_name_to_grad_name[param_varname]] + if self.sync_mode: + recv_dep_in = send_barrier_out + else: + # connect deps to send op in async mode + recv_dep_in = grad_name_to_send_dummy_out[ + self.param_name_to_grad_name[param_varname]] + all_recv_outputs.extend(splited_var) + # get recv op_role_var, if not splited, the grad should have .trainer suffix + # if splited, grad should be the original grad var name. ParallelExecutor + # will use op_role_var to get expected device place to run this op. + orig_grad_name = self.param_name_to_grad_name[param_varname] + recv_op_role_var_name = orig_grad_name + splited_trainer_grad = self.grad_var_mapping[orig_grad_name] + if len(splited_trainer_grad) == 1: + recv_op_role_var_name = splited_trainer_grad[0].name + program.global_block().append_op( type="recv", - inputs={"X": [grad_send_dummy_out]}, + inputs={"X": [recv_dep_in]}, outputs={"Out": splited_var}, attrs={ "epmap": eps, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, - OP_ROLE_VAR_ATTR_NAME: [ - param_varname, - self.param_name_to_grad_name[param_varname] - ], + OP_ROLE_VAR_ATTR_NAME: + [param_varname, recv_op_role_var_name], "sync_mode": not self.sync_mode }) if self.sync_mode: + # form a WAW dependency program.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": all_recv_outputs}, attrs={ "endpoints": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -367,21 +395,18 @@ class DistributeTranspiler(object): return self.origin_program - def _get_trainer_startup_program(self, - recv_vars, - eplist, - startup_program=None): + def _get_trainer_startup_program(self, recv_vars, eplist): """ Get transpiled trainer side startup program. Args: - startup_program(Program): Startup program. + recv_vars (list): Variable list to recv for current trainer_id + eplist (list): A list of strings indicating Returns: Program: trainer side startup program. """ - if startup_program is None: - startup_program = self.startup_program + startup_program = self.startup_program # FIXME(gongwb): delete not need ops. # note that: some parameter is not trainable and those ops can't be deleted. @@ -414,10 +439,12 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) + fetch_barrier_out = startup_program.global_block().create_var( + name=framework.generate_control_dev_var_name()) startup_program.global_block().append_op( type="fetch_barrier", inputs={}, - outputs={}, + outputs={"Out": fetch_barrier_out}, attrs={ "endpoints": self.pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE @@ -427,7 +454,18 @@ class DistributeTranspiler(object): #add concat ops to merge splited parameters received from parameter servers. if len(splited_var) <= 1: continue - orig_param = startup_program.global_block().vars[varname] + # NOTE: if enable memory optimization, origin vars maybe removed. + if startup_program.global_block().vars.has_key(varname): + orig_param = startup_program.global_block().vars[varname] + else: + origin_param_var = self.origin_program.global_block().vars[ + varname] + orig_param = startup_program.global_block().create_var( + name=varname, + persistable=origin_param_var.persistable, + type=origin_param_var.type, + dtype=origin_param_var.dtype, + shape=origin_param_var.shape) startup_program.global_block().append_op( type="concat", inputs={"X": splited_var}, @@ -450,7 +488,9 @@ class DistributeTranspiler(object): # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. - + sys.stderr.write("get_pserver_program() is deprecated, call\ + get_pserver_programs() to get pserver main and startup\ + in a single call.") # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed @@ -640,32 +680,58 @@ class DistributeTranspiler(object): endpoint) pserver_program._sync_with_cpp() + # save pserver program to generate pserver side startup relatively. + self.pserver_program = pserver_program return pserver_program + def get_pserver_programs(self, endpoint): + """ + Get pserver side main program and startup program for distributed training. + + Args: + endpoint (str): current pserver endpoint. + + Returns: + tuple: (main_program, startup_program), of type "Program" + """ + pserver_prog = self.get_pserver_program(endpoint) + pserver_startup = self.get_startup_program(endpoint) + return pserver_prog, pserver_startup + def get_startup_program(self, endpoint, - pserver_program, + pserver_program=None, startup_program=None): """ + **Deprecated** + Get startup program for current parameter server. Modify operator input variables if there are variables that were split to several blocks. Args: endpoint (str): current pserver endpoint. - pserver_program (Program): call get_pserver_program first and - pass the result here. - startup_program (Program): if pass None, will use - default_startup_program + pserver_program (Program): deprecated, call get_pserver_program first. + startup_program (Program): deprecated, should pass startup_program + when initalizing Returns: Program: parameter server side startup program. """ + sys.stderr.write("get_startup_program() is deprecated, call\ + get_pserver_programs() to get pserver main and startup\ + in a single call.") + if pserver_program != None: + sys.stderr.write("passing pserver_program to get_startup_program()\ + is deprecated, you can use new API get_pserver_programs() to\ + get both pserver main program and startup program.") + if startup_program != None: + sys.stderr.write("passing startup_program to get_startup_program()\ + is deprecated, use fluid.program_guard() or pass this argument\ + to transpile() call.") + s_prog = Program() - if not startup_program: - orig_s_prog = default_startup_program() - else: - orig_s_prog = startup_program + orig_s_prog = self.startup_program s_prog.random_seed = orig_s_prog.random_seed params = self.param_grad_ep_mapping[endpoint]["params"] @@ -1324,13 +1390,11 @@ class DistributeTranspiler(object): inputs={"X": vars2merge}, outputs={"Out": merged_var}, attrs={"use_mkldnn": False}) - # TODO(panyx0718): What if it's SELECTED_ROWS. - if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS: - optimize_block.append_op( - type="scale", - inputs={"X": merged_var}, - outputs={"Out": merged_var}, - attrs={"scale": 1.0 / float(self.trainer_num)}) + optimize_block.append_op( + type="scale", + inputs={"X": merged_var}, + outputs={"Out": merged_var}, + attrs={"scale": 1.0 / float(self.trainer_num)}) return merged_var def _append_pserver_ops(self, optimize_block, opt_op, endpoint, diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py index 7de76c381b29a1ff8dcf2167f0e861dc261aa47b..c44690a93ac3c1f1833ee62b4e13d1ae8220fb55 100644 --- a/tools/check_ctest_hung.py +++ b/tools/check_ctest_hung.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import print_function + import sys import re @@ -46,7 +48,7 @@ Diff: set(['test_parallel_executor_crf']) start_parts = escape(l).split(" ") m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l)) started.add(m.group(1)) - print "Diff: ", started - passed + print("Diff: ", started - passed) if __name__ == "__main__": diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 5e7ffd44c7b0ba2270069bc4467dc377a58b2417..e2805c4e7e6aa26a5865b64a874feef672bf9b36 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -17,6 +17,8 @@ Print all signature of a python module in alphabet order. Usage: ./print_signature "paddle.fluid" > signature.txt """ +from __future__ import print_function + import importlib import inspect import collections @@ -64,4 +66,4 @@ def visit_all_module(mod): visit_all_module(importlib.import_module(sys.argv[1])) for name in member_dict: - print name, member_dict[name] + print(name, member_dict[name]) diff --git a/tools/timeline.py b/tools/timeline.py index b413bb6fe0505df8fb09fa0759fefb6509b95bc9..f850476831d84787bf5cc7c7f7c91ff9dd6a2d5b 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -14,6 +14,7 @@ import argparse import json +import six import sys import unittest @@ -124,7 +125,7 @@ class Timeline(object): return cur_pid def _allocate_pids(self): - for k, profile_pb in self._profile_dict.iteritems(): + for k, profile_pb in six.iteritems(self._profile_dict): for event in profile_pb.events: if event.type == profiler_pb2.Event.CPU: if (k, event.device_id, "CPU") not in self._devices: @@ -140,7 +141,7 @@ class Timeline(object): (k, event.device_id), pid) def _allocate_events(self): - for k, profile_pb in self._profile_dict.iteritems(): + for k, profile_pb in six.iteritems(self._profile_dict): for event in profile_pb.events: if event.type == profiler_pb2.Event.CPU: type = "CPU"