diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index b8429975f5c83df6996e71478fe276b246e8b77b..420884ed8e1ae36a3f1772bfbe8323f3d0ea71e6 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -13,7 +13,7 @@ define_py_data_sources2(
settings(
batch_size=batch_size,
- learning_rate=0.01 / batch_size,
+ learning_rate=0.001 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 8fdc382f0c1c453a01dba884a3dad216e1c3092c..b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -1,17 +1,12 @@
# Find the CBlas and lapack libraries
#
-# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
#
# If any cblas implementation found, the following variable will be set.
-# CBLAS_PROVIDER # one of MKL, ATLAS, OPENBLAS, REFERENCE
+# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE
# CBLAS_INC_DIR # the include directory for cblas.
# CBLAS_LIBS # a list of libraries should be linked by paddle.
# # Each library should be full path to object file.
-#
-# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
-# during cmake. If none of them set, it will try to find cblas implementation in
-# system paths.
-#
set(CBLAS_FOUND OFF)
@@ -30,44 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
return()
endif()
-## Then find MKL.
-set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
-
-set(MKL_INCLUDE_SEARCH_PATHS
- ${MKL_ROOT}/include
- ${INTEL_MKL_ROOT}/include)
-set(MKL_LIB_SEARCH_PATHS
- ${MKL_ROOT}/lib
- ${MKL_ROOT}/lib/intel64
- ${INTEL_MKL_ROOT}/lib
- ${INTEL_MKL_ROOT}/lib/intel64)
-
-find_path(MKL_INC_DIR mkl.h PATHS
- ${MKL_INCLUDE_SEARCH_PATHS})
-find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
- ${MKL_INCLUDE_SEARCH_PATHS})
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS
- ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
- ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
- ${MKL_LIB_SEARCH_PATHS})
-
-if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
- set(CBLAS_FOUND ON)
- set(CBLAS_PROVIDER MKL)
- set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
- set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
-
- add_definitions(-DPADDLE_USE_MKL)
- add_definitions(-DLAPACK_FOUND)
-
- message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
- message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
- return()
-endif()
-
## Then find atlas.
set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
set(ATLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9686df00219001769d074ee815d9cc8db0258496..5a06825beb73e85d8a55b7b578b187bee2c4340c 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
ENDIF()
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
ExternalProject_Add(
${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
- GIT_TAG "v0.10"
+ GIT_TAG "v0.11"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT}
+ CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+ CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLDNN_MKLROOT}
)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 74f3279831357c21038df133df0f5a432a6dfd20..20dbc32a738d982df2d3f035206279c82c8de264 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,8 @@ ENDIF()
INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
-SET(MKLML_VER "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 3f86e456cfbe55fe47e5b18e755e34829ebe9930..05d83ad58ef8485d36829e7aeede79f625cfdc43 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -86,7 +86,7 @@ IF(NOT ${CBLAS_FOUND})
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
-
+ SET(CBLAS_PROVIDER openblas)
IF(WITH_C_API)
INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
# Because libopenblas.a is a symbolic link of another library, thus need to
@@ -115,7 +115,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF(${CBLAS_PROVIDER} MATCHES MKL)
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_LIBRARY(cblas SHARED ${dummyfile})
ELSE()
ADD_LIBRARY(cblas STATIC ${dummyfile})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c311783aa3187678c31c27ddbbd074790ca444f3..b9c1dde97bc444d793d67ff622fd6b13c6435a9a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE AND NOT ANDROID)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
- set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+ set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)
function(merge_static_libs TARGET_NAME)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2..203506d7ab84e5a5be2232b077eac2d433a99766 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -82,6 +82,11 @@ maxout
.. autoclass:: paddle.v2.layer.maxout
:noindex:
+roi_pool
+--------
+.. autoclass:: paddle.v2.layer.roi_pool
+ :noindex:
+
Norm Layer
==========
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -2,112 +2,9 @@
Data Reader Interface and DataSets
==================================
+.. toctree::
+ :maxdepth: 1
-DataTypes
-=========
-
-.. automodule:: paddle.v2.data_type
- :members:
- :noindex:
-
-DataFeeder
-==========
-
-.. automodule:: paddle.v2.data_feeder
- :members:
- :noindex:
-
-Reader
-======
-
-.. automodule:: paddle.v2.reader
- :members:
- :noindex:
-
-.. automodule:: paddle.v2.reader.creator
- :members:
- :noindex:
-
-minibatch
-=========
-
-.. automodule:: paddle.v2.minibatch
- :members:
- :noindex:
-
-Dataset
-=======
-
-.. automodule:: paddle.v2.dataset
- :members:
- :noindex:
-
-mnist
-+++++
-
-.. automodule:: paddle.v2.dataset.mnist
- :members:
- :noindex:
-
-cifar
-+++++
-
-.. automodule:: paddle.v2.dataset.cifar
- :members:
- :noindex:
-
-conll05
-+++++++
-
-.. automodule:: paddle.v2.dataset.conll05
- :members: get_dict,get_embedding,test
- :noindex:
-
-imdb
-++++
-
-.. automodule:: paddle.v2.dataset.imdb
- :members:
- :noindex:
-
-imikolov
-++++++++
-
-.. automodule:: paddle.v2.dataset.imikolov
- :members:
- :noindex:
-
-movielens
-+++++++++
-
-.. automodule:: paddle.v2.dataset.movielens
- :members:
- :noindex:
-
-.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
- :noindex:
-
-.. autoclass:: paddle.v2.dataset.movielens.UserInfo
- :noindex:
-
-sentiment
-+++++++++
-
-.. automodule:: paddle.v2.dataset.sentiment
- :members:
- :noindex:
-
-uci_housing
-+++++++++++
-
-.. automodule:: paddle.v2.dataset.uci_housing
- :members:
- :noindex:
-
-wmt14
-+++++
-
-.. automodule:: paddle.v2.dataset.wmt14
- :members:
- :noindex:
-
+ data/data_reader.rst
+ data/image.rst
+ data/dataset.rst
diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e
--- /dev/null
+++ b/doc/api/v2/data/data_reader.rst
@@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+.. automodule:: paddle.v2.data_type
+ :members:
+ :noindex:
+
+DataFeeder
+==========
+
+.. automodule:: paddle.v2.data_feeder
+ :members:
+ :noindex:
+
+Reader
+======
+
+.. automodule:: paddle.v2.reader
+ :members:
+ :noindex:
+
+.. automodule:: paddle.v2.reader.creator
+ :members:
+ :noindex:
+
+minibatch
+=========
+
+.. automodule:: paddle.v2.minibatch
+ :members:
+ :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365
--- /dev/null
+++ b/doc/api/v2/data/dataset.rst
@@ -0,0 +1,75 @@
+Dataset
+=======
+
+.. automodule:: paddle.v2.dataset
+ :members:
+ :noindex:
+
+mnist
++++++
+
+.. automodule:: paddle.v2.dataset.mnist
+ :members:
+ :noindex:
+
+cifar
++++++
+
+.. automodule:: paddle.v2.dataset.cifar
+ :members:
+ :noindex:
+
+conll05
++++++++
+
+.. automodule:: paddle.v2.dataset.conll05
+ :members: get_dict,get_embedding,test
+ :noindex:
+
+imdb
+++++
+
+.. automodule:: paddle.v2.dataset.imdb
+ :members:
+ :noindex:
+
+imikolov
+++++++++
+
+.. automodule:: paddle.v2.dataset.imikolov
+ :members:
+ :noindex:
+
+movielens
++++++++++
+
+.. automodule:: paddle.v2.dataset.movielens
+ :members:
+ :noindex:
+
+.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
+ :noindex:
+
+.. autoclass:: paddle.v2.dataset.movielens.UserInfo
+ :noindex:
+
+sentiment
++++++++++
+
+.. automodule:: paddle.v2.dataset.sentiment
+ :members:
+ :noindex:
+
+uci_housing
++++++++++++
+
+.. automodule:: paddle.v2.dataset.uci_housing
+ :members:
+ :noindex:
+
+wmt14
++++++
+
+.. automodule:: paddle.v2.dataset.wmt14
+ :members:
+ :noindex:
diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/api/v2/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+.. automodule:: paddle.v2.image
+ :members:
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index fe8da907d9d45a2164031430ac5b7a3d5523967a..16236763a73770f3fe5eadf67645765d0456f875 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -15,6 +15,7 @@
- [CMake](#cmake)
- [Layers](#layers)
- [Activations](#activations)
+ - [Weights](#weights)
- [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api)
@@ -45,17 +46,23 @@ Figure 1. PaddlePaddle on IA.
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。
+`paddle/gserver/layers`中,并且文件名都会一以*MKLDNN*开头。
-所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。
+所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。
+
+在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数,子类只需要实现这些函数即可。
### Activations
-由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。
+由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+### Weights
+由于有些layer是含有参数的,我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
+同时,由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致,我们会在网络训练的开始和结束时分别转换这个layout,使得最终保存的参数格式与PaddlePaddle一致。
-Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+### Unit Tests
+会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
@@ -82,7 +89,7 @@ if use_mkldnn
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking
-会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。
+会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试使用MKL-DNN之后的性能。
### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。
@@ -94,14 +101,16 @@ if use_mkldnn
我们总结出一些特别需要注意的点:
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MkldnnLayer`特有的设备ID。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`,和未来可能还会用到`FPGAEngine`等。
-5. 在**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue`和`mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
-6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
-7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
-8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况,所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面,一直保存的是0,所以可以充分利用这个信息,定义一个枚举处理所有MKLDNN的参数格式,从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
+3. 创建`MKLDNNMatrix`,同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+4. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。
+5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表input value, input gradient,output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory),主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好,每个子类只需要调用定义好的reset buffer函数即可。
+6. 每个`MKLDNNlayer`的resetbuffer相关的函数(包括reset input、output的Value和grad),他们会根据输入参数reset internal和external的memory,当然这两者也可以相等,即表示不需要转换。只需要把握一个原则,每个`MKLDNNlayer`的子类,只需要使用internal的memory就可以了,所有external的转换工作在父类的reset函数中都提前准备好了。
+7. 一般来说,external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存,因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换,那么internal的buffer也会与他们共享内存。
+8. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存,同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`。
+9. 由于MKL-DNN的输出操作都是覆盖data的,不是在原来的数据上累加,所以当网络出现分支时,在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法,此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中,由分支处的layer负责求和,并把结果放到这个layer的`output_.grad`中。所以整体上,每个子类并不会需要关心分支的事情,也是在父类都实现好了。
+10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
## References
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..9007aae7a8355ed06c6720a921351f81b859c1fe
--- /dev/null
+++ b/doc/design/ops/sequence_decoder.md
@@ -0,0 +1,245 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and image to text,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences,
+it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search,
+due to the complexity, the implementation relays on a lot of special data structures,
+quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks,
+so the flexibility of sequence decoder is very important to users.
+
+During PaddlePaddle's refactoring work,
+some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
+and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences,
+it stores several arrays of integers each represents a level.
+
+The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clear.
+
+The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that relay on empty sequence representation,
+such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following demos are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence,
+and a decoder which uses the sequence decoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+ name='source_language_word',
+ type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+ encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+ decoder = pd.while_loop()
+ with decoder.step():
+ decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory
+ generated_ids = decoder.memory() # TODO init to batch_size s
+ generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+ target_word = pd.lookup(trg_embedding, gendrated_ids)
+ # expand encoder_ctx's batch to fit target_word's lod
+ # for example
+ # decoder_mem.lod is
+ # [[0 1 3],
+ # [0 1 3 6]]
+ # its tensor content is [a1 a2 a3 a4 a5]
+ # which means there are 2 sentences to translate
+ # - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+ # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+ # the target_word.lod is
+ # [[0, 1, 6]
+ # [0, 2, 4, 7, 9 12]]
+ # which means 2 sentences to translate, each has 1 and 5 prefixes
+ # the first prefix has 2 candidates
+ # the following has 2, 3, 2, 3 candidates
+ # the encoder_ctx_expanded's content will be
+ # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+ encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+ decoder_input = pd.fc(
+ act=pd.activation.Linear(),
+ input=[target_word, encoder_ctx],
+ size=3 * decoder_dim)
+ gru_out, cur_mem = pd.gru_step(
+ decoder_input, mem=decoder_mem, size=decoder_dim)
+ scores = pd.fc(
+ gru_out,
+ size=trg_dic_size,
+ bias=None,
+ act=pd.activation.Softmax())
+ # K is an config
+ topk_scores, topk_ids = pd.top_k(scores, K)
+ topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+ selected_ids, selected_generation_scores = decoder.beam_search(
+ topk_ids, topk_generated_scores)
+
+ # update the states
+ decoder_mem.update(cur_mem) # tells how to update state
+ generated_ids.update(selected_ids)
+ generated_scores.update(selected_generation_scores)
+
+ decoder.output(selected_ids)
+ decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
+return the result of the beam search algorithm.
+
+In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
+
+1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. remove some specific candidate in `selected_ids`
+3. get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`
+
+- the first level represents `batch_size` of (source) sentences;
+- the second level represents the candidate ID sets for translation prefix.
+
+for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
+a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state stored in `encoder_ctx_expanded`
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+ - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+ - a2 a2
+ - b1 b1 b1
+ - b2
+ - b3 b3
+ - None (c1 has 0 candidates, so c1 is dropped)
+
+Benefit from the relative offset LoD, empty candidate set can be represented naturally.
+
+the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+the `selected_ids` is the candidate ids for the prefixes,
+it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
+the first level represents the source sequences,
+the second level represents generated sequences.
+
+Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
+
+Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+
+
+
+
+According the image above, the only phrase to change LoD is beam search.
+
+## Beam search design
+The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
+
+1. `topk_ids`, top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of the are LoDTensors, so that the sequence affilication is clear.
+Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
+and they exist in each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
+the results of beam search are better to store in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support pack or unpack an array of `LoDTensors`.
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 0e939a2671ace8682c90cdc1c1bb2da1dda0d568..b331d9d36e6a279881c3b1a5586835e7186957fb 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -99,7 +99,7 @@ PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`spa
利用更多的计算资源
++++++++++++++++++
-利用更多的计算资源可以分为一下几个方式来进行\:
+利用更多的计算资源可以分为以下几个方式来进行\:
* 单机CPU训练
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index c823d7e9fcd63dd7719ac1403952b03c2d2f03c0..6cfc9536f20e88571a9845a50be0341fe4d9f78b 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -214,7 +214,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
```cpp
// if use Eigen unsupported module before include head files
- #define EIGEN_USE_GPU
+ // #define EIGEN_USE_GPU
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel);
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 4547afaf1dc9af8bc7909a684db766fdd7b159c0..53a36f8f20d1143470928f57eda6f575d9048236 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
return kPD_NO_ERROR;
}
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+ paddle_real* value) {
+ if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+ auto ptr = cast(mat);
+ if (ptr->mat == nullptr) return kPD_NULLPTR;
+ paddle::real* buf = ptr->mat->getRowBuf(0);
+ size_t width = ptr->mat->getWidth();
+ size_t height = ptr->mat->getHeight();
+ if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+ hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+ return kPD_NOT_SUPPORTED;
+#endif
+ } else {
+ std::copy(value, value + width * height, buf);
+ }
+ return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+ paddle_real* result) {
+ if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+ auto ptr = cast(mat);
+ if (ptr->mat == nullptr) return kPD_NULLPTR;
+ paddle::real* buf = ptr->mat->getRowBuf(0);
+ size_t width = ptr->mat->getWidth();
+ size_t height = ptr->mat->getHeight();
+ if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+ hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+ return kPD_NOT_SUPPORTED;
+#endif
+ } else {
+ std::copy(buf, buf + width * height, result);
+ }
+ return kPD_NO_ERROR;
+}
+
paddle_error paddle_matrix_get_row(paddle_matrix mat,
uint64_t rowID,
paddle_real** rawRowBuffer) {
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 3e6bd5285058a297c4574631e2a5c033b83936e8..876af2aa7615c098d225b56ce2ea0b1529a6e3c6 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -27,18 +27,20 @@ int main() {
CHECK(paddle_arguments_resize(in_args, 1));
// Create input matrix.
- paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+ paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
/* size */ 784,
/* useGPU */ false);
srand(time(0));
- paddle_real* array;
- // Get First row.
- CHECK(paddle_matrix_get_row(mat, 0, &array));
+ std::vector input;
+ input.resize(784 * 10);
- for (int i = 0; i < 784; ++i) {
- array[i] = rand() / ((float)RAND_MAX);
+ for (int i = 0; i < input.size(); ++i) {
+ input[i] = rand() / ((float)RAND_MAX);
}
+
+ // Set value for the input matrix
+ CHECK(paddle_matrix_set_value(mat, input.data()));
CHECK(paddle_arguments_set_value(in_args, 0, mat));
@@ -51,11 +53,17 @@ int main() {
CHECK(paddle_arguments_get_value(out_args, 0, prob));
- CHECK(paddle_matrix_get_row(prob, 0, &array));
+ std::std::vector result;
+ int height;
+ int width;
+
+ CHECK(paddle_matrix_get_shape(prob, &height, &width);
+ result.resize(height * width);
+ CHECK(paddle_matrix_get_value(prob, result.data()));
printf("Prob: ");
- for (int i = 0; i < 10; ++i) {
- printf("%.2f ", array[i]);
+ for (int i = 0; i < height * width; ++i) {
+ printf("%.2f ", result[i]);
}
printf("\n");
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index f15f7f3bbbd1457617111f827d2182ae6b7d9fdb..bb5223f8a275fa2550bf8b7e94a9c4333de4c8c9 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -70,6 +70,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
uint64_t rowID,
paddle_real* rowArray);
+/**
+ * @brief paddle_matrix_set_value Set value to matrix.
+ * @param mat Target Matrix
+ * @param value Row data.
+ * @return paddle_error
+ * @note value should contain enough element of data to init the mat
+ */
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+ paddle_real* value);
+
/**
* @brief PDMatGetRow Get raw row buffer from matrix
* @param [in] mat Target matrix
@@ -81,6 +91,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
uint64_t rowID,
paddle_real** rawRowBuffer);
+/**
+ * @brief copy data from the matrix
+ * @param [in] mat Target matrix
+ * @param [out] result pointer to store the matrix data
+ * @return paddle_error
+ * @note the space of the result should allocated before invoke this API
+ */
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+ paddle_real* result);
/**
* @brief PDMatCreateNone Create None Matrix
* @return
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
index 4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b..6940c28448a897cecd78b718fe720441086a5a99 100644
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) {
paddle_matrix mat = paddle_matrix_create_none();
ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
}
+
+TEST(CAPIMatrix, cpu_get_set_value) {
+ paddle_matrix mat = paddle_matrix_create(128, 32, false);
+ std::vector sample;
+ std::vector result;
+ sample.resize(128 * 32);
+ result.resize(128 * 32);
+ for (size_t i = 0; i < sample.size(); ++i) {
+ sample[i] = 1.0 / (i + 1.0);
+ }
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+ for (size_t i = 0; i < sample.size(); ++i) {
+ ASSERT_NEAR(sample[i], result[i], 1e-5);
+ }
+
+ uint64_t height, width;
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+ ASSERT_EQ(128UL, height);
+ ASSERT_EQ(32UL, width);
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(CAPIMatrix, gpu_get_set_value) {
+ paddle_matrix mat = paddle_matrix_create(128, 32, true);
+ std::vector sample;
+ std::vector result;
+ sample.resize(128 * 32);
+ result.resize(128 * 32);
+ for (size_t i = 0; i < sample.size(); ++i) {
+ sample[i] = 1.0 / (i + 1.0);
+ }
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+ for (size_t i = 0; i < sample.size(); ++i) {
+ ASSERT_NEAR(sample[i], result[i], 1e-5);
+ }
+
+ uint64_t height, width;
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+ ASSERT_EQ(128UL, height);
+ ASSERT_EQ(32UL, width);
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+#endif
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index ed94540c268e5ed990c1d92859c6a2093c052868..b3b9c45ded95ce2e735b8898d47760956dcacdce 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -321,8 +321,6 @@ static void CreateGradVarInBlock(
auto* param = block_desc->FindVarRecursive(pname);
auto* grad = block_desc->FindVar(arg);
if (param == nullptr) {
- LOG(WARNING) << "Cannot find forward variable of " << arg
- << ". Set its gradient to FP32";
grad->SetDataType(DataType::FP32);
} else {
grad->SetDataType(param->GetDataType());
@@ -379,6 +377,12 @@ std::vector> MakeOpGrad(
return grad_op_descs;
}
+static BlockDescBind* CreateStepBlock(
+ ProgramDescBind& program_desc,
+ std::unordered_set* no_grad_vars,
+ std::unordered_map* grad_to_var,
+ int step_block_idx);
+
std::vector> MakeBlockBackward(
ProgramDescBind& program_desc, int block_idx,
std::unordered_set* no_grad_vars,
@@ -394,13 +398,13 @@ std::vector> MakeBlockBackward(
if ((*it)->Type() == "recurrent") {
int step_block_idx = (*it)->GetBlockAttr("step_block");
- auto backward_block_op_descs = MakeBlockBackward(
- program_desc, step_block_idx, no_grad_vars, grad_to_var);
+ BlockDescBind* backward_block = CreateStepBlock(
+ program_desc, no_grad_vars, grad_to_var, step_block_idx);
+ op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+ } else if ((*it)->Type() == "conditional_block") {
BlockDescBind* backward_block =
- program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
- for (auto& ptr : backward_block_op_descs) {
- backward_block->AppendAllocatedOp(std::move(ptr));
- }
+ CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+ (*it)->GetBlockAttr("block"));
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
} else {
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -408,6 +412,11 @@ std::vector> MakeBlockBackward(
for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) {
+ if (out_name.find("@GRAD") == std::string::npos) {
+ // Not all outputs of a backward operator is a gradient. Only gradient
+ // need to be sum. Skip variables are not gradient.
+ continue;
+ }
dup_out_ops[out_name].emplace_back(grad_desc_idx);
}
++grad_desc_idx;
@@ -446,6 +455,21 @@ std::vector> MakeBlockBackward(
return backward_descs;
}
+static BlockDescBind* CreateStepBlock(
+ ProgramDescBind& program_desc,
+ std::unordered_set* no_grad_vars,
+ std::unordered_map* grad_to_var,
+ int step_block_idx) {
+ auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+ no_grad_vars, grad_to_var);
+ BlockDescBind* backward_block =
+ program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+ for (auto& ptr : backward_block_op_descs) {
+ backward_block->AppendAllocatedOp(move(ptr));
+ }
+ return backward_block;
+}
+
ParamGradInfoMap AppendBackward(
ProgramDescBind& program_desc, const VarDescBind& target,
const std::unordered_set& no_grad_vars) {
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 4e8d630c2634682ff63b38182108eadebb5c7ff9..d485cdf6109274377ad0057223bdd8401e964aa7 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -21,7 +21,7 @@
#include "paddle/framework/var_desc.h"
#include "paddle/operators/net_op.h"
-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);
namespace paddle {
namespace framework {
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 9e3d597f3a2c84623a1ce9e4b6f4b956cffde211..11764810e1d40e5e6eb3cd0d8e9b4b63a79855b4 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -50,6 +50,15 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
return it->second.get();
}
+VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
+ const std::string &name_bytes) {
+ VarDescBind *res = FindVarRecursive(name_bytes);
+ if (res == nullptr) {
+ res = Var(name_bytes);
+ }
+ return res;
+}
+
bool BlockDescBind::HasVarRecursive(const std::string &name) const {
return FindVarRecursive(name) != nullptr;
}
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 26adf6a20ff09483b84f479db08efcf402135053..8e967e5378eb47a7869efb59cc96a271f1cbb9a1 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -58,6 +58,8 @@ class BlockDescBind {
VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+ VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+
bool HasVarRecursive(const std::string &var_name) const;
std::set LocalVarNames() const {
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c5ae7b185460c8b0d68ba38bb9db9bd3d3fb14ea..3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
}
}
+inline std::type_index ToTypeIndex(DataType type) {
+ switch (type) {
+ case DataType::FP32:
+ return typeid(float);
+ case DataType::FP64:
+ return typeid(double);
+ case DataType::INT32:
+ return typeid(int);
+ case DataType::INT64:
+ return typeid(int64_t);
+ default:
+ PADDLE_THROW("Not support type %d", type);
+ }
+}
+
template
inline void VisitDataType(DataType type, Visitor visitor) {
switch (type) {
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 10c785e04c4fa2192f9c95513009cf7d8c123868..53b899a23997b71e723a298ec360a4e018d89878 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -79,6 +79,13 @@ DDim make_ddim(const std::vector& dims) {
return result;
}
+DDim make_ddim(const std::vector& dims) {
+ std::vector res(dims.size());
+ std::transform(dims.begin(), dims.end(), res.begin(),
+ [](int d) { return static_cast(d); });
+ return make_ddim(res);
+}
+
/// @cond HIDDEN
// XXX For some reason, putting this in an anonymous namespace causes errors
class DynamicMutableIndexer : public boost::static_visitor {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index aa773868ab4b68acbc46dfa2cd2569d8b8b7789d..4ca5e49566b7ec006eba80f3f9808bacb1ff2615 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -81,6 +81,8 @@ struct DDim {
*/
DDim make_ddim(const std::vector& dims);
+DDim make_ddim(const std::vector& dims);
+
/**
* \brief Make a DDim from an initializer list
*
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index e7cba9e702ce0f96a9680169f0593130df2fd096..39c8def82e1ebb10a0e357a648af760099020c32 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -357,7 +357,8 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) {
- block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
+ block->FindRecursiveOrCreateVar(out_var_name)
+ ->SetType(VarDesc::LOD_TENSOR);
}
}
}
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index fb2c69105627f663ddcce07d31526c9e4278e863..9428b8a07ea0af005f6e960ddaa02da624ad9d97 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -98,5 +98,23 @@ void Scope::DeleteScope(Scope* scope) {
delete scope;
}
+void Scope::Rename(const std::string& origin_name,
+ const std::string& new_name) const {
+ auto origin_it = vars_.find(origin_name);
+ PADDLE_ENFORCE(origin_it != vars_.end(),
+ "Cannot find original variable with name %s", origin_name);
+ auto new_it = vars_.find(new_name);
+ PADDLE_ENFORCE(new_it == vars_.end(),
+ "The variable with name %s is already in the scope", new_name);
+ vars_[new_name] = origin_it->second;
+ vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+ auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+ Rename(origin_name, var_name);
+ return var_name;
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index fb660949394149ebf2c6172a0ac3f4c7594f4286..c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -68,11 +68,18 @@ class Scope {
// enumerate all the variables current contains.
std::vector GetAllNames(bool recursive = false) const;
+ // Rename variable to a new name
+ void Rename(const std::string& origin_name,
+ const std::string& new_name) const;
+
+ // Rename variable to a new name and return the new name
+ std::string Rename(const std::string& origin_name) const;
+
private:
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {}
- std::unordered_map vars_;
+ mutable std::unordered_map vars_;
mutable std::list kids_;
Scope const* parent_{nullptr};
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index d060196bb2c478b776851288cb71a1880d60660d..0f19870bec3e69d07278507cc556a86bbd25d12d 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -27,10 +27,32 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
return VarDesc_VarType_LOD_RANK_TABLE;
} else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
return VarDesc_VarType_LOD_TENSOR_ARRAY;
+ } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+ return VarDesc_VarType_SELECTED_ROWS;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
}
}
+template
+inline void VisitVarType(const Variable& var, Visitor visitor) {
+ switch (ToVarType(var.Type())) {
+ case VarDesc_VarType_LOD_TENSOR:
+ visitor(var.Get());
+ return;
+ case VarDesc_VarType_LOD_RANK_TABLE:
+ visitor(var.Get());
+ return;
+ case VarDesc_VarType_LOD_TENSOR_ARRAY:
+ visitor(var.Get());
+ return;
+ case VarDesc_VarType_SELECTED_ROWS:
+ visitor(var.Get());
+ return;
+ default:
+ PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+ }
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 4fd72d64a90ae6f16dd1499ceb7fba6e40fe4cea..9b2779b42cad324253dadf27dbff20fd8e8c8e16 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -45,6 +45,7 @@ if(WITH_GPU)
add_simple_unittest(BlockExpandOpTest)
add_simple_unittest(CropOpTest)
add_simple_unittest(SwitchOpTest)
+ add_simple_unittest(ScaleSubRegionOpTest)
endif()
add_simple_unittest(Im2ColTest)
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index ba446bf92da264fafa1fb47a2c30da9cb13176ce..370940532ef40335be54a3e6467de0409e923ec4 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -110,6 +110,7 @@ public:
function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
function1_->init(config);
function2_->init(config);
+ initArgsCallback_ = nullptr;
}
~Compare2Function() {}
@@ -170,6 +171,10 @@ public:
*seq2_));
}
+ void registerInitCallback(std::function callback) {
+ initArgsCallback_ = callback;
+ }
+
// output need only contains shape, do not contains data.
void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
size_t size =
@@ -340,6 +345,10 @@ protected:
initArg(*func1Inputs_[i]);
}
+ if (initArgsCallback_ != nullptr) {
+ initArgsCallback_(*func1Inputs_[i], i);
+ }
+
copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
}
}
@@ -386,6 +395,7 @@ protected:
std::shared_ptr seq1_;
std::shared_ptr seq2_;
test::CopyArgument copyArg_;
+ std::function initArgsCallback_;
};
class CpuGpuFuncCompare
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a080505d7df83a6c0a9d88fbcb7863fc0e1f7b21
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+ for (int n = 0; n < number; ++n) {
+ // indices start from 1
+ int offset = n * 6;
+ for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+ for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+ for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+ int idx = ((n * channel + c) * height + h) * width + w;
+ outputs[idx] *= value;
+ }
+ }
+ }
+ }
+}
+
+template <>
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ for (int n = 0; n < number; ++n) {
+ for (int c = 0; c < channel; ++c) {
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ int idx = ((n * channel + c) * height + h) * width + w;
+ int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) &&
+ h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) &&
+ w <= (indices[offset + 5] - 1)) {
+ outGrad[idx] += inGrad[idx] * value;
+ } else {
+ outGrad[idx] += inGrad[idx];
+ }
+ }
+ }
+ }
+ }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ * a specified sub continuous region. By providing start index and end
+ * index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with same shape as inputs, output value.
+ */
+template
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+ void init(const FuncConfig& config) override { conf_ = config; }
+
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ(2UL, inputs.size());
+ CHECK_EQ(1UL, outputs.size());
+ CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+ TensorShape shape = inputs[0].shape();
+
+ ScaleSubRegion(outputs[0].data(),
+ inputs[0].data(),
+ inputs[1].data(),
+ shape,
+ conf_);
+ }
+
+private:
+ FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+ void init(const FuncConfig& config) override { conf_ = config; }
+
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ(2UL, inputs.size());
+ CHECK_EQ(1UL, outputs.size());
+ CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+ TensorShape shape = inputs[0].shape();
+
+ ScaleSubRegionGrad(inputs[0].data(),
+ outputs[0].data(),
+ inputs[1].data(),
+ shape,
+ conf_);
+ }
+
+private:
+ FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/function/ScaleSubRegionOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..0480c8577f3fbf3bc9e94b635df96a31b103e9e3
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ * region. Indices must be provided to indcate the location and shape of
+ * the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs Output value.
+ * \param[in] inputs Input data which contains NCHW information.
+ * \param[in] indices Indices data to indcate the sub region.
+ * \param[in] shape Tensor shape of input value.
+ * \param[in] conf Configure variable which contains the multiplied value.
+ */
+template
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad Gradients of previous layer.
+ * \param[in] outGrad Output gradient.
+ * \param[in] indices Indices data.
+ * \param[in] shape The Shape of input tensor.
+ * \param[in] conf Configure variable.
+ */
+template
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf);
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/function/ScaleSubRegionOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aae2e44c3fdc8b516e66ecfd2e04f466a17dde9
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ real value,
+ int channel,
+ int height,
+ int width,
+ int nthreads) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < nthreads) {
+ const int w = idx % width;
+ const int h = (idx / width) % height;
+ const int c = (idx / width / height) % channel;
+ const int n = idx / width / height / channel;
+
+ const int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+ outputs[idx] = inputs[idx] * value;
+ } else {
+ outputs[idx] = inputs[idx];
+ }
+ }
+}
+
+template <>
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ size_t nth = number * channel * height * width;
+ int blockSize = 1024;
+ int gridSize = (nth + blockSize - 1) / blockSize;
+
+ KeScaleSubRegion<<>>(
+ outputs, inputs, indices, value, channel, height, width, nth);
+ CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ real value,
+ int channel,
+ int height,
+ int width,
+ int nthreads) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < nthreads) {
+ const int w = idx % width;
+ const int h = (idx / width) % height;
+ const int c = (idx / width / height) % channel;
+ const int n = idx / width / height / channel;
+
+ const int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+ outGrad[idx] += inGrad[idx] * value;
+ } else {
+ outGrad[idx] += inGrad[idx];
+ }
+ }
+}
+
+template <>
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ size_t nth = number * channel * height * width;
+ int blockSize = 1024;
+ int gridSize = (nth + blockSize - 1) / blockSize;
+
+ KeScaleSubRegionDiff<<>>(
+ inGrad, outGrad, indices, value, channel, height, width, nth);
+ CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/function/ScaleSubRegionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43331f258dddaa43cbc8cc77519e299de7e98290
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+ for (size_t numSamples : {5, 32}) {
+ for (size_t channels : {5, 32}) {
+ for (size_t imgSizeH : {5, 33}) {
+ for (size_t imgSizeW : {5, 32}) {
+ for (real value : {-0.5, 0.0, 0.5}) {
+ for (bool firstHalf : {false, true}) {
+ VLOG(3) << " numSamples=" << numSamples
+ << " channels=" << channels << " imgSizeH=" << imgSizeH
+ << " imgSizeW=" << imgSizeW;
+
+ for (bool testGrad : {false, true}) {
+ CpuGpuFuncCompare compare(
+ testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+ FuncConfig().set("value", value));
+
+ TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+ TensorShape indicesShape{numSamples, 6};
+
+ compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+ compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+ compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+ if (index == 1) {
+ real* data = (real*)arg.data();
+
+ for (size_t i = 0; i < numSamples; ++i) {
+ size_t offset = i * 6;
+ data[offset] = firstHalf ? 1 : channels / 2;
+ data[offset + 1] = firstHalf ? channels / 2 : channels;
+ data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+ data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+ data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+ data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+ }
+ }
+ });
+
+ compare.addOutputs(
+ BufferArg(
+ VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+ testGrad ? ADD_TO : ASSIGN_TO);
+ compare.run();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 8eb700723f2cf7dda969739bb5e3d48358d278a0..0f2b67fd758ec1513f42c4cb1a36f2f3915f4740 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -54,7 +54,6 @@ void MKLDNNAddtoLayer::reshape(
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
- printSizeInfo();
}
void MKLDNNAddtoLayer::resetFwd(std::vector& pipeline,
@@ -62,16 +61,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector& pipeline,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
- if (biases_) {
- LOG(FATAL) << "not implemented yet";
- }
- resetFwdBuffers(inVals_, out);
+ resetFwdBuffers(inVals_, bias, out);
in = inVals_[0];
std::shared_ptr fwdPD;
- resetFwdPD(fwdPD, inVals_, out);
+ std::shared_ptr biasPD;
+ resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
- resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+ resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
}
void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
@@ -79,7 +76,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
- resetBwdBuffers(inGrads_, out);
+ resetBwdBuffers(inGrads_, bias, out);
in = inGrads_[0];
// backward only need share output grad to input grad
@@ -89,6 +86,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
}
}
+
+ // backward bias
+ bwdBias_ = nullptr;
+ if (bias) {
+ std::vector scales(bs_, 1.0);
+ std::vector srcPDs(bs_, bias->getPrimitiveDesc());
+ auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+ std::vector srcs;
+ for (size_t i = 0; i < grads_.size(); ++i) {
+ srcs.push_back(*(grads_[i]));
+ }
+ bwdBias_.reset(new sum(biasPD, srcs, *bias));
+ pipeline.push_back(*bwdBias_);
+ }
}
void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@@ -97,7 +108,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
}
}
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+ const MatrixPtr& biasMat,
+ const MKLDNNMatrixPtr& out,
+ std::vector& outs) {
+ auto pd = MKLDNNMatrix::createPrimitiveDesc(
+ {(int)layerSize_}, memory::format::x, engine_);
+ bias = MKLDNNMatrix::create(pd, biasMat);
+ outs.clear();
+ real* data = out->getData();
+ CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+ for (int i = 0; i < bs_; ++i) {
+ MatrixPtr tmp =
+ Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+ outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+ }
+}
+
void MKLDNNAddtoLayer::resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
@@ -110,12 +139,20 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector& inputs,
}
resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+ if (biases_ && biases_->getW()) {
+ prepareBias(bias, biases_->getW(), out, vals_);
+ } else {
+ bias = nullptr;
+ }
}
void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr bias,
MKLDNNMatrixPtr out) {
- std::vector scales(inputs.size(), 1.0);
+ std::vector scales(inputs.size(), 1.0);
std::vector srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
@@ -123,12 +160,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr& pd,
CHECK(out);
pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+ biasPD = nullptr;
+ if (bias) {
+ std::vector scales(2, 1.0);
+ std::vector srcPDs(2, bias->getPrimitiveDesc());
+ biasPD.reset(
+ new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+ CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+ }
}
void MKLDNNAddtoLayer::resetFwdPipeline(
std::vector& pipeline,
std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
std::vector srcs;
for (size_t i = 0; i < inputs.size(); i++) {
@@ -136,9 +184,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
}
fwd_.reset(new sum(*pd, srcs, *out));
pipeline.push_back(*fwd_);
+
+ fwdBias_.clear();
+ if (biasPD == nullptr || bias == nullptr) {
+ return;
+ }
+ fwdBias_.resize(vals_.size());
+ for (size_t i = 0; i < vals_.size(); ++i) {
+ std::vector srcs;
+ srcs.push_back(*(vals_[i]));
+ srcs.push_back(*bias);
+ fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+ pipeline.push_back(*fwdBias_[i]);
+ }
}
void MKLDNNAddtoLayer::resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
@@ -149,6 +211,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector& inputs,
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
}
+
+ if (biases_ && biases_->getWGrad()) {
+ prepareBias(bias, biases_->getWGrad(), out, grads_);
+ } else {
+ bias = nullptr;
+ }
}
} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index 15f74ec5bdf3d1e4ae5e09051be6be418590a67a..24504b7b4f50726e2b2757ca3029461cdc27b411 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -32,9 +32,15 @@ protected:
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_;
- // TODO(TJ): this part has not been optimized by MKL-DNN
std::unique_ptr biases_;
+ // buffers for adding bias
+ std::vector vals_;
+ std::vector grads_;
+ // primitives for adding bias
+ std::vector> fwdBias_;
+ std::shared_ptr bwdBias_;
+
public:
explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
@@ -91,20 +97,34 @@ protected:
* reset pipeline.
*/
void resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr bias,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector& pipeline,
std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
+
+ /**
+ * prepare for bias
+ */
+ void prepareBias(MKLDNNMatrixPtr& bias,
+ const MatrixPtr& biasMat,
+ const MKLDNNMatrixPtr& out,
+ std::vector& outs);
};
} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index 9b0ae20f089e34a719883bc65e88e33ab9334e39..071bdf54d5dc9538d5ced580a73b9c0fbcea41fb 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -119,13 +119,12 @@ void MKLDNNBatchNormLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
oh = ih;
- ow = ow;
+ ow = iw;
// ic_ and oc can not be changed
CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
<< "Input channel can not be changed";
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
- printSizeInfo();
}
void MKLDNNBatchNormLayer::resetFwd(std::vector& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index b8120eda1e2dadab943869a05546351a369af6fd..8aa54e0a9efa7adb766cbb6009f6a29410c6ae7d 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -102,8 +102,6 @@ void MKLDNNConvLayer::reshape(
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
-
- printSizeInfo();
}
void MKLDNNConvLayer::resetFwd(std::vector& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 1fed0e1c6565b763a3ee73a0853f560ddfbd44c6..9c69136684e5f9005860b476ec6ed1bbc9ceff6c 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -92,7 +92,7 @@ public:
void printSizeInfo() override {
MKLDNNLayer::printSizeInfo();
VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
- << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+ << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
<< ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
}
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 3429c53d2396e051d62fe0ae405934758e89f9c2..350ec65fffbc73c3a6e4245f763f4c6aa868f574 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -84,8 +84,6 @@ void MKLDNNFcLayer::reshape(
reshapeOutput(oh, ow);
resizeOutput(bs, oc);
-
- printSizeInfo();
}
void MKLDNNFcLayer::resetFwd(std::vector& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 82ef344c7b2aa0093a5f0a28780592dea5d51efe..e75ac5ba4647a8267b7bc189893bd7adb5c3053f 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -287,7 +287,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
return;
}
CHECK(out) << "should have reset internal ouput grad";
- std::vector scales(outputMap_.size(), 1.0);
+ std::vector scales(outputMap_.size(), 1.0);
std::vector srcPDs;
std::vector srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 6e89260f49979d4edb4da138507a73dc2bf120de..a18c455beab96ef25b5545281bae4d48cec98d9e 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -71,8 +71,6 @@ void MKLDNNPoolLayer::reshape(
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
-
- printSizeInfo();
}
void MKLDNNPoolLayer::resetFwd(std::vector& pipeline,
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..35d4b12d3d357800fe72899069b5377c252fac5f
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ROIPoolLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(roi_pool, ROIPoolLayer);
+
+bool ROIPoolLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ Layer::init(layerMap, parameterMap);
+
+ const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+ pooledWidth_ = layerConf.pooled_width();
+ pooledHeight_ = layerConf.pooled_height();
+ spatialScale_ = layerConf.spatial_scale();
+
+ return true;
+}
+
+void ROIPoolLayer::forward(PassType passType) {
+ Layer::forward(passType);
+
+ const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+ height_ = getInput(0).getFrameHeight();
+ if (!height_) height_ = layerConf.height();
+ width_ = getInput(0).getFrameWidth();
+ if (!width_) width_ = layerConf.width();
+ channels_ = getInputValue(0)->getWidth() / width_ / height_;
+
+ size_t batchSize = getInput(0).getBatchSize();
+ size_t numROIs = getInput(1).getBatchSize();
+
+ MatrixPtr dataValue = getInputValue(0);
+ MatrixPtr roiValue = getInputValue(1);
+ resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
+ MatrixPtr outputValue = getOutputValue();
+
+ if (useGpu_) { // TODO(guosheng): implement on GPU later
+ MatrixPtr dataCpuBuffer;
+ Matrix::resizeOrCreate(dataCpuBuffer,
+ dataValue->getHeight(),
+ dataValue->getWidth(),
+ false,
+ false);
+ MatrixPtr roiCpuBuffer;
+ Matrix::resizeOrCreate(roiCpuBuffer,
+ roiValue->getHeight(),
+ roiValue->getWidth(),
+ false,
+ false);
+ dataCpuBuffer->copyFrom(*dataValue);
+ roiCpuBuffer->copyFrom(*roiValue);
+ dataValue = dataCpuBuffer;
+ roiValue = roiCpuBuffer;
+ MatrixPtr outputCpuBuffer;
+ Matrix::resizeOrCreate(outputCpuBuffer,
+ outputValue->getHeight(),
+ outputValue->getWidth(),
+ false,
+ false);
+ outputCpuBuffer->copyFrom(*outputValue);
+ outputValue = outputCpuBuffer;
+ }
+
+ real* bottomData = dataValue->getData();
+ size_t batchOffset = dataValue->getWidth();
+ size_t channelOffset = height_ * width_;
+ real* bottomROIs = roiValue->getData();
+ size_t roiOffset = roiValue->getWidth();
+ size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+
+ real* outputData = outputValue->getData();
+ Matrix::resizeOrCreate(maxIdxs_,
+ numROIs,
+ channels_ * pooledHeight_ * pooledWidth_,
+ false,
+ false);
+ real* argmaxData = maxIdxs_->getData();
+
+ for (size_t n = 0; n < numROIs; ++n) {
+ // the first five elememts of each RoI should be:
+ // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
+ size_t roiBatchIdx = bottomROIs[0];
+ size_t roiStartW = round(bottomROIs[1] * spatialScale_);
+ size_t roiStartH = round(bottomROIs[2] * spatialScale_);
+ size_t roiEndW = round(bottomROIs[3] * spatialScale_);
+ size_t roiEndH = round(bottomROIs[4] * spatialScale_);
+ CHECK_GE(roiBatchIdx, 0UL);
+ CHECK_LT(roiBatchIdx, batchSize);
+ size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
+ size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);
+ real binSizeH =
+ static_cast(roiHeight) / static_cast(pooledHeight_);
+ real binSizeW =
+ static_cast(roiWidth) / static_cast(pooledWidth_);
+ real* batchData = bottomData + batchOffset * roiBatchIdx;
+ for (size_t c = 0; c < channels_; ++c) {
+ for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+ for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+ size_t hstart = static_cast(std::floor(ph * binSizeH));
+ size_t wstart = static_cast(std::floor(pw * binSizeW));
+ size_t hend = static_cast(std::ceil((ph + 1) * binSizeH));
+ size_t wend = static_cast(std::ceil((pw + 1) * binSizeW));
+ hstart = std::min(std::max(hstart + roiStartH, 0UL), height_);
+ wstart = std::min(std::max(wstart + roiStartW, 0UL), width_);
+ hend = std::min(std::max(hend + roiStartH, 0UL), height_);
+ wend = std::min(std::max(wend + roiStartW, 0UL), width_);
+
+ bool isEmpty = (hend <= hstart) || (wend <= wstart);
+ size_t poolIndex = ph * pooledWidth_ + pw;
+ if (isEmpty) {
+ outputData[poolIndex] = 0;
+ argmaxData[poolIndex] = -1;
+ }
+
+ for (size_t h = hstart; h < hend; ++h) {
+ for (size_t w = wstart; w < wend; ++w) {
+ size_t index = h * width_ + w;
+ if (batchData[index] > outputData[poolIndex]) {
+ outputData[poolIndex] = batchData[index];
+ argmaxData[poolIndex] = index;
+ }
+ }
+ }
+ }
+ }
+ batchData += channelOffset;
+ outputData += poolChannelOffset;
+ argmaxData += poolChannelOffset;
+ }
+ bottomROIs += roiOffset;
+ }
+ if (useGpu_) {
+ getOutputValue()->copyFrom(*outputValue);
+ }
+}
+
+void ROIPoolLayer::backward(const UpdateCallback& callback) {
+ MatrixPtr inGradValue = getInputGrad(0);
+ MatrixPtr outGradValue = getOutputGrad();
+ MatrixPtr roiValue = getInputValue(1);
+
+ if (useGpu_) {
+ MatrixPtr inGradCpuBuffer;
+ Matrix::resizeOrCreate(inGradCpuBuffer,
+ inGradValue->getHeight(),
+ inGradValue->getWidth(),
+ false,
+ false);
+ MatrixPtr outGradCpuBuffer;
+ Matrix::resizeOrCreate(outGradCpuBuffer,
+ outGradValue->getHeight(),
+ outGradValue->getWidth(),
+ false,
+ false);
+ MatrixPtr roiCpuBuffer;
+ Matrix::resizeOrCreate(roiCpuBuffer,
+ roiValue->getHeight(),
+ roiValue->getWidth(),
+ false,
+ false);
+ inGradCpuBuffer->copyFrom(*inGradValue);
+ outGradCpuBuffer->copyFrom(*outGradValue);
+ roiCpuBuffer->copyFrom(*roiValue);
+ inGradValue = inGradCpuBuffer;
+ outGradValue = outGradCpuBuffer;
+ roiValue = roiCpuBuffer;
+ }
+
+ real* bottomROIs = roiValue->getData();
+ size_t numROIs = getInput(1).getBatchSize();
+ size_t roiOffset = getInputValue(1)->getWidth();
+
+ real* inDiffData = inGradValue->getData();
+ size_t batchOffset = getInputValue(0)->getWidth();
+ size_t channelOffset = height_ * width_;
+
+ real* outDiffData = outGradValue->getData();
+ size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+ real* argmaxData = maxIdxs_->getData();
+
+ for (size_t n = 0; n < numROIs; ++n) {
+ size_t roiBatchIdx = bottomROIs[0];
+ real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
+ for (size_t c = 0; c < channels_; ++c) {
+ for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+ for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+ size_t poolIndex = ph * pooledWidth_ + pw;
+ if (argmaxData[poolIndex] > 0) {
+ size_t index = static_cast(argmaxData[poolIndex]);
+ batchDiffData[index] += outDiffData[poolIndex];
+ }
+ }
+ }
+ batchDiffData += channelOffset;
+ outDiffData += poolChannelOffset;
+ argmaxData += poolChannelOffset;
+ }
+ bottomROIs += roiOffset;
+ }
+
+ if (useGpu_) {
+ getInputGrad(0)->copyFrom(*inGradValue);
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f07e49d6fd1eda9fa7bd46e4cec771a75f571be
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ * convolution layer; The second input layer contains the ROI data
+ * which is the output of ProposalLayer in Faster R-CNN. layers for
+ * generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ * Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ * Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+protected:
+ size_t channels_;
+ size_t width_;
+ size_t height_;
+ size_t pooledWidth_;
+ size_t pooledHeight_;
+ real spatialScale_;
+
+ // Since there is no int matrix, use real maxtrix instead.
+ MatrixPtr maxIdxs_;
+
+public:
+ explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+ bool init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) override;
+
+ void forward(PassType passType) override;
+ void backward(const UpdateCallback& callback = nullptr) override;
+};
+} // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa6778aef4e893208fd064ca22e217c6c4d960f9
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ Layer::init(layerMap, parameterMap);
+ CHECK_EQ(static_cast(inputLayers_.size()), 2);
+ auto& conf = config_.inputs(0).scale_sub_region_conf();
+ value_ = conf.value();
+
+ createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+ createFunction(
+ backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+ return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+ Layer::forward(passType);
+ auto in0 = getInput(0);
+ imgH_ = in0.getFrameHeight();
+ imgW_ = in0.getFrameWidth();
+ if (imgH_ == 0 || imgW_ == 0) {
+ auto& conf = config_.inputs(0).scale_sub_region_conf();
+ imgH_ = conf.image_conf().img_size_y();
+ imgW_ = conf.image_conf().img_size();
+ }
+ MatrixPtr imgV = in0.value;
+ size_t batchSize = imgV->getHeight();
+ size_t spatialSize = imgH_ * imgW_;
+ channelsNum_ = imgV->getWidth() / spatialSize;
+ shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+ resetOutput(batchSize, imgV->getWidth());
+ auto& out = getOutput();
+ out.setFrameHeight(imgH_);
+ out.setFrameWidth(imgW_);
+
+ MatrixPtr indicesV = getInputValue(1);
+ indicesShape_ = TensorShape({batchSize, 6});
+
+ REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+ BufferArgs inArgs;
+ BufferArgs outArgs;
+ inArgs.addArg(*imgV, shape_);
+ inArgs.addArg(*indicesV, indicesShape_);
+ outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+ forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+ REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+ BufferArgs inArgs;
+ BufferArgs outArgs;
+ inArgs.addArg(*getOutputGrad(), shape_);
+ inArgs.addArg(*getInputValue(1), indicesShape_);
+ outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+ backward_[0]->calc(inArgs, outArgs);
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a27c56de93bb6fdde0f95cd4c5abe5dfabe4e858
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief For each instance, this layer can be used to multiply a value to a
+ * specified sub continuous region. By providing start index and end
+ * index for C/H/W, you can specify the location and shape of the
+ * region.
+ *
+ * input_0: Input value.
+ * input_1: Indices value to specify the location an shape of the
+ * region.
+ */
+class ScaleSubRegionLayer : public Layer {
+public:
+ explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+ ~ScaleSubRegionLayer() {}
+
+ bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+ void forward(PassType passType);
+
+ void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+ TensorShape shape_;
+ TensorShape indicesShape_;
+ size_t imgH_;
+ size_t imgW_;
+ size_t channelsNum_;
+ real value_;
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 1a46fb49153a0aa4228f58db481b950bc2d6de83..fcbcb5b0f1f4cb07066363c9fa93fb1726459f30 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -53,7 +53,7 @@ TEST(Operator, dot_mul) {
TEST(Projection, context) {
for (auto contextStart : {-5, -3, -1, 0, 3}) {
for (auto contextLength : {1, 2, 5, 7}) {
- for (auto batchSize : {1, 2, 5, 20, 50}) {
+ for (auto batchSize : {1, 2, 5, 20}) {
for (auto trainablePadding : {false, true}) {
LOG(INFO) << " contextStart=" << contextStart
<< " contextLength=" << contextLength
@@ -585,14 +585,14 @@ TEST(Layer, maxoutLayer) {
}
void testFcLayer(string format, size_t nnz) {
TestConfig config;
- config.biasSize = 4096;
+ config.biasSize = 1024;
config.layerConfig.set_type("fc");
- config.layerConfig.set_size(4096);
+ config.layerConfig.set_size(1024);
config.layerConfig.set_active_type("sigmoid");
config.layerConfig.set_drop_rate(0.1);
config.inputDefs.push_back(
- {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+ {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
config.layerConfig.add_inputs();
LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
@@ -609,9 +609,9 @@ void testFcLayer(string format, size_t nnz) {
}
TEST(Layer, fcLayer) {
- testFcLayer("", 4096 * 4096 * 2);
- testFcLayer("csc", 4096 * 40);
- testFcLayer("csr", 4096 * 40);
+ testFcLayer("", 1024 * 1024 * 2);
+ testFcLayer("csc", 1024 * 10);
+ testFcLayer("csr", 1024 * 10);
}
TEST(Layer, SelectiveFullyConnectedLayer) {
@@ -1995,7 +1995,7 @@ TEST(Layer, multibox_loss) {
TEST(Layer, TransLayer) {
TestConfig config;
const int height = 128;
- const int width = 1028;
+ const int width = 256;
config.layerConfig.set_type("trans");
config.layerConfig.set_size(width);
@@ -2056,6 +2056,43 @@ TEST(Layer, CropLayer) {
}
}
+TEST(Layer, roi_pool) {
+ TestConfig config;
+ config.layerConfig.set_type("roi_pool");
+ config.biasSize = 0;
+ LayerInputConfig* input = config.layerConfig.add_inputs();
+ ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+ roiPoolConf->set_pooled_width(7);
+ roiPoolConf->set_pooled_height(7);
+ roiPoolConf->set_spatial_scale(1. / 16);
+ roiPoolConf->set_width(14);
+ roiPoolConf->set_height(14);
+
+ const size_t roiNum = 10;
+ const size_t roiDim = 10;
+ const size_t batchSize = 5;
+ MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+ roiValue->zeroMem();
+ real* roiData = roiValue->getData();
+ for (size_t i = 0; i < roiNum; ++i) {
+ roiData[i * roiDim + 0] = std::rand() % batchSize;
+ roiData[i * roiDim + 1] = std::rand() % 224; // xMin
+ roiData[i * roiDim + 2] = std::rand() % 224; // yMin
+ size_t xMin = static_cast(roiData[i * roiDim + 1]);
+ size_t yMin = static_cast(roiData[i * roiDim + 2]);
+ roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin); // xMax
+ roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin); // yMax
+ }
+
+ config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+ config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+ config.layerConfig.add_inputs();
+
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+ }
+}
+
TEST(Layer, SwitchOrderLayer) {
TestConfig config;
// config input_0
@@ -2358,6 +2395,38 @@ TEST(Layer, ScaleShiftLayer) {
}
}
+TEST(Layer, ScaleSubRegionLayer) {
+ const size_t batchSize = 64;
+ const size_t size = 4096;
+ TestConfig config;
+ config.layerConfig.set_type("scale_sub_region");
+ config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+ MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+ auto* data = indicesV->getData();
+ for (size_t i = 0; i < batchSize; ++i) {
+ data[i * 2] = 2;
+ data[i * 2 + 1] = 4;
+ data[i * 2 + 2] = 16;
+ data[i * 2 + 3] = 32;
+ data[i * 2 + 4] = 16;
+ data[i * 2 + 5] = 32;
+ }
+ config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+ LayerInputConfig* input = config.layerConfig.add_inputs();
+ ScaleSubRegionConfig* scaleSubRegionConf =
+ input->mutable_scale_sub_region_conf();
+ ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+ imgConf->set_img_size(32);
+ imgConf->set_img_size_y(32);
+ imgConf->set_channels(4);
+ scaleSubRegionConf->set_value(2.0);
+ config.layerConfig.add_inputs();
+
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+ }
+}
+
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 2e8d9f3333b36005c9b3b28449c76a4a44c74cc6..a859e34c8996d81f14bf1edcb6e23d5a4f687e6b 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -269,6 +269,7 @@ void testBatchNormLayer(const testBatchNormDesc& pm) {
TEST(MKLDNNLayer, BatchNormLayer) {
testBatchNormLayer({4, 10, 6, 6});
testBatchNormLayer({16, 32, 16, 16});
+ testBatchNormLayer({4, 16, 8, 10});
}
struct testImageDesc {
@@ -296,17 +297,12 @@ static void getAddtoConfig(TestConfig& cfg,
}
void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
- CHECK_GE(nInputs, 1);
+ CHECK_GE(nInputs, 1UL);
TestConfig dnnConfig;
getAddtoConfig(dnnConfig, pm, nInputs);
dnnConfig.layerConfig.set_type("mkldnn_addto");
- // TODO(TJ): test with bias
- for (auto withBias : {false}) {
- if (withBias) {
- dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
- } else {
- dnnConfig.biasSize = 0;
- }
+ for (auto withBias : {false, true}) {
+ dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
}
}
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 21a8f73c3e650d4b3c3b86247594cd965f4ead35..a710479bab82ed52122cf59bb14a05ccbd4aa05c 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -152,12 +152,7 @@ void MKLDNNMatrix::downSpatial() {
}
memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
- mkldnn_primitive_t result;
- mkldnn::error::wrap_c_api(
- mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
- "could not create a memory primitive");
- reset(result);
- set_data_handle(data_);
+ resetMKLDNNMemory(pd, data_);
}
} // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54cfefe23b3dc70fd12fd2ca8886c941047b59f7..39d40a1f61609a649d3341c170d24b0604921ac2 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -145,6 +145,27 @@ public:
m_.reset();
}
+ /**
+ * override the CpuMatrix::resize
+ */
+ void resize(size_t newHeight, size_t newWidth) override {
+ m_->resize(newHeight, newWidth);
+ if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+ return;
+ }
+ CpuMatrix::setData(data_);
+ height_ = newHeight;
+ width_ = newWidth;
+ elementCnt_ = newHeight * newWidth;
+ stride_ = width_;
+ auto pd = mkldnn::memory::primitive_desc(
+ mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+ getDtype(),
+ mkldnn::memory::format::nc),
+ getEngine());
+ resetMKLDNNMemory(pd, data_);
+ }
+
/**
* override Matrix::getData
* check data before return
@@ -215,6 +236,17 @@ protected:
memory::format srcFmt,
memory::format dstFmt,
memory::dims dm);
+ /**
+ * reset this MKLDNN Memory from primitve desc
+ */
+ void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+ mkldnn_primitive_t result;
+ mkldnn::error::wrap_c_api(
+ mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+ "could not create a memory primitive");
+ reset(result);
+ set_data_handle(data);
+ }
private:
// save the CpuMatrixPtr in case the buffer released outside
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index c2f17beeb87942ea681f5d388659c0d280157b26..ba86eacbb5d53ee43a60d2cd1dd922333a5d48f0 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct(const int n, const double* x, const double* y) {
}
#endif
-#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_USE_MKLML)
template <>
void vExp(const int n, const float* a, float* r) {
@@ -295,38 +295,6 @@ template void vAdd(const int n, const double* a, const double* b, double* r);
#endif
-#ifdef PADDLE_USE_MKL
-template <>
-void vInvSqrt(const int n, const float* a, float* r) {
- vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt(const int n, const double* a, double* r) {
- vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p(const int n, const float* a, float* r) {
- vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p(const int n, const double* a, double* r) {
- vdLog1p(n, a, r);
-}
-
-template <>
-void vTanh(const int n, const float* a, float* r) {
- vsTanh(n, a, r);
-}
-
-template <>
-void vTanh(const int n, const double* a, double* r) {
- vdTanh(n, a, r);
-}
-#else
-
DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
template
void vInvSqrt(const int n, const T* a, T* r) {
@@ -357,6 +325,4 @@ template void vLog1p(const int n, const double* a, double* r);
template void vTanh(const int n, const float* a, float* r);
template void vTanh(const int n, const double* a, double* r);
-#endif
-
} // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8193aa4adffc0409d8ea68417c68fa153a2942d8..f6e77029bdd75a602f88b688ca810f47ba4ee615 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -21,11 +21,6 @@ limitations under the License. */
#include
#endif
-#ifdef PADDLE_USE_MKL
-#include
-#include
-#endif
-
#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
extern "C" {
#include
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index 5bc4a03067a75527fa30e5bb5526f93dc7b9fdcc..b998e5772e70d0a0ec79dc4064dcbaa2c302efd2 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -169,7 +169,7 @@ void TensorCheck(AssertEq compare,
count++;
}
}
- EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+ EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
}
template
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index eae87a5141ef1284630170b07d22a0cf9cd977b0..709f7de2e43093114d096cbfca5b5d49293a6d3e 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -195,8 +195,13 @@ op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
- DEPS net_op tensor_array)
+if(WITH_TESTING)
+ op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
+ DEPS net_op tensor_array gtest)
+else()
+ op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
+ DEPS net_op tensor_array)
+endif()
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
@@ -209,6 +214,7 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
rnn/recurrent_op_utils.cc
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d0c4c0d25d6f4e3ab7acd72d62a8a17fa102637b..1776f33105367447759aa91c25263dfc53bd2f99 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -65,7 +65,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
size_t num_samples = inference->dims()[0];
size_t infer_width = inference->dims()[1];
- cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+ PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
if (num_samples == 0) {
return;
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index 1968b53d19acfddaa514eca6e24d98a298d8d311..28dbc77f64842a62e88ae8df4ead7adc3b03764b 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#include
-#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
@@ -22,18 +21,6 @@ namespace operators {
using Tensor = framework::Tensor;
-template
-using EigenMatrix = framework::EigenMatrix;
-
-template
-using EigenVector = framework::EigenVector;
-
-template
-using EigenScalar = framework::EigenScalar;
-
template
class AccuracyKernel : public framework::OpKernel {
public:
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..666043e824f885e9c0e79e319d0a38ba108c209a
--- /dev/null
+++ b/paddle/operators/array_operator.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+ ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+ size_t GetOffset(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const {
+ auto *i = scope.FindVar(Input("I"));
+ PADDLE_ENFORCE(i != nullptr, "I must be set");
+ auto &i_tensor = i->Get();
+ PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+ size_t offset;
+ if (platform::is_gpu_place(i_tensor.place())) {
+ // FIXME: Avoid copy from GPU to CPU
+ framework::Tensor t;
+ t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+ dev_ctx.Wait();
+ offset = static_cast(*t.data());
+ } else {
+ offset = static_cast(*i_tensor.data());
+ }
+ return offset;
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index 6cd9c06b8ae3d3b17be83268c2f5d4002705b111..c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -140,6 +140,23 @@ class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
"ArrayToLoDTensorOp must has input X.");
PADDLE_ENFORCE(context->HasInput("RankTable"),
"ArrayToLoDTensorOp must has input RankTable.");
+ context->SetOutputDim("Out", context->GetInputDim("X"));
+ }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+ std::unique_ptr Apply() const override {
+ auto *grad_op = new framework::OpDescBind();
+ grad_op->SetType("lod_tensor_to_array");
+ grad_op->SetInput("X", OutputGrad("Out"));
+ grad_op->SetInput("RankTable", Input("RankTable"));
+ grad_op->SetOutput("Out", InputGrad("X"));
+ grad_op->SetAttrMap(Attrs());
+ return std::unique_ptr(grad_op);
}
};
@@ -149,4 +166,5 @@ class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
namespace ops = paddle::operators;
REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
ops::ArrayToLoDTensorOpProtoMaker,
- ops::ArrayToLoDTensorInferShape);
+ ops::ArrayToLoDTensorInferShape,
+ ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..609e915b932e2bc4d5abee1e5f868cc07a7619d3
--- /dev/null
+++ b/paddle/operators/assign_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+ AssignFunctor(framework::Variable *out,
+ const platform::DeviceContext &dev_ctx)
+ : out_(out), dev_ctx_(dev_ctx) {}
+
+ void operator()(const framework::LoDTensor &lod_tensor) const {
+ auto &out_tensor = *out_->GetMutable();
+ copy_tensor(lod_tensor, &out_tensor);
+ }
+
+ void operator()(const framework::LoDTensorArray &array) const {
+ auto &out_array = *out_->GetMutable();
+ out_array.resize(array.size());
+ for (size_t i = 0; i < array.size(); ++i) {
+ copy_tensor(array[i], &out_array[i]);
+ }
+ }
+
+ void operator()(const framework::SelectedRows &rows) const {
+ framework::SelectedRows &out_rows =
+ *out_->GetMutable();
+ out_rows.set_rows(rows.rows());
+ out_rows.set_height(rows.height());
+ auto &t = rows.value();
+ out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+ }
+
+ template
+ void operator()(const T &v) const {
+ PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+ }
+
+ private:
+ void copy_tensor(const framework::LoDTensor &lod_tensor,
+ framework::LoDTensor *out) const {
+ auto &out_tensor = *out;
+ out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+ out_tensor.set_lod(lod_tensor.lod());
+ }
+
+ framework::Variable *out_;
+ const platform::DeviceContext &dev_ctx_;
+};
+
+class AssignOp : public framework::OperatorBase {
+ public:
+ AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorBase(type, inputs, outputs, attrs) {}
+ void Run(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const override {
+ auto *x = scope.FindVar(Input("X"));
+ if (x == nullptr) {
+ return;
+ }
+ auto *out = scope.FindVar(Output("Out"));
+ PADDLE_ENFORCE(
+ out != nullptr,
+ "The Output(Out) should not be null if the Input(X) is set.");
+ framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+ }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ AssignOpProtoMaker(framework::OpProto *proto,
+ framework::OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X",
+ "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+ "could be LoDTensor, SelectedRows or LoDTensorArray.")
+ .AsDispensable();
+ AddOutput("Out",
+ "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+ "is the same as input X.");
+ AddComment(R"DOC(Assign Operator
+
+Out = X, when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+ }
+};
+
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+ void operator()(framework::InferShapeContext *context) const override {
+ if (context->HasInput("X")) {
+ auto type = context->GetInputsVarType("X")[0];
+ if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
+ type == framework::VarDesc_VarType_LOD_TENSOR) {
+ context->SetOutputDim("Out", context->GetInputDim("X"));
+ }
+ }
+ }
+};
+
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+ std::unique_ptr Apply() const override {
+ auto *op = new framework::OpDescBind();
+ op->SetType("assign");
+ op->SetInput("X", OutputGrad("Out"));
+ op->SetOutput("Out", InputGrad("X"));
+ return std::unique_ptr(op);
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+ ops::AssignInferShape, ops::AssignOpProtoMaker);
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 8721ca352848fc4d69b206d4ea0ab7c581c8d055..f884e6efa917ce3f8554dce0e248f2b29273e3f3 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -19,9 +19,6 @@ namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
-template
-using EigenMatrix = framework::EigenMatrix;
template
using EigenArrayMap =
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ba4dfcdaba498bfef98258f03664afebe14ec18
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BeamSearchDecodeOp : public framework::OperatorBase {
+ public:
+ BeamSearchDecodeOp(const std::string& type,
+ const framework::VariableNameMap& inputs,
+ const framework::VariableNameMap& outputs,
+ const framework::AttributeMap& attrs)
+ : OperatorBase(type, inputs, outputs, attrs) {}
+ void Run(const framework::Scope& scope,
+ const platform::DeviceContext& dev_ctx) const override {
+ framework::ExecutionContext ctx(*this, scope, dev_ctx);
+ const LoDTensorArray* ids = ctx.Input("Ids");
+ const LoDTensorArray* scores = ctx.Input("Scores");
+ const size_t step_num = ids->size();
+ PADDLE_ENFORCE_GT(step_num, 0UL,
+ "beam search steps should be larger than 0");
+ const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+ PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
+
+ for (size_t i = 0; i < step_num; ++i) {
+ PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
+ "Level of LodTensor should be 2");
+ }
+
+ // prepare output
+ LoDTensor* sentenceIds = ctx.Output("SentenceIds");
+ LoDTensor* sentenceScores = ctx.Output("SentenceScores");
+
+ BeamSearchDecoder beam_search_decoder;
+ beam_search_decoder.PackAllSteps(*ids, *scores, sentenceIds,
+ sentenceScores);
+ }
+};
+
+class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
+ framework::OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("Ids",
+ "(LodTensorArray)"
+ "score of the candidate words in each step");
+ AddInput("Scores",
+ "(LodTensorArray)"
+ "score of the candidate words in each step");
+ AddOutput("SentenceIds",
+ "(LodTensor)"
+ "All possible result sentences of word ids");
+ AddOutput("SentenceScores",
+ "(LodTensor)"
+ "All possible result sentences of word scores");
+ AddComment(R"DOC(
+Pack the result of Beam search op into SentenceIds and SentenceScores.
+)DOC");
+ }
+};
+
+class BeamSearchDecodeInferShape : public framework::InferShapeBase {
+ public:
+ void operator()(framework::InferShapeContext* context) const override {
+ PADDLE_ENFORCE(context->HasInput("Ids"),
+ "BeamSearchDecodeOp must has input Ids");
+ PADDLE_ENFORCE(context->HasInput("Scores"),
+ "BeamSearchDecodeOp must has input Scores");
+ PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
+ "BeamSearchDecodeOp must has output SentenceIds");
+ PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
+ "BeamSearchDecodeOp must has output SentenceScores");
+ }
+};
+
+class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
+ public:
+ void operator()(const framework::OpDescBind& op_desc,
+ framework::BlockDescBind* block) const override {
+ for (auto& o : op_desc.Output("SentenceIds")) {
+ block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+ }
+ for (auto& o : op_desc.Output("SentenceScores")) {
+ block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+ }
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
+ paddle::operators::BeamSearchDecodeOpProtoMaker,
+ paddle::operators::BeamSearchDecodeInferShape,
+ paddle::operators::BeamSearchDecodeInferVarType,
+ paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f007ec22f9a66572971516a711317f348e1ec5a
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.h
@@ -0,0 +1,280 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+// all the lod have 2 levels.
+// The First is source level, the second is sentence level.
+// source level describe how many candidate words for this source.
+// sentence level describe these candidates belong to which prefix
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+
+template
+struct BeamNode {
+ BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
+
+ ~BeamNode() {
+ if (parent_) {
+ parent_->DropKid(this);
+ if (parent_->kids_.size() == 0UL) {
+ delete parent_;
+ }
+ }
+ VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
+ }
+
+ void AppendTo(BeamNode* parent) {
+ parent_ = parent;
+ parent->kids_.insert(this);
+ }
+
+ void DropKid(BeamNode* kid) { kids_.erase(kid); }
+
+ BeamNode* parent_ = nullptr;
+ std::unordered_set kids_;
+ int64_t word_id_;
+ T score_;
+};
+
+template
+using BeamNodeVector = std::vector>>;
+
+template
+struct Sentence {
+ std::vector word_ids;
+ std::vector scores;
+};
+
+template
+using SentenceVector = std::vector>;
+
+template
+struct BeamSearchDecoder {
+ /**
+ * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+ */
+ Sentence MakeSentence(const BeamNode* node) const;
+
+ /**
+ * Param:
+ * cur_ids: LoDTensor of One step for word ID
+ * cur_scores: LoDTensor of One Step for word score
+ * prefixes_list: prefixes for each source sentence.
+ * sentence_vector_list: result sentence_vector for each source sentence.
+ * Return:
+ * a new prefixes list for each source of current step
+ */
+ std::vector> PackTwoSteps(
+ const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+ std::vector>& prefixes_list,
+ std::vector>* sentence_vector_list) const;
+
+ /**
+ * convert the result sentence_vector for each source sentence into two
+ * LodTensor.
+ * One is all candidate sentences with word id, one is all candidate sentences
+ * with word score.
+ * Param:
+ * sentence_vector_list: sentence_vector for each source sentence.
+ * id_tensor: result LoDTensor for sentences of id.
+ * score_tensor: result LoDTensor for sentences of score.
+ */
+ void ConvertSentenceVectorToLodTensor(
+ std::vector> sentence_vector_list, LoDTensor* id_tensor,
+ LoDTensor* score_tensor) const;
+
+ /**
+ * Pack all steps of id/score LodTensor into sentence LoDTensor
+ * it's main logic is:
+ * ```python
+ * prefix
+ * result_sentence
+ * result_lod_tensor
+ *
+ * for (step in steps):
+ * prefix = PackTwoSteps(prefix, step, &result_sentence)
+ * ConvertSentenceVectorToLodTensor(result_sentence, &result_lod_tensor)
+ * ```
+ */
+ void PackAllSteps(const LoDTensorArray& step_ids,
+ const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+ LoDTensor* score_tensor) const;
+};
+
+template
+Sentence BeamSearchDecoder::MakeSentence(const BeamNode* node) const {
+ Sentence sentence;
+ while (node != nullptr) {
+ sentence.word_ids.emplace_back(node->word_id_);
+ sentence.scores.emplace_back(node->score_);
+ node = node->parent_;
+ }
+
+ std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
+ std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
+
+ return sentence;
+}
+
+template
+std::vector> BeamSearchDecoder::PackTwoSteps(
+ const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+ std::vector>& prefixes_list,
+ std::vector>* sentence_vector_list) const {
+ std::vector> result;
+
+ for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+ ++src_idx) {
+ size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+ size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+
+ BeamNodeVector beam_nodes;
+
+ // if prefixes size is 0, it means this is the first step. In this step,
+ // all candidate id is the start of candidate sentences.
+ if (prefixes_list.empty()) {
+ PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
+ cur_ids.lod().at(kSentenceLevel).back(),
+ "in the first step");
+ for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
+ beam_nodes.push_back(std::unique_ptr>(new BeamNode(
+ cur_ids.data()[id_idx], cur_scores.data()[id_idx])));
+ }
+ } else {
+ BeamNodeVector& prefixes = prefixes_list[src_idx];
+ SentenceVector& sentence_vector = (*sentence_vector_list)[src_idx];
+
+ PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
+ "prefix and candidate set number should be the same");
+
+ auto candidate_offset = cur_ids.lod()[kSentenceLevel];
+ for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
+ std::unique_ptr>& prefix = prefixes[prefix_idx];
+ size_t candidate_start = candidate_offset[src_start + prefix_idx];
+ size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
+ if (candidate_start == candidate_end) {
+ VLOG(3) << "this sentence has no more candidate, "
+ "add to result sentence and rm it from beam tree";
+ sentence_vector.push_back(MakeSentence(prefix.get()));
+ prefix.reset();
+ } else {
+ for (size_t candidate_idx = candidate_start;
+ candidate_idx < candidate_end; ++candidate_idx) {
+ auto* candidate =
+ new BeamNode(cur_ids.data()[candidate_idx],
+ cur_scores.data()[candidate_idx]);
+ candidate->AppendTo(prefix.get());
+ beam_nodes.push_back(std::unique_ptr>(candidate));
+ }
+ prefix.release();
+ }
+ }
+ }
+ result.push_back(std::move(beam_nodes));
+ }
+ return result;
+}
+
+template
+void BeamSearchDecoder::ConvertSentenceVectorToLodTensor(
+ std::vector> sentence_vector_list, LoDTensor* id_tensor,
+ LoDTensor* score_tensor) const {
+ size_t src_num = sentence_vector_list.size();
+
+ PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
+
+ std::vector source_level_lod = {0};
+ std::vector sentence_level_lod = {0};
+ std::vector id_data;
+ std::vector score_data;
+
+ for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+ for (Sentence& sentence : sentence_vector_list[src_idx]) {
+ id_data.insert(id_data.end(), sentence.word_ids.begin(),
+ sentence.word_ids.end());
+ score_data.insert(score_data.end(), sentence.scores.begin(),
+ sentence.scores.end());
+ sentence_level_lod.push_back(sentence_level_lod.back() +
+ sentence.word_ids.size());
+ }
+ source_level_lod.push_back(source_level_lod.back() +
+ sentence_vector_list[src_idx].size());
+ }
+
+ auto cpu_place = new paddle::platform::CPUPlace();
+ paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+
+ framework::LoD lod;
+ lod.push_back(source_level_lod);
+ lod.push_back(sentence_level_lod);
+
+ id_tensor->set_lod(lod);
+ id_tensor->Resize({static_cast(id_data.size())});
+ id_tensor->mutable_data(paddle::platform::CPUPlace());
+ id_tensor->CopyFromVector(id_data, cpu_ctx);
+
+ score_tensor->set_lod(lod);
+ score_tensor->Resize({static_cast(score_data.size())});
+ score_tensor->mutable_data(paddle::platform::CPUPlace());
+ score_tensor->CopyFromVector(score_data, cpu_ctx);
+}
+
+template
+void BeamSearchDecoder::PackAllSteps(const LoDTensorArray& step_ids,
+ const LoDTensorArray& step_scores,
+ LoDTensor* id_tensor,
+ LoDTensor* score_tensor) const {
+ PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+ PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+ "step_ids and step_scores should be the same");
+ const size_t step_num = step_ids.size();
+ const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+
+ PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+
+ // previous prefixes for each step,
+ // the init length is 0, means this is the first step.
+ std::vector> beamnode_vector_list(0);
+ std::vector> sentence_vector_list(src_num);
+
+ // pack all steps for one batch first, then another batch
+ for (size_t step_id = 0; step_id < step_num; ++step_id) {
+ beamnode_vector_list =
+ PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
+ beamnode_vector_list, &sentence_vector_list);
+ }
+ // append last beam_node to result
+ for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+ for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
+ sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
+ beam_node.reset();
+ }
+ }
+
+ ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+ score_tensor);
+}
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op_test.cc b/paddle/operators/beam_search_decode_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ac23991f3c7768abaf94f3a4b750697de0ef114
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+#include "gtest/gtest.h"
+
+using CPUPlace = paddle::platform::CPUPlace;
+using LoD = paddle::framework::LoD;
+using LoDTensor = paddle::framework::LoDTensor;
+using LoDTensorArray = paddle::framework::LoDTensorArray;
+
+template
+using BeamNode = paddle::operators::BeamNode;
+template
+using BeamSearchDecoder = paddle::operators::BeamSearchDecoder;
+template
+using Sentence = paddle::operators::Sentence;
+template
+using BeamNodeVector = paddle::operators::BeamNodeVector;
+template
+using SentenceVector = paddle::operators::SentenceVector;
+
+namespace paddle {
+namespace test {
+
+void GenerateExample(const std::vector& level_0,
+ const std::vector& level_1,
+ const std::vector& data, LoDTensorArray* ids,
+ LoDTensorArray* scores) {
+ PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
+ "source level is used to describe candidate set");
+ PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
+ "the lowest level is used to describe data"
+ ", so it's last element should be data length");
+
+ CPUPlace place;
+
+ LoD lod;
+ lod.push_back(level_0);
+ lod.push_back(level_1);
+
+ // Ids
+ LoDTensor tensor_id;
+ tensor_id.set_lod(lod);
+ tensor_id.Resize({static_cast(data.size())});
+ // malloc memory
+ int64_t* id_ptr = tensor_id.mutable_data(place);
+ for (size_t i = 0; i < data.size(); ++i) {
+ id_ptr[i] = static_cast(data.at(i));
+ }
+
+ // Scores
+ LoDTensor tensor_score;
+ tensor_score.set_lod(lod);
+ tensor_score.Resize({static_cast(data.size())});
+ // malloc memory
+ float* score_ptr = tensor_score.mutable_data(place);
+ for (size_t i = 0; i < data.size(); ++i) {
+ score_ptr[i] = static_cast(data.at(i));
+ }
+
+ ids->push_back(tensor_id);
+ scores->push_back(tensor_score);
+}
+
+} // namespace test
+} // namespace paddle
+
+TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+ auto* root = new BeamNode(0, 0);
+ auto* b1 = new BeamNode