diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 8fdc382f0c1c453a01dba884a3dad216e1c3092c..b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -1,17 +1,12 @@
# Find the CBlas and lapack libraries
#
-# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
#
# If any cblas implementation found, the following variable will be set.
-# CBLAS_PROVIDER # one of MKL, ATLAS, OPENBLAS, REFERENCE
+# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE
# CBLAS_INC_DIR # the include directory for cblas.
# CBLAS_LIBS # a list of libraries should be linked by paddle.
# # Each library should be full path to object file.
-#
-# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
-# during cmake. If none of them set, it will try to find cblas implementation in
-# system paths.
-#
set(CBLAS_FOUND OFF)
@@ -30,44 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
return()
endif()
-## Then find MKL.
-set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
-
-set(MKL_INCLUDE_SEARCH_PATHS
- ${MKL_ROOT}/include
- ${INTEL_MKL_ROOT}/include)
-set(MKL_LIB_SEARCH_PATHS
- ${MKL_ROOT}/lib
- ${MKL_ROOT}/lib/intel64
- ${INTEL_MKL_ROOT}/lib
- ${INTEL_MKL_ROOT}/lib/intel64)
-
-find_path(MKL_INC_DIR mkl.h PATHS
- ${MKL_INCLUDE_SEARCH_PATHS})
-find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
- ${MKL_INCLUDE_SEARCH_PATHS})
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS
- ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
- ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
- ${MKL_LIB_SEARCH_PATHS})
-
-if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
- set(CBLAS_FOUND ON)
- set(CBLAS_PROVIDER MKL)
- set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
- set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
-
- add_definitions(-DPADDLE_USE_MKL)
- add_definitions(-DLAPACK_FOUND)
-
- message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
- message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
- return()
-endif()
-
## Then find atlas.
set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
set(ATLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9686df00219001769d074ee815d9cc8db0258496..5a06825beb73e85d8a55b7b578b187bee2c4340c 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
ENDIF()
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
ExternalProject_Add(
${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
- GIT_TAG "v0.10"
+ GIT_TAG "v0.11"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT}
+ CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+ CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLDNN_MKLROOT}
)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 74f3279831357c21038df133df0f5a432a6dfd20..20dbc32a738d982df2d3f035206279c82c8de264 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,8 @@ ENDIF()
INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
-SET(MKLML_VER "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 3f86e456cfbe55fe47e5b18e755e34829ebe9930..05d83ad58ef8485d36829e7aeede79f625cfdc43 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -86,7 +86,7 @@ IF(NOT ${CBLAS_FOUND})
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
-
+ SET(CBLAS_PROVIDER openblas)
IF(WITH_C_API)
INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
# Because libopenblas.a is a symbolic link of another library, thus need to
@@ -115,7 +115,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF(${CBLAS_PROVIDER} MATCHES MKL)
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_LIBRARY(cblas SHARED ${dummyfile})
ELSE()
ADD_LIBRARY(cblas STATIC ${dummyfile})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c311783aa3187678c31c27ddbbd074790ca444f3..b9c1dde97bc444d793d67ff622fd6b13c6435a9a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE AND NOT ANDROID)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
- set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+ set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)
function(merge_static_libs TARGET_NAME)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2..203506d7ab84e5a5be2232b077eac2d433a99766 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -82,6 +82,11 @@ maxout
.. autoclass:: paddle.v2.layer.maxout
:noindex:
+roi_pool
+--------
+.. autoclass:: paddle.v2.layer.roi_pool
+ :noindex:
+
Norm Layer
==========
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -2,112 +2,9 @@
Data Reader Interface and DataSets
==================================
+.. toctree::
+ :maxdepth: 1
-DataTypes
-=========
-
-.. automodule:: paddle.v2.data_type
- :members:
- :noindex:
-
-DataFeeder
-==========
-
-.. automodule:: paddle.v2.data_feeder
- :members:
- :noindex:
-
-Reader
-======
-
-.. automodule:: paddle.v2.reader
- :members:
- :noindex:
-
-.. automodule:: paddle.v2.reader.creator
- :members:
- :noindex:
-
-minibatch
-=========
-
-.. automodule:: paddle.v2.minibatch
- :members:
- :noindex:
-
-Dataset
-=======
-
-.. automodule:: paddle.v2.dataset
- :members:
- :noindex:
-
-mnist
-+++++
-
-.. automodule:: paddle.v2.dataset.mnist
- :members:
- :noindex:
-
-cifar
-+++++
-
-.. automodule:: paddle.v2.dataset.cifar
- :members:
- :noindex:
-
-conll05
-+++++++
-
-.. automodule:: paddle.v2.dataset.conll05
- :members: get_dict,get_embedding,test
- :noindex:
-
-imdb
-++++
-
-.. automodule:: paddle.v2.dataset.imdb
- :members:
- :noindex:
-
-imikolov
-++++++++
-
-.. automodule:: paddle.v2.dataset.imikolov
- :members:
- :noindex:
-
-movielens
-+++++++++
-
-.. automodule:: paddle.v2.dataset.movielens
- :members:
- :noindex:
-
-.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
- :noindex:
-
-.. autoclass:: paddle.v2.dataset.movielens.UserInfo
- :noindex:
-
-sentiment
-+++++++++
-
-.. automodule:: paddle.v2.dataset.sentiment
- :members:
- :noindex:
-
-uci_housing
-+++++++++++
-
-.. automodule:: paddle.v2.dataset.uci_housing
- :members:
- :noindex:
-
-wmt14
-+++++
-
-.. automodule:: paddle.v2.dataset.wmt14
- :members:
- :noindex:
-
+ data/data_reader.rst
+ data/image.rst
+ data/dataset.rst
diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e
--- /dev/null
+++ b/doc/api/v2/data/data_reader.rst
@@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+.. automodule:: paddle.v2.data_type
+ :members:
+ :noindex:
+
+DataFeeder
+==========
+
+.. automodule:: paddle.v2.data_feeder
+ :members:
+ :noindex:
+
+Reader
+======
+
+.. automodule:: paddle.v2.reader
+ :members:
+ :noindex:
+
+.. automodule:: paddle.v2.reader.creator
+ :members:
+ :noindex:
+
+minibatch
+=========
+
+.. automodule:: paddle.v2.minibatch
+ :members:
+ :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365
--- /dev/null
+++ b/doc/api/v2/data/dataset.rst
@@ -0,0 +1,75 @@
+Dataset
+=======
+
+.. automodule:: paddle.v2.dataset
+ :members:
+ :noindex:
+
+mnist
++++++
+
+.. automodule:: paddle.v2.dataset.mnist
+ :members:
+ :noindex:
+
+cifar
++++++
+
+.. automodule:: paddle.v2.dataset.cifar
+ :members:
+ :noindex:
+
+conll05
++++++++
+
+.. automodule:: paddle.v2.dataset.conll05
+ :members: get_dict,get_embedding,test
+ :noindex:
+
+imdb
+++++
+
+.. automodule:: paddle.v2.dataset.imdb
+ :members:
+ :noindex:
+
+imikolov
+++++++++
+
+.. automodule:: paddle.v2.dataset.imikolov
+ :members:
+ :noindex:
+
+movielens
++++++++++
+
+.. automodule:: paddle.v2.dataset.movielens
+ :members:
+ :noindex:
+
+.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
+ :noindex:
+
+.. autoclass:: paddle.v2.dataset.movielens.UserInfo
+ :noindex:
+
+sentiment
++++++++++
+
+.. automodule:: paddle.v2.dataset.sentiment
+ :members:
+ :noindex:
+
+uci_housing
++++++++++++
+
+.. automodule:: paddle.v2.dataset.uci_housing
+ :members:
+ :noindex:
+
+wmt14
++++++
+
+.. automodule:: paddle.v2.dataset.wmt14
+ :members:
+ :noindex:
diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/api/v2/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+.. automodule:: paddle.v2.image
+ :members:
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index fe8da907d9d45a2164031430ac5b7a3d5523967a..16236763a73770f3fe5eadf67645765d0456f875 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -15,6 +15,7 @@
- [CMake](#cmake)
- [Layers](#layers)
- [Activations](#activations)
+ - [Weights](#weights)
- [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api)
@@ -45,17 +46,23 @@ Figure 1. PaddlePaddle on IA.
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。
+`paddle/gserver/layers`中,并且文件名都会一以*MKLDNN*开头。
-所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。
+所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。
+
+在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数,子类只需要实现这些函数即可。
### Activations
-由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。
+由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+### Weights
+由于有些layer是含有参数的,我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
+同时,由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致,我们会在网络训练的开始和结束时分别转换这个layout,使得最终保存的参数格式与PaddlePaddle一致。
-Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+### Unit Tests
+会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
@@ -82,7 +89,7 @@ if use_mkldnn
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking
-会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。
+会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试使用MKL-DNN之后的性能。
### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。
@@ -94,14 +101,16 @@ if use_mkldnn
我们总结出一些特别需要注意的点:
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MkldnnLayer`特有的设备ID。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`,和未来可能还会用到`FPGAEngine`等。
-5. 在**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue`和`mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
-6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
-7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
-8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况,所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面,一直保存的是0,所以可以充分利用这个信息,定义一个枚举处理所有MKLDNN的参数格式,从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
+3. 创建`MKLDNNMatrix`,同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+4. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。
+5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表input value, input gradient,output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory),主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好,每个子类只需要调用定义好的reset buffer函数即可。
+6. 每个`MKLDNNlayer`的resetbuffer相关的函数(包括reset input、output的Value和grad),他们会根据输入参数reset internal和external的memory,当然这两者也可以相等,即表示不需要转换。只需要把握一个原则,每个`MKLDNNlayer`的子类,只需要使用internal的memory就可以了,所有external的转换工作在父类的reset函数中都提前准备好了。
+7. 一般来说,external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存,因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换,那么internal的buffer也会与他们共享内存。
+8. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存,同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`。
+9. 由于MKL-DNN的输出操作都是覆盖data的,不是在原来的数据上累加,所以当网络出现分支时,在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法,此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中,由分支处的layer负责求和,并把结果放到这个layer的`output_.grad`中。所以整体上,每个子类并不会需要关心分支的事情,也是在父类都实现好了。
+10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
## References
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..9007aae7a8355ed06c6720a921351f81b859c1fe
--- /dev/null
+++ b/doc/design/ops/sequence_decoder.md
@@ -0,0 +1,245 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and image to text,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences,
+it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search,
+due to the complexity, the implementation relays on a lot of special data structures,
+quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks,
+so the flexibility of sequence decoder is very important to users.
+
+During PaddlePaddle's refactoring work,
+some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
+and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences,
+it stores several arrays of integers each represents a level.
+
+The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clear.
+
+The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that relay on empty sequence representation,
+such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following demos are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence,
+and a decoder which uses the sequence decoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+ name='source_language_word',
+ type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+ encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+ decoder = pd.while_loop()
+ with decoder.step():
+ decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory
+ generated_ids = decoder.memory() # TODO init to batch_size s
+ generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+ target_word = pd.lookup(trg_embedding, gendrated_ids)
+ # expand encoder_ctx's batch to fit target_word's lod
+ # for example
+ # decoder_mem.lod is
+ # [[0 1 3],
+ # [0 1 3 6]]
+ # its tensor content is [a1 a2 a3 a4 a5]
+ # which means there are 2 sentences to translate
+ # - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+ # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+ # the target_word.lod is
+ # [[0, 1, 6]
+ # [0, 2, 4, 7, 9 12]]
+ # which means 2 sentences to translate, each has 1 and 5 prefixes
+ # the first prefix has 2 candidates
+ # the following has 2, 3, 2, 3 candidates
+ # the encoder_ctx_expanded's content will be
+ # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+ encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+ decoder_input = pd.fc(
+ act=pd.activation.Linear(),
+ input=[target_word, encoder_ctx],
+ size=3 * decoder_dim)
+ gru_out, cur_mem = pd.gru_step(
+ decoder_input, mem=decoder_mem, size=decoder_dim)
+ scores = pd.fc(
+ gru_out,
+ size=trg_dic_size,
+ bias=None,
+ act=pd.activation.Softmax())
+ # K is an config
+ topk_scores, topk_ids = pd.top_k(scores, K)
+ topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+ selected_ids, selected_generation_scores = decoder.beam_search(
+ topk_ids, topk_generated_scores)
+
+ # update the states
+ decoder_mem.update(cur_mem) # tells how to update state
+ generated_ids.update(selected_ids)
+ generated_scores.update(selected_generation_scores)
+
+ decoder.output(selected_ids)
+ decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
+return the result of the beam search algorithm.
+
+In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
+
+1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. remove some specific candidate in `selected_ids`
+3. get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`
+
+- the first level represents `batch_size` of (source) sentences;
+- the second level represents the candidate ID sets for translation prefix.
+
+for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
+a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state stored in `encoder_ctx_expanded`
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+ - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+ - a2 a2
+ - b1 b1 b1
+ - b2
+ - b3 b3
+ - None (c1 has 0 candidates, so c1 is dropped)
+
+Benefit from the relative offset LoD, empty candidate set can be represented naturally.
+
+the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+the `selected_ids` is the candidate ids for the prefixes,
+it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
+the first level represents the source sequences,
+the second level represents generated sequences.
+
+Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
+
+Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+
+
+
+
+According the image above, the only phrase to change LoD is beam search.
+
+## Beam search design
+The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
+
+1. `topk_ids`, top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of the are LoDTensors, so that the sequence affilication is clear.
+Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
+and they exist in each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
+the results of beam search are better to store in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support pack or unpack an array of `LoDTensors`.
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 0e939a2671ace8682c90cdc1c1bb2da1dda0d568..b331d9d36e6a279881c3b1a5586835e7186957fb 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -99,7 +99,7 @@ PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`spa
利用更多的计算资源
++++++++++++++++++
-利用更多的计算资源可以分为一下几个方式来进行\:
+利用更多的计算资源可以分为以下几个方式来进行\:
* 单机CPU训练
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index c823d7e9fcd63dd7719ac1403952b03c2d2f03c0..6cfc9536f20e88571a9845a50be0341fe4d9f78b 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -214,7 +214,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
```cpp
// if use Eigen unsupported module before include head files
- #define EIGEN_USE_GPU
+ // #define EIGEN_USE_GPU
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel);
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 4547afaf1dc9af8bc7909a684db766fdd7b159c0..53a36f8f20d1143470928f57eda6f575d9048236 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
return kPD_NO_ERROR;
}
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+ paddle_real* value) {
+ if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+ auto ptr = cast(mat);
+ if (ptr->mat == nullptr) return kPD_NULLPTR;
+ paddle::real* buf = ptr->mat->getRowBuf(0);
+ size_t width = ptr->mat->getWidth();
+ size_t height = ptr->mat->getHeight();
+ if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+ hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+ return kPD_NOT_SUPPORTED;
+#endif
+ } else {
+ std::copy(value, value + width * height, buf);
+ }
+ return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+ paddle_real* result) {
+ if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+ auto ptr = cast(mat);
+ if (ptr->mat == nullptr) return kPD_NULLPTR;
+ paddle::real* buf = ptr->mat->getRowBuf(0);
+ size_t width = ptr->mat->getWidth();
+ size_t height = ptr->mat->getHeight();
+ if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+ hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+ return kPD_NOT_SUPPORTED;
+#endif
+ } else {
+ std::copy(buf, buf + width * height, result);
+ }
+ return kPD_NO_ERROR;
+}
+
paddle_error paddle_matrix_get_row(paddle_matrix mat,
uint64_t rowID,
paddle_real** rawRowBuffer) {
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 3e6bd5285058a297c4574631e2a5c033b83936e8..876af2aa7615c098d225b56ce2ea0b1529a6e3c6 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -27,18 +27,20 @@ int main() {
CHECK(paddle_arguments_resize(in_args, 1));
// Create input matrix.
- paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+ paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
/* size */ 784,
/* useGPU */ false);
srand(time(0));
- paddle_real* array;
- // Get First row.
- CHECK(paddle_matrix_get_row(mat, 0, &array));
+ std::vector input;
+ input.resize(784 * 10);
- for (int i = 0; i < 784; ++i) {
- array[i] = rand() / ((float)RAND_MAX);
+ for (int i = 0; i < input.size(); ++i) {
+ input[i] = rand() / ((float)RAND_MAX);
}
+
+ // Set value for the input matrix
+ CHECK(paddle_matrix_set_value(mat, input.data()));
CHECK(paddle_arguments_set_value(in_args, 0, mat));
@@ -51,11 +53,17 @@ int main() {
CHECK(paddle_arguments_get_value(out_args, 0, prob));
- CHECK(paddle_matrix_get_row(prob, 0, &array));
+ std::std::vector result;
+ int height;
+ int width;
+
+ CHECK(paddle_matrix_get_shape(prob, &height, &width);
+ result.resize(height * width);
+ CHECK(paddle_matrix_get_value(prob, result.data()));
printf("Prob: ");
- for (int i = 0; i < 10; ++i) {
- printf("%.2f ", array[i]);
+ for (int i = 0; i < height * width; ++i) {
+ printf("%.2f ", result[i]);
}
printf("\n");
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index f15f7f3bbbd1457617111f827d2182ae6b7d9fdb..bb5223f8a275fa2550bf8b7e94a9c4333de4c8c9 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -70,6 +70,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
uint64_t rowID,
paddle_real* rowArray);
+/**
+ * @brief paddle_matrix_set_value Set value to matrix.
+ * @param mat Target Matrix
+ * @param value Row data.
+ * @return paddle_error
+ * @note value should contain enough element of data to init the mat
+ */
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+ paddle_real* value);
+
/**
* @brief PDMatGetRow Get raw row buffer from matrix
* @param [in] mat Target matrix
@@ -81,6 +91,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
uint64_t rowID,
paddle_real** rawRowBuffer);
+/**
+ * @brief copy data from the matrix
+ * @param [in] mat Target matrix
+ * @param [out] result pointer to store the matrix data
+ * @return paddle_error
+ * @note the space of the result should allocated before invoke this API
+ */
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+ paddle_real* result);
/**
* @brief PDMatCreateNone Create None Matrix
* @return
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
index 4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b..6940c28448a897cecd78b718fe720441086a5a99 100644
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) {
paddle_matrix mat = paddle_matrix_create_none();
ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
}
+
+TEST(CAPIMatrix, cpu_get_set_value) {
+ paddle_matrix mat = paddle_matrix_create(128, 32, false);
+ std::vector sample;
+ std::vector result;
+ sample.resize(128 * 32);
+ result.resize(128 * 32);
+ for (size_t i = 0; i < sample.size(); ++i) {
+ sample[i] = 1.0 / (i + 1.0);
+ }
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+ for (size_t i = 0; i < sample.size(); ++i) {
+ ASSERT_NEAR(sample[i], result[i], 1e-5);
+ }
+
+ uint64_t height, width;
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+ ASSERT_EQ(128UL, height);
+ ASSERT_EQ(32UL, width);
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(CAPIMatrix, gpu_get_set_value) {
+ paddle_matrix mat = paddle_matrix_create(128, 32, true);
+ std::vector sample;
+ std::vector result;
+ sample.resize(128 * 32);
+ result.resize(128 * 32);
+ for (size_t i = 0; i < sample.size(); ++i) {
+ sample[i] = 1.0 / (i + 1.0);
+ }
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+ for (size_t i = 0; i < sample.size(); ++i) {
+ ASSERT_NEAR(sample[i], result[i], 1e-5);
+ }
+
+ uint64_t height, width;
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+ ASSERT_EQ(128UL, height);
+ ASSERT_EQ(32UL, width);
+ ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+#endif
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index ed94540c268e5ed990c1d92859c6a2093c052868..913cd0f81eaef37014f38c71e7c3d23bfeec1466 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -321,8 +321,6 @@ static void CreateGradVarInBlock(
auto* param = block_desc->FindVarRecursive(pname);
auto* grad = block_desc->FindVar(arg);
if (param == nullptr) {
- LOG(WARNING) << "Cannot find forward variable of " << arg
- << ". Set its gradient to FP32";
grad->SetDataType(DataType::FP32);
} else {
grad->SetDataType(param->GetDataType());
@@ -408,6 +406,11 @@ std::vector> MakeBlockBackward(
for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) {
+ if (out_name.find("@GRAD") == std::string::npos) {
+ // Not all outputs of a backward operator is a gradient. Only gradient
+ // need to be sum. Skip variables are not gradient.
+ continue;
+ }
dup_out_ops[out_name].emplace_back(grad_desc_idx);
}
++grad_desc_idx;
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 4e8d630c2634682ff63b38182108eadebb5c7ff9..d485cdf6109274377ad0057223bdd8401e964aa7 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -21,7 +21,7 @@
#include "paddle/framework/var_desc.h"
#include "paddle/operators/net_op.h"
-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);
namespace paddle {
namespace framework {
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 9e3d597f3a2c84623a1ce9e4b6f4b956cffde211..11764810e1d40e5e6eb3cd0d8e9b4b63a79855b4 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -50,6 +50,15 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
return it->second.get();
}
+VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
+ const std::string &name_bytes) {
+ VarDescBind *res = FindVarRecursive(name_bytes);
+ if (res == nullptr) {
+ res = Var(name_bytes);
+ }
+ return res;
+}
+
bool BlockDescBind::HasVarRecursive(const std::string &name) const {
return FindVarRecursive(name) != nullptr;
}
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 26adf6a20ff09483b84f479db08efcf402135053..8e967e5378eb47a7869efb59cc96a271f1cbb9a1 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -58,6 +58,8 @@ class BlockDescBind {
VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+ VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+
bool HasVarRecursive(const std::string &var_name) const;
std::set LocalVarNames() const {
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c5ae7b185460c8b0d68ba38bb9db9bd3d3fb14ea..3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
}
}
+inline std::type_index ToTypeIndex(DataType type) {
+ switch (type) {
+ case DataType::FP32:
+ return typeid(float);
+ case DataType::FP64:
+ return typeid(double);
+ case DataType::INT32:
+ return typeid(int);
+ case DataType::INT64:
+ return typeid(int64_t);
+ default:
+ PADDLE_THROW("Not support type %d", type);
+ }
+}
+
template
inline void VisitDataType(DataType type, Visitor visitor) {
switch (type) {
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 10c785e04c4fa2192f9c95513009cf7d8c123868..53b899a23997b71e723a298ec360a4e018d89878 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -79,6 +79,13 @@ DDim make_ddim(const std::vector& dims) {
return result;
}
+DDim make_ddim(const std::vector& dims) {
+ std::vector res(dims.size());
+ std::transform(dims.begin(), dims.end(), res.begin(),
+ [](int d) { return static_cast(d); });
+ return make_ddim(res);
+}
+
/// @cond HIDDEN
// XXX For some reason, putting this in an anonymous namespace causes errors
class DynamicMutableIndexer : public boost::static_visitor {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index aa773868ab4b68acbc46dfa2cd2569d8b8b7789d..4ca5e49566b7ec006eba80f3f9808bacb1ff2615 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -81,6 +81,8 @@ struct DDim {
*/
DDim make_ddim(const std::vector& dims);
+DDim make_ddim(const std::vector& dims);
+
/**
* \brief Make a DDim from an initializer list
*
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index e7cba9e702ce0f96a9680169f0593130df2fd096..39c8def82e1ebb10a0e357a648af760099020c32 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -357,7 +357,8 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) {
- block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
+ block->FindRecursiveOrCreateVar(out_var_name)
+ ->SetType(VarDesc::LOD_TENSOR);
}
}
}
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index fb2c69105627f663ddcce07d31526c9e4278e863..9428b8a07ea0af005f6e960ddaa02da624ad9d97 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -98,5 +98,23 @@ void Scope::DeleteScope(Scope* scope) {
delete scope;
}
+void Scope::Rename(const std::string& origin_name,
+ const std::string& new_name) const {
+ auto origin_it = vars_.find(origin_name);
+ PADDLE_ENFORCE(origin_it != vars_.end(),
+ "Cannot find original variable with name %s", origin_name);
+ auto new_it = vars_.find(new_name);
+ PADDLE_ENFORCE(new_it == vars_.end(),
+ "The variable with name %s is already in the scope", new_name);
+ vars_[new_name] = origin_it->second;
+ vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+ auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+ Rename(origin_name, var_name);
+ return var_name;
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index fb660949394149ebf2c6172a0ac3f4c7594f4286..c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -68,11 +68,18 @@ class Scope {
// enumerate all the variables current contains.
std::vector GetAllNames(bool recursive = false) const;
+ // Rename variable to a new name
+ void Rename(const std::string& origin_name,
+ const std::string& new_name) const;
+
+ // Rename variable to a new name and return the new name
+ std::string Rename(const std::string& origin_name) const;
+
private:
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {}
- std::unordered_map vars_;
+ mutable std::unordered_map vars_;
mutable std::list kids_;
Scope const* parent_{nullptr};
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 4fd72d64a90ae6f16dd1499ceb7fba6e40fe4cea..9b2779b42cad324253dadf27dbff20fd8e8c8e16 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -45,6 +45,7 @@ if(WITH_GPU)
add_simple_unittest(BlockExpandOpTest)
add_simple_unittest(CropOpTest)
add_simple_unittest(SwitchOpTest)
+ add_simple_unittest(ScaleSubRegionOpTest)
endif()
add_simple_unittest(Im2ColTest)
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index ba446bf92da264fafa1fb47a2c30da9cb13176ce..370940532ef40335be54a3e6467de0409e923ec4 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -110,6 +110,7 @@ public:
function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
function1_->init(config);
function2_->init(config);
+ initArgsCallback_ = nullptr;
}
~Compare2Function() {}
@@ -170,6 +171,10 @@ public:
*seq2_));
}
+ void registerInitCallback(std::function callback) {
+ initArgsCallback_ = callback;
+ }
+
// output need only contains shape, do not contains data.
void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
size_t size =
@@ -340,6 +345,10 @@ protected:
initArg(*func1Inputs_[i]);
}
+ if (initArgsCallback_ != nullptr) {
+ initArgsCallback_(*func1Inputs_[i], i);
+ }
+
copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
}
}
@@ -386,6 +395,7 @@ protected:
std::shared_ptr seq1_;
std::shared_ptr seq2_;
test::CopyArgument copyArg_;
+ std::function initArgsCallback_;
};
class CpuGpuFuncCompare
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a080505d7df83a6c0a9d88fbcb7863fc0e1f7b21
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+ for (int n = 0; n < number; ++n) {
+ // indices start from 1
+ int offset = n * 6;
+ for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+ for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+ for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+ int idx = ((n * channel + c) * height + h) * width + w;
+ outputs[idx] *= value;
+ }
+ }
+ }
+ }
+}
+
+template <>
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ for (int n = 0; n < number; ++n) {
+ for (int c = 0; c < channel; ++c) {
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ int idx = ((n * channel + c) * height + h) * width + w;
+ int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) &&
+ h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) &&
+ w <= (indices[offset + 5] - 1)) {
+ outGrad[idx] += inGrad[idx] * value;
+ } else {
+ outGrad[idx] += inGrad[idx];
+ }
+ }
+ }
+ }
+ }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ * a specified sub continuous region. By providing start index and end
+ * index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with same shape as inputs, output value.
+ */
+template
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+ void init(const FuncConfig& config) override { conf_ = config; }
+
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ(2UL, inputs.size());
+ CHECK_EQ(1UL, outputs.size());
+ CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+ TensorShape shape = inputs[0].shape();
+
+ ScaleSubRegion(outputs[0].data(),
+ inputs[0].data(),
+ inputs[1].data(),
+ shape,
+ conf_);
+ }
+
+private:
+ FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+ void init(const FuncConfig& config) override { conf_ = config; }
+
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ(2UL, inputs.size());
+ CHECK_EQ(1UL, outputs.size());
+ CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+ TensorShape shape = inputs[0].shape();
+
+ ScaleSubRegionGrad(inputs[0].data(),
+ outputs[0].data(),
+ inputs[1].data(),
+ shape,
+ conf_);
+ }
+
+private:
+ FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/function/ScaleSubRegionOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..0480c8577f3fbf3bc9e94b635df96a31b103e9e3
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ * region. Indices must be provided to indcate the location and shape of
+ * the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs Output value.
+ * \param[in] inputs Input data which contains NCHW information.
+ * \param[in] indices Indices data to indcate the sub region.
+ * \param[in] shape Tensor shape of input value.
+ * \param[in] conf Configure variable which contains the multiplied value.
+ */
+template
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad Gradients of previous layer.
+ * \param[in] outGrad Output gradient.
+ * \param[in] indices Indices data.
+ * \param[in] shape The Shape of input tensor.
+ * \param[in] conf Configure variable.
+ */
+template
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf);
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/function/ScaleSubRegionOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aae2e44c3fdc8b516e66ecfd2e04f466a17dde9
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ real value,
+ int channel,
+ int height,
+ int width,
+ int nthreads) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < nthreads) {
+ const int w = idx % width;
+ const int h = (idx / width) % height;
+ const int c = (idx / width / height) % channel;
+ const int n = idx / width / height / channel;
+
+ const int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+ outputs[idx] = inputs[idx] * value;
+ } else {
+ outputs[idx] = inputs[idx];
+ }
+ }
+}
+
+template <>
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ size_t nth = number * channel * height * width;
+ int blockSize = 1024;
+ int gridSize = (nth + blockSize - 1) / blockSize;
+
+ KeScaleSubRegion<<>>(
+ outputs, inputs, indices, value, channel, height, width, nth);
+ CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ real value,
+ int channel,
+ int height,
+ int width,
+ int nthreads) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < nthreads) {
+ const int w = idx % width;
+ const int h = (idx / width) % height;
+ const int c = (idx / width / height) % channel;
+ const int n = idx / width / height / channel;
+
+ const int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+ outGrad[idx] += inGrad[idx] * value;
+ } else {
+ outGrad[idx] += inGrad[idx];
+ }
+ }
+}
+
+template <>
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ size_t nth = number * channel * height * width;
+ int blockSize = 1024;
+ int gridSize = (nth + blockSize - 1) / blockSize;
+
+ KeScaleSubRegionDiff<<>>(
+ inGrad, outGrad, indices, value, channel, height, width, nth);
+ CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/function/ScaleSubRegionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43331f258dddaa43cbc8cc77519e299de7e98290
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+ for (size_t numSamples : {5, 32}) {
+ for (size_t channels : {5, 32}) {
+ for (size_t imgSizeH : {5, 33}) {
+ for (size_t imgSizeW : {5, 32}) {
+ for (real value : {-0.5, 0.0, 0.5}) {
+ for (bool firstHalf : {false, true}) {
+ VLOG(3) << " numSamples=" << numSamples
+ << " channels=" << channels << " imgSizeH=" << imgSizeH
+ << " imgSizeW=" << imgSizeW;
+
+ for (bool testGrad : {false, true}) {
+ CpuGpuFuncCompare compare(
+ testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+ FuncConfig().set("value", value));
+
+ TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+ TensorShape indicesShape{numSamples, 6};
+
+ compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+ compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+ compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+ if (index == 1) {
+ real* data = (real*)arg.data();
+
+ for (size_t i = 0; i < numSamples; ++i) {
+ size_t offset = i * 6;
+ data[offset] = firstHalf ? 1 : channels / 2;
+ data[offset + 1] = firstHalf ? channels / 2 : channels;
+ data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+ data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+ data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+ data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+ }
+ }
+ });
+
+ compare.addOutputs(
+ BufferArg(
+ VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+ testGrad ? ADD_TO : ASSIGN_TO);
+ compare.run();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 8eb700723f2cf7dda969739bb5e3d48358d278a0..6ffe4fbec643e50d27924a989875454d307f5b9b 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -62,16 +62,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector& pipeline,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
- if (biases_) {
- LOG(FATAL) << "not implemented yet";
- }
- resetFwdBuffers(inVals_, out);
+ resetFwdBuffers(inVals_, bias, out);
in = inVals_[0];
std::shared_ptr fwdPD;
- resetFwdPD(fwdPD, inVals_, out);
+ std::shared_ptr biasPD;
+ resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
- resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+ resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
}
void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
@@ -79,7 +77,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
- resetBwdBuffers(inGrads_, out);
+ resetBwdBuffers(inGrads_, bias, out);
in = inGrads_[0];
// backward only need share output grad to input grad
@@ -89,6 +87,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
}
}
+
+ // backward bias
+ bwdBias_ = nullptr;
+ if (bias) {
+ std::vector scales(bs_, 1.0);
+ std::vector srcPDs(bs_, bias->getPrimitiveDesc());
+ auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+ std::vector srcs;
+ for (size_t i = 0; i < grads_.size(); ++i) {
+ srcs.push_back(*(grads_[i]));
+ }
+ bwdBias_.reset(new sum(biasPD, srcs, *bias));
+ pipeline.push_back(*bwdBias_);
+ }
}
void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@@ -97,7 +109,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
}
}
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+ const MatrixPtr& biasMat,
+ const MKLDNNMatrixPtr& out,
+ std::vector& outs) {
+ auto pd = MKLDNNMatrix::createPrimitiveDesc(
+ {(int)layerSize_}, memory::format::x, engine_);
+ bias = MKLDNNMatrix::create(pd, biasMat);
+ outs.clear();
+ real* data = out->getData();
+ CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+ for (int i = 0; i < bs_; ++i) {
+ MatrixPtr tmp =
+ Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+ outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+ }
+}
+
void MKLDNNAddtoLayer::resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
@@ -110,12 +140,20 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector& inputs,
}
resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+ if (biases_ && biases_->getW()) {
+ prepareBias(bias, biases_->getW(), out, vals_);
+ } else {
+ bias = nullptr;
+ }
}
void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr bias,
MKLDNNMatrixPtr out) {
- std::vector scales(inputs.size(), 1.0);
+ std::vector scales(inputs.size(), 1.0);
std::vector srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
@@ -123,12 +161,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr& pd,
CHECK(out);
pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+ biasPD = nullptr;
+ if (bias) {
+ std::vector scales(2, 1.0);
+ std::vector srcPDs(2, bias->getPrimitiveDesc());
+ biasPD.reset(
+ new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+ CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+ }
}
void MKLDNNAddtoLayer::resetFwdPipeline(
std::vector& pipeline,
std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
std::vector srcs;
for (size_t i = 0; i < inputs.size(); i++) {
@@ -136,9 +185,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
}
fwd_.reset(new sum(*pd, srcs, *out));
pipeline.push_back(*fwd_);
+
+ fwdBias_.clear();
+ if (biasPD == nullptr || bias == nullptr) {
+ return;
+ }
+ fwdBias_.resize(vals_.size());
+ for (size_t i = 0; i < vals_.size(); ++i) {
+ std::vector srcs;
+ srcs.push_back(*(vals_[i]));
+ srcs.push_back(*bias);
+ fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+ pipeline.push_back(*fwdBias_[i]);
+ }
}
void MKLDNNAddtoLayer::resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
@@ -149,6 +212,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector& inputs,
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
}
+
+ if (biases_ && biases_->getWGrad()) {
+ prepareBias(bias, biases_->getWGrad(), out, grads_);
+ } else {
+ bias = nullptr;
+ }
}
} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index 15f74ec5bdf3d1e4ae5e09051be6be418590a67a..24504b7b4f50726e2b2757ca3029461cdc27b411 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -32,9 +32,15 @@ protected:
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_;
- // TODO(TJ): this part has not been optimized by MKL-DNN
std::unique_ptr biases_;
+ // buffers for adding bias
+ std::vector vals_;
+ std::vector grads_;
+ // primitives for adding bias
+ std::vector> fwdBias_;
+ std::shared_ptr bwdBias_;
+
public:
explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
@@ -91,20 +97,34 @@ protected:
* reset pipeline.
*/
void resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr bias,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector& pipeline,
std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
+
+ /**
+ * prepare for bias
+ */
+ void prepareBias(MKLDNNMatrixPtr& bias,
+ const MatrixPtr& biasMat,
+ const MKLDNNMatrixPtr& out,
+ std::vector& outs);
};
} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index 9b0ae20f089e34a719883bc65e88e33ab9334e39..ed3887cbf653878623764a310c9f364f4d8be27f 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -119,7 +119,7 @@ void MKLDNNBatchNormLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
oh = ih;
- ow = ow;
+ ow = iw;
// ic_ and oc can not be changed
CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
<< "Input channel can not be changed";
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 82ef344c7b2aa0093a5f0a28780592dea5d51efe..e75ac5ba4647a8267b7bc189893bd7adb5c3053f 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -287,7 +287,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
return;
}
CHECK(out) << "should have reset internal ouput grad";
- std::vector scales(outputMap_.size(), 1.0);
+ std::vector scales(outputMap_.size(), 1.0);
std::vector srcPDs;
std::vector srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99cfddb0cf3337745a716a8c329713c18b99eda3
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ROIPoolLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(roi_pool, ROIPoolLayer);
+
+bool ROIPoolLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ Layer::init(layerMap, parameterMap);
+
+ const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+ pooledWidth_ = layerConf.pooled_width();
+ pooledHeight_ = layerConf.pooled_height();
+ spatialScale_ = layerConf.spatial_scale();
+
+ return true;
+}
+
+void ROIPoolLayer::forward(PassType passType) {
+ Layer::forward(passType);
+
+ const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+ height_ = getInput(0).getFrameHeight();
+ if (!height_) height_ = layerConf.height();
+ width_ = getInput(0).getFrameWidth();
+ if (!width_) width_ = layerConf.width();
+ channels_ = getInputValue(0)->getWidth() / width_ / height_;
+
+ size_t batchSize = getInput(0).getBatchSize();
+ size_t numROIs = getInput(1).getBatchSize();
+
+ MatrixPtr dataValue = getInputValue(0);
+ MatrixPtr roiValue = getInputValue(1);
+ resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
+ MatrixPtr outputValue = getOutputValue();
+
+ if (useGpu_) { // TODO(guosheng): implement on GPU later
+ MatrixPtr dataCpuBuffer;
+ Matrix::resizeOrCreate(dataCpuBuffer,
+ dataValue->getHeight(),
+ dataValue->getWidth(),
+ false,
+ false);
+ MatrixPtr roiCpuBuffer;
+ Matrix::resizeOrCreate(roiCpuBuffer,
+ roiValue->getHeight(),
+ roiValue->getWidth(),
+ false,
+ false);
+ dataCpuBuffer->copyFrom(*dataValue);
+ roiCpuBuffer->copyFrom(*roiValue);
+ dataValue = dataCpuBuffer;
+ roiValue = roiCpuBuffer;
+ MatrixPtr outputCpuBuffer;
+ Matrix::resizeOrCreate(outputCpuBuffer,
+ outputValue->getHeight(),
+ outputValue->getWidth(),
+ false,
+ false);
+ outputCpuBuffer->copyFrom(*outputValue);
+ outputValue = outputCpuBuffer;
+ }
+
+ real* bottomData = dataValue->getData();
+ size_t batchOffset = dataValue->getWidth();
+ size_t channelOffset = height_ * width_;
+ real* bottomROIs = roiValue->getData();
+ size_t roiOffset = roiValue->getWidth();
+ size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+
+ real* outputData = outputValue->getData();
+ Matrix::resizeOrCreate(maxIdxs_,
+ numROIs,
+ channels_ * pooledHeight_ * pooledWidth_,
+ false,
+ false);
+ real* argmaxData = maxIdxs_->getData();
+
+ for (size_t n = 0; n < numROIs; ++n) {
+ // the first five elememts of each RoI should be:
+ // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
+ size_t roiBatchIdx = bottomROIs[0];
+ size_t roiStartW = round(bottomROIs[1] * spatialScale_);
+ size_t roiStartH = round(bottomROIs[2] * spatialScale_);
+ size_t roiEndW = round(bottomROIs[3] * spatialScale_);
+ size_t roiEndH = round(bottomROIs[4] * spatialScale_);
+ CHECK_GE(roiBatchIdx, 0);
+ CHECK_LT(roiBatchIdx, batchSize);
+ size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
+ size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);
+ real binSizeH =
+ static_cast(roiHeight) / static_cast(pooledHeight_);
+ real binSizeW =
+ static_cast(roiWidth) / static_cast(pooledWidth_);
+ real* batchData = bottomData + batchOffset * roiBatchIdx;
+ for (size_t c = 0; c < channels_; ++c) {
+ for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+ for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+ size_t hstart = static_cast(std::floor(ph * binSizeH));
+ size_t wstart = static_cast(std::floor(pw * binSizeW));
+ size_t hend = static_cast(std::ceil((ph + 1) * binSizeH));
+ size_t wend = static_cast(std::ceil((pw + 1) * binSizeW));
+ hstart = std::min(std::max(hstart + roiStartH, 0UL), height_);
+ wstart = std::min(std::max(wstart + roiStartW, 0UL), width_);
+ hend = std::min(std::max(hend + roiStartH, 0UL), height_);
+ wend = std::min(std::max(wend + roiStartW, 0UL), width_);
+
+ bool isEmpty = (hend <= hstart) || (wend <= wstart);
+ size_t poolIndex = ph * pooledWidth_ + pw;
+ if (isEmpty) {
+ outputData[poolIndex] = 0;
+ argmaxData[poolIndex] = -1;
+ }
+
+ for (size_t h = hstart; h < hend; ++h) {
+ for (size_t w = wstart; w < wend; ++w) {
+ size_t index = h * width_ + w;
+ if (batchData[index] > outputData[poolIndex]) {
+ outputData[poolIndex] = batchData[index];
+ argmaxData[poolIndex] = index;
+ }
+ }
+ }
+ }
+ }
+ batchData += channelOffset;
+ outputData += poolChannelOffset;
+ argmaxData += poolChannelOffset;
+ }
+ bottomROIs += roiOffset;
+ }
+ if (useGpu_) {
+ getOutputValue()->copyFrom(*outputValue);
+ }
+}
+
+void ROIPoolLayer::backward(const UpdateCallback& callback) {
+ MatrixPtr inGradValue = getInputGrad(0);
+ MatrixPtr outGradValue = getOutputGrad();
+ MatrixPtr roiValue = getInputValue(1);
+
+ if (useGpu_) {
+ MatrixPtr inGradCpuBuffer;
+ Matrix::resizeOrCreate(inGradCpuBuffer,
+ inGradValue->getHeight(),
+ inGradValue->getWidth(),
+ false,
+ false);
+ MatrixPtr outGradCpuBuffer;
+ Matrix::resizeOrCreate(outGradCpuBuffer,
+ outGradValue->getHeight(),
+ outGradValue->getWidth(),
+ false,
+ false);
+ MatrixPtr roiCpuBuffer;
+ Matrix::resizeOrCreate(roiCpuBuffer,
+ roiValue->getHeight(),
+ roiValue->getWidth(),
+ false,
+ false);
+ inGradCpuBuffer->copyFrom(*inGradValue);
+ outGradCpuBuffer->copyFrom(*outGradValue);
+ roiCpuBuffer->copyFrom(*roiValue);
+ inGradValue = inGradCpuBuffer;
+ outGradValue = outGradCpuBuffer;
+ roiValue = roiCpuBuffer;
+ }
+
+ real* bottomROIs = roiValue->getData();
+ size_t numROIs = getInput(1).getBatchSize();
+ size_t roiOffset = getInputValue(1)->getWidth();
+
+ real* inDiffData = inGradValue->getData();
+ size_t batchOffset = getInputValue(0)->getWidth();
+ size_t channelOffset = height_ * width_;
+
+ real* outDiffData = outGradValue->getData();
+ size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+ real* argmaxData = maxIdxs_->getData();
+
+ for (size_t n = 0; n < numROIs; ++n) {
+ size_t roiBatchIdx = bottomROIs[0];
+ real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
+ for (size_t c = 0; c < channels_; ++c) {
+ for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+ for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+ size_t poolIndex = ph * pooledWidth_ + pw;
+ if (argmaxData[poolIndex] > 0) {
+ size_t index = static_cast(argmaxData[poolIndex]);
+ batchDiffData[index] += outDiffData[poolIndex];
+ }
+ }
+ }
+ batchDiffData += channelOffset;
+ outDiffData += poolChannelOffset;
+ argmaxData += poolChannelOffset;
+ }
+ bottomROIs += roiOffset;
+ }
+
+ if (useGpu_) {
+ getInputGrad(0)->copyFrom(*inGradValue);
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f07e49d6fd1eda9fa7bd46e4cec771a75f571be
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ * convolution layer; The second input layer contains the ROI data
+ * which is the output of ProposalLayer in Faster R-CNN. layers for
+ * generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ * Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ * Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+protected:
+ size_t channels_;
+ size_t width_;
+ size_t height_;
+ size_t pooledWidth_;
+ size_t pooledHeight_;
+ real spatialScale_;
+
+ // Since there is no int matrix, use real maxtrix instead.
+ MatrixPtr maxIdxs_;
+
+public:
+ explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+ bool init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) override;
+
+ void forward(PassType passType) override;
+ void backward(const UpdateCallback& callback = nullptr) override;
+};
+} // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa6778aef4e893208fd064ca22e217c6c4d960f9
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ Layer::init(layerMap, parameterMap);
+ CHECK_EQ(static_cast(inputLayers_.size()), 2);
+ auto& conf = config_.inputs(0).scale_sub_region_conf();
+ value_ = conf.value();
+
+ createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+ createFunction(
+ backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+ return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+ Layer::forward(passType);
+ auto in0 = getInput(0);
+ imgH_ = in0.getFrameHeight();
+ imgW_ = in0.getFrameWidth();
+ if (imgH_ == 0 || imgW_ == 0) {
+ auto& conf = config_.inputs(0).scale_sub_region_conf();
+ imgH_ = conf.image_conf().img_size_y();
+ imgW_ = conf.image_conf().img_size();
+ }
+ MatrixPtr imgV = in0.value;
+ size_t batchSize = imgV->getHeight();
+ size_t spatialSize = imgH_ * imgW_;
+ channelsNum_ = imgV->getWidth() / spatialSize;
+ shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+ resetOutput(batchSize, imgV->getWidth());
+ auto& out = getOutput();
+ out.setFrameHeight(imgH_);
+ out.setFrameWidth(imgW_);
+
+ MatrixPtr indicesV = getInputValue(1);
+ indicesShape_ = TensorShape({batchSize, 6});
+
+ REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+ BufferArgs inArgs;
+ BufferArgs outArgs;
+ inArgs.addArg(*imgV, shape_);
+ inArgs.addArg(*indicesV, indicesShape_);
+ outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+ forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+ REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+ BufferArgs inArgs;
+ BufferArgs outArgs;
+ inArgs.addArg(*getOutputGrad(), shape_);
+ inArgs.addArg(*getInputValue(1), indicesShape_);
+ outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+ backward_[0]->calc(inArgs, outArgs);
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a27c56de93bb6fdde0f95cd4c5abe5dfabe4e858
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief For each instance, this layer can be used to multiply a value to a
+ * specified sub continuous region. By providing start index and end
+ * index for C/H/W, you can specify the location and shape of the
+ * region.
+ *
+ * input_0: Input value.
+ * input_1: Indices value to specify the location an shape of the
+ * region.
+ */
+class ScaleSubRegionLayer : public Layer {
+public:
+ explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+ ~ScaleSubRegionLayer() {}
+
+ bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+ void forward(PassType passType);
+
+ void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+ TensorShape shape_;
+ TensorShape indicesShape_;
+ size_t imgH_;
+ size_t imgW_;
+ size_t channelsNum_;
+ real value_;
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 1a46fb49153a0aa4228f58db481b950bc2d6de83..fcbcb5b0f1f4cb07066363c9fa93fb1726459f30 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -53,7 +53,7 @@ TEST(Operator, dot_mul) {
TEST(Projection, context) {
for (auto contextStart : {-5, -3, -1, 0, 3}) {
for (auto contextLength : {1, 2, 5, 7}) {
- for (auto batchSize : {1, 2, 5, 20, 50}) {
+ for (auto batchSize : {1, 2, 5, 20}) {
for (auto trainablePadding : {false, true}) {
LOG(INFO) << " contextStart=" << contextStart
<< " contextLength=" << contextLength
@@ -585,14 +585,14 @@ TEST(Layer, maxoutLayer) {
}
void testFcLayer(string format, size_t nnz) {
TestConfig config;
- config.biasSize = 4096;
+ config.biasSize = 1024;
config.layerConfig.set_type("fc");
- config.layerConfig.set_size(4096);
+ config.layerConfig.set_size(1024);
config.layerConfig.set_active_type("sigmoid");
config.layerConfig.set_drop_rate(0.1);
config.inputDefs.push_back(
- {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+ {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
config.layerConfig.add_inputs();
LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
@@ -609,9 +609,9 @@ void testFcLayer(string format, size_t nnz) {
}
TEST(Layer, fcLayer) {
- testFcLayer("", 4096 * 4096 * 2);
- testFcLayer("csc", 4096 * 40);
- testFcLayer("csr", 4096 * 40);
+ testFcLayer("", 1024 * 1024 * 2);
+ testFcLayer("csc", 1024 * 10);
+ testFcLayer("csr", 1024 * 10);
}
TEST(Layer, SelectiveFullyConnectedLayer) {
@@ -1995,7 +1995,7 @@ TEST(Layer, multibox_loss) {
TEST(Layer, TransLayer) {
TestConfig config;
const int height = 128;
- const int width = 1028;
+ const int width = 256;
config.layerConfig.set_type("trans");
config.layerConfig.set_size(width);
@@ -2056,6 +2056,43 @@ TEST(Layer, CropLayer) {
}
}
+TEST(Layer, roi_pool) {
+ TestConfig config;
+ config.layerConfig.set_type("roi_pool");
+ config.biasSize = 0;
+ LayerInputConfig* input = config.layerConfig.add_inputs();
+ ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+ roiPoolConf->set_pooled_width(7);
+ roiPoolConf->set_pooled_height(7);
+ roiPoolConf->set_spatial_scale(1. / 16);
+ roiPoolConf->set_width(14);
+ roiPoolConf->set_height(14);
+
+ const size_t roiNum = 10;
+ const size_t roiDim = 10;
+ const size_t batchSize = 5;
+ MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+ roiValue->zeroMem();
+ real* roiData = roiValue->getData();
+ for (size_t i = 0; i < roiNum; ++i) {
+ roiData[i * roiDim + 0] = std::rand() % batchSize;
+ roiData[i * roiDim + 1] = std::rand() % 224; // xMin
+ roiData[i * roiDim + 2] = std::rand() % 224; // yMin
+ size_t xMin = static_cast(roiData[i * roiDim + 1]);
+ size_t yMin = static_cast(roiData[i * roiDim + 2]);
+ roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin); // xMax
+ roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin); // yMax
+ }
+
+ config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+ config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+ config.layerConfig.add_inputs();
+
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+ }
+}
+
TEST(Layer, SwitchOrderLayer) {
TestConfig config;
// config input_0
@@ -2358,6 +2395,38 @@ TEST(Layer, ScaleShiftLayer) {
}
}
+TEST(Layer, ScaleSubRegionLayer) {
+ const size_t batchSize = 64;
+ const size_t size = 4096;
+ TestConfig config;
+ config.layerConfig.set_type("scale_sub_region");
+ config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+ MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+ auto* data = indicesV->getData();
+ for (size_t i = 0; i < batchSize; ++i) {
+ data[i * 2] = 2;
+ data[i * 2 + 1] = 4;
+ data[i * 2 + 2] = 16;
+ data[i * 2 + 3] = 32;
+ data[i * 2 + 4] = 16;
+ data[i * 2 + 5] = 32;
+ }
+ config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+ LayerInputConfig* input = config.layerConfig.add_inputs();
+ ScaleSubRegionConfig* scaleSubRegionConf =
+ input->mutable_scale_sub_region_conf();
+ ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+ imgConf->set_img_size(32);
+ imgConf->set_img_size_y(32);
+ imgConf->set_channels(4);
+ scaleSubRegionConf->set_value(2.0);
+ config.layerConfig.add_inputs();
+
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+ }
+}
+
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 2e8d9f3333b36005c9b3b28449c76a4a44c74cc6..a0e039c2a33b586e21775ad06c1278a10804d654 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -269,6 +269,7 @@ void testBatchNormLayer(const testBatchNormDesc& pm) {
TEST(MKLDNNLayer, BatchNormLayer) {
testBatchNormLayer({4, 10, 6, 6});
testBatchNormLayer({16, 32, 16, 16});
+ testBatchNormLayer({4, 16, 8, 10});
}
struct testImageDesc {
@@ -300,13 +301,8 @@ void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
TestConfig dnnConfig;
getAddtoConfig(dnnConfig, pm, nInputs);
dnnConfig.layerConfig.set_type("mkldnn_addto");
- // TODO(TJ): test with bias
- for (auto withBias : {false}) {
- if (withBias) {
- dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
- } else {
- dnnConfig.biasSize = 0;
- }
+ for (auto withBias : {false, true}) {
+ dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
}
}
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 21a8f73c3e650d4b3c3b86247594cd965f4ead35..a710479bab82ed52122cf59bb14a05ccbd4aa05c 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -152,12 +152,7 @@ void MKLDNNMatrix::downSpatial() {
}
memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
- mkldnn_primitive_t result;
- mkldnn::error::wrap_c_api(
- mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
- "could not create a memory primitive");
- reset(result);
- set_data_handle(data_);
+ resetMKLDNNMemory(pd, data_);
}
} // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54cfefe23b3dc70fd12fd2ca8886c941047b59f7..39d40a1f61609a649d3341c170d24b0604921ac2 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -145,6 +145,27 @@ public:
m_.reset();
}
+ /**
+ * override the CpuMatrix::resize
+ */
+ void resize(size_t newHeight, size_t newWidth) override {
+ m_->resize(newHeight, newWidth);
+ if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+ return;
+ }
+ CpuMatrix::setData(data_);
+ height_ = newHeight;
+ width_ = newWidth;
+ elementCnt_ = newHeight * newWidth;
+ stride_ = width_;
+ auto pd = mkldnn::memory::primitive_desc(
+ mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+ getDtype(),
+ mkldnn::memory::format::nc),
+ getEngine());
+ resetMKLDNNMemory(pd, data_);
+ }
+
/**
* override Matrix::getData
* check data before return
@@ -215,6 +236,17 @@ protected:
memory::format srcFmt,
memory::format dstFmt,
memory::dims dm);
+ /**
+ * reset this MKLDNN Memory from primitve desc
+ */
+ void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+ mkldnn_primitive_t result;
+ mkldnn::error::wrap_c_api(
+ mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+ "could not create a memory primitive");
+ reset(result);
+ set_data_handle(data);
+ }
private:
// save the CpuMatrixPtr in case the buffer released outside
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index c2f17beeb87942ea681f5d388659c0d280157b26..ba86eacbb5d53ee43a60d2cd1dd922333a5d48f0 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct(const int n, const double* x, const double* y) {
}
#endif
-#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_USE_MKLML)
template <>
void vExp(const int n, const float* a, float* r) {
@@ -295,38 +295,6 @@ template void vAdd(const int n, const double* a, const double* b, double* r);
#endif
-#ifdef PADDLE_USE_MKL
-template <>
-void vInvSqrt(const int n, const float* a, float* r) {
- vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt(const int n, const double* a, double* r) {
- vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p(const int n, const float* a, float* r) {
- vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p(const int n, const double* a, double* r) {
- vdLog1p(n, a, r);
-}
-
-template <>
-void vTanh(const int n, const float* a, float* r) {
- vsTanh(n, a, r);
-}
-
-template <>
-void vTanh(const int n, const double* a, double* r) {
- vdTanh(n, a, r);
-}
-#else
-
DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
template
void vInvSqrt(const int n, const T* a, T* r) {
@@ -357,6 +325,4 @@ template void vLog1p(const int n, const double* a, double* r);
template void vTanh(const int n, const float* a, float* r);
template void vTanh(const int n, const double* a, double* r);
-#endif
-
} // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8193aa4adffc0409d8ea68417c68fa153a2942d8..f6e77029bdd75a602f88b688ca810f47ba4ee615 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -21,11 +21,6 @@ limitations under the License. */
#include
#endif
-#ifdef PADDLE_USE_MKL
-#include
-#include
-#endif
-
#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
extern "C" {
#include
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index 5bc4a03067a75527fa30e5bb5526f93dc7b9fdcc..b998e5772e70d0a0ec79dc4064dcbaa2c302efd2 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -169,7 +169,7 @@ void TensorCheck(AssertEq compare,
count++;
}
}
- EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+ EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
}
template
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d0c4c0d25d6f4e3ab7acd72d62a8a17fa102637b..1776f33105367447759aa91c25263dfc53bd2f99 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -65,7 +65,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
size_t num_samples = inference->dims()[0];
size_t infer_width = inference->dims()[1];
- cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+ PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
if (num_samples == 0) {
return;
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index 1968b53d19acfddaa514eca6e24d98a298d8d311..28dbc77f64842a62e88ae8df4ead7adc3b03764b 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#include
-#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
@@ -22,18 +21,6 @@ namespace operators {
using Tensor = framework::Tensor;
-template
-using EigenMatrix = framework::EigenMatrix;
-
-template
-using EigenVector = framework::EigenVector;
-
-template
-using EigenScalar = framework::EigenScalar;
-
template
class AccuracyKernel : public framework::OpKernel {
public:
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..666043e824f885e9c0e79e319d0a38ba108c209a
--- /dev/null
+++ b/paddle/operators/array_operator.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+ ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+ size_t GetOffset(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const {
+ auto *i = scope.FindVar(Input("I"));
+ PADDLE_ENFORCE(i != nullptr, "I must be set");
+ auto &i_tensor = i->Get();
+ PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+ size_t offset;
+ if (platform::is_gpu_place(i_tensor.place())) {
+ // FIXME: Avoid copy from GPU to CPU
+ framework::Tensor t;
+ t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+ dev_ctx.Wait();
+ offset = static_cast(*t.data());
+ } else {
+ offset = static_cast(*i_tensor.data());
+ }
+ return offset;
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index 6cd9c06b8ae3d3b17be83268c2f5d4002705b111..c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -140,6 +140,23 @@ class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
"ArrayToLoDTensorOp must has input X.");
PADDLE_ENFORCE(context->HasInput("RankTable"),
"ArrayToLoDTensorOp must has input RankTable.");
+ context->SetOutputDim("Out", context->GetInputDim("X"));
+ }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+ std::unique_ptr Apply() const override {
+ auto *grad_op = new framework::OpDescBind();
+ grad_op->SetType("lod_tensor_to_array");
+ grad_op->SetInput("X", OutputGrad("Out"));
+ grad_op->SetInput("RankTable", Input("RankTable"));
+ grad_op->SetOutput("Out", InputGrad("X"));
+ grad_op->SetAttrMap(Attrs());
+ return std::unique_ptr(grad_op);
}
};
@@ -149,4 +166,5 @@ class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
namespace ops = paddle::operators;
REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
ops::ArrayToLoDTensorOpProtoMaker,
- ops::ArrayToLoDTensorInferShape);
+ ops::ArrayToLoDTensorInferShape,
+ ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 8721ca352848fc4d69b206d4ea0ab7c581c8d055..f884e6efa917ce3f8554dce0e248f2b29273e3f3 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -19,9 +19,6 @@ namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
-template
-using EigenMatrix = framework::EigenMatrix;
template
using EigenArrayMap =
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..309660b01fe7052de2f9300acdf00779d0228221
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/chunk_eval_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ChunkEvalOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ void InferShape(framework::InferShapeContext *ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("Inference"),
+ "Input(Inference) of ChunkEvalOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput("Label"),
+ "Input(Label) of ChunkEvalOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Precision"),
+ "Output(Precision) of ChunkEvalOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Recall"),
+ "Output(Recall) of ChunkEvalOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
+ "Output(F1-Score) of ChunkEvalOp should not be null.");
+
+ auto inference_dim = ctx->GetInputDim("Inference");
+ auto label_dim = ctx->GetInputDim("Label");
+
+ PADDLE_ENFORCE(inference_dim == label_dim,
+ "Inference's shape must be the same as Label's shape.");
+
+ ctx->SetOutputDim("Precision", {1});
+ ctx->SetOutputDim("Recall", {1});
+ ctx->SetOutputDim("F1-Score", {1});
+ }
+
+ protected:
+ framework::OpKernelType GetKernelType(
+ const framework::ExecutionContext &ctx) const override {
+ return framework::OpKernelType(framework::DataType::FP32,
+ ctx.device_context());
+ }
+};
+
+class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ChunkEvalOpMaker(framework::OpProto *proto,
+ framework::OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("Inference",
+ "(Tensor, default: Tensor). Predictions from the network.");
+ AddInput("Label",
+ "(Tensor, default: Tensor). The true tag sequences.");
+ AddOutput("Precision",
+ "(float). The evaluated precision (called positive predictive "
+ "value) of chunks on the given mini-batch.");
+ AddOutput("Recall",
+ "(float). The evaluated recall (true positive rate or "
+ "sensitivity) of chunks on the given mini-batch.");
+ AddOutput("F1-Score",
+ "(float). The evaluated F1-Score on the given mini-batch.");
+ AddAttr("num_chunk_types",
+ "(int). The number of chunk type. See below for details.");
+ AddAttr(
+ "chunk_scheme",
+ "(string, default IOB). The labeling scheme indicating "
+ "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
+ "for details.")
+ .SetDefault("IOB");
+ AddAttr>("excluded_chunk_types",
+ "(list) A list including chunk type ids "
+ "indicating chunk types that are not counted. "
+ "See below for details.")
+ .SetDefault(std::vector{});
+ AddComment(R"DOC(
+For some basics of chunking, please refer to
+‘Chunking with Support Vector Mechines ’.
+
+
+CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+Here is a NER example of labeling for these tagging schemes:
+
+ Li Ming works at Agricultural Bank of China in Beijing.
+ IO: I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC
+ IOB: B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC
+ IOE: I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC
+ IOBES: B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC
+
+There are three chunk types(named entity types) including PER(person), ORG(orgnazation)
+and LOC(LOCATION), and we can see that the labels have the form -.
+
+Since the calculations actually use label ids rather than labels, extra attention
+should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+is that the listed equations are satisfied by ids.
+
+ tag_type = label % num_tag_type
+ chunk_type = label / num_tag_type
+
+where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+is the num of chunk types, and `tag_type` get its value from the following table.
+
+ Scheme Begin Inside End Single
+ plain 0 - - -
+ IOB 0 1 - -
+ IOE - 0 1 -
+ IOBES 0 1 2 3
+
+Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+PER and LOC. To satisfy the above equations, the label map can be like this:
+
+ B-ORG 0
+ I-ORG 1
+ B-PER 2
+ I-PER 3
+ B-LOC 4
+ I-LOC 5
+ O 6
+
+It’s not hard to verify the equations noting that the num of chunk types
+is 3 and the num of tag types in IOB scheme is 2. For example, the label
+id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+I-LOC is 2, which consistent with the results from the equations.
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
+ ops::ChunkEvalOpMaker);
+REGISTER_OP_CPU_KERNEL(chunk_eval,
+ ops::ChunkEvalKernel);
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81aa07817b673b2ff85a35a51cc43742b7ad7fed
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template
+class ChunkEvalKernel : public framework::OpKernel {
+ public:
+ struct Segment {
+ int begin;
+ int end;
+ int type;
+ bool operator==(const Segment& y) const {
+ return begin == y.begin && end == y.end && type == y.type;
+ }
+ };
+
+ void GetSegments(const int* label, int length, std::vector& segments,
+ int num_chunk_types, int num_tag_types, int other_chunk_type,
+ int tag_begin, int tag_inside, int tag_end,
+ int tag_single) const {
+ segments.clear();
+ segments.reserve(length);
+ int chunk_start = 0;
+ bool in_chunk = false;
+ int tag = -1;
+ int type = other_chunk_type;
+ for (int i = 0; i < length; ++i) {
+ int prev_tag = tag;
+ int prev_type = type;
+ PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
+ tag = label[i] % num_tag_types;
+ type = label[i] / num_tag_types;
+ if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
+ tag_begin, tag_inside, tag_end, tag_single)) {
+ Segment segment{
+ chunk_start, // begin
+ i - 1, // end
+ prev_type,
+ };
+ segments.push_back(segment);
+ in_chunk = false;
+ }
+ if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
+ tag_begin, tag_inside, tag_end, tag_single)) {
+ chunk_start = i;
+ in_chunk = true;
+ }
+ }
+ if (in_chunk) {
+ Segment segment{
+ chunk_start, // begin
+ length - 1, // end
+ type,
+ };
+ segments.push_back(segment);
+ }
+ }
+
+ bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
+ int other_chunk_type, int tag_begin, int tag_inside,
+ int tag_end, int tag_single) const {
+ if (prev_type == other_chunk_type) return false;
+ if (type == other_chunk_type) return true;
+ if (type != prev_type) return true;
+ if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
+ if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
+ if (prev_tag == tag_end) return true;
+ if (prev_tag == tag_single) return true;
+ return false;
+ }
+
+ bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
+ int other_chunk_type, int tag_begin, int tag_inside,
+ int tag_end, int tag_single) const {
+ if (prev_type == other_chunk_type) return type != other_chunk_type;
+ if (type == other_chunk_type) return false;
+ if (type != prev_type) return true;
+ if (tag == tag_begin) return true;
+ if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
+ if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
+ if (tag == tag_single) return true;
+ return false;
+ }
+
+ void Compute(const framework::ExecutionContext& context) const override {
+ // initialize to parse configurations
+ int num_chunk_types, num_tag_types;
+ int other_chunk_type;
+ int tag_begin, tag_inside, tag_end, tag_single;
+ std::vector label_segments;
+ std::vector output_segments;
+ std::set excluded_chunk_types;
+ int64_t num_output_segments = 0;
+ int64_t num_label_segments = 0;
+ int64_t num_correct = 0;
+ if (context.Attr("chunk_scheme") == "IOB") {
+ num_tag_types = 2;
+ tag_begin = 0;
+ tag_inside = 1;
+ tag_end = -1;
+ tag_single = -1;
+ } else if (context.Attr("chunk_scheme") == "IOE") {
+ num_tag_types = 2;
+ tag_begin = -1;
+ tag_inside = 0;
+ tag_end = 1;
+ tag_single = -1;
+ } else if (context.Attr("chunk_scheme") == "IOBES") {
+ num_tag_types = 4;
+ tag_begin = 0;
+ tag_inside = 1;
+ tag_end = 2;
+ tag_single = 3;
+ } else if (context.Attr("chunk_scheme") == "plain") {
+ num_tag_types = 1;
+ tag_begin = -1;
+ tag_inside = -1;
+ tag_end = -1;
+ tag_single = -1;
+ } else {
+ PADDLE_THROW("Unknown chunk scheme.");
+ }
+ other_chunk_type = num_chunk_types = context.Attr("num_chunk_types");
+ excluded_chunk_types.insert(
+ context.Attr>("excluded_chunk_types").begin(),
+ context.Attr>("excluded_chunk_types").end());
+
+ auto* inference = context.Input("Inference");
+ auto* label = context.Input("Label");
+ auto* precision = context.Output("Precision");
+ auto* recall = context.Output("Recall");
+ auto* f1 = context.Output("F1-Score");
+
+ const int* inference_data = inference->data();
+ const int* label_data = label->data();
+ T* precision_data = precision->mutable_data(context.GetPlace());
+ T* racall_data = recall->mutable_data(context.GetPlace());
+ T* f1_data = f1->mutable_data(context.GetPlace());
+
+ auto lod = label->lod();
+ PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+ PADDLE_ENFORCE(lod == inference->lod(),
+ "LoD must be same between Inference and Label.");
+ int num_sequences = lod[0].size() - 1;
+ for (int i = 0; i < num_sequences; ++i) {
+ int seq_length = lod[0][i + 1] - lod[0][i];
+ EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
+ output_segments, label_segments, num_output_segments,
+ num_label_segments, num_correct, num_chunk_types,
+ num_tag_types, other_chunk_type, tag_begin, tag_inside,
+ tag_end, tag_single, excluded_chunk_types);
+ }
+ *precision_data = !num_output_segments ? 0 : static_cast(num_correct) /
+ num_output_segments;
+ *racall_data = !num_label_segments ? 0 : static_cast(num_correct) /
+ num_label_segments;
+ *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
+ ((*precision_data) + (*racall_data));
+ }
+
+ void EvalOneSeq(const int* output, const int* label, int length,
+ std::vector& output_segments,
+ std::vector& label_segments,
+ int64_t& num_output_segments, int64_t& num_label_segments,
+ int64_t& num_correct, int num_chunk_types, int num_tag_types,
+ int other_chunk_type, int tag_begin, int tag_inside,
+ int tag_end, int tag_single,
+ const std::set& excluded_chunk_types) const {
+ GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
+ other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+ GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
+ other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+ size_t i = 0, j = 0;
+ while (i < output_segments.size() && j < label_segments.size()) {
+ if (output_segments[i] == label_segments[j] &&
+ excluded_chunk_types.count(output_segments[i].type) != 1) {
+ ++num_correct;
+ }
+ if (output_segments[i].end < label_segments[j].end) {
+ ++i;
+ } else if (output_segments[i].end > label_segments[j].end) {
+ ++j;
+ } else {
+ ++i;
+ ++j;
+ }
+ }
+ for (auto& segment : label_segments) {
+ if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+ }
+ for (auto& segment : output_segments) {
+ if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+ }
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9fc532e39500fa397be80396b075e866bad9362
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("X"),
+ "Input(X) of ClipByNormOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Out"),
+ "Output(Out) of ClipByNormOp should not be null.");
+ auto max_norm = ctx->Attrs().Get("max_norm");
+ PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+ auto x_dims = ctx->GetInputDim("X");
+ ctx->SetOutputDim("Out", x_dims);
+ ctx->ShareLoD("X", /*->*/ "Out");
+ }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ClipByNormOpMaker(framework::OpProto* proto,
+ framework::OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X",
+ "(Tensor) The input of clip_by_norm op."
+ "The number of dimensions must be between [1, 9].");
+ AddOutput("Out",
+ "(Tensor) The output of clip_by_norm op with shape as input(X)");
+ AddAttr("max_norm", "(float) The maximum norm value.");
+ AddComment(R"DOC(
+ClipByNorm operator limits the L2 norm of the input 'X' within 'max_norm'.
+If the L2 norm of 'X' is less than or equal to 'max_norm', 'Out' will be
+the same as 'X'. If the L2 norm of 'X' is greater than 'max_norm', 'X' will
+be linearly scaled to make the L2 norm of 'Out' equal to 'max_norm', as
+shown in the following formula:
+
+'Out' = 'max_norm' * 'X' / norm('X'),
+
+where norm('X') represents the L2 norm of 'X'.
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
+ ops::ClipByNormOpMaker);
+REGISTER_OP_CPU_KERNEL(
+ clip_by_norm, ops::ClipByNormKernel);
diff --git a/paddle/operators/increment_op.cu b/paddle/operators/clip_by_norm_op.cu
similarity index 64%
rename from paddle/operators/increment_op.cu
rename to paddle/operators/clip_by_norm_op.cu
index f97a6c468522f033687bd83ae5b1a1bc7d86fa80..2593a24ebbf56ecd286a726e527d2414247576e8 100644
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -12,11 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/operators/increment_op.h"
+#include "paddle/operators/clip_by_norm_op.h"
+namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
- increment,
- paddle::operators::IncrementKernel,
- paddle::operators::IncrementKernel,
- paddle::operators::IncrementKernel,
- paddle::operators::IncrementKernel);
+ clip_by_norm, ops::ClipByNormKernel);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b26476cae9b5b2fa290bc9186b9a64c48ba703d6
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template
+using EigenVector = framework::EigenVector;
+
+template
+class ClipByNormKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto max_norm = context.Attr("max_norm");
+ auto* input = context.Input("X");
+ auto* output = context.Output("Out");
+ output->mutable_data(context.GetPlace());
+
+ auto x = EigenVector::Flatten(*input);
+ auto out = EigenVector::Flatten(*output);
+ auto x_norm = x.square().sum().sqrt();
+ auto place = context.GetEigenDevice();
+
+ auto temp = (x_norm <= max_norm).template cast().eval();
+ auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm;
+ Eigen::array one_dim{{1}};
+ Eigen::DSizes m_dsize(input->numel());
+ out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index 8b425d14df3bc484437dc72f29abf13b887006bd..716b5ee92d0d8737d2069460f53989f691ff7c77 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -14,6 +14,7 @@
#include "paddle/operators/compare_op.h"
#include "paddle/framework/op_registry.h"
+
namespace paddle {
namespace operators {
template
@@ -61,19 +62,34 @@ class CompareOpInferShape : public framework::InferShapeBase {
}
};
+class CompareOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ framework::OpKernelType GetKernelType(
+ const framework::ExecutionContext &ctx) const override {
+ framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+ // CompareOp kernel's device type is decided by input tensor place
+ kt.place_ = ctx.Input("X")->place();
+ return kt;
+ }
+};
+
} // namespace operators
} // namespace paddle
-#define REGISTER_LOGICAL_OP(op_type, _equation) \
- struct _##op_type##Comment { \
- static char type[]; \
- static char equation[]; \
- }; \
- char _##op_type##Comment::type[]{#op_type}; \
- char _##op_type##Comment::equation[]{_equation}; \
- REGISTER_OP_WITH_KERNEL( \
- op_type, ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
- ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
+#define REGISTER_LOGICAL_OP(op_type, _equation) \
+ struct _##op_type##Comment { \
+ static char type[]; \
+ static char equation[]; \
+ }; \
+ char _##op_type##Comment::type[]{#op_type}; \
+ char _##op_type##Comment::equation[]{_equation}; \
+ REGISTER_OPERATOR( \
+ op_type, ::paddle::operators::CompareOp, \
+ ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+ ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
::paddle::framework::EmptyGradOpMaker);
REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..282775fcda45fe3bbd72bf04a7ae828f2c840ab7
--- /dev/null
+++ b/paddle/operators/expand_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+ std::vector expand_times =
+ ctx->Attrs().Get>("expand_times");
+ auto x_dims = ctx->GetInputDim("X");
+
+ PADDLE_ENFORCE_EQ(static_cast(x_dims.size()), expand_times.size(),
+ "The number of Attr(expand_times)'s value must be equal "
+ "to the rank of Input(X).");
+ PADDLE_ENFORCE_LE(x_dims.size(), 6,
+ "The rank of Input(X) must not be greater than 6.");
+
+ std::vector out_shape(x_dims.size());
+ for (size_t i = 0; i < expand_times.size(); ++i) {
+ PADDLE_ENFORCE_GE(expand_times[i], 1,
+ "Each value of Attr(expand_times) should not be "
+ "less than 1.");
+ out_shape[i] = x_dims[i] * expand_times[i];
+ }
+
+ ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+ if (out_shape[0] == x_dims[0]) {
+ ctx->ShareLoD("X", "Out");
+ }
+ }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X",
+ "(Tensor, default Tensor) A tensor with rank in [1, 6]."
+ "X is the input tensor to be expanded.");
+ AddOutput("Out",
+ "(Tensor, default Tensor) A tensor with rank in [1, 6]."
+ "The rank of Output(Out) is same as Input(X) except that each "
+ "dimension size of Output(Out) is equal to corresponding "
+ "dimension size of Input(X) multiplying corresponding value of "
+ "Attr(expand_times).");
+ AddAttr>("expand_times",
+ "Expand times number for each dimension.");
+ AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please notice that size of 'expand_times' must be same with
+X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+ [
+ [[1], [2], [3]],
+ [[4], [5], [6]]
+ ]
+
+Attr(expand_times): [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+ [
+ [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+ [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+ ]
+
+)DOC");
+ }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+ "Input(Out@GRAD) should not be null.");
+
+ auto x_dims = ctx->GetInputDim("X");
+ std::vector expand_times =
+ ctx->Attrs().Get>("expand_times");
+ auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+ for (size_t i = 0; i < expand_times.size(); ++i) {
+ PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+ "Each dimension size of Input(Out@GRAD) should be "
+ "equal to multiplication of crroresponding dimension "
+ "size of Input(X) and Attr(expand_times) value.");
+ }
+
+ auto x_grad_name = framework::GradVarName("X");
+
+ if (ctx->HasOutput(x_grad_name)) {
+ ctx->SetOutputDim(x_grad_name, x_dims);
+ }
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+ ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(expand,
+ ops::ExpandKernel);
+REGISTER_OP_CPU_KERNEL(
+ expand_grad, ops::ExpandGradKernel);
diff --git a/paddle/operators/fill_constant_op.cu b/paddle/operators/expand_op.cu
similarity index 65%
rename from paddle/operators/fill_constant_op.cu
rename to paddle/operators/expand_op.cu
index bca402a8b988b570a083e9ce253342304f4b8946..6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5 100644
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/expand_op.cu
@@ -13,12 +13,11 @@
limitations under the License. */
#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/fill_constant_op.h"
+
+#include "paddle/operators/expand_op.h"
namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(expand,
+ ops::ExpandKernel);
REGISTER_OP_GPU_KERNEL(
- fill_constant, ops::FillConstantOpKernel,
- ops::FillConstantOpKernel,
- ops::FillConstantOpKernel,
- ops::FillConstantOpKernel);
+ expand_grad, ops::ExpandGradKernel);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2c11a5d31dafc1b90d129054ebfabfb761bfe
--- /dev/null
+++ b/paddle/operators/expand_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ You may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+ case n + 1: { \
+ Expand(context); \
+ break; \
+ }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n) \
+ BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+ BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n) \
+ case n: { \
+ ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \
+ break; \
+ }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+ BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template
+using EigenVector = framework::EigenVector;
+template
+using EigenTensor = framework::EigenTensor;
+
+template
+class ExpandKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto rank = context.Input("X")->dims().size();
+ switch (rank) {
+ REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+ default:
+ PADDLE_ENFORCE(false,
+ "Only support tensor with rank being between 1 and 6.");
+ }
+ }
+
+ protected:
+ template
+ void Expand(const framework::ExecutionContext& context) const {
+ auto* in0 = context.Input("X");
+ auto& expand_times = context.Attr>("expand_times");
+ auto* out0 = context.Output("Out");
+ Eigen::DSizes bcast_dims;
+ auto x_dims = in0->dims();
+ for (size_t i = 0; i < expand_times.size(); ++i) {
+ bcast_dims[i] = expand_times[i];
+ }
+ auto x = EigenTensor::From(*in0);
+ out0->mutable_data(context.GetPlace());
+ auto y = EigenTensor::From(*out0);
+ auto place = context.GetEigenDevice();
+ y.device(place) = x.broadcast(bcast_dims);
+ }
+};
+
+template
+class ExpandGradKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto* in0 = context.Input