提交 945ad8d9 编写于 作者: T tensor-tang

Merge remote-tracking branch 'upstream/develop' into resnet

...@@ -126,7 +126,7 @@ include(external/swig) # download, build, install swig ...@@ -126,7 +126,7 @@ include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/nccl) include(external/nccl)
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
......
...@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH) ...@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
set(IOS_ARCH "arm64") set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR") elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
set(IOS_ARCH "i386;x86_64") # FIXME(liuyiqun): support "i386;x86_64" future
elseif(IOS_PLATFORM STREQUAL "WATCHOS") set(IOS_ARCH "x86_64")
set(IOS_ARCH armv7k)
endif() endif()
endif() endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject) include(ExternalProject)
set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl) set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
INCLUDE(ExternalProject) # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) if(NOT WITH_PYTHON)
return()
endif()
include(ExternalProject)
INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
ExternalProject_Add( ExternalProject_Add(
extern_pybind extern_pybind
...@@ -17,14 +35,12 @@ ExternalProject_Add( ...@@ -17,14 +35,12 @@ ExternalProject_Add(
TEST_COMMAND "" TEST_COMMAND ""
) )
if (${CMAKE_VERSION} VERSION_LESS "3.3.0") if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
add_library(pybind STATIC ${dummyfile}) add_library(pybind STATIC ${dummyfile})
else() else()
add_library(pybind INTERFACE) add_library(pybind INTERFACE)
endif() endif()
add_dependencies(pybind extern_pybind) add_dependencies(pybind extern_pybind)
LIST(APPEND external_project_dependencies pybind)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# This file is use to check all support level of AVX on your machine # This file is use to check all support level of AVX on your machine
# so that PaddlePaddle can unleash the vectorization power of muticore. # so that PaddlePaddle can unleash the vectorization power of muticore.
INCLUDE(CheckCXXSourceRuns) include(CheckCXXSourceRuns)
INCLUDE(CheckCXXSourceCompiles) include(CheckCXXSourceCompiles)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MMX_FLAG "-mmmx") set(MMX_FLAG "-mmmx")
set(SSE2_FLAG "-msse2") set(SSE2_FLAG "-msse2")
set(SSE3_FLAG "-msse3") set(SSE3_FLAG "-msse3")
SET(AVX_FLAG "-mavx") set(AVX_FLAG "-mavx")
SET(AVX2_FLAG "-mavx2") set(AVX2_FLAG "-mavx2")
ELSEIF(MSVC) elseif(MSVC)
set(MMX_FLAG "/arch:MMX") set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2") set(SSE2_FLAG "/arch:SSE2")
set(SSE3_FLAG "/arch:SSE3") set(SSE3_FLAG "/arch:SSE3")
SET(AVX_FLAG "/arch:AVX") SET(AVX_FLAG "/arch:AVX")
SET(AVX2_FLAG "/arch:AVX2") SET(AVX2_FLAG "/arch:AVX2")
ENDIF() endif()
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
# Check MMX # Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <mmintrin.h> #include <mmintrin.h>
int main() int main()
...@@ -32,6 +33,7 @@ int main() ...@@ -32,6 +33,7 @@ int main()
# Check SSE2 # Check SSE2
set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <emmintrin.h> #include <emmintrin.h>
int main() int main()
...@@ -42,6 +44,7 @@ int main() ...@@ -42,6 +44,7 @@ int main()
# Check SSE3 # Check SSE3
set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <pmmintrin.h> #include <pmmintrin.h>
int main() int main()
...@@ -55,6 +58,7 @@ int main() ...@@ -55,6 +58,7 @@ int main()
# Check AVX # Check AVX
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
...@@ -67,6 +71,7 @@ int main() ...@@ -67,6 +71,7 @@ int main()
# Check AVX 2 # Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
......
...@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以 ...@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以
Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。 Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: 我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
......
...@@ -63,7 +63,7 @@ ...@@ -63,7 +63,7 @@
</tr> </tr>
<tr> <tr>
<td class="left" rowspan="15">训练</td><td class="left">dot_period</td> <td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td> <td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr> </tr>
......
...@@ -8,3 +8,4 @@ PaddlePaddle 文档 ...@@ -8,3 +8,4 @@ PaddlePaddle 文档
howto/index_cn.rst howto/index_cn.rst
api/index_cn.rst api/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
mobile/index_cn.rst
...@@ -7,3 +7,4 @@ PaddlePaddle Documentation ...@@ -7,3 +7,4 @@ PaddlePaddle Documentation
getstarted/index_en.rst getstarted/index_en.rst
howto/index_en.rst howto/index_en.rst
api/index_en.rst api/index_en.rst
mobile/index_en.rst
...@@ -20,10 +20,32 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android ...@@ -20,10 +20,32 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。 构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数: Android的Docker开发镜像向用户提供两个可配置的参数:
| Argument | Optional Values | Default | <table class="docutils">
|-----------------|-------------------------|---------| <colgroup>
|`ANDROID_ABI` |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` | <col width="25%" />
|`ANDROID_API` |`>= 21` | `21` | <col width="50%" />
<col width="25%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">Argument</th>
<th class="head">Optional Values</th>
<th class="head">Default</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>ANDROID_ABI</td>
<td>armeabi-v7a, arm64-v8a</td>
<td>armeabi-v7a</td>
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>21</td>
</tr>
</tbody>
</table>
- 编译`armeabi-v7a``Android API 21`的PaddlePaddle库 - 编译`armeabi-v7a``Android API 21`的PaddlePaddle库
```bash ```bash
......
...@@ -26,10 +26,32 @@ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_A ...@@ -26,10 +26,32 @@ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_A
The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
| Argument | Optional Values | Default | <table class="docutils">
|-----------------|-------------------------|---------| <colgroup>
|`ANDROID_ABI` |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` | <col width="25%" />
|`ANDROID_API` |`>= 21` | `21` | <col width="50%" />
<col width="25%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">Argument</th>
<th class="head">Optional Values</th>
<th class="head">Default</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>ANDROID_ABI</td>
<td>armeabi-v7a, arm64-v8a</td>
<td>armeabi-v7a</td>
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>21</td>
</tr>
</tbody>
</table>
The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API. The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
......
...@@ -27,10 +27,28 @@ iOS平台可选配置参数: ...@@ -27,10 +27,28 @@ iOS平台可选配置参数:
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示: - `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示:
| IOS_PLATFORM | IOS_ARCH | <table class="docutils">
|--------------|----------------------| <colgroup>
| OS | armv7, armv7s, arm64 (默认) | <col width="35%" />
| SIMULATOR | i386, x86_64 (默认) | <col width="65%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">IOS_PLATFORM</th>
<th class="head">IOS_ARCH</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td>
</tr>
<tr class="row-odd">
<td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td>
</tr>
</tbody>
</table>
- `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0` - `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0`
- `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON` - `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON`
......
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_raspberry_en.md
...@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER} ...@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto) add_dependencies(paddle_capi paddle_proto)
# TODO: paddle_capi_whole will be removed. # TODO: paddle_capi_whole will be removed.
set(PADDLE_CAPI_LAYERS_LIBS
paddle_function
paddle_gserver)
if(MOBILE_INFERENCE) if(MOBILE_INFERENCE)
set(PADDLE_CAPI_INFER_LIBS set(PADDLE_CAPI_ENGINE_LIBS
paddle_utils paddle_utils
paddle_parameter paddle_parameter
paddle_math paddle_math
paddle_cuda paddle_cuda
paddle_function paddle_proto)
paddle_gserver
paddle_proto)
else() else()
set(PADDLE_CAPI_INFER_LIBS set(PADDLE_CAPI_ENGINE_LIBS
paddle_utils paddle_utils
paddle_parameter paddle_parameter
paddle_math paddle_math
paddle_cuda paddle_cuda
paddle_function paddle_proto
paddle_gserver paddle_pserver
paddle_proto paddle_network)
paddle_pserver
paddle_network)
endif() endif()
set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS}) cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
# Link the static library for inference # Link the static library for inference
cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto) cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver) cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
# Link the shared library for inference # Link the shared library for inference
if(NOT IOS) if(NOT IOS)
......
...@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD ...@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
are transformed into offsets of elements/words as follows: are transformed into offsets of elements/words as follows:
``` ```
0 9 10 15 0 3 4 6
= = = = = =
3+2+4 1+9 2+3+10 3 3+1 4+2
```
so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
The complete offset representation is as follows:
```
0 9 10 15
0 3 5 9 10 12 15
||| || |||| | || |||
``` ```
## Slicing of LoD Tensors ## Slicing of LoD Tensors
......
...@@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
out); out);
in_var->SetLoDLevel(out_var->GetLodLevel()); in_var->SetLoDLevel(out_var->GetLodLevel());
} }
bool IsRuntime() const override;
protected:
VarDesc::VarType GetVarType(const std::string &name) const override;
private:
DDim GetDim(const std::string &name) const override; DDim GetDim(const std::string &name) const override;
void SetDim(const std::string &name, const DDim &dim) override; void SetDim(const std::string &name, const DDim &dim) override;
...@@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const { ...@@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
info.infer_var_type_(*this, block); info.infer_var_type_(*this, block);
} else { } else {
// all output type is LoDTensor by default // all output type is LoDTensor by default
VLOG(10) << this->Type()
<< " has not registered InferVarType. Set output variables to "
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) { for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) { for (auto &out_var_name : out_pair.second) {
block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR); block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
...@@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name, ...@@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
const DDim &dim) { const DDim &dim) {
block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim)); block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
} }
bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
const std::string &name) const {
return block_.FindVarRecursive(name)->GetType();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include <algorithm> #include <algorithm>
#include <atomic> #include <atomic>
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -365,7 +367,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -365,7 +367,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
out_tensor->set_lod(in_tensor.lod()); out_tensor->set_lod(in_tensor.lod());
} }
private: bool IsRuntime() const override { return true; }
protected:
DDim GetDim(const std::string& name) const override { DDim GetDim(const std::string& name) const override {
Variable* var = scope_.FindVar(name); Variable* var = scope_.FindVar(name);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
...@@ -388,6 +392,12 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -388,6 +392,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
} }
VarDesc::VarType GetVarType(const std::string& name) const override {
auto* var = scope_.FindVar(name);
return ToVarType(var->Type());
}
private:
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
}; };
......
...@@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names, ...@@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
SetDim(names[i], dims[i]); SetDim(names[i], dims[i]);
} }
} }
std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<VarDesc::VarType> retv;
retv.resize(names.size());
std::transform(names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/framework.pb.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -26,6 +27,10 @@ class InferShapeContext { ...@@ -26,6 +27,10 @@ class InferShapeContext {
virtual bool HasInput(const std::string &name) const = 0; virtual bool HasInput(const std::string &name) const = 0;
virtual bool HasOutput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0;
std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
std::vector<VarDesc::VarType> GetOutputsVarType(
const std::string &name) const;
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
...@@ -46,6 +51,8 @@ class InferShapeContext { ...@@ -46,6 +51,8 @@ class InferShapeContext {
virtual void ShareLoD(const std::string &in, const std::string &out, virtual void ShareLoD(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0; size_t i = 0, size_t j = 0) const = 0;
virtual bool IsRuntime() const = 0;
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
...@@ -55,6 +62,11 @@ class InferShapeContext { ...@@ -55,6 +62,11 @@ class InferShapeContext {
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims); const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const;
virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
}; };
} // namespace framework } // namespace framework
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
namespace paddle {
namespace framework {
inline VarDesc::VarType ToVarType(std::type_index type) {
if (type.hash_code() == typeid(LoDTensor).hash_code()) {
return VarDesc_VarType_LOD_TENSOR;
} else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
return VarDesc_VarType_LOD_RANK_TABLE;
} else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
return VarDesc_VarType_LOD_TENSOR_ARRAY;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
}
}
} // namespace framework
} // namespace paddle
...@@ -48,6 +48,11 @@ class Variable { ...@@ -48,6 +48,11 @@ class Variable {
void Clear() { holder_.reset(); } void Clear() { holder_.reset(); }
std::type_index Type() const {
PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
return holder_->Type();
}
private: private:
struct Placeholder { struct Placeholder {
virtual ~Placeholder() {} virtual ~Placeholder() {}
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_; ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config, ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
ParameterPtr parameter, ParameterPtr parameter,
...@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) { ...@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
} }
void *ConvBaseProjection::getSpaceBytes(size_t size) { void *ConvBaseProjection::getSpaceBytes(size_t size) {
std::vector<MemoryHandle *> &convMem = *convMem_; std::vector<MemoryHandlePtr> &convMem = *convMem_;
if (convMem.empty()) { if (convMem.empty()) {
int numDevices = hl_get_device_count(); int numDevices = hl_get_device_count();
convMem.resize(numDevices); convMem.resize(numDevices);
} }
int devId = hl_get_device(); int devId = hl_get_device();
MemoryHandle **localMem = &(convMem[devId]); MemoryHandlePtr localMem = convMem[devId];
if (NULL == *localMem || size > (*localMem)->getAllocSize()) { if (NULL == localMem || size > localMem->getAllocSize()) {
*localMem = new GpuMemoryHandle(size); localMem = std::make_shared<GpuMemoryHandle>(size);
} }
return (*localMem)->getBuf(); return localMem->getBuf();
} }
ConvBaseProjection::~ConvBaseProjection() { ConvBaseProjection::~ConvBaseProjection() {
......
...@@ -105,7 +105,7 @@ protected: ...@@ -105,7 +105,7 @@ protected:
bool bias_; bool bias_;
std::unique_ptr<Weight> weight_; std::unique_ptr<Weight> weight_;
static ThreadLocalD<std::vector<MemoryHandle*>> convMem_; static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
}; };
} // namespace paddle } // namespace paddle
...@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) { ...@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
CHECK_EQ(numSequences2, numSequences3); CHECK_EQ(numSequences2, numSequences3);
MatrixPtr inputValue = input.value; MatrixPtr inputValue = input.value;
IVectorPtr offsetValue = offsetSeq.ids; IVectorPtr offsetValue;
IVectorPtr sizeValue = sizeSeq.ids; IVectorPtr sizeValue;
if (useGpu_) {
// copy to cpu
IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
offsetValue->copyFrom(*offsetSeq.ids);
sizeValue->copyFrom(*sizeSeq.ids);
} else {
offsetValue = offsetSeq.ids;
sizeValue = sizeSeq.ids;
}
CHECK_EQ(offsetValue->getSize(), numSequences1); CHECK_EQ(offsetValue->getSize(), numSequences1);
CHECK_EQ(sizeValue->getSize(), numSequences1); CHECK_EQ(sizeValue->getSize(), numSequences1);
...@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) { ...@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
size_t numSequences1 = startPositions1->getSize() - 1; size_t numSequences1 = startPositions1->getSize() - 1;
const int* starts1 = startPositions1->getData(); const int* starts1 = startPositions1->getData();
IVectorPtr offsetValue = getInput(1).ids; const Argument& offsetSeq = getInput(1);
IVectorPtr sizeValue = getInput(2).ids; const Argument& sizeSeq = getInput(2);
IVectorPtr offsetValue;
IVectorPtr sizeValue;
if (useGpu_) {
// copy to cpu
IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
offsetValue->copyFrom(*offsetSeq.ids);
sizeValue->copyFrom(*sizeSeq.ids);
} else {
offsetValue = offsetSeq.ids;
sizeValue = sizeSeq.ids;
}
int* offsets = offsetValue->getData(); int* offsets = offsetValue->getData();
int* sizes = sizeValue->getData(); int* sizes = sizeValue->getData();
......
...@@ -69,6 +69,20 @@ function(op_library TARGET) ...@@ -69,6 +69,20 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n") file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif() endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# conv_transpose_op contains several operators
if ("${TARGET}" STREQUAL "conv_transpose_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif()
# pool_cudnn_op contains several operators # pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op") if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -96,7 +110,7 @@ function(op_library TARGET) ...@@ -96,7 +110,7 @@ function(op_library TARGET)
# It's enough to just adding one operator to pybind # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n") file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
endif() endif()
# reduce_op contains several operators # reduce_op contains several operators
if ("${TARGET}" STREQUAL "reduce_op") if ("${TARGET}" STREQUAL "reduce_op")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -104,6 +118,11 @@ function(op_library TARGET) ...@@ -104,6 +118,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
endif() endif()
if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
endif()
# pybind USE_NO_KERNEL_OP # pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
file(READ ${TARGET}.cc TARGET_CONTENT) file(READ ${TARGET}.cc TARGET_CONTENT)
...@@ -139,26 +158,38 @@ set(DEPS_OPS ...@@ -139,26 +158,38 @@ set(DEPS_OPS
sum_op sum_op
pool_op pool_op
pool_with_index_op pool_with_index_op
conv_op
lstm_op
conv_transpose_op
nccl_op nccl_op
sequence_conv_op sequence_conv_op
sequence_pool_op
lod_rank_table_op lod_rank_table_op
lstm_op) lstm_op
tensor_array_read_write_op
gru_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(conv_op DEPS vol2col)
op_library(sum_op DEPS net_op selected_rows_functor) op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
if(WITH_GPU) if(WITH_GPU)
op_library(nccl_op DEPS nccl_common) op_library(nccl_op DEPS nccl_common)
endif() endif()
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS net_op tensor_array) DEPS net_op tensor_array)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_transpose_op.h" #include "paddle/operators/conv_transpose_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -38,13 +38,13 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker { ...@@ -38,13 +38,13 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d_transpose_cudnn, ops::Conv2DTransposeOp, REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad, ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
ops::Conv2DTransposeOpGrad); ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d_transpose_cudnn, conv2d_transpose_cudnn,
ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>); ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d_transpose_cudnn_grad, conv2d_transpose_cudnn_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>); ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
#include "paddle/operators/conv2d_transpose_op.h" #include "paddle/operators/conv_transpose_op.h"
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
#include "paddle/platform/cudnn_helper.h" #include "paddle/platform/cudnn_helper.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker { ...@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad, REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
ops::Conv2DOpGrad); ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(
conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(conv_cudnn,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv_cudnn_grad, conv_cudnn_grad,
ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>); ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
#include "paddle/platform/cudnn_helper.h" #include "paddle/platform/cudnn_helper.h"
......
...@@ -12,18 +12,18 @@ ...@@ -12,18 +12,18 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"), PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DOp should not be null."); "Input(Input) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"), PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DOp should not be null."); "Input(Filter) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"), PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DOp should not be null."); "Output(Output) of ConvOp should not be null.");
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
...@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
int input_channels = in_dims[1]; int input_channels = in_dims[1];
int output_channels = filter_dims[0]; int output_channels = filter_dims[0];
PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D."); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D."); "Conv intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(
in_dims.size(), filter_dims.size(),
"Conv input dimension and filter dimension should be the same.");
PADDLE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"Conv input dimension and strides dimension should be consistent.");
PADDLE_ENFORCE_EQ(
paddings.size(), strides.size(),
"Conv paddings dimension and Conv strides dimension should be the same.");
PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
"The number of input channels should be equal to filter " "The number of input channels should be equal to filter "
"channels * groups."); "channels * groups.");
...@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
output_channels % groups, 0, output_channels % groups, 0,
"The number of output channels should be divided by groups."); "The number of output channels should be divided by groups.");
auto output_height = std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]); for (size_t i = 0; i < paddings.size(); ++i) {
auto output_width = output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]); paddings[i], strides[i]));
ctx->SetOutputDim("Output", }
{in_dims[0], filter_dims[0], output_height, output_width}); ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
} }
Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
...@@ -55,19 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -55,19 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput( AddInput(
"Input", "Input",
"The input tensor of convolution operator. " "(Tensor) The input tensor of convolution operator. "
"The format of input tensor is NCHW, where N is batch size, C is the " "The format of input tensor is NCHW, where N is batch size, C is the "
"number of channels, H is the height of the image, " "number of channels, H is the height of the feature, "
"and W is the width of the image."); "and W is the width of the feature.");
AddInput("Filter", AddInput("Filter",
"The filter tensor of convolution operator. " "(Tensor) The filter tensor of convolution operator. "
"The format of the filter tensor is MCHW, where M is the number of " "The format of the filter tensor is MCHW, where M is the number of "
"output image channels, C is the number of input image channels, " "output image channels, C is the number of input image channels, "
"H is the height of the filter, and W is the width of the filter. " "H is the height of the filter, and W is the width of the filter. "
"If the groups attribute is greater than 1, C equals the number of " "If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups."); "input image channels divided by the groups.");
AddOutput("Output", AddOutput("Output",
"The output tensor of convolution operator. " "(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides", "strides of convolution operator.") AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
...@@ -75,7 +84,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -75,7 +84,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0}); .SetDefault({0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
"Group size of convolution operator. " "(int default:1), the group size of convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the " "when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters " "first half of the input channels, while the second half of the filters "
...@@ -84,14 +93,91 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -84,14 +93,91 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
AddComment(R"DOC( AddComment(R"DOC(
Convolution Operator. Convolution Operator.
The convolution operation calculates the output based on the input, filter, The convolution operation calculates the output based on the input, filter
strides, paddings, and groups parameters. The size of each dimension of the and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape method. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(ksize, strides, paddings) are two elements.
These two elements represent height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, H_in, W_in)
Filter shape: (C_out, C_in, H_f, W_f)
Output:
Output shape: (N, C_out, H_out, W_out)
where
H_out = (H_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
W_out = (W_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
)DOC");
}
Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"Input",
"(Tensor) The input tensor of convolution operator. "
"The format of input tensor is NCDHW. Where N is batch size, C is the "
"number of channels, D is the depth of the feature, H is the height of "
"the feature, "
"and W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution operator. "
"The format of the filter tensor is MCDHW, where M is the number of "
"output image channels, C is the number of input image channels, "
"D is the depth of the filter, H is the height of the filter, and W "
"is the width of the filter."
"If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups.");
AddOutput("Output",
"(Tensor) The output tensor of convolution operator."
"The format of output tensor is also NCDHW.");
AddAttr<std::vector<int>>(
"strides",
"(vector, default:{0, 0, 0}), the strides of convolution operator.")
.SetDefault({1, 1, 1});
AddAttr<std::vector<int>>(
"paddings",
"(vector, default:{0, 0, 0}), the paddings of convolution operator.")
.SetDefault({0, 0, 0});
AddAttr<int>(
"groups",
"(int default:1), the group size of convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters "
"is only connected to the second half of the input channels.")
.SetDefault(1);
AddComment(R"DOC(
Convolution3D Operator.
The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
size, C is the number of channels,D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
are three elements. These three elements represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, D_in, H_in, W_in)
Filter shape: (C_out, C_in, D_f, H_f, W_f)
Output:
Output shape: (N, C_out, D_out, H_out, W_out)
where
D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
)DOC"); )DOC");
} }
void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const { void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) { if (ctx->HasOutput(framework::GradVarName("Input"))) {
...@@ -106,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -106,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::Conv2DOp, ops::Conv2DOpMaker, conv2d_grad, REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::Conv2DOpGrad); ops::ConvOpGrad);
namespace ops = paddle::operators;
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>); conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>); conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
...@@ -12,11 +12,16 @@ ...@@ -12,11 +12,16 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>); conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>); conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/im2col.h" #include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker); framework::OpAttrChecker* op_checker);
}; };
class Conv2DOp : public framework::OperatorWithKernel { class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Conv3DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker);
};
class ConvOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override; void InferShape(framework::InferShapeContext* ctx) const override;
}; };
class Conv2DOpGrad : public framework::OperatorWithKernel { class ConvOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel { ...@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
}; };
template <typename Place, typename T> template <typename Place, typename T>
class GemmConv2DKernel : public framework::OpKernel<T> { class GemmConvKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor* input = context.Input<Tensor>("Input");
...@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> { ...@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups"); int groups = context.Attr<int>("groups");
int batch_size = input->dims()[0]; const int batch_size = static_cast<int>(input->dims()[0]);
int input_channels = input->dims()[1];
int filter_height = filter.dims()[filter.dims().size() - 2]; // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
int filter_width = filter.dims()[filter.dims().size() - 1]; std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
int output_channels = output->dims()[1]; filter_shape_vec.erase(filter_shape_vec.begin(),
int output_height = output->dims()[2]; filter_shape_vec.begin() + 2);
int output_width = output->dims()[3];
// output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
paddle::operators::math::Im2ColFunctor< std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
paddle::operators::math::ColFormat::kCFO, Place, T> output_shape_vec.erase(output_shape_vec.begin(),
im2col; output_shape_vec.begin() + 2);
// use col_shape in the im2col calculation // use col_shape in the im2col calculation
framework::DDim col_shape = {input_channels / groups, filter_height, // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
filter_width, output_height, output_width}; // o_h, o_w}
std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(input->dims()[1] / groups);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
output_shape_vec.end());
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation // use col_matrix_shape in the gemm calculation
framework::DDim col_matrix_shape = { // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
input_channels / groups * filter_height * filter_width, // o_h * o_w)
output_height * output_width}; framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
Tensor col; Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace()); col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col, // col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape // but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface. // to call the matrix multiplication interface.
Tensor col_matrix = col; Tensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
framework::DDim input_shape = {input->dims()[1], input->dims()[2], framework::DDim input_shape = framework::slice_ddim(
input->dims()[3]}; input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0], framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]}; filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape); filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {output_channels, framework::DDim output_matrix_shape = {
output_height * output_width}; output->dims()[1],
// convolution operator: im2col + gemm output->numel() / (output->dims()[0] * output->dims()[1])};
int in_step = input_channels / groups;
int out_step = output_channels / groups; // convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
// im2col
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[0], paddings[1], paddings[1]); if (filter_shape_vec.size() == 2) {
// im2col
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
// vol2col
math::Vol2ColFunctor<Place, T> vol2col;
vol2col(context.device_context(), in_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
...@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> { ...@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
}; };
template <typename Place, typename T> template <typename Place, typename T>
class GemmConvGrad2DKernel : public framework::OpKernel<T> { class GemmConvGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor* input = context.Input<Tensor>("Input");
...@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
context.Output<Tensor>(framework::GradVarName("Input")); context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter")); context.Output<Tensor>(framework::GradVarName("Filter"));
// The filter and filter_grad will be reshaped in the calculations, // The filter and filter_grad will be reshaped in the calculations,
// so here use an assignment operation, // so here use an assignment operation,
// that avoids modifying the variable in the Scope. // that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter"); Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups"); int groups = context.Attr<int>("groups");
int batch_size = input->dims()[0]; const int batch_size = static_cast<int>(input->dims()[0]);
int input_channels = input->dims()[1];
int filter_height = filter.dims()[filter.dims().size() - 2]; // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
int filter_width = filter.dims()[filter.dims().size() - 1]; std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
int output_channels = output_grad->dims()[1]; filter_shape_vec.erase(filter_shape_vec.begin(),
int output_height = output_grad->dims()[2]; filter_shape_vec.begin() + 2);
int output_width = output_grad->dims()[3];
// output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
paddle::operators::math::Col2ImFunctor< std::vector<int64_t> output_shape_vec(
paddle::operators::math::ColFormat::kCFO, Place, T> framework::vectorize(output_grad->dims()));
col2im; output_shape_vec.erase(output_shape_vec.begin(),
paddle::operators::math::Im2ColFunctor< output_shape_vec.begin() + 2);
paddle::operators::math::ColFormat::kCFO, Place, T>
im2col; // use col_shape in the im2col calculation
// use col_shape in the im2col and col2im calculation // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
framework::DDim col_shape = {input_channels / groups, filter_height, // o_h, o_w}
filter_width, output_height, output_width}; std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(input->dims()[1] / groups);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
output_shape_vec.end());
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation // use col_matrix_shape in the gemm calculation
framework::DDim col_matrix_shape = { // size: (i_c/g * k_h * k_w, o_h * o_w)
input_channels / groups * filter_height * filter_width, // or
output_height * output_width}; // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
Tensor col; framework::DDim col_matrix_shape =
col.mutable_data<T>(col_shape, context.GetPlace()); framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix = col;
col_matrix.Resize(col_matrix_shape);
framework::DDim input_shape = {input->dims()[1], input->dims()[2], framework::DDim input_shape = framework::slice_ddim(
input->dims()[3]}; input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim output_matrix_shape = {
output_grad->dims()[1],
output_grad->dims()[2] * output_grad->dims()[3]};
framework::DDim filter_matrix_shape = {filter.dims()[0], framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]}; filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape); filter.Resize(filter_matrix_shape);
// convolution backward input operator: gemm + col2im framework::DDim output_matrix_shape = {
// convolution backward weight operator: im2col + gemm output_grad->dims()[1],
int in_step = input_channels / groups; output_grad->numel() /
int out_step = output_channels / groups; (output_grad->dims()[0] * output_grad->dims()[1])};
// convolution backward input operator: gemm + col2im(or col2vol)
// convolution backward weight operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
Tensor col;
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col.mutable_data<T>(col_shape, context.GetPlace());
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
math::SetConstant<Place, T> set_zero;
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad); set_zero(context.device_context(), input_grad, static_cast<T>(0));
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
...@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
math::matmul<Place, T>(context.device_context(), filter_slice, true, math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix, out_grad_slice, false, T(1.0), &col_matrix,
T(0.0)); T(0.0));
// col2im // col2im
Tensor in_grad_slice = Tensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step); in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], if (filter_shape_vec.size() == 2) {
paddings[1]); math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
math::Col2VolFunctor<Place, T> col2vol;
col2vol(context.device_context(), in_grad_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
} }
} }
} }
...@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
filter_grad->mutable_data<T>(context.GetPlace()); filter_grad->mutable_data<T>(context.GetPlace());
Tensor filter_grad_ = *filter_grad; Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape); filter_grad_.Resize(filter_matrix_shape);
auto t = framework::EigenVector<T>::Flatten(filter_grad_); set_zero(context.device_context(), filter_grad, static_cast<T>(0));
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
...@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
Tensor out_grad_slice = Tensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step); out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], if (filter_shape_vec.size() == 2) {
paddings[1]); math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
math::Vol2ColFunctor<Place, T> vol2col;
vol2col(context.device_context(), in_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
// gemm // gemm
Tensor filter_grad_slice = Tensor filter_grad_slice =
...@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
} }
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,18 +12,18 @@ ...@@ -12,18 +12,18 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_transpose_op.h" #include "paddle/operators/conv_transpose_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const { void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"), PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DTransposeOp should not be null."); "Input(Input) of ConvTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"), PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DTransposeOp should not be null."); "Input(Filter) of ConvTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"), PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DTransposeOp should not be null."); "Output(Output) of ConvTransposeOp should not be null.");
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
...@@ -35,17 +35,27 @@ void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -35,17 +35,27 @@ void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
"No Padding allowed in conv transpose op."); "No Padding allowed in conv transpose op.");
} }
PADDLE_ENFORCE_EQ(in_dims.size(), 4, PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"Conv2DTransposeOp input should be 4-D tensor."); "ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(filter_dims.size(), 4, PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
"Conv2DTransposeOp filter should be 4-D tensor."); "ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
"ConvTransposeOp paddings dimension and Conv strides "
"dimension should be the same.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"input and kernel input dimension should be equal."); "In ConvTransposeOp, The input channel should be the same "
"as the number of filters.");
auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2]; std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3]; for (size_t i = 0; i < paddings.size(); ++i) {
ctx->SetOutputDim("Output", output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
{in_dims[0], filter_dims[1], output_height, output_width}); filter_dims[i + 2]);
}
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
} }
Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
...@@ -54,37 +64,109 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( ...@@ -54,37 +64,109 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
AddInput( AddInput(
"Input", "Input",
"(Tensor) The input tensor of convolution transpose operator. " "(Tensor) The input tensor of convolution transpose operator. "
"The format of input tensor is NCHW, where N is batch size, C is the " "The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H is the height of the image, and " "number of input channels, H is the height of the feature, and "
"W is the width of the image."); "W is the width of the feature.");
AddInput("Filter", AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator." "(Tensor) The filter tensor of convolution transpose operator. "
"The format of the filter tensor is CMHW, where C is the number of " "The format of the filter tensor is CMHW, where C is the number of "
"output image channels, M is the number of input image channels, " "output image channels, M is the number of input image channels, "
"H is the height of the filter, and W is the width of the filter. " "H is the height of the filter, and W is the width of the filter. "
"We enforce groups number == 1 and padding == 0 in " "We enforce groups number == 1 and padding == 0 in "
"the convolution transpose scenario."); "the convolution transpose scenario.");
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator." "(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>(
"strides of convolution transpose operator.") "strides",
"(vector defalut:{1, 1}), strides of convolution transpose operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings", AddAttr<std::vector<int>>(
"paddings of convolution transpose operator.") "paddings",
"(vector defalut:{0, 0}), paddings of convolution transpose operator.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddComment(R"DOC( AddComment(R"DOC(
Convolution Transpose Operator. Convolution2D Transpose Operator.
The convolution transpose operation calculates the output based on the input, The convolution transpose operation calculates the output based on the input, filter
filter, strides, paddings, and groups parameters. The size of each dimension and strides, paddings, groups parameters. The size of each dimension of the
of the parameters is checked in the infer-shape method. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and
W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
These two elements represent height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, H_in, W_in)
Filter shape: (C_in, C_out, H_f, W_f)
Output:
Output shape: (N, C_out, H_out, W_out)
where
H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
)DOC");
}
Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(Tensor) The input tensor of convolution transpose operator."
"The format of input tensor is NCDHW. Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and "
"W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator."
"The format of the filter tensor is CMDHW, where C is the number of "
"output image channels, M is the number of input image channels, D "
"is the depth of the filter, H is the height of the filter, and "
"W is the width of the filter."
"We enforce groups number == 1 and padding == 0 in "
"the convolution3d transpose scenario.");
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator."
"The format of output tensor is also NCDHW."
"Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>(
"strides",
"(vector defalut:{1, 1, 1}), strides of convolution transpose operator.")
.SetDefault({1, 1, 1});
AddAttr<std::vector<int>>(
"paddings",
"(vector defalut:{0, 0, 0}), paddings of convolution transpose operator.")
.SetDefault({0, 0, 0});
AddComment(R"DOC(
Convolution3D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
size, C is the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width of the feature.
Parameters(ksize, strides, paddings) are three elements.
These three elements represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, D_in, H_in, W_in)
Filter shape: (C_in, C_out, D_f, H_f, W_f)
Output:
Output shape: (N, C_out, D_out, H_out, W_out)
where
D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
)DOC"); )DOC");
} }
void Conv2DTransposeOpGrad::InferShape( void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) { if (ctx->HasOutput(framework::GradVarName("Input"))) {
...@@ -99,13 +181,23 @@ void Conv2DTransposeOpGrad::InferShape( ...@@ -99,13 +181,23 @@ void Conv2DTransposeOpGrad::InferShape(
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d_transpose, ops::Conv2DTransposeOp,
ops::Conv2DTransposeOpMaker, conv2d_transpose_grad, REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
ops::Conv2DTransposeOpGrad); conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d_transpose, conv2d_transpose,
ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>); ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d_transpose_grad, conv2d_transpose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>); ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
conv3d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
...@@ -12,13 +12,20 @@ ...@@ -12,13 +12,20 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_transpose_op.h" #include "paddle/operators/conv_transpose_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2d_transpose, conv2d_transpose,
ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>); ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2d_transpose_grad, conv2d_transpose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>); ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/im2col.h" #include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,7 +34,13 @@ class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -33,7 +34,13 @@ class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker); framework::OpAttrChecker* op_checker);
}; };
class Conv2DTransposeOp : public framework::OperatorWithKernel { class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Conv3DTransposeOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker);
};
class ConvTransposeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -41,7 +48,7 @@ class Conv2DTransposeOp : public framework::OperatorWithKernel { ...@@ -41,7 +48,7 @@ class Conv2DTransposeOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override; void InferShape(framework::InferShapeContext* ctx) const override;
}; };
class Conv2DTransposeOpGrad : public framework::OperatorWithKernel { class ConvTransposeOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -50,41 +57,44 @@ class Conv2DTransposeOpGrad : public framework::OperatorWithKernel { ...@@ -50,41 +57,44 @@ class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
}; };
template <typename Place, typename T> template <typename Place, typename T>
class GemmConv2DTransposeKernel : public framework::OpKernel<T> { class GemmConvTransposeKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor* input = context.Input<Tensor>("Input");
// The filter will be reshaped, so it should not be constant pointer // The filter will be reshaped, so it should not be constant pointer
Tensor filter = *context.Input<Tensor>("Filter"); Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output"); Tensor* output = context.Output<Tensor>("Output");
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// TODO(Zhuoyuan): Paddings can be added in future. // TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2d_transpose. // groups will alway be disabled in conv2dtranspose.
const int batch_size = input->dims()[0]; const int batch_size = static_cast<int>(input->dims()[0]);
const int m = input->dims()[1];
const int h = input->dims()[2]; // input_shape_vec: {h, w} or {d, h, w}
const int w = input->dims()[3]; std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
const int k_h = filter.dims()[2];
const int k_w = filter.dims()[3]; // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
const int c = output->dims()[1]; // output channels filter_shape_vec.erase(filter_shape_vec.begin(),
const int o_h = output->dims()[2]; filter_shape_vec.begin() + 2);
const int o_w = output->dims()[3];
// use col_shape in the im2col and col2im (or vol2col and col2vol)
paddle::operators::math::Col2ImFunctor< // calculation
paddle::operators::math::ColFormat::kCFO, Place, T> // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
col2im; std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(output->dims()[1]);
// use col_shape in the im2col and col2im calculation col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
DDim col_shape = {c, k_h, k_w, h, w}; filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
input_shape_vec.end());
DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation // use col_matrix_shape in the gemm calculation
DDim col_matrix_shape = {c * k_h * k_w, h * w}; // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
Tensor col; Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace()); col.mutable_data<T>(col_shape, context.GetPlace());
...@@ -95,160 +105,189 @@ class GemmConv2DTransposeKernel : public framework::OpKernel<T> { ...@@ -95,160 +105,189 @@ class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
DDim output_shape = {c, o_h, o_w}; // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
DDim input_matrix_shape = {m, h * w}; DDim output_shape =
framework::slice_ddim(output->dims(), 1, output->dims().size());
DDim filter_matrix_shape = {m, c * k_h * k_w}; // input matrix size: (m, h * w) or (m, d * h * w)
filter.Resize(filter_matrix_shape); DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
// convolution transpose: gemm + col2im (similar to conv-backward on input) // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
filter.Resize(filter_matrix_shape);
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*output); math::SetConstant<Place, T> set_zero;
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0)); set_zero(context.device_context(), output, static_cast<T>(0));
// convolution transpose: gemm + col2im or col2vol (similar to conv-backward
// on input)
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
// batch with size (M, h * w) // batch with size (m, h * w) or (m, d * h * w)
Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// filter size: (M, c * k_h * k_w)
// output size: (c, o_h, o_w) // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
// col_matrix = filter * input_batch // col_matrix = filter * input_batch
// of shape (c * k_h * k_w, h * w) // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
math::matmul<Place, T>(context.device_context(), filter, true, math::matmul<Place, T>(context.device_context(), filter, true,
input_batch, false, T(1.0), &col_matrix, T(0.0)); input_batch, false, static_cast<T>(1.0),
col2im(context.device_context(), output_batch, col, strides[0], &col_matrix, static_cast<T>(0.0));
strides[1], 0, 0, 0, 0);
if (filter_shape_vec.size() == 2) {
// col2im: col_matrix -> dy
// from (c * k_h * k_w, h * w) to (c, o_h, o_w)
math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
col2im(context.device_context(), output_batch, col, strides[0],
strides[1], 0, 0, 0, 0);
} else if (filter_shape_vec.size() == 3) {
// col2vol: col_matrix -> dy
// from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
math::Col2VolFunctor<Place, T> col2vol;
col2vol(context.device_context(), output_batch, col, strides[0],
strides[1], strides[2], 0, 0, 0);
}
} }
} }
}; };
template <typename Place, typename T> template <typename Place, typename T>
class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> { class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad = const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output")); context.Input<Tensor>(framework::GradVarName("Output"));
// For filter, we do not use const pointer b/c we will do reshape, // For filter, we do not use const pointer b/c we will do reshape,
// but we should avoid modifying its value. // but we should avoid modifying its value.
Tensor filter = *context.Input<Tensor>("Filter"); Tensor filter = *context.Input<Tensor>("Filter");
Tensor* input_grad = Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input")); context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter")); context.Output<Tensor>(framework::GradVarName("Filter"));
if ((!input_grad) && (!filter_grad)) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// Actually, no paddings and groups allowed in conv transpose. // Actually, no paddings and groups allowed in conv transpose.
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
const int batch_size = input->dims()[0]; const int batch_size = static_cast<int>(input->dims()[0]);
const int m = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int k_h = filter.dims()[2]; // input_shape_vec: {h, w} or {d, h, w}
const int k_w = filter.dims()[3]; std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
const int c = output_grad->dims()[1]; // output channels // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
const int o_h = output_grad->dims()[2]; std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
const int o_w = output_grad->dims()[3]; filter_shape_vec.erase(filter_shape_vec.begin(),
filter_shape_vec.begin() + 2);
// Only im2col functor required for bp to get to the right shape // use col_shape in the im2col and col2im (or vol2col and col2vol)
paddle::operators::math::Im2ColFunctor< // calculation
paddle::operators::math::ColFormat::kCFO, Place, T> // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
im2col; std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(output_grad->dims()[1]);
// use col_shape in the im2col and col2im calculation col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
DDim col_shape = {c, k_h, k_w, h, w}; filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
input_shape_vec.end());
DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation // use col_matrix_shape in the gemm calculation
DDim col_matrix_shape_f = {c * h * w, k_h * k_w}; // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
Tensor col; // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
col.mutable_data<T>(col_shape, context.GetPlace()); DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
// col_matrix shares the same piece of data with col, output_grad->dims().size());
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
DDim output_shape = {c, o_h, o_w}; // input matrix size: (m, h * w) or (m, d * h * w)
DDim input_matrix_shape = {m, h * w}; DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
DDim filter_matrix_shape = {m, c * k_h * k_w}; // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
filter.Resize(filter_matrix_shape); filter.Resize(filter_matrix_shape);
// convolution transpose grad on input: // convolution transpose grad on input:
// im2col + gemm (similar to conv-forward) // im2col + gemm (similar to conv-forward)
// input need to compute gradient // input need to compute gradient
if (input_grad) { if (input_grad || filter_grad) {
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix; Tensor col_matrix;
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
DDim col_matrix_shape = {c * k_h * k_w, h * w};
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
input_grad->mutable_data<T>(context.GetPlace()); Tensor filter_grad_;
auto t = framework::EigenVector<T>::Flatten(*input_grad); math::SetConstant<Place, T> set_zero;
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(context.device_context(), input_grad, static_cast<T>(0));
}
if (filter_grad) { // filter size (m, c, k_h, k_w)
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(context.device_context(), filter_grad, static_cast<T>(0));
filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
}
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
// batch with size (c, o_h * o_w) // batch with size (c, o_h * o_w)
Tensor output_grad_batch = Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape); output_grad->Slice(i, i + 1).Resize(output_shape);
// filter of size (m, c * k_h * k_w)
// batch with size (m, h, w) if (filter_shape_vec.size() == 2) {
Tensor input_grad_batch = // im2col: dy -> col matrix
input_grad->Slice(i, i + 1).Resize(input_matrix_shape); // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
// im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w) im2col(context.device_context(), output_grad_batch, col, strides[0],
im2col(context.device_context(), output_grad_batch, col, strides[0], strides[1], paddings[0], paddings[0], paddings[1],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]); paddings[1]);
} else if (filter_shape_vec.size() == 3) {
// gemm: dx = filter * dy // vol2col: dy -> col_matrix
// (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h) // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
math::matmul<Place, T>(context.device_context(), filter, false, math::Vol2ColFunctor<Place, T> vol2col;
col_matrix, false, T(1.0), &input_grad_batch, vol2col(context.device_context(), output_grad_batch, col, strides[0],
T(0.0)); strides[1], strides[2], paddings[0], paddings[1],
} paddings[2]);
} }
// filter gradient required if (input_grad) {
if (filter_grad) { // batch with size (m, h, w)
Tensor col_matrix_f; Tensor input_grad_batch =
col_matrix_f.ShareDataWith(col); input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
DDim col_matrix_shape_f = {c * h * w, k_h * k_w}; // gemm: dx = filter * dy
col_matrix_f.Resize(col_matrix_shape_f); // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
// or
filter_grad->mutable_data<T>(context.GetPlace()); // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
Tensor filter_grad_ = *filter_grad; // d, h, w)
filter_grad_.Resize(filter_matrix_shape); math::matmul<Place, T>(context.device_context(), filter, false,
auto t = framework::EigenVector<T>::Flatten(filter_grad_); col_matrix, false, static_cast<T>(1.0),
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0)); &input_grad_batch, static_cast<T>(0.0));
}
for (int i = 0; i < batch_size; ++i) { if (filter_grad) {
// batch with size (c, o_h, o_w) // input batch
Tensor output_grad_batch = Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
output_grad->Slice(i, i + 1).Resize(output_shape); // gemm: d_filter = x * dy^T
// input batch // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); // or
// (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
// im2col: (c * h * w, k_h * k_w) // k_h * k_w)
im2col(context.device_context(), output_grad_batch, col, strides[0], math::matmul<Place, T>(context.device_context(), in_batch, false,
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]); col_matrix, true, static_cast<T>(1.0),
&filter_grad_, static_cast<T>(1.0));
// gemm: d_filter = x * y_grad^T }
// (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
math::matmul<Place, T>(context.device_context(), in_batch, false,
col_matrix_f, true, T(1.0), &filter_grad_,
T(1.0));
} }
} }
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -114,21 +114,17 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -114,21 +114,17 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
"where N is the batch size and D is the number of classes. " "where N is the batch size and D is the number of classes. "
"This input is a probability computed by the previous operator, " "This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator."); "which is almost always the result of a softmax operator.");
AddInput( AddInput("Label",
"Label", "(Tensor), the ground truth which is a 2-D tensor. When "
"(Tensor, default Tensor<int>), the ground truth which is " "soft_label is set to false, Label is a Tensor<int64> with shape "
"a 2-D tensor. " "[N x 1]. When soft_label is set to true, Label is a "
"When soft_label is set to false, Label is a Tensor<int> with shape " "Tensor<float/double> with shape [N x K].");
"[N x 1]. "
"When soft_label is set to true, Label is a Tensor<float/double> "
"with shape [N x K].");
AddOutput("Y", AddOutput("Y",
"(Tensor, default Tensor<float>), a 2-D tensor " "(Tensor, default Tensor<float>), a 2-D tensor with shape "
"with shape [N x 1]. The cross entropy loss."); "[N x 1]. The cross entropy loss.");
AddAttr<bool>( AddAttr<bool>("soft_label",
"soft_label", "(bool, default false), a flag indicating whether to "
"(bool, default false), a flag to indicate whether to interpretate " "interpretate the given labels as soft labels.")
"the given labels as soft labels.")
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
CrossEntropy Operator. CrossEntropy Operator.
......
...@@ -34,15 +34,18 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel { ...@@ -34,15 +34,18 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
std::vector<int64_t> shape_int64(shape.size(), 0); std::vector<int64_t> shape_int64(shape.size(), 0);
std::transform(shape.begin(), shape.end(), shape_int64.begin(), std::transform(shape.begin(), shape.end(), shape_int64.begin(),
[](int a) { return static_cast<int64_t>(a); }); [](int a) { return static_cast<int64_t>(a); });
auto dims = framework::make_ddim(shape_int64); auto output_dim = framework::make_ddim(shape_int64);
int dim_idx = ctx->Attrs().Get<int>("dim_idx"); int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
PADDLE_ENFORCE_GE(dim_idx, 0); PADDLE_ENFORCE_GE(input_dim_idx, 0);
PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), dim_idx); PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), dim_idx);
dims[dim_idx] = ctx->GetInputDim("Input")[dim_idx]; int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
ctx->SetOutputDim("Out", dims); PADDLE_ENFORCE_GE(output_dim_idx, 0);
PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
ctx->SetOutputDim("Out", output_dim);
} }
protected: protected:
...@@ -69,8 +72,11 @@ class FillConstantBatchSizeLikeOpMaker ...@@ -69,8 +72,11 @@ class FillConstantBatchSizeLikeOpMaker
"(Tensor) Tensor of specified shape will be filled " "(Tensor) Tensor of specified shape will be filled "
"with the specified value"); "with the specified value");
AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output"); AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
AddAttr<int>("dim_idx", AddAttr<int>("input_dim_idx",
"(int, default 0) The index of batch size dimension") "(int, default 0) the index of input's batch size dimension")
.SetDefault(0);
AddAttr<int>("output_dim_idx",
"(int, default 0) the index of output's batch size dimension")
.SetDefault(0); .SetDefault(0);
AddAttr<float>("value", "(float, default 0) The value to be filled") AddAttr<float>("value", "(float, default 0) The value to be filled")
.SetDefault(0.0f); .SetDefault(0.0f);
...@@ -86,9 +92,10 @@ Fill up a variable with specified constant value. ...@@ -86,9 +92,10 @@ Fill up a variable with specified constant value.
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fill_constant_batch_size_like, REGISTER_OPERATOR(fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOp, ops::FillConstantBatchSizeLikeOp,
ops::FillConstantBatchSizeLikeOpMaker); paddle::framework::EmptyGradOpMaker,
ops::FillConstantBatchSizeLikeOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_constant_batch_size_like, fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>, ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
......
...@@ -35,7 +35,9 @@ class FillConstantOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,9 @@ class FillConstantOp : public framework::OperatorWithKernel {
protected: protected:
framework::DataType IndicateDataType( framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return static_cast<framework::DataType>(ctx.Attr<int>("data_type")); int data_type = ctx.Attr<int>("data_type");
VLOG(10) << " FillConstant data_type = " << data_type;
return static_cast<framework::DataType>(data_type);
} }
}; };
...@@ -71,4 +73,5 @@ REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp, ...@@ -71,4 +73,5 @@ REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>, fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>, ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>); ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -20,4 +20,5 @@ namespace ops = paddle::operators; ...@@ -20,4 +20,5 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>, fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>, ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>); ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/gru_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class GRUOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(%s) of GRUOp should not be null.", "Input");
PADDLE_ENFORCE(ctx->HasInput("Weight"),
"Input(%s) of GRUOp should not be null.", "Weight");
PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
"Output(%s) of GRUOp should not be null.", "BatchGate");
PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
"Output(%s) of GRUOp should not be null.",
"BatchResetHiddenPrev");
PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
"Output(%s) of GRUOp should not be null.", "BatchHidden");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(%s) of GRUOp should not be null.", "Hidden");
auto input_dims = ctx->GetInputDim("Input");
auto weight_dims = ctx->GetInputDim("Weight");
int input_size = input_dims[1];
int frame_size = weight_dims[0];
PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
"The input_size must be 3 times of frame_size in GRUOp.");
PADDLE_ENFORCE_EQ(
weight_dims[1], frame_size * 3,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
if (ctx->HasInput("H0")) {
auto h0_dims = ctx->GetInputDim("H0");
PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
"The width of H0 must be equal to frame_size.");
}
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
int bias_height = bias_dims[0];
int bias_width = bias_dims[1];
PADDLE_ENFORCE_EQ(bias_height, 1,
"The shape of Bias must be [1, frame_size * 3].");
PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
"The shape of Bias must be [1, frame_size * 3].");
}
ctx->SetOutputDim("BatchGate", input_dims);
ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
ctx->ShareLoD("Input", "Hidden");
}
};
class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
public:
GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(LoDTensor) The first input is a LodTensor, which supports "
"variable-time length input sequence. The underlying tensor in "
"this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
"total time steps in this mini-batch, D is the hidden size.");
AddInput("H0",
"(Tensor, optional) The initial hidden state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size, D is the hidden size.")
.AsDispensable();
AddInput(
"Weight",
"(Tensor) The learnable hidden-hidden weight matrix with shape "
"(D x 3D), where D is the hidden size. The elements continuous in "
"memory can be divided into two parts. The first part are weights of "
"the update gate and reset gate with shape (D x 2D), and the second "
"part are weights of output candidate with shape (D x D).");
AddInput("Bias",
"(Tensor, optional) Bias vector with shape (1 x 3D) concating "
"bias of the update gate, reset gate and output candidate.")
.AsDispensable();
AddOutput("BatchGate",
"(LoDTensor) To compute with batches, sequence data will be "
"reorganized into several successive batches each containing "
"data from the same time step. The LoDTensor BatchGate contains "
"the update gate, reset gate and output candidate values "
"organized in batches. The LoD size is 2. The first LoD contains "
"the batch offsets and the second LoD contains the indexes in "
"the raw sequence data.")
.AsIntermediate();
AddOutput(
"BatchResetHiddenPrev",
"(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
"This LoDTensor is a matrix with shape (T X D) and has the same LoD "
"with `BatchGate`.")
.AsIntermediate();
AddOutput(
"BatchHidden",
"(LoDTensor) The hidden state LoDTensor organized in batches. "
"This LoDTensor is a matrix with shape (T X D) and has the same LoD "
"with `BatchGate`.")
.AsIntermediate();
AddOutput(
"Hidden",
"(LoDTensor) the hidden state LoDTensor organized in sequences. "
"This LoDTensor is a matrix with shape (T X D) and has the same LoD "
"with `BatchGate`.");
AddAttr<std::string>("activation",
"(string, default tanh) "
"The activation type used for output candidate {h}_t.")
.SetDefault("tanh");
AddAttr<std::string>(
"gate_activation",
"(string, default sigmoid) "
"The activation type used in update gate and reset gate.")
.SetDefault("sigmoid");
AddAttr<bool>("is_reverse",
"(bool, defalut: False) "
"whether to compute reversed GRU.")
.SetDefault(false);
AddComment(R"DOC(
GRU Operator implements part calculations of the complete GRU as following:
\f[
update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\
output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
\f]
@note To implement the complete GRU, fully-connected operator must be used
before to feed xu, xr and xc as the Input of GRU operator.
)DOC");
}
};
class GRUGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(%s) of GRUGradOp should not be null.", "Input");
PADDLE_ENFORCE(ctx->HasInput("Weight"),
"Input(%s) of GRUGradOp should not be null.", "Weight");
PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
"Input(%s) of GRUGradOp should not be null.", "BatchGate");
PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
"Input(%s) of GRUGradOp should not be null.",
"BatchResetHiddenPrev");
PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
"Input(%s) of GRUOp should not be null.", "BatchHidden");
PADDLE_ENFORCE(ctx->HasInput("Hidden"),
"Input(%s) of GRUGradOp should not be null.", "Hidden");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
"Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
auto input_dims = ctx->GetInputDim("Input");
auto weight_dims = ctx->GetInputDim("Weight");
int input_size = input_dims[1];
int frame_size = weight_dims[0];
int weight_height = weight_dims[0];
int weight_width = weight_dims[1];
PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
"The input_size must be 3 times of frame_size in GRUOp.");
PADDLE_ENFORCE_EQ(
weight_height, frame_size,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
PADDLE_ENFORCE_EQ(
weight_width, frame_size * 3,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
if (ctx->HasInput("H0")) {
auto h0_dims = ctx->GetInputDim("H0");
PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
"The width of H0 must be equal to frame_size.");
auto h0_grad_name = framework::GradVarName("H0");
if (ctx->HasOutput(h0_grad_name))
ctx->SetOutputDim(h0_grad_name, h0_dims);
}
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
int bias_height = bias_dims[0];
int bias_width = bias_dims[1];
PADDLE_ENFORCE_EQ(bias_height, 1,
"The shape of Bias must be [1, frame_size * 3].");
PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
"The shape of Bias must be [1, frame_size * 3].");
auto bias_grad_name = framework::GradVarName("Bias");
if (ctx->HasOutput(bias_grad_name))
ctx->SetOutputDim(bias_grad_name, bias_dims);
}
auto input_grad_name = framework::GradVarName("Input");
if (ctx->HasOutput(input_grad_name))
ctx->SetOutputDim(input_grad_name, input_dims);
auto weight_grad_name = framework::GradVarName("Weight");
if (ctx->HasOutput(weight_grad_name))
ctx->SetOutputDim(weight_grad_name, weight_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
ops::GRUKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(gru_grad,
ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/gru_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
ops::GRUKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(gru_grad,
ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/operators/math/gru_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T>
class GRUKernel : public framework::OpKernel<T> {
public:
void BatchCompute(const framework::ExecutionContext& context) const {
auto* input = context.Input<LoDTensor>("Input");
auto* h0 = context.Input<Tensor>("H0");
const T* h0_data = h0 ? h0->data<T>() : nullptr;
auto* weight = context.Input<Tensor>("Weight");
const T* weight_data = weight->data<T>();
auto* bias = context.Input<Tensor>("Bias");
auto* batch_gate = context.Output<LoDTensor>("BatchGate");
batch_gate->mutable_data<T>(context.GetPlace());
auto* batch_reset_hidden_prev =
context.Output<LoDTensor>("BatchResetHiddenPrev");
batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
batch_hidden->mutable_data<T>(context.GetPlace());
auto* hidden = context.Output<LoDTensor>("Hidden");
hidden->mutable_data<T>(context.GetPlace());
context.ShareLoD("Input", "Hidden");
auto hidden_dims = hidden->dims();
bool is_reverse = context.Attr<bool>("is_reverse");
math::LoDTensor2BatchFunctor<Place, T> to_batch;
to_batch(context.device_context(), *input, *batch_gate, true, is_reverse);
int frame_size = hidden_dims[1];
int batch_size = hidden_dims[0];
auto g = EigenMatrix<T>::From(*batch_gate);
auto place = context.GetEigenDevice<Place>();
if (bias) {
auto b = EigenMatrix<T>::From(*bias);
g.device(place) = g +
b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
.broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
}
math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data);
gru_value.stateWeight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size);
gru_value.prevOutValue = const_cast<T*>(h0_data);
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.outputValue = hidden_t.data<T>();
gru_value.gateValue = gate_t.data<T>();
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
math::GRUUnitFunctor<Place, T>::compute(
context.device_context(), gru_value, frame_size, cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation")));
gru_value.prevOutValue = gru_value.outputValue;
}
math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_hidden->set_lod(batch_gate->lod());
to_seq(context.device_context(), *batch_hidden, *hidden);
}
void Compute(const framework::ExecutionContext& context) const override {
BatchCompute(context);
}
};
template <typename Place, typename T>
class GRUGradKernel : public framework::OpKernel<T> {
public:
void BatchCompute(const framework::ExecutionContext& context) const {
auto* h0 = context.Input<Tensor>("H0");
const T* h0_data = h0 ? h0->data<T>() : nullptr;
auto* weight = context.Input<Tensor>("Weight");
const T* weight_data = weight->data<T>();
auto* batch_gate = context.Input<LoDTensor>("BatchGate");
auto* batch_reset_hidden_prev =
context.Input<LoDTensor>("BatchResetHiddenPrev");
auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
auto* hidden = context.Input<LoDTensor>("Hidden");
auto* hidden_grad =
context.Input<LoDTensor>(framework::GradVarName("Hidden"));
auto* input_grad =
context.Output<LoDTensor>(framework::GradVarName("Input"));
auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
auto* weight_grad =
context.Output<Tensor>(framework::GradVarName("Weight"));
auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
auto gate_dims = batch_gate->dims();
auto hidden_dims = hidden->dims();
int frame_size = hidden_dims[1];
math::LoDTensor2BatchFunctor<Place, T> to_batch;
LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
context.GetPlace());
math::SetConstant<Place, T> zero;
zero(context.device_context(), &batch_hidden_grad, static_cast<T>(0.0));
zero(context.device_context(), &batch_gate_grad, static_cast<T>(0.0));
zero(context.device_context(), &batch_reset_hidden_prev_grad,
static_cast<T>(0.0));
bool is_reverse = context.Attr<bool>("is_reverse");
batch_hidden_grad.set_lod(batch_hidden->lod());
to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false,
is_reverse);
math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data);
gru_value.stateWeight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size);
math::hl_gru_grad<T> gru_grad;
if (weight_grad) {
gru_grad.gateWeightGrad =
weight_grad->mutable_data<T>(context.GetPlace());
zero(context.device_context(), weight_grad, static_cast<T>(0.0));
gru_grad.stateWeightGrad =
weight_grad->data<T>() + 2 * frame_size * frame_size;
} else {
gru_grad.gateWeightGrad = nullptr;
gru_grad.stateWeightGrad = nullptr;
}
auto batch_starts = batch_hidden_grad.lod()[0];
size_t num_batch = batch_starts.size() - 1;
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend);
gru_value.gateValue = gate_t.data<T>();
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
gru_grad.outputGrad = hidden_grad_t.data<T>();
Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
gru_grad.gateGrad = gate_grad_t.data<T>();
Tensor reset_hidden_prev_grad_t =
batch_reset_hidden_prev_grad.Slice(bstart, bend);
gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
if (n == 0) {
gru_value.prevOutValue = const_cast<T*>(h0_data);
if (h0_grad) {
T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
zero(context.device_context(), h0_grad, static_cast<T>(0.0));
gru_grad.prevOutGrad = h0_grad_data;
} else {
gru_grad.prevOutGrad = nullptr;
}
} else {
int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
gru_value.prevOutValue = hidden_prev_t.data<T>();
Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
}
math::GRUUnitGradFunctor<Place, T>::compute(
context.device_context(), gru_value, gru_grad, frame_size,
cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation")));
}
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_gate_grad.set_lod(batch_gate->lod());
to_seq(context.device_context(), batch_gate_grad, *input_grad);
}
if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace());
auto d_b = EigenMatrix<T>::From(*bias_grad);
auto d_g = EigenMatrix<T>::From(batch_gate_grad);
auto place = context.GetEigenDevice<Place>();
d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
}
}
void Compute(const framework::ExecutionContext& context) const override {
BatchCompute(context);
}
};
} // namespace operators
} // namespace paddle
...@@ -31,7 +31,6 @@ class IncrementOp : public framework::OperatorWithKernel { ...@@ -31,7 +31,6 @@ class IncrementOp : public framework::OperatorWithKernel {
} }
}; };
template <typename AttrType>
class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
IncrementOpMaker(framework::OpProto *proto, IncrementOpMaker(framework::OpProto *proto,
...@@ -39,10 +38,10 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -39,10 +38,10 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) The input tensor of increment operator"); AddInput("X", "(Tensor) The input tensor of increment operator");
AddOutput("Out", "(Tensor) The output tensor of increment operator."); AddOutput("Out", "(Tensor) The output tensor of increment operator.");
AddAttr<AttrType>("step", AddAttr<float>("step",
"(float, default 1.0) " "(float, default 1.0) "
"The step size by which the " "The step size by which the "
"input tensor will be incremented.") "input tensor will be incremented.")
.SetDefault(1.0); .SetDefault(1.0);
AddComment(R"DOC( AddComment(R"DOC(
Increment Operator. Increment Operator.
...@@ -73,7 +72,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker { ...@@ -73,7 +72,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>, REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
ops::IncrementGradOpMaker); ops::IncrementGradOpMaker);
REGISTER_OP_CPU_KERNEL(increment, REGISTER_OP_CPU_KERNEL(
ops::IncrementKernel<paddle::platform::CPUPlace, float>); increment, ops::IncrementKernel<paddle::platform::CPUPlace, float>,
ops::IncrementKernel<paddle::platform::CPUPlace, double>,
ops::IncrementKernel<paddle::platform::CPUPlace, int>,
ops::IncrementKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -16,4 +16,7 @@ ...@@ -16,4 +16,7 @@
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
increment, increment,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>); paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, double>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int64_t>);
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename Place, typename T, typename AttrType = T> template <typename Place, typename T>
class IncrementKernel : public framework::OpKernel<T> { class IncrementKernel : public framework::OpKernel<T> {
public: public:
virtual void Compute(const framework::ExecutionContext& context) const { virtual void Compute(const framework::ExecutionContext& context) const {
...@@ -27,7 +27,7 @@ class IncrementKernel : public framework::OpKernel<T> { ...@@ -27,7 +27,7 @@ class IncrementKernel : public framework::OpKernel<T> {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place()); tensor->mutable_data<T>(in->place());
auto step = static_cast<T>(context.Attr<AttrType>("step")); auto step = static_cast<T>(context.Attr<float>("step"));
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor); auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*in); auto eigen_in = framework::EigenVector<T>::Flatten(*in);
......
...@@ -8,20 +8,24 @@ if(WITH_GPU) ...@@ -8,20 +8,24 @@ if(WITH_GPU)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(softmax SRCS softmax.cc DEPS operator)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context)
cc_library(context_project SRCS context_project.cc DEPS device_context) cc_library(context_project SRCS context_project.cc DEPS device_context)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
endif() endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/operators/math/gru_compute.h"
namespace paddle {
namespace operators {
namespace math {
namespace detail {
#ifndef __NVCC__
template <class OpResetOutput, typename T>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
T *gateValue, T *resetOutputValue,
T *prevOutputValue, int frameSize,
activation_mode_t active_gate) {
T rValueUpdateGate;
T rValueResetGate;
T rValueResetOutput;
T rPrevOut = 0;
T *updateGate = gateValue;
T *resetGate = gateValue + frameSize;
for (int i = 0; i < frameSize; i++) {
rValueUpdateGate = updateGate[i];
rValueResetGate = resetGate[i];
if (prevOutputValue) {
rPrevOut = prevOutputValue[i];
}
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
rValueResetOutput, active_gate);
updateGate[i] = rValueUpdateGate;
resetGate[i] = rValueResetGate;
resetOutputValue[i] = rValueResetOutput;
}
}
template <class OpFinalOutput, typename T>
void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
T *gateValue, T *prevOutputValue,
T *outputValue, int frameSize,
activation_mode_t active_node) {
T rValueUpdateGate;
T rValueFrameState;
T rPrevOut = 0;
T rOutput;
T *updateGate = gateValue;
T *frameState = gateValue + frameSize * 2;
for (int i = 0; i < frameSize; i++) {
rValueUpdateGate = updateGate[i];
rValueFrameState = frameState[i];
if (prevOutputValue) {
rPrevOut = prevOutputValue[i];
}
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
active_node);
frameState[i] = rValueFrameState;
outputValue[i] = rOutput;
}
}
template <class OpResetOutput, typename T>
void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
T *resetOutputValue, T *prevOutputValue,
int frameSize,
activation_mode_t active_gate) {
#ifdef __AVX__
__m256 rValueUpdateGate;
__m256 rValueResetGate;
__m256 rValueResetOutput;
__m256 rPrevOut = _mm256_set1_ps(0.0f);
__m256 *updateGate = (__m256 *)gateValue;
__m256 *resetGate = (__m256 *)(gateValue + frameSize);
for (int i = 0; i < frameSize / 8; i++) {
rValueUpdateGate = updateGate[i];
rValueResetGate = resetGate[i];
if (prevOutputValue) {
rPrevOut = ((__m256 *)prevOutputValue)[i];
}
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
rValueResetOutput, active_gate);
updateGate[i] = rValueUpdateGate;
resetGate[i] = rValueResetGate;
((__m256 *)resetOutputValue)[i] = rValueResetOutput;
}
#endif
}
template <class OpFinalOutput, typename T>
void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
T *prevOutputValue, T *outputValue,
int frameSize,
activation_mode_t active_node) {
#ifdef __AVX__
__m256 rValueUpdateGate;
__m256 rValueFrameState;
__m256 rPrevOut = _mm256_set1_ps(0.0f);
__m256 rOutput;
__m256 *updateGate = (__m256 *)gateValue;
__m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
for (int i = 0; i < frameSize / 8; i++) {
rValueUpdateGate = updateGate[i];
rValueFrameState = frameState[i];
if (prevOutputValue) {
rPrevOut = ((__m256 *)prevOutputValue)[i];
}
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
active_node);
frameState[i] = rValueFrameState;
((__m256 *)outputValue)[i] = rOutput;
}
#endif
}
template <class OpResetOutput, typename T>
inline void forward_reset_output(OpResetOutput opResetOutput,
hl_gru_value<T> value, int frameSize,
int batchSize, activation_mode_t active_gate) {
for (int b = 0; b < batchSize; b++) {
if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue,
value.prevOutValue, frameSize, active_gate);
} else {
hl_naive_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue,
value.prevOutValue, frameSize, active_gate);
}
value.gateValue += frameSize * 3;
value.resetOutputValue += frameSize;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
}
}
template <class OpFinalOutput, typename T>
inline void forward_final_output(OpFinalOutput opFinalOutput,
hl_gru_value<T> value, int frameSize,
int batchSize, activation_mode_t active_node) {
for (int b = 0; b < batchSize; b++) {
if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
value.prevOutValue, value.outputValue,
frameSize, active_node);
} else {
hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
value.prevOutValue, value.outputValue,
frameSize, active_node);
}
value.gateValue += frameSize * 3;
value.outputValue += frameSize;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
}
}
template <class OpStateGrad, typename T>
void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *outputGrad,
int frameSize,
activation_mode_t active_node) {
T rUpdateGateValue;
T rUpdateGateGrad;
T rFrameStateValue;
T rFrameStateGrad;
T rOutGrad;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T *updateGateValue = gateValue;
T *updateGateGrad = gateGrad;
T *frameStateValue = gateValue + frameSize * 2;
T *frameStateGrad = gateGrad + frameSize * 2;
for (int i = 0; i < frameSize; i++) {
rUpdateGateValue = updateGateValue[i];
rFrameStateValue = frameStateValue[i];
rOutGrad = outputGrad[i];
if (prevOutValue) {
rPrevOutValue = prevOutValue[i];
}
if (prevOutGrad) {
rPrevOutGrad = prevOutGrad[i];
}
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
active_node);
updateGateGrad[i] = rUpdateGateGrad;
frameStateGrad[i] = rFrameStateGrad;
if (prevOutGrad) {
prevOutGrad[i] = rPrevOutGrad;
}
}
}
template <class OpResetGrad, typename T>
void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *resetOutputGrad,
int frameSize,
activation_mode_t active_gate) {
T rUpdateGateValue;
T rUpdateGateGrad;
T rResetGateValue;
T rResetGateGrad;
T rResetOutputGrad = 0;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T *updateGateValue = gateValue;
T *updateGateGrad = gateGrad;
T *resetGateValue = gateValue + frameSize;
T *resetGateGrad = gateGrad + frameSize;
for (int i = 0; i < frameSize; i++) {
rUpdateGateValue = updateGateValue[i];
rUpdateGateGrad = updateGateGrad[i];
rResetGateValue = resetGateValue[i];
if (prevOutValue && prevOutGrad) {
rResetOutputGrad = resetOutputGrad[i];
}
if (prevOutValue) {
rPrevOutValue = prevOutValue[i];
}
if (prevOutGrad) {
rPrevOutGrad = prevOutGrad[i];
}
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
active_gate);
updateGateGrad[i] = rUpdateGateGrad;
resetGateGrad[i] = rResetGateGrad;
if (prevOutGrad) {
prevOutGrad[i] = rPrevOutGrad;
}
}
}
template <class OpStateGrad, typename T>
void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *outputGrad,
int frameSize,
activation_mode_t active_node) {
#ifdef __AVX__
__m256 rUpdateGateValue;
__m256 rUpdateGateGrad;
__m256 rFrameStateValue;
__m256 rFrameStateGrad;
__m256 rOutGrad;
__m256 rPrevOutValue = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue;
__m256 *updateGateGrad = (__m256 *)gateGrad;
__m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
__m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
for (int i = 0; i < frameSize / 8; i++) {
rUpdateGateValue = updateGateValue[i];
rFrameStateValue = frameStateValue[i];
rOutGrad = ((__m256 *)outputGrad)[i];
if (prevOutValue) {
rPrevOutValue = ((__m256 *)prevOutValue)[i];
}
if (prevOutGrad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
}
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
active_node);
updateGateGrad[i] = rUpdateGateGrad;
frameStateGrad[i] = rFrameStateGrad;
if (prevOutGrad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
}
}
#endif
}
template <class OpResetGrad, typename T>
void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *resetOutputGrad,
int frameSize,
activation_mode_t active_gate) {
#ifdef __AVX__
__m256 rUpdateGateValue;
__m256 rUpdateGateGrad;
__m256 rResetGateValue;
__m256 rResetGateGrad;
__m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
__m256 rPrevOutValue = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue;
__m256 *updateGateGrad = (__m256 *)gateGrad;
__m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
__m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
for (int i = 0; i < frameSize / 8; i++) {
rUpdateGateValue = updateGateValue[i];
rUpdateGateGrad = updateGateGrad[i];
rResetGateValue = resetGateValue[i];
if (prevOutValue && prevOutGrad) {
rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
}
if (prevOutValue) {
rPrevOutValue = ((__m256 *)prevOutValue)[i];
}
if (prevOutGrad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
}
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
active_gate);
updateGateGrad[i] = rUpdateGateGrad;
resetGateGrad[i] = rResetGateGrad;
if (prevOutGrad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
}
}
#endif
}
template <class OpStateGrad, typename T>
inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node) {
for (int b = 0; b < batchSize; b++) {
if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
} else {
hl_naive_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
}
value.gateValue += frameSize * 3;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
grad.gateGrad += frameSize * 3;
grad.outputGrad += frameSize;
if (grad.prevOutGrad) {
grad.prevOutGrad += frameSize;
}
}
}
template <class OpResetGrad, typename T>
inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_gate) {
for (int b = 0; b < batchSize; b++) {
if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
} else {
hl_naive_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
}
value.gateValue += frameSize * 3;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
grad.gateGrad += frameSize * 3;
grad.resetOutputGrad += frameSize;
if (grad.prevOutGrad) {
grad.prevOutGrad += frameSize;
}
}
}
#endif
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/operators/math/gru_compute.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h"
#include <glog/logging.h>
namespace paddle {
namespace operators {
namespace math {
namespace detail {
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpResetOutput, bool isBatch, typename T>
__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
T *gateValue, T *resetOutputValue,
T *prevOutputValue, int frameSize,
int batchSize,
activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
resetOutputValue += batchIdx * frameSize;
}
T rPrevOut = 0;
T rValueResetOutput;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
T rValueResetGate = gateValue[frameIdx + frameSize * 1];
if (prevOutputValue) {
if (isBatch) prevOutputValue += batchIdx * frameSize;
rPrevOut = prevOutputValue[frameIdx];
}
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
active_gate);
gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
gateValue[frameIdx + frameSize * 1] = rValueResetGate;
resetOutputValue[frameIdx] = rValueResetOutput;
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpFinalOutput, bool isBatch, typename T>
__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
T *gateValue, T *prevOutputValue,
T *outputValue, int frameSize,
int batchSize,
activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
outputValue += batchIdx * frameSize;
}
T rOutput;
T rPrevOut = 0;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
T rValueFrameState = gateValue[frameIdx + frameSize * 2];
if (prevOutputValue) {
if (isBatch) prevOutputValue += batchIdx * frameSize;
rPrevOut = prevOutputValue[frameIdx];
}
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
active_node);
gateValue[frameIdx + frameSize * 2] = rValueFrameState;
outputValue[frameIdx] = rOutput;
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpStateGrad, bool isBatch, typename T>
__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *outputGrad,
int frameSize, int batchSize,
activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
gateGrad += batchIdx * 3 * frameSize;
outputGrad += batchIdx * frameSize;
}
T rUpdateGateGrad;
T rFrameStateGrad;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
T rOutGrad = outputGrad[frameIdx];
if (prevOutValue && prevOutGrad) {
if (isBatch) prevOutValue += batchIdx * frameSize;
rPrevOutValue = prevOutValue[frameIdx];
if (isBatch) prevOutGrad += batchIdx * frameSize;
rPrevOutGrad = prevOutGrad[frameIdx];
}
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
active_node);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
if (prevOutGrad) {
prevOutGrad[frameIdx] = rPrevOutGrad;
}
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpResetGrad, bool isBatch, typename T>
__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *resetOutputGrad,
int frameSize, int batchSize,
activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
gateGrad += batchIdx * 3 * frameSize;
resetOutputGrad += batchIdx * frameSize;
}
T rResetGateGrad;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T rResetOutputGrad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
T rResetGateValue = gateValue[frameIdx + frameSize * 1];
if (prevOutValue && prevOutGrad) {
if (isBatch) prevOutValue += batchIdx * frameSize;
if (isBatch) prevOutGrad += batchIdx * frameSize;
rPrevOutValue = prevOutValue[frameIdx];
rPrevOutGrad = prevOutGrad[frameIdx];
rResetOutputGrad = resetOutputGrad[frameIdx];
}
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
active_gate);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
if (prevOutGrad) {
prevOutGrad[frameIdx] = rPrevOutGrad;
}
}
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/platform/hostdevice.h"
#include <type_traits>
// TODO(guosheng): refine code style in gru_kernel
namespace paddle {
namespace operators {
namespace math {
namespace detail {
namespace forward {
template <typename T>
class gru_resetOutput {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
T &valueResetOutput, activation_mode_t actGate) {
valueUpdateGate = activation(valueUpdateGate, actGate);
valueResetGate = activation(valueResetGate, actGate);
valueResetOutput = prevOut * valueResetGate;
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
__m256 &prevOut, __m256 &valueResetOutput,
activation_mode_t actGate) {
valueUpdateGate = activation(valueUpdateGate, actGate);
valueResetGate = activation(valueResetGate, actGate);
valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
}
#endif
#endif
};
template <typename T>
class gru_finalOutput {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
T &valueOutput, activation_mode_t actInput) {
valueFrameState = activation(valueFrameState, actInput);
valueOutput = prevOut - (valueUpdateGate * prevOut) +
(valueUpdateGate * valueFrameState);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
__m256 &prevOut, __m256 &valueOutput,
activation_mode_t actInput) {
valueFrameState = activation(valueFrameState, actInput);
valueOutput = _mm256_add_ps(
_mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
_mm256_mul_ps(valueUpdateGate, valueFrameState));
}
#endif
#endif
};
} // namespace forward
namespace backward {
template <typename T>
class gru_stateGrad {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
T &valueFrameState, T &gradFrameState,
T &valuePrevOut, T &gradPrevOut, T &gradOutput,
activation_mode_t actInput) {
gradUpdateGate = (gradOutput * valueFrameState);
gradUpdateGate -= (gradOutput * valuePrevOut);
gradPrevOut -= (gradOutput * valueUpdateGate);
gradPrevOut += gradOutput;
gradFrameState =
activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
__m256 &valueFrameState, __m256 &gradFrameState,
__m256 &valuePrevOut, __m256 &gradPrevOut,
__m256 &gradOutput, activation_mode_t actInput) {
gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
gradUpdateGate =
_mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
gradPrevOut = _mm256_add_ps(
_mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
gradOutput);
gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
valueFrameState, actInput);
}
#endif
#endif
};
template <typename T>
class gru_resetGrad {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
T &valueResetGate, T &gradResetGate,
T &valuePrevOut, T &gradPrevOut,
T &gradResetOutput, activation_mode_t actGate) {
gradResetGate = (gradResetOutput * valuePrevOut);
gradPrevOut += (gradResetOutput * valueResetGate);
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
gradResetGate = activation(gradResetGate, valueResetGate, actGate);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
__m256 &valueResetGate, __m256 &gradResetGate,
__m256 &valuePrevOut, __m256 &gradPrevOut,
__m256 &gradResetOutput,
activation_mode_t actGate) {
gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
gradPrevOut = _mm256_add_ps(gradPrevOut,
_mm256_mul_ps(gradResetOutput, valueResetGate));
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
gradResetGate = activation(gradResetGate, valueResetGate, actGate);
}
#endif
#endif
};
} // namespace backward
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/gru_compute.h"
#include "paddle/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/operators/math/detail/gru_kernel.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct GRUUnitFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate) {
#ifndef __NVCC__
if (value.prevOutValue) {
math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
value.gateValue, frameSize * 3);
}
detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
frameSize, batchSize, active_gate);
if (value.prevOutValue) {
math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
value.gateValue + frameSize * 2, frameSize * 3);
}
detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
frameSize, batchSize, active_node);
#endif
}
};
template <typename T>
struct GRUUnitGradFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate) {
#ifndef __NVCC__
detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
grad, frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
frameSize, 0, grad.resetOutputGrad, frameSize);
if (grad.stateWeightGrad) {
math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
frameSize * 3, 1, grad.stateWeightGrad, frameSize);
}
}
detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
grad, frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
grad.prevOutGrad, frameSize);
if (grad.gateWeightGrad) {
math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
grad.gateWeightGrad, frameSize * 2);
}
}
#endif
}
};
template struct GRUUnitFunctor<platform::CPUPlace, float>;
template struct GRUUnitFunctor<platform::CPUPlace, double>;
template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/detail/gru_gpu_kernel.h"
#include "paddle/operators/math/detail/gru_kernel.h"
#include "paddle/operators/math/gru_compute.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct GRUUnitFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate) {
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
if (value.prevOutValue) {
math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
value.gateValue, frameSize * 3);
}
if (batchSize == 1) {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ false,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
active_gate);
} else {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ true,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
active_gate);
}
if (value.prevOutValue) {
math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
value.gateValue + frameSize * 2, frameSize * 3);
}
if (batchSize == 1) {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ false,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue,
value.prevOutValue, value.outputValue, frameSize, batchSize,
active_node);
} else {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ true,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue,
value.prevOutValue, value.outputValue, frameSize, batchSize,
active_node);
}
}
};
template <typename T>
struct GRUUnitGradFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate) {
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
if (batchSize == 1) {
detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
batchSize, active_node);
} else {
detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
batchSize, active_node);
}
if (value.prevOutValue && grad.prevOutGrad) {
math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
frameSize, 0, grad.resetOutputGrad, frameSize);
if (grad.stateWeightGrad) {
math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
frameSize * 3, 1, grad.stateWeightGrad, frameSize);
}
}
if (batchSize == 1) {
detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
batchSize, active_gate);
} else {
detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
batchSize, active_gate);
}
if (grad.prevOutGrad && value.prevOutValue) {
math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
grad.prevOutGrad, frameSize);
if (grad.gateWeightGrad) {
math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
grad.gateWeightGrad, frameSize * 2);
}
}
}
};
template struct GRUUnitFunctor<platform::GPUPlace, float>;
template struct GRUUnitFunctor<platform::GPUPlace, double>;
template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace operators {
namespace math {
// TODO(guosheng): refine code style in gru_compute
template <typename T>
struct hl_gru_value {
T *gateWeight;
T *stateWeight;
T *gateValue;
T *resetOutputValue;
T *outputValue;
T *prevOutValue;
};
template <typename T>
struct hl_gru_grad {
T *gateWeightGrad;
T *stateWeightGrad;
T *gateGrad;
T *resetOutputGrad;
T *outputGrad;
T *prevOutGrad;
};
template <typename Place, typename T>
struct GRUUnitFunctor {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate);
};
template <typename Place, typename T>
struct GRUUnitGradFunctor {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/sequence_pooling.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class MaxSeqPoolFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1);
PADDLE_ENFORCE_GT(out_dims.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, out_dims);
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->data<T>();
int* max_index = index->data<int>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
for (int64_t i = 0; i < num_seq; ++i) {
for (int64_t k = 0; k < dim; ++k) {
out_data[i * dim + k] = in_data[starts[i] * dim + k];
max_index[i * dim + k] = starts[i];
}
for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
for (int64_t k = 0; k < dim; ++k) {
if (in_data[j * dim + k] > out_data[i * dim + k]) {
out_data[i * dim + k] = in_data[j * dim + k];
max_index[i * dim + k] = j;
}
}
}
}
}
};
template <typename T>
class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& out_grad,
const framework::Tensor& index,
framework::LoDTensor* in_grad) {
auto og_dims = out_grad.dims();
auto ig_dims = in_grad->dims();
auto idx_dims = index.dims();
PADDLE_ENFORCE_GT(og_dims.size(), 1);
PADDLE_ENFORCE_GT(ig_dims.size(), 1);
for (int64_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, og_dims);
const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>();
T* ig_data = in_grad->data<T>();
SetConstant<platform::CPUPlace, T> set_zero;
set_zero(context, in_grad, static_cast<T>(0.0));
int64_t num_seq = og_dims[0];
int64_t dim = out_grad.numel() / num_seq;
for (int64_t i = 0; i < num_seq; ++i) {
for (int64_t j = 0; j < dim; ++j) {
int step_id = max_index[i * dim + j];
ig_data[step_id * dim + j] = og_data[i * dim + j];
}
}
}
};
template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence_pooling.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
template <typename T>
__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
T* output, int* index, int64_t num_seq,
int64_t dim) {
int dim_idx = threadIdx.x;
int seq_id = blockIdx.x;
if (seq_id >= num_seq) return;
size_t start = starts[seq_id];
size_t end = starts[seq_id + 1];
for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
T max_val = static_cast<T>(-FLT_MAX);
int max_id = -1;
for (size_t step_id = start; step_id < end; step_id++) {
if (max_val < input[step_id * dim + i]) {
max_val = input[step_id * dim + i];
max_id = step_id;
}
}
output[seq_id * dim + i] = max_val;
index[seq_id * dim + i] = max_id;
}
}
template <typename T>
class MaxSeqPoolFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
PADDLE_ENFORCE_GT(out_dims.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, out_dims);
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->data<T>();
int* max_index = index->data<int>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
dim3 threads(256, 1);
dim3 grid(num_seq, 1);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
in_data, starts.data(), out_data, max_index, num_seq, dim);
}
};
template <typename T>
__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
T* in_grad, int64_t num_seq,
int64_t dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int col_idx = idx % dim;
if (idx < num_seq * dim) {
int step_id = max_index[idx];
in_grad[step_id * dim + col_idx] = out_grad[idx];
}
}
template <typename T>
class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& out_grad,
const framework::Tensor& index,
framework::LoDTensor* in_grad) {
auto og_dims = out_grad.dims();
auto idx_dims = index.dims();
auto ig_dims = in_grad->dims();
PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
for (int64_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, og_dims);
const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>();
T* ig_data = in_grad->data<T>();
SetConstant<platform::GPUPlace, T> set_zero;
set_zero(context, in_grad, static_cast<T>(0.0));
int64_t num_seq = og_dims[0];
int64_t dim = out_grad.numel() / num_seq;
unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
dim3 threads(128, 1);
dim3 grid(blocks, 1);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
og_data, max_index, ig_data, num_seq, dim);
}
};
template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
template <typename Place, typename T>
class MaxSeqPoolFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index);
};
template <typename Place, class T>
class MaxSeqPoolGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& out_grad,
const framework::Tensor& index,
framework::LoDTensor* in_grad);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/positive_negative_pair_op.h"
namespace paddle {
namespace operators {
class PositiveNegativePairOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("Score"),
"Input(Score) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("Label"),
"Input(Label) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("QueryID"),
"Input(QueryID) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("PositivePair"),
"Output(PositivePair) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("NegativePair"),
"Output(NegativePair) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("NeutralPair"),
"Output(NeutralPair) of PositiveNegativePairOp should not be null.");
auto scalar_dim = framework::make_ddim({1});
if (ctx->HasInput("AccumulatePositivePair") ||
ctx->HasInput("AccumulateNegativePair") ||
ctx->HasInput("AccumulateNeutralPair")) {
PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
ctx->HasInput("AccumulateNegativePair") &&
ctx->HasInput("AccumulateNeutralPair"),
"All optional inputs(AccumulatePositivePair, "
"AccumulateNegativePair, AccumulateNeutralPair) of "
"PositiveNegativePairOp are required if one of them is "
"specified.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
"Shape of AccumulatePositivePair should be {1}.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
"Shape of AccumulateNegativePair should be {1}.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
"Shape of AccumulateNeutralPair should be {1}.");
}
auto score_dim = ctx->GetInputDim("Score");
auto label_dim = ctx->GetInputDim("Label");
auto query_dim = ctx->GetInputDim("QueryID");
PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(
label_dim[0], score_dim[0],
"Tensor Score and Label should have the same height (batch size).");
PADDLE_ENFORCE_EQ(label_dim[1], 1,
"The width of Label should be 1, i.e. each item should "
"have a scalar label.");
PADDLE_ENFORCE(query_dim == label_dim,
"QueryID should have the same shape as Label.");
if (ctx->HasInput("Weight")) {
PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
"Weight should have the same shape as Label.");
}
int column = ctx->Attrs().Get<int>("column");
auto depth = score_dim[1];
PADDLE_ENFORCE(column < depth && column >= -depth,
"Attribute column should be in the range of [-%l, %l)",
depth, depth);
ctx->SetOutputDim("PositivePair", scalar_dim);
ctx->SetOutputDim("NegativePair", scalar_dim);
ctx->SetOutputDim("NeutralPair", scalar_dim);
}
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override {
return framework::ToDataType(ctx.Input<Tensor>("Score")->type());
}
};
class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
public:
PositiveNegativePairOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Score",
"(Tensor, float) Model Score on an item (with "
"respect to QueryID). It's a 2-D tensor with shape [batch_size, "
"depth], where the column specified by the attribute \"column\" "
"is used as item score.");
AddInput("Label",
"(Tensor, float) Label of an item (with repsect to "
"QueryId). It's a 2-D tensor with shape [batch_size, 1].");
AddInput("QueryID",
"(Tensor, int64) Query ID that indicates the context. Its shape "
"should be the same as Label.");
AddInput(
"AccumulatePositivePair",
"(float) Optional. The accumulated number of positive pairs over a "
"stream of data. If provided, the output PositivePair will be "
"initialized with this number rather than 0. it won't be modified "
"in place.")
.AsDispensable();
AddInput(
"AccumulateNegativePair",
"(float) Optional. The accumulated number of negative pairs over a "
"stream of data. If provided, the output NegativePair will be "
"initialized with this number rather than 0. it won't be modified "
"in place.")
.AsDispensable();
AddInput("AccumulateNeutralPair",
"(float) Optional. The accumulated number of neutral pairs over a "
"stream of data. If provided, the output NeutralPair will be "
"initialized with this number rather than 0. it won't be modified "
"in place.")
.AsDispensable();
AddInput("Weight",
"(float) Optional. Weight of current item. If specified, its "
"shape should be the same as Label, and the meaning of the output "
"changes from numbers of pairs to the total sum of pairs' "
"weights. Weight of a pair of items is the average of their "
"weights.")
.AsDispensable();
AddOutput("PositivePair",
"(float) Number of positive pairs, i.e. the pairs of "
"items that are ranked correctly.");
AddOutput("NegativePair",
"(float) Number of negative pairs, i.e. the pairs of "
"items that are ranked incorrectly.");
AddOutput("NeutralPair",
"(float) Number of neutral pairs, i.e. the pairs of items "
"that have the same score.")
.AsDispensable();
AddAttr<int>(
"column",
"(int, default -1) The column position of Score used to rank items in "
"descending order. It must be in the range of [-rank(Score), "
"rank(Score)). "
"If `dim < 0`, the dim to reduce is `rank + dim`. "
"Noting that reducing on the first dim will make the LoD info lost.")
.SetDefault(0);
AddComment(R"DOC(
PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR)
model performance.
Within some context, e.g. the "query", a LTR model generates scores
for a list of items, which gives a partial order of the items.
PositiveNegativePairOp takes a list of reference rank order
(Input("Label")) and the model generated scores (Input(Score)) as
inputs and counts the pairs that ranked correctly and incorrectly.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
ops::PositiveNegativePairOp,
ops::PositiveNegativePairOpMaker);
REGISTER_OP_CPU_KERNEL(
positive_negative_pair,
ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <unordered_map>
#include <vector>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/utils/Logging.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename Place, typename T>
class PositiveNegativePairKernel : public framework::OpKernel<T> {
public:
struct PredictionResult {
PredictionResult(T score, T label, T weight)
: score(score), label(label), weight(weight) {}
T score;
T label;
T weight;
};
void Compute(const framework::ExecutionContext& context) const override {
auto score_t = context.Input<Tensor>("Score");
auto label_t = context.Input<Tensor>("Label");
auto query_t = context.Input<Tensor>("QueryID");
auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
auto positive_t = context.Output<Tensor>("PositivePair");
auto negative_t = context.Output<Tensor>("NegativePair");
auto neutral_t = context.Output<Tensor>("NeutralPair");
auto weight_t = context.Input<Tensor>("Weight");
auto score = score_t->data<T>();
auto label = label_t->data<T>();
auto query = query_t->data<int64_t>();
const T* weight = nullptr;
if (weight_t != nullptr) {
weight = weight_t->data<T>();
}
T* positive = positive_t->mutable_data<T>(context.GetPlace());
T* negative = negative_t->mutable_data<T>(context.GetPlace());
T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
auto score_dim = score_t->dims();
auto batch_size = score_dim[0];
auto width = score_dim[1];
auto column = context.Attr<int32_t>("column");
if (column < 0) {
column += width;
}
// construct document instances for each query: Query => List[<score#0,
// label#0, weight#0>, ...]
std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
for (auto i = 0; i < batch_size; ++i) {
if (predictions.find(query[i]) == predictions.end()) {
predictions.emplace(
std::make_pair(query[i], std::vector<PredictionResult>()));
}
predictions[query[i]].emplace_back(score[i * width + column], label[i],
weight_t != nullptr ? weight[i] : 1.0);
}
// for each query, accumulate pair counts
T pos = 0, neg = 0, neu = 0;
if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
acc_neutral_t != nullptr) {
pos = acc_positive_t->data<T>()[0];
neg = acc_negative_t->data<T>()[0];
neu = acc_neutral_t->data<T>()[0];
}
auto evaluate_one_list = [&pos, &neg,
&neu](std::vector<PredictionResult> vec) {
for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
if (ite1->label == ite2->label) { // labels are equal, ignore.
continue;
}
T w = (ite1->weight + ite2->weight) * 0.5;
if (ite1->score == ite2->score) {
neu += w;
}
(ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
? pos += w
: neg += w;
}
}
};
for (auto prediction : predictions) {
evaluate_one_list(prediction.second);
}
*positive = pos;
*negative = neg;
*neutral = neu;
}
};
} // namespace operators
} // namespace paddle
...@@ -27,6 +27,11 @@ class SequencePoolOp : public framework::OperatorWithKernel { ...@@ -27,6 +27,11 @@ class SequencePoolOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SequencePoolOp should not be null."); "Output(Out) of SequencePoolOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
"Output(MaxIndex) of SequencePoolOp should not be null.");
ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
}
} }
}; };
...@@ -35,10 +40,14 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -35,10 +40,14 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
SequencePoolOpMaker(framework::OpProto* proto, SequencePoolOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker) framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp"); AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
AddOutput("Out", AddOutput("Out",
"(Tensor), output of SequencePoolOp, which does not contain LoD " "(Tensor) The output of SequencePoolOp does not contain LoD "
"infomation."); "infomation.");
AddOutput("MaxIndex",
"(Tensor<int>) This tensor is used for the sequence max-pooling "
"to record the max indexes.")
.AsIntermediate();
AddAttr<std::string>( AddAttr<std::string>(
"pooltype", "pooltype",
"(int, default AVERAGE) the pooling pooltype of SequencePoolOp.") "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
...@@ -96,6 +105,12 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { ...@@ -96,6 +105,12 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
} }
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext& ctx) const override {
return framework::ToDataType(ctx.Input<Tensor>("X")->type());
}
}; };
} // namespace operators } // namespace operators
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence_pooling.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -34,7 +35,7 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -34,7 +35,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X"); auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out"); auto* out = context.Output<Tensor>("Out");
std::string pooltype = context.Attr<std::string>("pooltype"); std::string pooltype = context.Attr<std::string>("pooltype");
auto dims = in->dims(); auto dims = in->dims();
...@@ -53,6 +54,16 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -53,6 +54,16 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto lod_level_0 = lod[0]; auto lod_level_0 = lod[0];
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
if (pooltype == "MAX") {
math::MaxSeqPoolFunctor<Place, T> max_pool;
auto* index = context.Output<Tensor>("MaxIndex");
index->Resize({dims});
index->mutable_data<int>(context.GetPlace());
max_pool(context.device_context(), *in, out, index);
return;
}
auto place = context.GetEigenDevice<Place>(); auto place = context.GetEigenDevice<Place>();
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]), Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
...@@ -69,8 +80,6 @@ class SequencePoolKernel : public framework::OpKernel<T> { ...@@ -69,8 +80,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
} else if (pooltype == "SQRT") { } else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) / out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h)); std::sqrt(static_cast<T>(h));
} else if (pooltype == "MAX") {
out_e.device(place) = in_e.maximum(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "LAST") { } else if (pooltype == "LAST") {
out_e.device(place) = in_e.chip(h - 1, 0); out_e.device(place) = in_e.chip(h - 1, 0);
} else if (pooltype == "FIRST") { } else if (pooltype == "FIRST") {
...@@ -87,8 +96,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> { ...@@ -87,8 +96,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X"); auto* in = context.Input<LoDTensor>("X");
auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X")); auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
std::string pooltype = context.Attr<std::string>("pooltype"); std::string pooltype = context.Attr<std::string>("pooltype");
auto dims = in->dims(); auto dims = in->dims();
...@@ -96,6 +105,14 @@ class SequencePoolGradKernel : public framework::OpKernel<T> { ...@@ -96,6 +105,14 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
int64_t w = in->numel() / dims[0]; int64_t w = in->numel() / dims[0];
in_g->mutable_data<T>(context.GetPlace()); in_g->mutable_data<T>(context.GetPlace());
if (pooltype == "MAX") {
math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
auto* index = context.Input<Tensor>("MaxIndex");
max_pool_grad(context.device_context(), *out_g, *index, in_g);
return;
}
if (pooltype == "LAST" || pooltype == "FIRST") { if (pooltype == "LAST" || pooltype == "FIRST") {
// set X@Grad be zero at first when pooltype is LAST/FIRST // set X@Grad be zero at first when pooltype is LAST/FIRST
math::SetConstant<Place, T> functor; math::SetConstant<Place, T> functor;
...@@ -118,20 +135,6 @@ class SequencePoolGradKernel : public framework::OpKernel<T> { ...@@ -118,20 +135,6 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
} else if (pooltype == "SQRT") { } else if (pooltype == "SQRT") {
in_g_e.device(place) = in_g_e.device(place) =
(out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast); (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
} else if (pooltype == "MAX") {
auto in_t =
in->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
in_t_map(in_t.data<T>(), h, w);
int row_id;
Eigen::array<int, 2> extents{{1, 1}};
for (int col_id = 0; col_id < w; col_id++) {
in_t_map.col(col_id).maxCoeff(&row_id);
Eigen::array<int, 2> in_offsets{{row_id, col_id}};
Eigen::array<int, 2> out_offsets{{0, col_id}};
in_g_e.slice(in_offsets, extents).device(place) =
out_g_e.slice(out_offsets, extents);
}
} else if (pooltype == "LAST") { } else if (pooltype == "LAST") {
in_g_e.chip(h - 1, 0).device(place) = out_g_e; in_g_e.chip(h - 1, 0).device(place) = out_g_e;
} else if (pooltype == "FIRST") { } else if (pooltype == "FIRST") {
......
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/softmax_with_cross_entropy_op.h" #include "paddle/operators/softmax_with_cross_entropy_op.h"
#include <paddle/function/TensorType.h> #include <paddle/function/TensorType.h>
...@@ -30,12 +30,10 @@ class SoftmaxWithCrossEntropyOpMaker ...@@ -30,12 +30,10 @@ class SoftmaxWithCrossEntropyOpMaker
"which is a 2-D tensor with shape [N x K]. N is the batch_size, " "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
"and K is the class number."); "and K is the class number.");
AddInput("Label", AddInput("Label",
"(Tensor, default: Tensor<int>), The ground truth which is a 2-D " "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
"tensor. " "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
"If softLabel is set to false, Label is a Tensor<int> with shape " "soft_label is set to true, Label is a Tensor<float/double> with "
"[N x 1]." "shape [N x K].");
"If softLabel is set to true, Label is a Tensor<float/double> "
"with shape [N x K].");
AddOutput( AddOutput(
"Softmax", "Softmax",
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. " "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
...@@ -62,7 +60,7 @@ Because this operator performs a softmax on logits internally, it expects ...@@ -62,7 +60,7 @@ Because this operator performs a softmax on logits internally, it expects
unscaled logits. This operator should not be used with the output of unscaled logits. This operator should not be used with the output of
softmax operator since that would produce incorrect results. softmax operator since that would produce incorrect results.
When the attribute softLabel is set false, this operators expects mutually When the attribute soft_label is set false, this operators expects mutually
exclusive hard labels, each sample in a batch is in exactly one class with a exclusive hard labels, each sample in a batch is in exactly one class with a
probability of 1.0. Each sample in the batch will have a single label. probability of 1.0. Each sample in the batch will have a single label.
...@@ -198,6 +196,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp, ...@@ -198,6 +196,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
REGISTER_OPERATOR(softmax_with_cross_entropy_grad, REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyOpGrad); ops::SoftmaxWithCrossEntropyOpGrad);
REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy, REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
ops::SoftmaxWithCrossEntropyKernel<float>); ops::SoftmaxWithCrossEntropyKernel<float>,
ops::SoftmaxWithCrossEntropyKernel<double>);
REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad, REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradKernel<float>); ops::SoftmaxWithCrossEntropyGradKernel<float>,
ops::SoftmaxWithCrossEntropyGradKernel<double>);
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
...@@ -24,7 +24,7 @@ using Tensor = framework::Tensor; ...@@ -24,7 +24,7 @@ using Tensor = framework::Tensor;
namespace { namespace {
template <typename T> template <typename T>
__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad, __global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
const int* labels, const int batch_size, const int64_t* labels, const int batch_size,
const int class_num) { const int class_num) {
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int sample_idx = tid / class_num; int sample_idx = tid / class_num;
...@@ -50,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad, ...@@ -50,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
int ids = blockIdx.x * blockDim.x + threadIdx.x; int ids = blockIdx.x * blockDim.x + threadIdx.x;
if (ids < batch_size * class_num) { if (ids < batch_size * class_num) {
int row_ids = ids / class_num; int row_ids = ids / class_num;
logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]); logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
} }
} }
} // namespace } // namespace
...@@ -104,7 +104,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -104,7 +104,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
.stream()>>>(logit_grad_data, loss_grad_data, .stream()>>>(logit_grad_data, loss_grad_data,
label_data, batch_size, class_num); label_data, batch_size, class_num);
} else { } else {
const int* label_data = labels->data<int>(); const int64_t* label_data = labels->data<int64_t>();
CrossEntropyGrad<T><<< CrossEntropyGrad<T><<<
grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>( grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
context.device_context()) context.device_context())
...@@ -119,6 +119,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -119,6 +119,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy, REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
ops::SoftmaxWithCrossEntropyCUDAKernel<float>); ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad, REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>); ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
...@@ -60,25 +60,25 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> { ...@@ -60,25 +60,25 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax")); logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
const int class_num = logit_grad->dims()[1]; const int class_num = logit_grad->dims()[1];
auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
if (context.Attr<bool>("soft_label")) { if (context.Attr<bool>("soft_label")) {
auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
auto lbl_mat = EigenMatrix<T>::From(*labels); auto lbl_mat = EigenMatrix<T>::From(*labels);
logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) = logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
logit_grad_mat * out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
(out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) - (logit_grad_mat - lbl_mat);
lbl_mat);
} else { } else {
logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
logit_grad_mat *
out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
const int batch_size = logit_grad->dims()[0]; const int batch_size = logit_grad->dims()[0];
const int* label_data = labels->data<int>(); const int64_t* label_data = labels->data<int64_t>();
const T* out_grad_data = out_grad->data<T>();
T* logit_grad_data = logit_grad->data<T>(); T* logit_grad_data = logit_grad->data<T>();
const T* out_grad_data = out_grad->data<T>();
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i]; logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
logit_grad_data[index] =
out_grad_data[i] * (logit_grad_data[index] - 1.);
} }
} }
} }
......
...@@ -24,10 +24,16 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -24,10 +24,16 @@ class SumOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null"); PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
auto x_dims = ctx->GetInputsDim("X");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SumOp should not be null."); "Output(Out) of SumOp should not be null.");
if (ctx->IsRuntime() &&
ctx->GetOutputsVarType("Out")[0] ==
framework::VarDesc::LOD_TENSOR_ARRAY) {
return; // skip runtime infershape when is tensor array;
}
auto x_dims = ctx->GetInputsDim("X");
size_t N = x_dims.size(); size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
...@@ -39,6 +45,28 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -39,6 +45,28 @@ class SumOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", in_dim); ctx->SetOutputDim("Out", in_dim);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X");
if (x_vars[0]->IsType<framework::LoDTensor>()) {
return framework::ToDataType(
x_vars[0]->Get<framework::LoDTensor>().type());
} else if (x_vars[0]->IsType<framework::SelectedRows>()) {
return framework::ToDataType(
x_vars[0]->Get<framework::SelectedRows>().value().type());
} else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
auto& array = x_vars[0]->Get<framework::LoDTensorArray>();
for (auto& each : array) {
if (each.numel() != 0) {
return framework::ToDataType(each.type());
}
}
}
PADDLE_THROW("Unexpected branch. Input type is %s",
x_vars[0]->Type().name());
}
}; };
class SumOpMaker : public framework::OpProtoAndCheckerMaker { class SumOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -63,18 +91,32 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -63,18 +91,32 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
void operator()(const framework::OpDescBind& op_desc, void operator()(const framework::OpDescBind& op_desc,
framework::BlockDescBind* block) const override { framework::BlockDescBind* block) const override {
auto& inputs = op_desc.Input("X"); auto& inputs = op_desc.Input("X");
auto default_var_type = framework::VarDesc::SELECTED_ROWS; auto var_type = framework::VarDesc::SELECTED_ROWS;
bool any_input_is_lod_tensor = std::any_of( bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string& name) { inputs.begin(), inputs.end(), [block](const std::string& name) {
return block->Var(name)->GetType() == framework::VarDesc::LOD_TENSOR; return block->Var(name)->GetType() == framework::VarDesc::LOD_TENSOR;
}); });
if (any_input_is_lod_tensor) {
default_var_type = framework::VarDesc::LOD_TENSOR; auto is_tensor_array = [block](const std::string& name) {
return block->Var(name)->GetType() ==
framework::VarDesc::LOD_TENSOR_ARRAY;
};
bool any_input_is_tensor_array =
std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
bool all_inputs_are_tensor_array =
std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
if (any_input_is_tensor_array) {
PADDLE_ENFORCE(all_inputs_are_tensor_array);
var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
} else if (any_input_is_lod_tensor) {
var_type = framework::VarDesc::LOD_TENSOR;
} }
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = op_desc.Output("Out").front();
block->Var(out_var_name)->SetType(default_var_type); block->Var(out_var_name)->SetType(var_type);
} }
}; };
......
...@@ -11,6 +11,7 @@ limitations under the License. */ ...@@ -11,6 +11,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/selected_rows_functor.h" #include "paddle/operators/math/selected_rows_functor.h"
...@@ -28,7 +29,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>; ...@@ -28,7 +29,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
class SumKernel : public framework::OpKernel<T> { class SumKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto in_vars = context.MultiInputVar("X"); auto in_vars = context.MultiInputVar("X");
int N = in_vars.size(); int N = in_vars.size();
auto out_var = context.OutputVar("Out"); auto out_var = context.OutputVar("Out");
...@@ -36,7 +37,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -36,7 +37,7 @@ class SumKernel : public framework::OpKernel<T> {
bool in_place = out_var == in_vars[0]; bool in_place = out_var == in_vars[0];
if (out_var->IsType<framework::LoDTensor>()) { if (out_var->IsType<framework::LoDTensor>()) {
auto* out = context.Output<Tensor>("Out"); auto *out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
auto result = EigenVector<T>::Flatten(*out); auto result = EigenVector<T>::Flatten(*out);
...@@ -51,11 +52,11 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -51,11 +52,11 @@ class SumKernel : public framework::OpKernel<T> {
// If in_place, just skip the first tensor // If in_place, just skip the first tensor
for (int i = in_place ? 1 : 0; i < N; i++) { for (int i = in_place ? 1 : 0; i < N; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) { if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto& in_t = in_vars[i]->Get<framework::LoDTensor>(); auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
auto in = EigenVector<T>::Flatten(in_t); auto in = EigenVector<T>::Flatten(in_t);
result.device(place) = result + in; result.device(place) = result + in;
} else if (in_vars[i]->IsType<framework::SelectedRows>()) { } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
auto& in_t = in_vars[i]->Get<framework::SelectedRows>(); auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
functor(context.device_context(), in_t, out); functor(context.device_context(), in_t, out);
} else { } else {
PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
...@@ -63,8 +64,8 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -63,8 +64,8 @@ class SumKernel : public framework::OpKernel<T> {
} }
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now"); PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
auto* out = context.Output<SelectedRows>("Out"); auto *out = context.Output<SelectedRows>("Out");
auto* out_value = out->mutable_value(); auto *out_value = out->mutable_value();
// Runtime InferShape // Runtime InferShape
size_t first_dim = 0; size_t first_dim = 0;
...@@ -88,9 +89,36 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -88,9 +89,36 @@ class SumKernel : public framework::OpKernel<T> {
offset, out); offset, out);
offset += in_vars[i]->Get<SelectedRows>().value().numel(); offset += in_vars[i]->Get<SelectedRows>().value().numel();
} }
} else if (out_var->IsType<framework::LoDTensorArray>()) {
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
"Only support all inputs are TensorArray");
auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < in_array.size(); ++i) {
if (in_array[i].numel() != 0) {
if (i >= out_array.size()) {
out_array.resize(i + 1);
}
if (out_array[i].numel() == 0) {
out_array[i].CopyFrom(in_array[i], in_array[i].place(),
context.device_context());
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
auto in = EigenVector<T>::Flatten(in_array[i]);
auto result = EigenVector<T>::Flatten(out_array[i]);
result.device(context.GetEigenDevice<Place>()) = result + in;
}
}
}
}
} else {
PADDLE_THROW("Unexpected branch, output variable type is %s",
out_var->Type().name());
} }
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
此差异已折叠。
include_directories(${CMAKE_CURRENT_BINARY_DIR})
set(OPITMIZER_SRCS set(OPITMIZER_SRCS
adadelta_optimizer.cc adadelta_optimizer.cc
adagrad_optimizer.cc adagrad_optimizer.cc
...@@ -9,11 +7,6 @@ set(OPITMIZER_SRCS ...@@ -9,11 +7,6 @@ set(OPITMIZER_SRCS
sgd_optimizer.cc sgd_optimizer.cc
) )
add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS}) cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
add_dependencies(paddle_optimizer paddle_proto ${external_project_dependencies}) cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
if(WITH_TESTING)
add_simple_unittest(serialization_test)
add_simple_unittest(parameter_optimizer_test)
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "adadelta_optimizer.h" #include "adadelta_optimizer.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once #pragma once
#include "parameter_optimizer.h" #include "parameter_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath> #include <cmath>
#include "adagrad_optimizer.h" #include "adagrad_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once #pragma once
#include "parameter_optimizer.h" #include "parameter_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "adam_optimizer.h" #include "adam_optimizer.h"
#include <cmath> #include <cmath>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once #pragma once
#include "parameter_optimizer.h" #include "parameter_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "optimizer.h" #include "optimizer.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <cstdlib> #include <cstdlib>
...@@ -6,8 +20,8 @@ ...@@ -6,8 +20,8 @@
#include "parameter_optimizer.h" #include "parameter_optimizer.h"
using namespace paddle; using paddle::optimizer::ParameterOptimizer;
using namespace paddle::optimizer; using paddle::optimizer::Tensor;
template <paddle_element_type VALUE> template <paddle_element_type VALUE>
struct EnumToType {}; struct EnumToType {};
...@@ -15,22 +29,21 @@ struct EnumToType {}; ...@@ -15,22 +29,21 @@ struct EnumToType {};
template <class T> template <class T>
struct TypeToEnum {}; struct TypeToEnum {};
#define MATCH_ENUM_TYPE(TYPE, ENUM) \ #define MATCH_ENUM_TYPE(TYPE, ENUM) \
template <> \ template <> \
struct TypeToEnum<TYPE> { \ struct TypeToEnum<TYPE> { \
static paddle_element_type v() { return ENUM; }; \ static paddle_element_type v() { return ENUM; } \
static constexpr TYPE value = ENUM; \ static constexpr TYPE value = ENUM; \
}; \ }; \
template <> \ template <> \
struct EnumToType<ENUM> { \ struct EnumToType<ENUM> { \
typedef TYPE Type; \ typedef TYPE Type; \
} }
MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32); MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32); MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64); MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64); MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
// TODO(zhihong): only implement below type, need to fix
MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32); MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64); MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once #pragma once
#include <stdbool.h> #include <stdbool.h>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include "adadelta_optimizer.h" #include "adadelta_optimizer.h"
#include "adagrad_optimizer.h" #include "adagrad_optimizer.h"
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册