提交 945ad8d9 编写于 作者: T tensor-tang

Merge remote-tracking branch 'upstream/develop' into resnet

......@@ -126,7 +126,7 @@ include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/pybind11) # download pybind11
include(external/nccl)
include(cudnn) # set cudnn libraries, must before configure
......
......@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future
set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
set(IOS_ARCH "i386;x86_64")
elseif(IOS_PLATFORM STREQUAL "WATCHOS")
set(IOS_ARCH armv7k)
# FIXME(liuyiqun): support "i386;x86_64" future
set(IOS_ARCH "x86_64")
endif()
endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject)
set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
INCLUDE(ExternalProject)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
if(NOT WITH_PYTHON)
return()
endif()
include(ExternalProject)
INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
ExternalProject_Add(
extern_pybind
......@@ -17,14 +35,12 @@ ExternalProject_Add(
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
add_library(pybind STATIC ${dummyfile})
else()
add_library(pybind INTERFACE)
endif()
add_dependencies(pybind extern_pybind)
LIST(APPEND external_project_dependencies pybind)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# This file is use to check all support level of AVX on your machine
# so that PaddlePaddle can unleash the vectorization power of muticore.
INCLUDE(CheckCXXSourceRuns)
INCLUDE(CheckCXXSourceCompiles)
include(CheckCXXSourceRuns)
include(CheckCXXSourceCompiles)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MMX_FLAG "-mmmx")
set(SSE2_FLAG "-msse2")
set(SSE3_FLAG "-msse3")
SET(AVX_FLAG "-mavx")
SET(AVX2_FLAG "-mavx2")
ELSEIF(MSVC)
set(AVX_FLAG "-mavx")
set(AVX2_FLAG "-mavx2")
elseif(MSVC)
set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2")
set(SSE3_FLAG "/arch:SSE3")
SET(AVX_FLAG "/arch:AVX")
SET(AVX2_FLAG "/arch:AVX2")
ENDIF()
endif()
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
# Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <mmintrin.h>
int main()
......@@ -32,6 +33,7 @@ int main()
# Check SSE2
set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <emmintrin.h>
int main()
......@@ -42,6 +44,7 @@ int main()
# Check SSE3
set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <pmmintrin.h>
int main()
......@@ -55,6 +58,7 @@ int main()
# Check AVX
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
......@@ -67,6 +71,7 @@ int main()
# Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
......
......@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以
Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
......
......@@ -63,7 +63,7 @@
</tr>
<tr>
<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
......
......@@ -8,3 +8,4 @@ PaddlePaddle 文档
howto/index_cn.rst
api/index_cn.rst
faq/index_cn.rst
mobile/index_cn.rst
......@@ -7,3 +7,4 @@ PaddlePaddle Documentation
getstarted/index_en.rst
howto/index_en.rst
api/index_en.rst
mobile/index_en.rst
......@@ -20,10 +20,32 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数:
| Argument | Optional Values | Default |
|-----------------|-------------------------|---------|
|`ANDROID_ABI` |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
|`ANDROID_API` |`>= 21` | `21` |
<table class="docutils">
<colgroup>
<col width="25%" />
<col width="50%" />
<col width="25%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">Argument</th>
<th class="head">Optional Values</th>
<th class="head">Default</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>ANDROID_ABI</td>
<td>armeabi-v7a, arm64-v8a</td>
<td>armeabi-v7a</td>
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>21</td>
</tr>
</tbody>
</table>
- 编译`armeabi-v7a``Android API 21`的PaddlePaddle库
```bash
......
......@@ -26,10 +26,32 @@ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_A
The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
| Argument | Optional Values | Default |
|-----------------|-------------------------|---------|
|`ANDROID_ABI` |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
|`ANDROID_API` |`>= 21` | `21` |
<table class="docutils">
<colgroup>
<col width="25%" />
<col width="50%" />
<col width="25%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">Argument</th>
<th class="head">Optional Values</th>
<th class="head">Default</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>ANDROID_ABI</td>
<td>armeabi-v7a, arm64-v8a</td>
<td>armeabi-v7a</td>
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>21</td>
</tr>
</tbody>
</table>
The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
......
......@@ -27,10 +27,28 @@ iOS平台可选配置参数:
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示:
| IOS_PLATFORM | IOS_ARCH |
|--------------|----------------------|
| OS | armv7, armv7s, arm64 (默认) |
| SIMULATOR | i386, x86_64 (默认) |
<table class="docutils">
<colgroup>
<col width="35%" />
<col width="65%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">IOS_PLATFORM</th>
<th class="head">IOS_ARCH</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td>
</tr>
<tr class="row-odd">
<td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td>
</tr>
</tbody>
</table>
- `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0`
- `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON`
......
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_raspberry_en.md
......@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto)
# TODO: paddle_capi_whole will be removed.
set(PADDLE_CAPI_LAYERS_LIBS
paddle_function
paddle_gserver)
if(MOBILE_INFERENCE)
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto)
set(PADDLE_CAPI_ENGINE_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_proto)
else()
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto
paddle_pserver
paddle_network)
set(PADDLE_CAPI_ENGINE_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_proto
paddle_pserver
paddle_network)
endif()
set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
# Link the static library for inference
cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
# Link the shared library for inference
if(NOT IOS)
......
......@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
are transformed into offsets of elements/words as follows:
```
0 9 10 15
= = =
3+2+4 1+9 2+3+10
```
so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
The complete offset representation is as follows:
```
0 9 10 15
0 3 5 9 10 12 15
||| || |||| | || |||
0 3 4 6
= = =
3 3+1 4+2
```
## Slicing of LoD Tensors
......
......@@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
out);
in_var->SetLoDLevel(out_var->GetLodLevel());
}
bool IsRuntime() const override;
protected:
VarDesc::VarType GetVarType(const std::string &name) const override;
private:
DDim GetDim(const std::string &name) const override;
void SetDim(const std::string &name, const DDim &dim) override;
......@@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
info.infer_var_type_(*this, block);
} else {
// all output type is LoDTensor by default
VLOG(10) << this->Type()
<< " has not registered InferVarType. Set output variables to "
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) {
block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
......@@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
const DDim &dim) {
block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
}
bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
const std::string &name) const {
return block_.FindVarRecursive(name)->GetType();
}
} // namespace framework
} // namespace paddle
......@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/framework/operator.h"
#include <algorithm>
#include <atomic>
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
namespace paddle {
namespace framework {
......@@ -365,7 +367,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
out_tensor->set_lod(in_tensor.lod());
}
private:
bool IsRuntime() const override { return true; }
protected:
DDim GetDim(const std::string& name) const override {
Variable* var = scope_.FindVar(name);
if (var->IsType<LoDTensor>()) {
......@@ -388,6 +392,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
}
}
VarDesc::VarType GetVarType(const std::string& name) const override {
auto* var = scope_.FindVar(name);
return ToVarType(var->Type());
}
private:
const OperatorBase& op_;
const Scope& scope_;
};
......
......@@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
SetDim(names[i], dims[i]);
}
}
std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<VarDesc::VarType> retv;
retv.resize(names.size());
std::transform(names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
} // namespace framework
} // namespace paddle
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/attribute.h"
#include "paddle/framework/ddim.h"
#include "paddle/framework/framework.pb.h"
namespace paddle {
namespace framework {
......@@ -26,6 +27,10 @@ class InferShapeContext {
virtual bool HasInput(const std::string &name) const = 0;
virtual bool HasOutput(const std::string &name) const = 0;
std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
std::vector<VarDesc::VarType> GetOutputsVarType(
const std::string &name) const;
virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0;
......@@ -46,6 +51,8 @@ class InferShapeContext {
virtual void ShareLoD(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0;
virtual bool IsRuntime() const = 0;
protected:
virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
......@@ -55,6 +62,11 @@ class InferShapeContext {
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const;
virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
};
} // namespace framework
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
namespace paddle {
namespace framework {
inline VarDesc::VarType ToVarType(std::type_index type) {
if (type.hash_code() == typeid(LoDTensor).hash_code()) {
return VarDesc_VarType_LOD_TENSOR;
} else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
return VarDesc_VarType_LOD_RANK_TABLE;
} else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
return VarDesc_VarType_LOD_TENSOR_ARRAY;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
}
}
} // namespace framework
} // namespace paddle
......@@ -48,6 +48,11 @@ class Variable {
void Clear() { holder_.reset(); }
std::type_index Type() const {
PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
return holder_->Type();
}
private:
struct Placeholder {
virtual ~Placeholder() {}
......
......@@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle {
ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
ParameterPtr parameter,
......@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
}
void *ConvBaseProjection::getSpaceBytes(size_t size) {
std::vector<MemoryHandle *> &convMem = *convMem_;
std::vector<MemoryHandlePtr> &convMem = *convMem_;
if (convMem.empty()) {
int numDevices = hl_get_device_count();
convMem.resize(numDevices);
}
int devId = hl_get_device();
MemoryHandle **localMem = &(convMem[devId]);
if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
*localMem = new GpuMemoryHandle(size);
MemoryHandlePtr localMem = convMem[devId];
if (NULL == localMem || size > localMem->getAllocSize()) {
localMem = std::make_shared<GpuMemoryHandle>(size);
}
return (*localMem)->getBuf();
return localMem->getBuf();
}
ConvBaseProjection::~ConvBaseProjection() {
......
......@@ -105,7 +105,7 @@ protected:
bool bias_;
std::unique_ptr<Weight> weight_;
static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
};
} // namespace paddle
......@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
CHECK_EQ(numSequences2, numSequences3);
MatrixPtr inputValue = input.value;
IVectorPtr offsetValue = offsetSeq.ids;
IVectorPtr sizeValue = sizeSeq.ids;
IVectorPtr offsetValue;
IVectorPtr sizeValue;
if (useGpu_) {
// copy to cpu
IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
offsetValue->copyFrom(*offsetSeq.ids);
sizeValue->copyFrom(*sizeSeq.ids);
} else {
offsetValue = offsetSeq.ids;
sizeValue = sizeSeq.ids;
}
CHECK_EQ(offsetValue->getSize(), numSequences1);
CHECK_EQ(sizeValue->getSize(), numSequences1);
......@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
size_t numSequences1 = startPositions1->getSize() - 1;
const int* starts1 = startPositions1->getData();
IVectorPtr offsetValue = getInput(1).ids;
IVectorPtr sizeValue = getInput(2).ids;
const Argument& offsetSeq = getInput(1);
const Argument& sizeSeq = getInput(2);
IVectorPtr offsetValue;
IVectorPtr sizeValue;
if (useGpu_) {
// copy to cpu
IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
offsetValue->copyFrom(*offsetSeq.ids);
sizeValue->copyFrom(*sizeSeq.ids);
} else {
offsetValue = offsetSeq.ids;
sizeValue = sizeSeq.ids;
}
int* offsets = offsetValue->getData();
int* sizes = sizeValue->getData();
......
......@@ -69,6 +69,20 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# conv_transpose_op contains several operators
if ("${TARGET}" STREQUAL "conv_transpose_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif()
# pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1)
......@@ -96,7 +110,7 @@ function(op_library TARGET)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
endif()
# reduce_op contains several operators
if ("${TARGET}" STREQUAL "reduce_op")
set(pybind_flag 1)
......@@ -104,6 +118,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
endif()
if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
endif()
# pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
file(READ ${TARGET}.cc TARGET_CONTENT)
......@@ -139,26 +158,38 @@ set(DEPS_OPS
sum_op
pool_op
pool_with_index_op
conv_op
lstm_op
conv_transpose_op
nccl_op
sequence_conv_op
sequence_pool_op
lod_rank_table_op
lstm_op)
lstm_op
tensor_array_read_write_op
gru_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(conv_op DEPS vol2col)
op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
if(WITH_GPU)
op_library(nccl_op DEPS nccl_common)
endif()
op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS net_op tensor_array)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
op_library(${src})
......
......@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2d_transpose_op.h"
#include "paddle/operators/conv_transpose_op.h"
namespace paddle {
namespace operators {
......@@ -38,13 +38,13 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv2d_transpose_cudnn, ops::Conv2DTransposeOp,
REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
ops::Conv2DTransposeOpGrad);
ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_cudnn,
ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_cudnn_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
......@@ -15,7 +15,7 @@
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h"
#include "paddle/operators/conv2d_transpose_op.h"
#include "paddle/operators/conv_transpose_op.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/cudnn_helper.h"
......
......@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2d_op.h"
#include "paddle/operators/conv_op.h"
namespace paddle {
namespace operators {
......@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
ops::Conv2DOpGrad);
REGISTER_OP_CPU_KERNEL(
conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(conv_cudnn,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv_cudnn_grad,
ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
......@@ -15,7 +15,7 @@
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h"
#include "paddle/operators/conv2d_op.h"
#include "paddle/operators/conv_op.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/cudnn_helper.h"
......
......@@ -12,18 +12,18 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2d_op.h"
#include "paddle/operators/conv_op.h"
namespace paddle {
namespace operators {
void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DOp should not be null.");
"Input(Input) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DOp should not be null.");
"Input(Filter) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DOp should not be null.");
"Output(Output) of ConvOp should not be null.");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
......@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
int input_channels = in_dims[1];
int output_channels = filter_dims[0];
PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"Conv intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(
in_dims.size(), filter_dims.size(),
"Conv input dimension and filter dimension should be the same.");
PADDLE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"Conv input dimension and strides dimension should be consistent.");
PADDLE_ENFORCE_EQ(
paddings.size(), strides.size(),
"Conv paddings dimension and Conv strides dimension should be the same.");
PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
"The number of input channels should be equal to filter "
"channels * groups.");
......@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
output_channels % groups, 0,
"The number of output channels should be divided by groups.");
auto output_height =
OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
auto output_width =
OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
ctx->SetOutputDim("Output",
{in_dims[0], filter_dims[0], output_height, output_width});
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < paddings.size(); ++i) {
output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
paddings[i], strides[i]));
}
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
}
Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
......@@ -55,19 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"Input",
"The input tensor of convolution operator. "
"(Tensor) The input tensor of convolution operator. "
"The format of input tensor is NCHW, where N is batch size, C is the "
"number of channels, H is the height of the image, "
"and W is the width of the image.");
"number of channels, H is the height of the feature, "
"and W is the width of the feature.");
AddInput("Filter",
"The filter tensor of convolution operator. "
"(Tensor) The filter tensor of convolution operator. "
"The format of the filter tensor is MCHW, where M is the number of "
"output image channels, C is the number of input image channels, "
"H is the height of the filter, and W is the width of the filter. "
"If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups.");
AddOutput("Output",
"The output tensor of convolution operator. "
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
.SetDefault({1, 1});
......@@ -75,7 +84,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0});
AddAttr<int>(
"groups",
"Group size of convolution operator. "
"(int default:1), the group size of convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters "
......@@ -84,14 +93,91 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
AddComment(R"DOC(
Convolution Operator.
The convolution operation calculates the output based on the input, filter,
strides, paddings, and groups parameters. The size of each dimension of the
parameters is checked in the infer-shape method.
The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(ksize, strides, paddings) are two elements.
These two elements represent height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, H_in, W_in)
Filter shape: (C_out, C_in, H_f, W_f)
Output:
Output shape: (N, C_out, H_out, W_out)
where
H_out = (H_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
W_out = (W_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
)DOC");
}
Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"Input",
"(Tensor) The input tensor of convolution operator. "
"The format of input tensor is NCDHW. Where N is batch size, C is the "
"number of channels, D is the depth of the feature, H is the height of "
"the feature, "
"and W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution operator. "
"The format of the filter tensor is MCDHW, where M is the number of "
"output image channels, C is the number of input image channels, "
"D is the depth of the filter, H is the height of the filter, and W "
"is the width of the filter."
"If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups.");
AddOutput("Output",
"(Tensor) The output tensor of convolution operator."
"The format of output tensor is also NCDHW.");
AddAttr<std::vector<int>>(
"strides",
"(vector, default:{0, 0, 0}), the strides of convolution operator.")
.SetDefault({1, 1, 1});
AddAttr<std::vector<int>>(
"paddings",
"(vector, default:{0, 0, 0}), the paddings of convolution operator.")
.SetDefault({0, 0, 0});
AddAttr<int>(
"groups",
"(int default:1), the group size of convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters "
"is only connected to the second half of the input channels.")
.SetDefault(1);
AddComment(R"DOC(
Convolution3D Operator.
The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
size, C is the number of channels,D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
are three elements. These three elements represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, D_in, H_in, W_in)
Filter shape: (C_out, C_in, D_f, H_f, W_f)
Output:
Output shape: (N, C_out, D_out, H_out, W_out)
where
D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
)DOC");
}
void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) {
......@@ -106,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::Conv2DOp, ops::Conv2DOpMaker, conv2d_grad,
ops::Conv2DOpGrad);
REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad);
namespace ops = paddle::operators;
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
......@@ -12,11 +12,16 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2d_op.h"
#include "paddle/operators/conv_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>);
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>);
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
......@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h"
namespace paddle {
namespace operators {
......@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker);
};
class Conv2DOp : public framework::OperatorWithKernel {
class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Conv3DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker);
};
class ConvOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
};
class Conv2DOpGrad : public framework::OperatorWithKernel {
class ConvOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
};
template <typename Place, typename T>
class GemmConv2DKernel : public framework::OpKernel<T> {
class GemmConvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
......@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups");
int batch_size = input->dims()[0];
int input_channels = input->dims()[1];
int filter_height = filter.dims()[filter.dims().size() - 2];
int filter_width = filter.dims()[filter.dims().size() - 1];
int output_channels = output->dims()[1];
int output_height = output->dims()[2];
int output_width = output->dims()[3];
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
im2col;
const int batch_size = static_cast<int>(input->dims()[0]);
// filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
filter_shape_vec.erase(filter_shape_vec.begin(),
filter_shape_vec.begin() + 2);
// output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
output_shape_vec.erase(output_shape_vec.begin(),
output_shape_vec.begin() + 2);
// use col_shape in the im2col calculation
framework::DDim col_shape = {input_channels / groups, filter_height,
filter_width, output_height, output_width};
// col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
// o_h, o_w}
std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(input->dims()[1] / groups);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
output_shape_vec.end());
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
framework::DDim col_matrix_shape = {
input_channels / groups * filter_height * filter_width,
output_height * output_width};
// size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
// o_h * o_w)
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix = col;
Tensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
framework::DDim input_shape = {input->dims()[1], input->dims()[2],
input->dims()[3]};
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {output_channels,
output_height * output_width};
// convolution operator: im2col + gemm
int in_step = input_channels / groups;
int out_step = output_channels / groups;
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
// im2col
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[0], paddings[1], paddings[1]);
if (filter_shape_vec.size() == 2) {
// im2col
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
// vol2col
math::Vol2ColFunctor<Place, T> vol2col;
vol2col(context.device_context(), in_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
......@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
};
template <typename Place, typename T>
class GemmConvGrad2DKernel : public framework::OpKernel<T> {
class GemmConvGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
......@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
// The filter and filter_grad will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups");
int batch_size = input->dims()[0];
int input_channels = input->dims()[1];
int filter_height = filter.dims()[filter.dims().size() - 2];
int filter_width = filter.dims()[filter.dims().size() - 1];
int output_channels = output_grad->dims()[1];
int output_height = output_grad->dims()[2];
int output_width = output_grad->dims()[3];
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
col2im;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
im2col;
// use col_shape in the im2col and col2im calculation
framework::DDim col_shape = {input_channels / groups, filter_height,
filter_width, output_height, output_width};
const int batch_size = static_cast<int>(input->dims()[0]);
// filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
filter_shape_vec.erase(filter_shape_vec.begin(),
filter_shape_vec.begin() + 2);
// output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
std::vector<int64_t> output_shape_vec(
framework::vectorize(output_grad->dims()));
output_shape_vec.erase(output_shape_vec.begin(),
output_shape_vec.begin() + 2);
// use col_shape in the im2col calculation
// col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
// o_h, o_w}
std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(input->dims()[1] / groups);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
output_shape_vec.end());
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
framework::DDim col_matrix_shape = {
input_channels / groups * filter_height * filter_width,
output_height * output_width};
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix = col;
col_matrix.Resize(col_matrix_shape);
// size: (i_c/g * k_h * k_w, o_h * o_w)
// or
// (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
framework::DDim input_shape = {input->dims()[1], input->dims()[2],
input->dims()[3]};
framework::DDim output_matrix_shape = {
output_grad->dims()[1],
output_grad->dims()[2] * output_grad->dims()[3]};
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
// convolution backward input operator: gemm + col2im
// convolution backward weight operator: im2col + gemm
int in_step = input_channels / groups;
int out_step = output_channels / groups;
framework::DDim output_matrix_shape = {
output_grad->dims()[1],
output_grad->numel() /
(output_grad->dims()[0] * output_grad->dims()[1])};
// convolution backward input operator: gemm + col2im(or col2vol)
// convolution backward weight operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
Tensor col;
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col.mutable_data<T>(col_shape, context.GetPlace());
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
math::SetConstant<Place, T> set_zero;
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
set_zero(context.device_context(), input_grad, static_cast<T>(0));
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
......@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix,
T(0.0));
// col2im
Tensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
if (filter_shape_vec.size() == 2) {
math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
math::Col2VolFunctor<Place, T> col2vol;
col2vol(context.device_context(), in_grad_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
}
}
}
......@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
filter_grad->mutable_data<T>(context.GetPlace());
Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
auto t = framework::EigenVector<T>::Flatten(filter_grad_);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
set_zero(context.device_context(), filter_grad, static_cast<T>(0));
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
......@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
Tensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
if (filter_shape_vec.size() == 2) {
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
math::Vol2ColFunctor<Place, T> vol2col;
vol2col(context.device_context(), in_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
// gemm
Tensor filter_grad_slice =
......@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,18 +12,18 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2d_transpose_op.h"
#include "paddle/operators/conv_transpose_op.h"
namespace paddle {
namespace operators {
void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DTransposeOp should not be null.");
"Input(Input) of ConvTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DTransposeOp should not be null.");
"Input(Filter) of ConvTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DTransposeOp should not be null.");
"Output(Output) of ConvTransposeOp should not be null.");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
......@@ -35,17 +35,27 @@ void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
"No Padding allowed in conv transpose op.");
}
PADDLE_ENFORCE_EQ(in_dims.size(), 4,
"Conv2DTransposeOp input should be 4-D tensor.");
PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
"Conv2DTransposeOp filter should be 4-D tensor.");
PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
"ConvTransposeOp paddings dimension and Conv strides "
"dimension should be the same.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"input and kernel input dimension should be equal.");
"In ConvTransposeOp, The input channel should be the same "
"as the number of filters.");
auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
ctx->SetOutputDim("Output",
{in_dims[0], filter_dims[1], output_height, output_width});
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
for (size_t i = 0; i < paddings.size(); ++i) {
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
filter_dims[i + 2]);
}
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
}
Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
......@@ -54,37 +64,109 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
AddInput(
"Input",
"(Tensor) The input tensor of convolution transpose operator. "
"The format of input tensor is NCHW, where N is batch size, C is the "
"number of input channels, H is the height of the image, and "
"W is the width of the image.");
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H is the height of the feature, and "
"W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator."
"(Tensor) The filter tensor of convolution transpose operator. "
"The format of the filter tensor is CMHW, where C is the number of "
"output image channels, M is the number of input image channels, "
"H is the height of the filter, and W is the width of the filter. "
"We enforce groups number == 1 and padding == 0 in "
"the convolution transpose scenario.");
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator."
"(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides",
"strides of convolution transpose operator.")
AddAttr<std::vector<int>>(
"strides",
"(vector defalut:{1, 1}), strides of convolution transpose operator.")
.SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings",
"paddings of convolution transpose operator.")
AddAttr<std::vector<int>>(
"paddings",
"(vector defalut:{0, 0}), paddings of convolution transpose operator.")
.SetDefault({0, 0});
AddComment(R"DOC(
Convolution Transpose Operator.
The convolution transpose operation calculates the output based on the input,
filter, strides, paddings, and groups parameters. The size of each dimension
of the parameters is checked in the infer-shape method.
Convolution2D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and
W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
These two elements represent height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, H_in, W_in)
Filter shape: (C_in, C_out, H_f, W_f)
Output:
Output shape: (N, C_out, H_out, W_out)
where
H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
)DOC");
}
Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(Tensor) The input tensor of convolution transpose operator."
"The format of input tensor is NCDHW. Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and "
"W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator."
"The format of the filter tensor is CMDHW, where C is the number of "
"output image channels, M is the number of input image channels, D "
"is the depth of the filter, H is the height of the filter, and "
"W is the width of the filter."
"We enforce groups number == 1 and padding == 0 in "
"the convolution3d transpose scenario.");
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator."
"The format of output tensor is also NCDHW."
"Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>(
"strides",
"(vector defalut:{1, 1, 1}), strides of convolution transpose operator.")
.SetDefault({1, 1, 1});
AddAttr<std::vector<int>>(
"paddings",
"(vector defalut:{0, 0, 0}), paddings of convolution transpose operator.")
.SetDefault({0, 0, 0});
AddComment(R"DOC(
Convolution3D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
size, C is the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width of the feature.
Parameters(ksize, strides, paddings) are three elements.
These three elements represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, D_in, H_in, W_in)
Filter shape: (C_in, C_out, D_f, H_f, W_f)
Output:
Output shape: (N, C_out, D_out, H_out, W_out)
where
D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
)DOC");
}
void Conv2DTransposeOpGrad::InferShape(
framework::InferShapeContext* ctx) const {
void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) {
......@@ -99,13 +181,23 @@ void Conv2DTransposeOpGrad::InferShape(
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv2d_transpose, ops::Conv2DTransposeOp,
ops::Conv2DTransposeOpMaker, conv2d_transpose_grad,
ops::Conv2DTransposeOpGrad);
REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose,
ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
conv3d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
......@@ -12,13 +12,20 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2d_transpose_op.h"
#include "paddle/operators/conv_transpose_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
conv2d_transpose,
ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv2d_transpose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
......@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h"
namespace paddle {
namespace operators {
......@@ -33,7 +34,13 @@ class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker);
};
class Conv2DTransposeOp : public framework::OperatorWithKernel {
class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Conv3DTransposeOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker);
};
class ConvTransposeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -41,7 +48,7 @@ class Conv2DTransposeOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override;
};
class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
class ConvTransposeOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -50,41 +57,44 @@ class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
};
template <typename Place, typename T>
class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
class GemmConvTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
// The filter will be reshaped, so it should not be constant pointer
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2d_transpose.
const int batch_size = input->dims()[0];
const int m = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int k_h = filter.dims()[2];
const int k_w = filter.dims()[3];
const int c = output->dims()[1]; // output channels
const int o_h = output->dims()[2];
const int o_w = output->dims()[3];
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
col2im;
// use col_shape in the im2col and col2im calculation
DDim col_shape = {c, k_h, k_w, h, w};
// groups will alway be disabled in conv2dtranspose.
const int batch_size = static_cast<int>(input->dims()[0]);
// input_shape_vec: {h, w} or {d, h, w}
std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
// filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
filter_shape_vec.erase(filter_shape_vec.begin(),
filter_shape_vec.begin() + 2);
// use col_shape in the im2col and col2im (or vol2col and col2vol)
// calculation
// col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(output->dims()[1]);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
input_shape_vec.end());
DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
DDim col_matrix_shape = {c * k_h * k_w, h * w};
// size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
......@@ -95,160 +105,189 @@ class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
DDim output_shape = {c, o_h, o_w};
DDim input_matrix_shape = {m, h * w};
// output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
DDim output_shape =
framework::slice_ddim(output->dims(), 1, output->dims().size());
DDim filter_matrix_shape = {m, c * k_h * k_w};
filter.Resize(filter_matrix_shape);
// input matrix size: (m, h * w) or (m, d * h * w)
DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
// convolution transpose: gemm + col2im (similar to conv-backward on input)
// filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
filter.Resize(filter_matrix_shape);
output->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*output);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
math::SetConstant<Place, T> set_zero;
set_zero(context.device_context(), output, static_cast<T>(0));
// convolution transpose: gemm + col2im or col2vol (similar to conv-backward
// on input)
for (int i = 0; i < batch_size; i++) {
// batch with size (M, h * w)
// batch with size (m, h * w) or (m, d * h * w)
Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// filter size: (M, c * k_h * k_w)
// output size: (c, o_h, o_w)
// output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
// col_matrix = filter * input_batch
// of shape (c * k_h * k_w, h * w)
// of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
math::matmul<Place, T>(context.device_context(), filter, true,
input_batch, false, T(1.0), &col_matrix, T(0.0));
col2im(context.device_context(), output_batch, col, strides[0],
strides[1], 0, 0, 0, 0);
input_batch, false, static_cast<T>(1.0),
&col_matrix, static_cast<T>(0.0));
if (filter_shape_vec.size() == 2) {
// col2im: col_matrix -> dy
// from (c * k_h * k_w, h * w) to (c, o_h, o_w)
math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
col2im(context.device_context(), output_batch, col, strides[0],
strides[1], 0, 0, 0, 0);
} else if (filter_shape_vec.size() == 3) {
// col2vol: col_matrix -> dy
// from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
math::Col2VolFunctor<Place, T> col2vol;
col2vol(context.device_context(), output_batch, col, strides[0],
strides[1], strides[2], 0, 0, 0);
}
}
}
};
template <typename Place, typename T>
class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> {
class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
// For filter, we do not use const pointer b/c we will do reshape,
// but we should avoid modifying its value.
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
if ((!input_grad) && (!filter_grad)) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// Actually, no paddings and groups allowed in conv transpose.
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
const int batch_size = input->dims()[0];
const int m = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int batch_size = static_cast<int>(input->dims()[0]);
const int k_h = filter.dims()[2];
const int k_w = filter.dims()[3];
// input_shape_vec: {h, w} or {d, h, w}
std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
const int c = output_grad->dims()[1]; // output channels
const int o_h = output_grad->dims()[2];
const int o_w = output_grad->dims()[3];
// filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
filter_shape_vec.erase(filter_shape_vec.begin(),
filter_shape_vec.begin() + 2);
// Only im2col functor required for bp to get to the right shape
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
im2col;
// use col_shape in the im2col and col2im calculation
DDim col_shape = {c, k_h, k_w, h, w};
// use col_shape in the im2col and col2im (or vol2col and col2vol)
// calculation
// col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(output_grad->dims()[1]);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
input_shape_vec.end());
DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation
DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
// size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
// output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
output_grad->dims().size());
DDim output_shape = {c, o_h, o_w};
DDim input_matrix_shape = {m, h * w};
// input matrix size: (m, h * w) or (m, d * h * w)
DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
DDim filter_matrix_shape = {m, c * k_h * k_w};
// filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
filter.Resize(filter_matrix_shape);
// convolution transpose grad on input:
// im2col + gemm (similar to conv-forward)
// input need to compute gradient
if (input_grad) {
if (input_grad || filter_grad) {
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col_matrix.ShareDataWith(col);
DDim col_matrix_shape = {c * k_h * k_w, h * w};
col_matrix.Resize(col_matrix_shape);
input_grad->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
Tensor filter_grad_;
math::SetConstant<Place, T> set_zero;
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
set_zero(context.device_context(), input_grad, static_cast<T>(0));
}
if (filter_grad) { // filter size (m, c, k_h, k_w)
filter_grad->mutable_data<T>(context.GetPlace());
set_zero(context.device_context(), filter_grad, static_cast<T>(0));
filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
}
for (int i = 0; i < batch_size; i++) {
// batch with size (c, o_h * o_w)
Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape);
// filter of size (m, c * k_h * k_w)
// batch with size (m, h, w)
Tensor input_grad_batch =
input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
// im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w)
im2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm: dx = filter * dy
// (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h)
math::matmul<Place, T>(context.device_context(), filter, false,
col_matrix, false, T(1.0), &input_grad_batch,
T(0.0));
}
}
// filter gradient required
if (filter_grad) {
Tensor col_matrix_f;
col_matrix_f.ShareDataWith(col);
DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
col_matrix_f.Resize(col_matrix_shape_f);
filter_grad->mutable_data<T>(context.GetPlace());
Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
auto t = framework::EigenVector<T>::Flatten(filter_grad_);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; ++i) {
// batch with size (c, o_h, o_w)
Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape);
// input batch
Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// im2col: (c * h * w, k_h * k_w)
im2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm: d_filter = x * y_grad^T
// (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
math::matmul<Place, T>(context.device_context(), in_batch, false,
col_matrix_f, true, T(1.0), &filter_grad_,
T(1.0));
if (filter_shape_vec.size() == 2) {
// im2col: dy -> col matrix
// from (c, o_h, o_w) to (c * k_h * k_w, h * w)
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
im2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
// vol2col: dy -> col_matrix
// from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
math::Vol2ColFunctor<Place, T> vol2col;
vol2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
if (input_grad) {
// batch with size (m, h, w)
Tensor input_grad_batch =
input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
// gemm: dx = filter * dy
// (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
// or
// (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
// d, h, w)
math::matmul<Place, T>(context.device_context(), filter, false,
col_matrix, false, static_cast<T>(1.0),
&input_grad_batch, static_cast<T>(0.0));
}
if (filter_grad) {
// input batch
Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// gemm: d_filter = x * dy^T
// (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
// or
// (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
// k_h * k_w)
math::matmul<Place, T>(context.device_context(), in_batch, false,
col_matrix, true, static_cast<T>(1.0),
&filter_grad_, static_cast<T>(1.0));
}
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -114,21 +114,17 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
"where N is the batch size and D is the number of classes. "
"This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator.");
AddInput(
"Label",
"(Tensor, default Tensor<int>), the ground truth which is "
"a 2-D tensor. "
"When soft_label is set to false, Label is a Tensor<int> with shape "
"[N x 1]. "
"When soft_label is set to true, Label is a Tensor<float/double> "
"with shape [N x K].");
AddInput("Label",
"(Tensor), the ground truth which is a 2-D tensor. When "
"soft_label is set to false, Label is a Tensor<int64> with shape "
"[N x 1]. When soft_label is set to true, Label is a "
"Tensor<float/double> with shape [N x K].");
AddOutput("Y",
"(Tensor, default Tensor<float>), a 2-D tensor "
"with shape [N x 1]. The cross entropy loss.");
AddAttr<bool>(
"soft_label",
"(bool, default false), a flag to indicate whether to interpretate "
"the given labels as soft labels.")
"(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The cross entropy loss.");
AddAttr<bool>("soft_label",
"(bool, default false), a flag indicating whether to "
"interpretate the given labels as soft labels.")
.SetDefault(false);
AddComment(R"DOC(
CrossEntropy Operator.
......
......@@ -34,15 +34,18 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
std::vector<int64_t> shape_int64(shape.size(), 0);
std::transform(shape.begin(), shape.end(), shape_int64.begin(),
[](int a) { return static_cast<int64_t>(a); });
auto dims = framework::make_ddim(shape_int64);
auto output_dim = framework::make_ddim(shape_int64);
int dim_idx = ctx->Attrs().Get<int>("dim_idx");
PADDLE_ENFORCE_GE(dim_idx, 0);
PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), dim_idx);
PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), dim_idx);
int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
PADDLE_ENFORCE_GE(input_dim_idx, 0);
PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
dims[dim_idx] = ctx->GetInputDim("Input")[dim_idx];
ctx->SetOutputDim("Out", dims);
int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
PADDLE_ENFORCE_GE(output_dim_idx, 0);
PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
ctx->SetOutputDim("Out", output_dim);
}
protected:
......@@ -69,8 +72,11 @@ class FillConstantBatchSizeLikeOpMaker
"(Tensor) Tensor of specified shape will be filled "
"with the specified value");
AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
AddAttr<int>("dim_idx",
"(int, default 0) The index of batch size dimension")
AddAttr<int>("input_dim_idx",
"(int, default 0) the index of input's batch size dimension")
.SetDefault(0);
AddAttr<int>("output_dim_idx",
"(int, default 0) the index of output's batch size dimension")
.SetDefault(0);
AddAttr<float>("value", "(float, default 0) The value to be filled")
.SetDefault(0.0f);
......@@ -86,9 +92,10 @@ Fill up a variable with specified constant value.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOp,
ops::FillConstantBatchSizeLikeOpMaker);
REGISTER_OPERATOR(fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOp,
paddle::framework::EmptyGradOpMaker,
ops::FillConstantBatchSizeLikeOpMaker);
REGISTER_OP_CPU_KERNEL(
fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
......
......@@ -35,7 +35,9 @@ class FillConstantOp : public framework::OperatorWithKernel {
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override {
return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
int data_type = ctx.Attr<int>("data_type");
VLOG(10) << " FillConstant data_type = " << data_type;
return static_cast<framework::DataType>(data_type);
}
};
......@@ -71,4 +73,5 @@ REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
REGISTER_OP_CPU_KERNEL(
fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>);
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>,
ops::FillConstantOpKernel<paddle::platform::CPUPlace, int64_t>);
......@@ -20,4 +20,5 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>);
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>,
ops::FillConstantOpKernel<paddle::platform::GPUPlace, int64_t>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/gru_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class GRUOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(%s) of GRUOp should not be null.", "Input");
PADDLE_ENFORCE(ctx->HasInput("Weight"),
"Input(%s) of GRUOp should not be null.", "Weight");
PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
"Output(%s) of GRUOp should not be null.", "BatchGate");
PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
"Output(%s) of GRUOp should not be null.",
"BatchResetHiddenPrev");
PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
"Output(%s) of GRUOp should not be null.", "BatchHidden");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(%s) of GRUOp should not be null.", "Hidden");
auto input_dims = ctx->GetInputDim("Input");
auto weight_dims = ctx->GetInputDim("Weight");
int input_size = input_dims[1];
int frame_size = weight_dims[0];
PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
"The input_size must be 3 times of frame_size in GRUOp.");
PADDLE_ENFORCE_EQ(
weight_dims[1], frame_size * 3,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
if (ctx->HasInput("H0")) {
auto h0_dims = ctx->GetInputDim("H0");
PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
"The width of H0 must be equal to frame_size.");
}
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
int bias_height = bias_dims[0];
int bias_width = bias_dims[1];
PADDLE_ENFORCE_EQ(bias_height, 1,
"The shape of Bias must be [1, frame_size * 3].");
PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
"The shape of Bias must be [1, frame_size * 3].");
}
ctx->SetOutputDim("BatchGate", input_dims);
ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
ctx->ShareLoD("Input", "Hidden");
}
};
class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
public:
GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(LoDTensor) The first input is a LodTensor, which supports "
"variable-time length input sequence. The underlying tensor in "
"this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
"total time steps in this mini-batch, D is the hidden size.");
AddInput("H0",
"(Tensor, optional) The initial hidden state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size, D is the hidden size.")
.AsDispensable();
AddInput(
"Weight",
"(Tensor) The learnable hidden-hidden weight matrix with shape "
"(D x 3D), where D is the hidden size. The elements continuous in "
"memory can be divided into two parts. The first part are weights of "
"the update gate and reset gate with shape (D x 2D), and the second "
"part are weights of output candidate with shape (D x D).");
AddInput("Bias",
"(Tensor, optional) Bias vector with shape (1 x 3D) concating "
"bias of the update gate, reset gate and output candidate.")
.AsDispensable();
AddOutput("BatchGate",
"(LoDTensor) To compute with batches, sequence data will be "
"reorganized into several successive batches each containing "
"data from the same time step. The LoDTensor BatchGate contains "
"the update gate, reset gate and output candidate values "
"organized in batches. The LoD size is 2. The first LoD contains "
"the batch offsets and the second LoD contains the indexes in "
"the raw sequence data.")
.AsIntermediate();
AddOutput(
"BatchResetHiddenPrev",
"(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
"This LoDTensor is a matrix with shape (T X D) and has the same LoD "
"with `BatchGate`.")
.AsIntermediate();
AddOutput(
"BatchHidden",
"(LoDTensor) The hidden state LoDTensor organized in batches. "
"This LoDTensor is a matrix with shape (T X D) and has the same LoD "
"with `BatchGate`.")
.AsIntermediate();
AddOutput(
"Hidden",
"(LoDTensor) the hidden state LoDTensor organized in sequences. "
"This LoDTensor is a matrix with shape (T X D) and has the same LoD "
"with `BatchGate`.");
AddAttr<std::string>("activation",
"(string, default tanh) "
"The activation type used for output candidate {h}_t.")
.SetDefault("tanh");
AddAttr<std::string>(
"gate_activation",
"(string, default sigmoid) "
"The activation type used in update gate and reset gate.")
.SetDefault("sigmoid");
AddAttr<bool>("is_reverse",
"(bool, defalut: False) "
"whether to compute reversed GRU.")
.SetDefault(false);
AddComment(R"DOC(
GRU Operator implements part calculations of the complete GRU as following:
\f[
update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\
output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
\f]
@note To implement the complete GRU, fully-connected operator must be used
before to feed xu, xr and xc as the Input of GRU operator.
)DOC");
}
};
class GRUGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(%s) of GRUGradOp should not be null.", "Input");
PADDLE_ENFORCE(ctx->HasInput("Weight"),
"Input(%s) of GRUGradOp should not be null.", "Weight");
PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
"Input(%s) of GRUGradOp should not be null.", "BatchGate");
PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
"Input(%s) of GRUGradOp should not be null.",
"BatchResetHiddenPrev");
PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
"Input(%s) of GRUOp should not be null.", "BatchHidden");
PADDLE_ENFORCE(ctx->HasInput("Hidden"),
"Input(%s) of GRUGradOp should not be null.", "Hidden");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
"Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
auto input_dims = ctx->GetInputDim("Input");
auto weight_dims = ctx->GetInputDim("Weight");
int input_size = input_dims[1];
int frame_size = weight_dims[0];
int weight_height = weight_dims[0];
int weight_width = weight_dims[1];
PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
"The input_size must be 3 times of frame_size in GRUOp.");
PADDLE_ENFORCE_EQ(
weight_height, frame_size,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
PADDLE_ENFORCE_EQ(
weight_width, frame_size * 3,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
if (ctx->HasInput("H0")) {
auto h0_dims = ctx->GetInputDim("H0");
PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
"The width of H0 must be equal to frame_size.");
auto h0_grad_name = framework::GradVarName("H0");
if (ctx->HasOutput(h0_grad_name))
ctx->SetOutputDim(h0_grad_name, h0_dims);
}
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
int bias_height = bias_dims[0];
int bias_width = bias_dims[1];
PADDLE_ENFORCE_EQ(bias_height, 1,
"The shape of Bias must be [1, frame_size * 3].");
PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
"The shape of Bias must be [1, frame_size * 3].");
auto bias_grad_name = framework::GradVarName("Bias");
if (ctx->HasOutput(bias_grad_name))
ctx->SetOutputDim(bias_grad_name, bias_dims);
}
auto input_grad_name = framework::GradVarName("Input");
if (ctx->HasOutput(input_grad_name))
ctx->SetOutputDim(input_grad_name, input_dims);
auto weight_grad_name = framework::GradVarName("Weight");
if (ctx->HasOutput(weight_grad_name))
ctx->SetOutputDim(weight_grad_name, weight_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
ops::GRUKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(gru_grad,
ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/gru_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
ops::GRUKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(gru_grad,
ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/operators/math/gru_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T>
class GRUKernel : public framework::OpKernel<T> {
public:
void BatchCompute(const framework::ExecutionContext& context) const {
auto* input = context.Input<LoDTensor>("Input");
auto* h0 = context.Input<Tensor>("H0");
const T* h0_data = h0 ? h0->data<T>() : nullptr;
auto* weight = context.Input<Tensor>("Weight");
const T* weight_data = weight->data<T>();
auto* bias = context.Input<Tensor>("Bias");
auto* batch_gate = context.Output<LoDTensor>("BatchGate");
batch_gate->mutable_data<T>(context.GetPlace());
auto* batch_reset_hidden_prev =
context.Output<LoDTensor>("BatchResetHiddenPrev");
batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
batch_hidden->mutable_data<T>(context.GetPlace());
auto* hidden = context.Output<LoDTensor>("Hidden");
hidden->mutable_data<T>(context.GetPlace());
context.ShareLoD("Input", "Hidden");
auto hidden_dims = hidden->dims();
bool is_reverse = context.Attr<bool>("is_reverse");
math::LoDTensor2BatchFunctor<Place, T> to_batch;
to_batch(context.device_context(), *input, *batch_gate, true, is_reverse);
int frame_size = hidden_dims[1];
int batch_size = hidden_dims[0];
auto g = EigenMatrix<T>::From(*batch_gate);
auto place = context.GetEigenDevice<Place>();
if (bias) {
auto b = EigenMatrix<T>::From(*bias);
g.device(place) = g +
b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
.broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
}
math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data);
gru_value.stateWeight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size);
gru_value.prevOutValue = const_cast<T*>(h0_data);
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.outputValue = hidden_t.data<T>();
gru_value.gateValue = gate_t.data<T>();
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
math::GRUUnitFunctor<Place, T>::compute(
context.device_context(), gru_value, frame_size, cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation")));
gru_value.prevOutValue = gru_value.outputValue;
}
math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_hidden->set_lod(batch_gate->lod());
to_seq(context.device_context(), *batch_hidden, *hidden);
}
void Compute(const framework::ExecutionContext& context) const override {
BatchCompute(context);
}
};
template <typename Place, typename T>
class GRUGradKernel : public framework::OpKernel<T> {
public:
void BatchCompute(const framework::ExecutionContext& context) const {
auto* h0 = context.Input<Tensor>("H0");
const T* h0_data = h0 ? h0->data<T>() : nullptr;
auto* weight = context.Input<Tensor>("Weight");
const T* weight_data = weight->data<T>();
auto* batch_gate = context.Input<LoDTensor>("BatchGate");
auto* batch_reset_hidden_prev =
context.Input<LoDTensor>("BatchResetHiddenPrev");
auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
auto* hidden = context.Input<LoDTensor>("Hidden");
auto* hidden_grad =
context.Input<LoDTensor>(framework::GradVarName("Hidden"));
auto* input_grad =
context.Output<LoDTensor>(framework::GradVarName("Input"));
auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
auto* weight_grad =
context.Output<Tensor>(framework::GradVarName("Weight"));
auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
auto gate_dims = batch_gate->dims();
auto hidden_dims = hidden->dims();
int frame_size = hidden_dims[1];
math::LoDTensor2BatchFunctor<Place, T> to_batch;
LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
context.GetPlace());
math::SetConstant<Place, T> zero;
zero(context.device_context(), &batch_hidden_grad, static_cast<T>(0.0));
zero(context.device_context(), &batch_gate_grad, static_cast<T>(0.0));
zero(context.device_context(), &batch_reset_hidden_prev_grad,
static_cast<T>(0.0));
bool is_reverse = context.Attr<bool>("is_reverse");
batch_hidden_grad.set_lod(batch_hidden->lod());
to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false,
is_reverse);
math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data);
gru_value.stateWeight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size);
math::hl_gru_grad<T> gru_grad;
if (weight_grad) {
gru_grad.gateWeightGrad =
weight_grad->mutable_data<T>(context.GetPlace());
zero(context.device_context(), weight_grad, static_cast<T>(0.0));
gru_grad.stateWeightGrad =
weight_grad->data<T>() + 2 * frame_size * frame_size;
} else {
gru_grad.gateWeightGrad = nullptr;
gru_grad.stateWeightGrad = nullptr;
}
auto batch_starts = batch_hidden_grad.lod()[0];
size_t num_batch = batch_starts.size() - 1;
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend);
gru_value.gateValue = gate_t.data<T>();
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
gru_grad.outputGrad = hidden_grad_t.data<T>();
Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
gru_grad.gateGrad = gate_grad_t.data<T>();
Tensor reset_hidden_prev_grad_t =
batch_reset_hidden_prev_grad.Slice(bstart, bend);
gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
if (n == 0) {
gru_value.prevOutValue = const_cast<T*>(h0_data);
if (h0_grad) {
T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
zero(context.device_context(), h0_grad, static_cast<T>(0.0));
gru_grad.prevOutGrad = h0_grad_data;
} else {
gru_grad.prevOutGrad = nullptr;
}
} else {
int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
gru_value.prevOutValue = hidden_prev_t.data<T>();
Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
}
math::GRUUnitGradFunctor<Place, T>::compute(
context.device_context(), gru_value, gru_grad, frame_size,
cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation")));
}
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_gate_grad.set_lod(batch_gate->lod());
to_seq(context.device_context(), batch_gate_grad, *input_grad);
}
if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace());
auto d_b = EigenMatrix<T>::From(*bias_grad);
auto d_g = EigenMatrix<T>::From(batch_gate_grad);
auto place = context.GetEigenDevice<Place>();
d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
}
}
void Compute(const framework::ExecutionContext& context) const override {
BatchCompute(context);
}
};
} // namespace operators
} // namespace paddle
......@@ -31,7 +31,6 @@ class IncrementOp : public framework::OperatorWithKernel {
}
};
template <typename AttrType>
class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
public:
IncrementOpMaker(framework::OpProto *proto,
......@@ -39,10 +38,10 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) The input tensor of increment operator");
AddOutput("Out", "(Tensor) The output tensor of increment operator.");
AddAttr<AttrType>("step",
"(float, default 1.0) "
"The step size by which the "
"input tensor will be incremented.")
AddAttr<float>("step",
"(float, default 1.0) "
"The step size by which the "
"input tensor will be incremented.")
.SetDefault(1.0);
AddComment(R"DOC(
Increment Operator.
......@@ -73,7 +72,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>,
REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
ops::IncrementGradOpMaker);
REGISTER_OP_CPU_KERNEL(increment,
ops::IncrementKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
increment, ops::IncrementKernel<paddle::platform::CPUPlace, float>,
ops::IncrementKernel<paddle::platform::CPUPlace, double>,
ops::IncrementKernel<paddle::platform::CPUPlace, int>,
ops::IncrementKernel<paddle::platform::CPUPlace, int64_t>);
......@@ -16,4 +16,7 @@
REGISTER_OP_GPU_KERNEL(
increment,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>);
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, double>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int>,
paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int64_t>);
......@@ -19,7 +19,7 @@
namespace paddle {
namespace operators {
template <typename Place, typename T, typename AttrType = T>
template <typename Place, typename T>
class IncrementKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& context) const {
......@@ -27,7 +27,7 @@ class IncrementKernel : public framework::OpKernel<T> {
auto* in = context.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place());
auto step = static_cast<T>(context.Attr<AttrType>("step"));
auto step = static_cast<T>(context.Attr<float>("step"));
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
......
......@@ -8,20 +8,24 @@ if(WITH_GPU)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
cc_library(softmax SRCS softmax.cc DEPS operator)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context)
cc_library(context_project SRCS context_project.cc DEPS device_context)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/operators/math/gru_compute.h"
namespace paddle {
namespace operators {
namespace math {
namespace detail {
#ifndef __NVCC__
template <class OpResetOutput, typename T>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
T *gateValue, T *resetOutputValue,
T *prevOutputValue, int frameSize,
activation_mode_t active_gate) {
T rValueUpdateGate;
T rValueResetGate;
T rValueResetOutput;
T rPrevOut = 0;
T *updateGate = gateValue;
T *resetGate = gateValue + frameSize;
for (int i = 0; i < frameSize; i++) {
rValueUpdateGate = updateGate[i];
rValueResetGate = resetGate[i];
if (prevOutputValue) {
rPrevOut = prevOutputValue[i];
}
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
rValueResetOutput, active_gate);
updateGate[i] = rValueUpdateGate;
resetGate[i] = rValueResetGate;
resetOutputValue[i] = rValueResetOutput;
}
}
template <class OpFinalOutput, typename T>
void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
T *gateValue, T *prevOutputValue,
T *outputValue, int frameSize,
activation_mode_t active_node) {
T rValueUpdateGate;
T rValueFrameState;
T rPrevOut = 0;
T rOutput;
T *updateGate = gateValue;
T *frameState = gateValue + frameSize * 2;
for (int i = 0; i < frameSize; i++) {
rValueUpdateGate = updateGate[i];
rValueFrameState = frameState[i];
if (prevOutputValue) {
rPrevOut = prevOutputValue[i];
}
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
active_node);
frameState[i] = rValueFrameState;
outputValue[i] = rOutput;
}
}
template <class OpResetOutput, typename T>
void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
T *resetOutputValue, T *prevOutputValue,
int frameSize,
activation_mode_t active_gate) {
#ifdef __AVX__
__m256 rValueUpdateGate;
__m256 rValueResetGate;
__m256 rValueResetOutput;
__m256 rPrevOut = _mm256_set1_ps(0.0f);
__m256 *updateGate = (__m256 *)gateValue;
__m256 *resetGate = (__m256 *)(gateValue + frameSize);
for (int i = 0; i < frameSize / 8; i++) {
rValueUpdateGate = updateGate[i];
rValueResetGate = resetGate[i];
if (prevOutputValue) {
rPrevOut = ((__m256 *)prevOutputValue)[i];
}
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
rValueResetOutput, active_gate);
updateGate[i] = rValueUpdateGate;
resetGate[i] = rValueResetGate;
((__m256 *)resetOutputValue)[i] = rValueResetOutput;
}
#endif
}
template <class OpFinalOutput, typename T>
void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
T *prevOutputValue, T *outputValue,
int frameSize,
activation_mode_t active_node) {
#ifdef __AVX__
__m256 rValueUpdateGate;
__m256 rValueFrameState;
__m256 rPrevOut = _mm256_set1_ps(0.0f);
__m256 rOutput;
__m256 *updateGate = (__m256 *)gateValue;
__m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
for (int i = 0; i < frameSize / 8; i++) {
rValueUpdateGate = updateGate[i];
rValueFrameState = frameState[i];
if (prevOutputValue) {
rPrevOut = ((__m256 *)prevOutputValue)[i];
}
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
active_node);
frameState[i] = rValueFrameState;
((__m256 *)outputValue)[i] = rOutput;
}
#endif
}
template <class OpResetOutput, typename T>
inline void forward_reset_output(OpResetOutput opResetOutput,
hl_gru_value<T> value, int frameSize,
int batchSize, activation_mode_t active_gate) {
for (int b = 0; b < batchSize; b++) {
if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue,
value.prevOutValue, frameSize, active_gate);
} else {
hl_naive_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue,
value.prevOutValue, frameSize, active_gate);
}
value.gateValue += frameSize * 3;
value.resetOutputValue += frameSize;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
}
}
template <class OpFinalOutput, typename T>
inline void forward_final_output(OpFinalOutput opFinalOutput,
hl_gru_value<T> value, int frameSize,
int batchSize, activation_mode_t active_node) {
for (int b = 0; b < batchSize; b++) {
if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
value.prevOutValue, value.outputValue,
frameSize, active_node);
} else {
hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
value.prevOutValue, value.outputValue,
frameSize, active_node);
}
value.gateValue += frameSize * 3;
value.outputValue += frameSize;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
}
}
template <class OpStateGrad, typename T>
void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *outputGrad,
int frameSize,
activation_mode_t active_node) {
T rUpdateGateValue;
T rUpdateGateGrad;
T rFrameStateValue;
T rFrameStateGrad;
T rOutGrad;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T *updateGateValue = gateValue;
T *updateGateGrad = gateGrad;
T *frameStateValue = gateValue + frameSize * 2;
T *frameStateGrad = gateGrad + frameSize * 2;
for (int i = 0; i < frameSize; i++) {
rUpdateGateValue = updateGateValue[i];
rFrameStateValue = frameStateValue[i];
rOutGrad = outputGrad[i];
if (prevOutValue) {
rPrevOutValue = prevOutValue[i];
}
if (prevOutGrad) {
rPrevOutGrad = prevOutGrad[i];
}
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
active_node);
updateGateGrad[i] = rUpdateGateGrad;
frameStateGrad[i] = rFrameStateGrad;
if (prevOutGrad) {
prevOutGrad[i] = rPrevOutGrad;
}
}
}
template <class OpResetGrad, typename T>
void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *resetOutputGrad,
int frameSize,
activation_mode_t active_gate) {
T rUpdateGateValue;
T rUpdateGateGrad;
T rResetGateValue;
T rResetGateGrad;
T rResetOutputGrad = 0;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T *updateGateValue = gateValue;
T *updateGateGrad = gateGrad;
T *resetGateValue = gateValue + frameSize;
T *resetGateGrad = gateGrad + frameSize;
for (int i = 0; i < frameSize; i++) {
rUpdateGateValue = updateGateValue[i];
rUpdateGateGrad = updateGateGrad[i];
rResetGateValue = resetGateValue[i];
if (prevOutValue && prevOutGrad) {
rResetOutputGrad = resetOutputGrad[i];
}
if (prevOutValue) {
rPrevOutValue = prevOutValue[i];
}
if (prevOutGrad) {
rPrevOutGrad = prevOutGrad[i];
}
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
active_gate);
updateGateGrad[i] = rUpdateGateGrad;
resetGateGrad[i] = rResetGateGrad;
if (prevOutGrad) {
prevOutGrad[i] = rPrevOutGrad;
}
}
}
template <class OpStateGrad, typename T>
void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *outputGrad,
int frameSize,
activation_mode_t active_node) {
#ifdef __AVX__
__m256 rUpdateGateValue;
__m256 rUpdateGateGrad;
__m256 rFrameStateValue;
__m256 rFrameStateGrad;
__m256 rOutGrad;
__m256 rPrevOutValue = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue;
__m256 *updateGateGrad = (__m256 *)gateGrad;
__m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
__m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
for (int i = 0; i < frameSize / 8; i++) {
rUpdateGateValue = updateGateValue[i];
rFrameStateValue = frameStateValue[i];
rOutGrad = ((__m256 *)outputGrad)[i];
if (prevOutValue) {
rPrevOutValue = ((__m256 *)prevOutValue)[i];
}
if (prevOutGrad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
}
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
active_node);
updateGateGrad[i] = rUpdateGateGrad;
frameStateGrad[i] = rFrameStateGrad;
if (prevOutGrad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
}
}
#endif
}
template <class OpResetGrad, typename T>
void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *resetOutputGrad,
int frameSize,
activation_mode_t active_gate) {
#ifdef __AVX__
__m256 rUpdateGateValue;
__m256 rUpdateGateGrad;
__m256 rResetGateValue;
__m256 rResetGateGrad;
__m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
__m256 rPrevOutValue = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue;
__m256 *updateGateGrad = (__m256 *)gateGrad;
__m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
__m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
for (int i = 0; i < frameSize / 8; i++) {
rUpdateGateValue = updateGateValue[i];
rUpdateGateGrad = updateGateGrad[i];
rResetGateValue = resetGateValue[i];
if (prevOutValue && prevOutGrad) {
rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
}
if (prevOutValue) {
rPrevOutValue = ((__m256 *)prevOutValue)[i];
}
if (prevOutGrad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
}
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
active_gate);
updateGateGrad[i] = rUpdateGateGrad;
resetGateGrad[i] = rResetGateGrad;
if (prevOutGrad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
}
}
#endif
}
template <class OpStateGrad, typename T>
inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node) {
for (int b = 0; b < batchSize; b++) {
if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
} else {
hl_naive_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
}
value.gateValue += frameSize * 3;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
grad.gateGrad += frameSize * 3;
grad.outputGrad += frameSize;
if (grad.prevOutGrad) {
grad.prevOutGrad += frameSize;
}
}
}
template <class OpResetGrad, typename T>
inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_gate) {
for (int b = 0; b < batchSize; b++) {
if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
} else {
hl_naive_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
}
value.gateValue += frameSize * 3;
if (value.prevOutValue) {
value.prevOutValue += frameSize;
}
grad.gateGrad += frameSize * 3;
grad.resetOutputGrad += frameSize;
if (grad.prevOutGrad) {
grad.prevOutGrad += frameSize;
}
}
}
#endif
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/operators/math/gru_compute.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h"
#include <glog/logging.h>
namespace paddle {
namespace operators {
namespace math {
namespace detail {
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpResetOutput, bool isBatch, typename T>
__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
T *gateValue, T *resetOutputValue,
T *prevOutputValue, int frameSize,
int batchSize,
activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
resetOutputValue += batchIdx * frameSize;
}
T rPrevOut = 0;
T rValueResetOutput;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
T rValueResetGate = gateValue[frameIdx + frameSize * 1];
if (prevOutputValue) {
if (isBatch) prevOutputValue += batchIdx * frameSize;
rPrevOut = prevOutputValue[frameIdx];
}
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
active_gate);
gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
gateValue[frameIdx + frameSize * 1] = rValueResetGate;
resetOutputValue[frameIdx] = rValueResetOutput;
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpFinalOutput, bool isBatch, typename T>
__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
T *gateValue, T *prevOutputValue,
T *outputValue, int frameSize,
int batchSize,
activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
outputValue += batchIdx * frameSize;
}
T rOutput;
T rPrevOut = 0;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
T rValueFrameState = gateValue[frameIdx + frameSize * 2];
if (prevOutputValue) {
if (isBatch) prevOutputValue += batchIdx * frameSize;
rPrevOut = prevOutputValue[frameIdx];
}
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
active_node);
gateValue[frameIdx + frameSize * 2] = rValueFrameState;
outputValue[frameIdx] = rOutput;
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpStateGrad, bool isBatch, typename T>
__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *outputGrad,
int frameSize, int batchSize,
activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
gateGrad += batchIdx * 3 * frameSize;
outputGrad += batchIdx * frameSize;
}
T rUpdateGateGrad;
T rFrameStateGrad;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
T rOutGrad = outputGrad[frameIdx];
if (prevOutValue && prevOutGrad) {
if (isBatch) prevOutValue += batchIdx * frameSize;
rPrevOutValue = prevOutValue[frameIdx];
if (isBatch) prevOutGrad += batchIdx * frameSize;
rPrevOutGrad = prevOutGrad[frameIdx];
}
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
active_node);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
if (prevOutGrad) {
prevOutGrad[frameIdx] = rPrevOutGrad;
}
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class OpResetGrad, bool isBatch, typename T>
__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
T *gateGrad, T *prevOutValue,
T *prevOutGrad, T *resetOutputGrad,
int frameSize, int batchSize,
activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
gateValue += batchIdx * 3 * frameSize;
gateGrad += batchIdx * 3 * frameSize;
resetOutputGrad += batchIdx * frameSize;
}
T rResetGateGrad;
T rPrevOutValue = 0;
T rPrevOutGrad = 0;
T rResetOutputGrad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
T rResetGateValue = gateValue[frameIdx + frameSize * 1];
if (prevOutValue && prevOutGrad) {
if (isBatch) prevOutValue += batchIdx * frameSize;
if (isBatch) prevOutGrad += batchIdx * frameSize;
rPrevOutValue = prevOutValue[frameIdx];
rPrevOutGrad = prevOutGrad[frameIdx];
rResetOutputGrad = resetOutputGrad[frameIdx];
}
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
active_gate);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
if (prevOutGrad) {
prevOutGrad[frameIdx] = rPrevOutGrad;
}
}
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/platform/hostdevice.h"
#include <type_traits>
// TODO(guosheng): refine code style in gru_kernel
namespace paddle {
namespace operators {
namespace math {
namespace detail {
namespace forward {
template <typename T>
class gru_resetOutput {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
T &valueResetOutput, activation_mode_t actGate) {
valueUpdateGate = activation(valueUpdateGate, actGate);
valueResetGate = activation(valueResetGate, actGate);
valueResetOutput = prevOut * valueResetGate;
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
__m256 &prevOut, __m256 &valueResetOutput,
activation_mode_t actGate) {
valueUpdateGate = activation(valueUpdateGate, actGate);
valueResetGate = activation(valueResetGate, actGate);
valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
}
#endif
#endif
};
template <typename T>
class gru_finalOutput {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
T &valueOutput, activation_mode_t actInput) {
valueFrameState = activation(valueFrameState, actInput);
valueOutput = prevOut - (valueUpdateGate * prevOut) +
(valueUpdateGate * valueFrameState);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
__m256 &prevOut, __m256 &valueOutput,
activation_mode_t actInput) {
valueFrameState = activation(valueFrameState, actInput);
valueOutput = _mm256_add_ps(
_mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
_mm256_mul_ps(valueUpdateGate, valueFrameState));
}
#endif
#endif
};
} // namespace forward
namespace backward {
template <typename T>
class gru_stateGrad {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
T &valueFrameState, T &gradFrameState,
T &valuePrevOut, T &gradPrevOut, T &gradOutput,
activation_mode_t actInput) {
gradUpdateGate = (gradOutput * valueFrameState);
gradUpdateGate -= (gradOutput * valuePrevOut);
gradPrevOut -= (gradOutput * valueUpdateGate);
gradPrevOut += gradOutput;
gradFrameState =
activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
__m256 &valueFrameState, __m256 &gradFrameState,
__m256 &valuePrevOut, __m256 &gradPrevOut,
__m256 &gradOutput, activation_mode_t actInput) {
gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
gradUpdateGate =
_mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
gradPrevOut = _mm256_add_ps(
_mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
gradOutput);
gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
valueFrameState, actInput);
}
#endif
#endif
};
template <typename T>
class gru_resetGrad {
public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
T &valueResetGate, T &gradResetGate,
T &valuePrevOut, T &gradPrevOut,
T &gradResetOutput, activation_mode_t actGate) {
gradResetGate = (gradResetOutput * valuePrevOut);
gradPrevOut += (gradResetOutput * valueResetGate);
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
gradResetGate = activation(gradResetGate, valueResetGate, actGate);
}
#ifndef __NVCC__
#ifndef __AVX__
static const bool avx = false;
#else
static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
__m256 &valueResetGate, __m256 &gradResetGate,
__m256 &valuePrevOut, __m256 &gradPrevOut,
__m256 &gradResetOutput,
activation_mode_t actGate) {
gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
gradPrevOut = _mm256_add_ps(gradPrevOut,
_mm256_mul_ps(gradResetOutput, valueResetGate));
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
gradResetGate = activation(gradResetGate, valueResetGate, actGate);
}
#endif
#endif
};
} // namespace backward
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/gru_compute.h"
#include "paddle/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/operators/math/detail/gru_kernel.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct GRUUnitFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate) {
#ifndef __NVCC__
if (value.prevOutValue) {
math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
value.gateValue, frameSize * 3);
}
detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
frameSize, batchSize, active_gate);
if (value.prevOutValue) {
math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
value.gateValue + frameSize * 2, frameSize * 3);
}
detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
frameSize, batchSize, active_node);
#endif
}
};
template <typename T>
struct GRUUnitGradFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate) {
#ifndef __NVCC__
detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
grad, frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
frameSize, 0, grad.resetOutputGrad, frameSize);
if (grad.stateWeightGrad) {
math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
frameSize * 3, 1, grad.stateWeightGrad, frameSize);
}
}
detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
grad, frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
grad.prevOutGrad, frameSize);
if (grad.gateWeightGrad) {
math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
grad.gateWeightGrad, frameSize * 2);
}
}
#endif
}
};
template struct GRUUnitFunctor<platform::CPUPlace, float>;
template struct GRUUnitFunctor<platform::CPUPlace, double>;
template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/detail/gru_gpu_kernel.h"
#include "paddle/operators/math/detail/gru_kernel.h"
#include "paddle/operators/math/gru_compute.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct GRUUnitFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate) {
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
if (value.prevOutValue) {
math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
value.gateValue, frameSize * 3);
}
if (batchSize == 1) {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ false,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
active_gate);
} else {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ true,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
active_gate);
}
if (value.prevOutValue) {
math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
value.gateValue + frameSize * 2, frameSize * 3);
}
if (batchSize == 1) {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ false,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue,
value.prevOutValue, value.outputValue, frameSize, batchSize,
active_node);
} else {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ true,
T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue,
value.prevOutValue, value.outputValue, frameSize, batchSize,
active_node);
}
}
};
template <typename T>
struct GRUUnitGradFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate) {
auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
if (batchSize == 1) {
detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
batchSize, active_node);
} else {
detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
batchSize, active_node);
}
if (value.prevOutValue && grad.prevOutGrad) {
math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
frameSize, 0, grad.resetOutputGrad, frameSize);
if (grad.stateWeightGrad) {
math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
frameSize * 3, 1, grad.stateWeightGrad, frameSize);
}
}
if (batchSize == 1) {
detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
batchSize, active_gate);
} else {
detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
batchSize, active_gate);
}
if (grad.prevOutGrad && value.prevOutValue) {
math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
grad.prevOutGrad, frameSize);
if (grad.gateWeightGrad) {
math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
grad.gateWeightGrad, frameSize * 2);
}
}
}
};
template struct GRUUnitFunctor<platform::GPUPlace, float>;
template struct GRUUnitFunctor<platform::GPUPlace, double>;
template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace operators {
namespace math {
// TODO(guosheng): refine code style in gru_compute
template <typename T>
struct hl_gru_value {
T *gateWeight;
T *stateWeight;
T *gateValue;
T *resetOutputValue;
T *outputValue;
T *prevOutValue;
};
template <typename T>
struct hl_gru_grad {
T *gateWeightGrad;
T *stateWeightGrad;
T *gateGrad;
T *resetOutputGrad;
T *outputGrad;
T *prevOutGrad;
};
template <typename Place, typename T>
struct GRUUnitFunctor {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate);
};
template <typename Place, typename T>
struct GRUUnitGradFunctor {
static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/sequence_pooling.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class MaxSeqPoolFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1);
PADDLE_ENFORCE_GT(out_dims.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, out_dims);
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->data<T>();
int* max_index = index->data<int>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
for (int64_t i = 0; i < num_seq; ++i) {
for (int64_t k = 0; k < dim; ++k) {
out_data[i * dim + k] = in_data[starts[i] * dim + k];
max_index[i * dim + k] = starts[i];
}
for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
for (int64_t k = 0; k < dim; ++k) {
if (in_data[j * dim + k] > out_data[i * dim + k]) {
out_data[i * dim + k] = in_data[j * dim + k];
max_index[i * dim + k] = j;
}
}
}
}
}
};
template <typename T>
class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& out_grad,
const framework::Tensor& index,
framework::LoDTensor* in_grad) {
auto og_dims = out_grad.dims();
auto ig_dims = in_grad->dims();
auto idx_dims = index.dims();
PADDLE_ENFORCE_GT(og_dims.size(), 1);
PADDLE_ENFORCE_GT(ig_dims.size(), 1);
for (int64_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, og_dims);
const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>();
T* ig_data = in_grad->data<T>();
SetConstant<platform::CPUPlace, T> set_zero;
set_zero(context, in_grad, static_cast<T>(0.0));
int64_t num_seq = og_dims[0];
int64_t dim = out_grad.numel() / num_seq;
for (int64_t i = 0; i < num_seq; ++i) {
for (int64_t j = 0; j < dim; ++j) {
int step_id = max_index[i * dim + j];
ig_data[step_id * dim + j] = og_data[i * dim + j];
}
}
}
};
template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence_pooling.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
template <typename T>
__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
T* output, int* index, int64_t num_seq,
int64_t dim) {
int dim_idx = threadIdx.x;
int seq_id = blockIdx.x;
if (seq_id >= num_seq) return;
size_t start = starts[seq_id];
size_t end = starts[seq_id + 1];
for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
T max_val = static_cast<T>(-FLT_MAX);
int max_id = -1;
for (size_t step_id = start; step_id < end; step_id++) {
if (max_val < input[step_id * dim + i]) {
max_val = input[step_id * dim + i];
max_id = step_id;
}
}
output[seq_id * dim + i] = max_val;
index[seq_id * dim + i] = max_id;
}
}
template <typename T>
class MaxSeqPoolFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index) {
auto in_dims = input.dims();
auto out_dims = output->dims();
auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
PADDLE_ENFORCE_GT(out_dims.size(), 1);
for (int64_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, out_dims);
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->data<T>();
int* max_index = index->data<int>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
dim3 threads(256, 1);
dim3 grid(num_seq, 1);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
in_data, starts.data(), out_data, max_index, num_seq, dim);
}
};
template <typename T>
__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
T* in_grad, int64_t num_seq,
int64_t dim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int col_idx = idx % dim;
if (idx < num_seq * dim) {
int step_id = max_index[idx];
in_grad[step_id * dim + col_idx] = out_grad[idx];
}
}
template <typename T>
class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& out_grad,
const framework::Tensor& index,
framework::LoDTensor* in_grad) {
auto og_dims = out_grad.dims();
auto idx_dims = index.dims();
auto ig_dims = in_grad->dims();
PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
for (int64_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
}
PADDLE_ENFORCE_EQ(idx_dims, og_dims);
const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>();
T* ig_data = in_grad->data<T>();
SetConstant<platform::GPUPlace, T> set_zero;
set_zero(context, in_grad, static_cast<T>(0.0));
int64_t num_seq = og_dims[0];
int64_t dim = out_grad.numel() / num_seq;
unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
dim3 threads(128, 1);
dim3 grid(blocks, 1);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
og_data, max_index, ig_data, num_seq, dim);
}
};
template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
#define FLT_MAX __FLT_MAX__
template <typename Place, typename T>
class MaxSeqPoolFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& input, framework::Tensor* output,
framework::Tensor* index);
};
template <typename Place, class T>
class MaxSeqPoolGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& out_grad,
const framework::Tensor& index,
framework::LoDTensor* in_grad);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/positive_negative_pair_op.h"
namespace paddle {
namespace operators {
class PositiveNegativePairOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("Score"),
"Input(Score) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("Label"),
"Input(Label) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("QueryID"),
"Input(QueryID) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("PositivePair"),
"Output(PositivePair) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("NegativePair"),
"Output(NegativePair) of PositiveNegativePairOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("NeutralPair"),
"Output(NeutralPair) of PositiveNegativePairOp should not be null.");
auto scalar_dim = framework::make_ddim({1});
if (ctx->HasInput("AccumulatePositivePair") ||
ctx->HasInput("AccumulateNegativePair") ||
ctx->HasInput("AccumulateNeutralPair")) {
PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
ctx->HasInput("AccumulateNegativePair") &&
ctx->HasInput("AccumulateNeutralPair"),
"All optional inputs(AccumulatePositivePair, "
"AccumulateNegativePair, AccumulateNeutralPair) of "
"PositiveNegativePairOp are required if one of them is "
"specified.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
"Shape of AccumulatePositivePair should be {1}.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
"Shape of AccumulateNegativePair should be {1}.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
"Shape of AccumulateNeutralPair should be {1}.");
}
auto score_dim = ctx->GetInputDim("Score");
auto label_dim = ctx->GetInputDim("Label");
auto query_dim = ctx->GetInputDim("QueryID");
PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(
label_dim[0], score_dim[0],
"Tensor Score and Label should have the same height (batch size).");
PADDLE_ENFORCE_EQ(label_dim[1], 1,
"The width of Label should be 1, i.e. each item should "
"have a scalar label.");
PADDLE_ENFORCE(query_dim == label_dim,
"QueryID should have the same shape as Label.");
if (ctx->HasInput("Weight")) {
PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
"Weight should have the same shape as Label.");
}
int column = ctx->Attrs().Get<int>("column");
auto depth = score_dim[1];
PADDLE_ENFORCE(column < depth && column >= -depth,
"Attribute column should be in the range of [-%l, %l)",
depth, depth);
ctx->SetOutputDim("PositivePair", scalar_dim);
ctx->SetOutputDim("NegativePair", scalar_dim);
ctx->SetOutputDim("NeutralPair", scalar_dim);
}
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override {
return framework::ToDataType(ctx.Input<Tensor>("Score")->type());
}
};
class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
public:
PositiveNegativePairOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Score",
"(Tensor, float) Model Score on an item (with "
"respect to QueryID). It's a 2-D tensor with shape [batch_size, "
"depth], where the column specified by the attribute \"column\" "
"is used as item score.");
AddInput("Label",
"(Tensor, float) Label of an item (with repsect to "
"QueryId). It's a 2-D tensor with shape [batch_size, 1].");
AddInput("QueryID",
"(Tensor, int64) Query ID that indicates the context. Its shape "
"should be the same as Label.");
AddInput(
"AccumulatePositivePair",
"(float) Optional. The accumulated number of positive pairs over a "
"stream of data. If provided, the output PositivePair will be "
"initialized with this number rather than 0. it won't be modified "
"in place.")
.AsDispensable();
AddInput(
"AccumulateNegativePair",
"(float) Optional. The accumulated number of negative pairs over a "
"stream of data. If provided, the output NegativePair will be "
"initialized with this number rather than 0. it won't be modified "
"in place.")
.AsDispensable();
AddInput("AccumulateNeutralPair",
"(float) Optional. The accumulated number of neutral pairs over a "
"stream of data. If provided, the output NeutralPair will be "
"initialized with this number rather than 0. it won't be modified "
"in place.")
.AsDispensable();
AddInput("Weight",
"(float) Optional. Weight of current item. If specified, its "
"shape should be the same as Label, and the meaning of the output "
"changes from numbers of pairs to the total sum of pairs' "
"weights. Weight of a pair of items is the average of their "
"weights.")
.AsDispensable();
AddOutput("PositivePair",
"(float) Number of positive pairs, i.e. the pairs of "
"items that are ranked correctly.");
AddOutput("NegativePair",
"(float) Number of negative pairs, i.e. the pairs of "
"items that are ranked incorrectly.");
AddOutput("NeutralPair",
"(float) Number of neutral pairs, i.e. the pairs of items "
"that have the same score.")
.AsDispensable();
AddAttr<int>(
"column",
"(int, default -1) The column position of Score used to rank items in "
"descending order. It must be in the range of [-rank(Score), "
"rank(Score)). "
"If `dim < 0`, the dim to reduce is `rank + dim`. "
"Noting that reducing on the first dim will make the LoD info lost.")
.SetDefault(0);
AddComment(R"DOC(
PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR)
model performance.
Within some context, e.g. the "query", a LTR model generates scores
for a list of items, which gives a partial order of the items.
PositiveNegativePairOp takes a list of reference rank order
(Input("Label")) and the model generated scores (Input(Score)) as
inputs and counts the pairs that ranked correctly and incorrectly.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
ops::PositiveNegativePairOp,
ops::PositiveNegativePairOpMaker);
REGISTER_OP_CPU_KERNEL(
positive_negative_pair,
ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <unordered_map>
#include <vector>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/utils/Logging.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename Place, typename T>
class PositiveNegativePairKernel : public framework::OpKernel<T> {
public:
struct PredictionResult {
PredictionResult(T score, T label, T weight)
: score(score), label(label), weight(weight) {}
T score;
T label;
T weight;
};
void Compute(const framework::ExecutionContext& context) const override {
auto score_t = context.Input<Tensor>("Score");
auto label_t = context.Input<Tensor>("Label");
auto query_t = context.Input<Tensor>("QueryID");
auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
auto positive_t = context.Output<Tensor>("PositivePair");
auto negative_t = context.Output<Tensor>("NegativePair");
auto neutral_t = context.Output<Tensor>("NeutralPair");
auto weight_t = context.Input<Tensor>("Weight");
auto score = score_t->data<T>();
auto label = label_t->data<T>();
auto query = query_t->data<int64_t>();
const T* weight = nullptr;
if (weight_t != nullptr) {
weight = weight_t->data<T>();
}
T* positive = positive_t->mutable_data<T>(context.GetPlace());
T* negative = negative_t->mutable_data<T>(context.GetPlace());
T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
auto score_dim = score_t->dims();
auto batch_size = score_dim[0];
auto width = score_dim[1];
auto column = context.Attr<int32_t>("column");
if (column < 0) {
column += width;
}
// construct document instances for each query: Query => List[<score#0,
// label#0, weight#0>, ...]
std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
for (auto i = 0; i < batch_size; ++i) {
if (predictions.find(query[i]) == predictions.end()) {
predictions.emplace(
std::make_pair(query[i], std::vector<PredictionResult>()));
}
predictions[query[i]].emplace_back(score[i * width + column], label[i],
weight_t != nullptr ? weight[i] : 1.0);
}
// for each query, accumulate pair counts
T pos = 0, neg = 0, neu = 0;
if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
acc_neutral_t != nullptr) {
pos = acc_positive_t->data<T>()[0];
neg = acc_negative_t->data<T>()[0];
neu = acc_neutral_t->data<T>()[0];
}
auto evaluate_one_list = [&pos, &neg,
&neu](std::vector<PredictionResult> vec) {
for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
if (ite1->label == ite2->label) { // labels are equal, ignore.
continue;
}
T w = (ite1->weight + ite2->weight) * 0.5;
if (ite1->score == ite2->score) {
neu += w;
}
(ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
? pos += w
: neg += w;
}
}
};
for (auto prediction : predictions) {
evaluate_one_list(prediction.second);
}
*positive = pos;
*negative = neg;
*neutral = neu;
}
};
} // namespace operators
} // namespace paddle
......@@ -27,6 +27,11 @@ class SequencePoolOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SequencePoolOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
"Output(MaxIndex) of SequencePoolOp should not be null.");
ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
}
}
};
......@@ -35,10 +40,14 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
SequencePoolOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
AddOutput("Out",
"(Tensor), output of SequencePoolOp, which does not contain LoD "
"(Tensor) The output of SequencePoolOp does not contain LoD "
"infomation.");
AddOutput("MaxIndex",
"(Tensor<int>) This tensor is used for the sequence max-pooling "
"to record the max indexes.")
.AsIntermediate();
AddAttr<std::string>(
"pooltype",
"(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
......@@ -96,6 +105,12 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
}
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
}
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext& ctx) const override {
return framework::ToDataType(ctx.Input<Tensor>("X")->type());
}
};
} // namespace operators
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence_pooling.h"
namespace paddle {
namespace operators {
......@@ -34,7 +35,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
auto* out = context.Output<Tensor>("Out");
std::string pooltype = context.Attr<std::string>("pooltype");
auto dims = in->dims();
......@@ -53,6 +54,16 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto lod_level_0 = lod[0];
out->mutable_data<T>(context.GetPlace());
if (pooltype == "MAX") {
math::MaxSeqPoolFunctor<Place, T> max_pool;
auto* index = context.Output<Tensor>("MaxIndex");
index->Resize({dims});
index->mutable_data<int>(context.GetPlace());
max_pool(context.device_context(), *in, out, index);
return;
}
auto place = context.GetEigenDevice<Place>();
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
......@@ -69,8 +80,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
} else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h));
} else if (pooltype == "MAX") {
out_e.device(place) = in_e.maximum(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "LAST") {
out_e.device(place) = in_e.chip(h - 1, 0);
} else if (pooltype == "FIRST") {
......@@ -87,8 +96,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
std::string pooltype = context.Attr<std::string>("pooltype");
auto dims = in->dims();
......@@ -96,6 +105,14 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
int64_t w = in->numel() / dims[0];
in_g->mutable_data<T>(context.GetPlace());
if (pooltype == "MAX") {
math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
auto* index = context.Input<Tensor>("MaxIndex");
max_pool_grad(context.device_context(), *out_g, *index, in_g);
return;
}
if (pooltype == "LAST" || pooltype == "FIRST") {
// set X@Grad be zero at first when pooltype is LAST/FIRST
math::SetConstant<Place, T> functor;
......@@ -118,20 +135,6 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
} else if (pooltype == "SQRT") {
in_g_e.device(place) =
(out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
} else if (pooltype == "MAX") {
auto in_t =
in->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
in_t_map(in_t.data<T>(), h, w);
int row_id;
Eigen::array<int, 2> extents{{1, 1}};
for (int col_id = 0; col_id < w; col_id++) {
in_t_map.col(col_id).maxCoeff(&row_id);
Eigen::array<int, 2> in_offsets{{row_id, col_id}};
Eigen::array<int, 2> out_offsets{{0, col_id}};
in_g_e.slice(in_offsets, extents).device(place) =
out_g_e.slice(out_offsets, extents);
}
} else if (pooltype == "LAST") {
in_g_e.chip(h - 1, 0).device(place) = out_g_e;
} else if (pooltype == "FIRST") {
......
......@@ -4,13 +4,13 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/softmax_with_cross_entropy_op.h"
#include <paddle/function/TensorType.h>
......@@ -30,12 +30,10 @@ class SoftmaxWithCrossEntropyOpMaker
"which is a 2-D tensor with shape [N x K]. N is the batch_size, "
"and K is the class number.");
AddInput("Label",
"(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
"tensor. "
"If softLabel is set to false, Label is a Tensor<int> with shape "
"[N x 1]."
"If softLabel is set to true, Label is a Tensor<float/double> "
"with shape [N x K].");
"(Tensor) The ground truth which is a 2-D tensor. If soft_label "
"is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
"soft_label is set to true, Label is a Tensor<float/double> with "
"shape [N x K].");
AddOutput(
"Softmax",
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
......@@ -62,7 +60,7 @@ Because this operator performs a softmax on logits internally, it expects
unscaled logits. This operator should not be used with the output of
softmax operator since that would produce incorrect results.
When the attribute softLabel is set false, this operators expects mutually
When the attribute soft_label is set false, this operators expects mutually
exclusive hard labels, each sample in a batch is in exactly one class with a
probability of 1.0. Each sample in the batch will have a single label.
......@@ -198,6 +196,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyOpGrad);
REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
ops::SoftmaxWithCrossEntropyKernel<float>);
ops::SoftmaxWithCrossEntropyKernel<float>,
ops::SoftmaxWithCrossEntropyKernel<double>);
REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradKernel<float>);
ops::SoftmaxWithCrossEntropyGradKernel<float>,
ops::SoftmaxWithCrossEntropyGradKernel<double>);
......@@ -4,13 +4,13 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
......@@ -24,7 +24,7 @@ using Tensor = framework::Tensor;
namespace {
template <typename T>
__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
const int* labels, const int batch_size,
const int64_t* labels, const int batch_size,
const int class_num) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int sample_idx = tid / class_num;
......@@ -50,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
int ids = blockIdx.x * blockDim.x + threadIdx.x;
if (ids < batch_size * class_num) {
int row_ids = ids / class_num;
logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]);
logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
}
}
} // namespace
......@@ -104,7 +104,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
.stream()>>>(logit_grad_data, loss_grad_data,
label_data, batch_size, class_num);
} else {
const int* label_data = labels->data<int>();
const int64_t* label_data = labels->data<int64_t>();
CrossEntropyGrad<T><<<
grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
context.device_context())
......@@ -119,6 +119,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
ops::SoftmaxWithCrossEntropyCUDAKernel<float>);
ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>);
ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
......@@ -4,13 +4,13 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
......@@ -60,25 +60,25 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
const int class_num = logit_grad->dims()[1];
auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
if (context.Attr<bool>("soft_label")) {
auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
auto lbl_mat = EigenMatrix<T>::From(*labels);
logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
logit_grad_mat *
(out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
lbl_mat);
out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
(logit_grad_mat - lbl_mat);
} else {
logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
logit_grad_mat *
out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
const int batch_size = logit_grad->dims()[0];
const int* label_data = labels->data<int>();
const T* out_grad_data = out_grad->data<T>();
const int64_t* label_data = labels->data<int64_t>();
T* logit_grad_data = logit_grad->data<T>();
const T* out_grad_data = out_grad->data<T>();
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
logit_grad_data[index] =
out_grad_data[i] * (logit_grad_data[index] - 1.);
logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
}
}
}
......
......@@ -24,10 +24,16 @@ class SumOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
auto x_dims = ctx->GetInputsDim("X");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SumOp should not be null.");
if (ctx->IsRuntime() &&
ctx->GetOutputsVarType("Out")[0] ==
framework::VarDesc::LOD_TENSOR_ARRAY) {
return; // skip runtime infershape when is tensor array;
}
auto x_dims = ctx->GetInputsDim("X");
size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
......@@ -39,6 +45,28 @@ class SumOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", in_dim);
ctx->ShareLoD("X", /*->*/ "Out");
}
protected:
framework::DataType IndicateDataType(
const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X");
if (x_vars[0]->IsType<framework::LoDTensor>()) {
return framework::ToDataType(
x_vars[0]->Get<framework::LoDTensor>().type());
} else if (x_vars[0]->IsType<framework::SelectedRows>()) {
return framework::ToDataType(
x_vars[0]->Get<framework::SelectedRows>().value().type());
} else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
auto& array = x_vars[0]->Get<framework::LoDTensorArray>();
for (auto& each : array) {
if (each.numel() != 0) {
return framework::ToDataType(each.type());
}
}
}
PADDLE_THROW("Unexpected branch. Input type is %s",
x_vars[0]->Type().name());
}
};
class SumOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -63,18 +91,32 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
void operator()(const framework::OpDescBind& op_desc,
framework::BlockDescBind* block) const override {
auto& inputs = op_desc.Input("X");
auto default_var_type = framework::VarDesc::SELECTED_ROWS;
auto var_type = framework::VarDesc::SELECTED_ROWS;
bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string& name) {
return block->Var(name)->GetType() == framework::VarDesc::LOD_TENSOR;
});
if (any_input_is_lod_tensor) {
default_var_type = framework::VarDesc::LOD_TENSOR;
auto is_tensor_array = [block](const std::string& name) {
return block->Var(name)->GetType() ==
framework::VarDesc::LOD_TENSOR_ARRAY;
};
bool any_input_is_tensor_array =
std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
bool all_inputs_are_tensor_array =
std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
if (any_input_is_tensor_array) {
PADDLE_ENFORCE(all_inputs_are_tensor_array);
var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
} else if (any_input_is_lod_tensor) {
var_type = framework::VarDesc::LOD_TENSOR;
}
auto out_var_name = op_desc.Output("Out").front();
block->Var(out_var_name)->SetType(default_var_type);
block->Var(out_var_name)->SetType(var_type);
}
};
......
......@@ -11,6 +11,7 @@ limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/selected_rows_functor.h"
......@@ -28,7 +29,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T>
class SumKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
void Compute(const framework::ExecutionContext &context) const override {
auto in_vars = context.MultiInputVar("X");
int N = in_vars.size();
auto out_var = context.OutputVar("Out");
......@@ -36,7 +37,7 @@ class SumKernel : public framework::OpKernel<T> {
bool in_place = out_var == in_vars[0];
if (out_var->IsType<framework::LoDTensor>()) {
auto* out = context.Output<Tensor>("Out");
auto *out = context.Output<Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto result = EigenVector<T>::Flatten(*out);
......@@ -51,11 +52,11 @@ class SumKernel : public framework::OpKernel<T> {
// If in_place, just skip the first tensor
for (int i = in_place ? 1 : 0; i < N; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto& in_t = in_vars[i]->Get<framework::LoDTensor>();
auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
auto in = EigenVector<T>::Flatten(in_t);
result.device(place) = result + in;
} else if (in_vars[i]->IsType<framework::SelectedRows>()) {
auto& in_t = in_vars[i]->Get<framework::SelectedRows>();
auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
functor(context.device_context(), in_t, out);
} else {
PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
......@@ -63,8 +64,8 @@ class SumKernel : public framework::OpKernel<T> {
}
} else if (out_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
auto* out = context.Output<SelectedRows>("Out");
auto* out_value = out->mutable_value();
auto *out = context.Output<SelectedRows>("Out");
auto *out_value = out->mutable_value();
// Runtime InferShape
size_t first_dim = 0;
......@@ -88,9 +89,36 @@ class SumKernel : public framework::OpKernel<T> {
offset, out);
offset += in_vars[i]->Get<SelectedRows>().value().numel();
}
} else if (out_var->IsType<framework::LoDTensorArray>()) {
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
"Only support all inputs are TensorArray");
auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < in_array.size(); ++i) {
if (in_array[i].numel() != 0) {
if (i >= out_array.size()) {
out_array.resize(i + 1);
}
if (out_array[i].numel() == 0) {
out_array[i].CopyFrom(in_array[i], in_array[i].place(),
context.device_context());
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
auto in = EigenVector<T>::Flatten(in_array[i]);
auto result = EigenVector<T>::Flatten(out_array[i]);
result.device(context.GetEigenDevice<Place>()) = result + in;
}
}
}
}
} else {
PADDLE_THROW("Unexpected branch, output variable type is %s",
out_var->Type().name());
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
class ArrayOpBase : public framework::OperatorBase {
public:
ArrayOpBase(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {}
protected:
size_t GetOffset(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const {
auto *i = scope.FindVar(Input("I"));
PADDLE_ENFORCE(i != nullptr, "I must be set");
auto &i_tensor = i->Get<framework::LoDTensor>();
PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
size_t offset;
if (platform::is_gpu_place(i_tensor.place())) {
// FIXME: Avoid copy from GPU to CPU
framework::Tensor t;
t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
dev_ctx.Wait();
offset = static_cast<size_t>(*t.data<int64_t>());
} else {
offset = static_cast<size_t>(*i_tensor.data<int64_t>());
}
return offset;
}
};
class WriteToArrayOp : public ArrayOpBase {
public:
WriteToArrayOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: ArrayOpBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
auto *x = scope.FindVar(Input("X"));
PADDLE_ENFORCE(x != nullptr, "X must be set");
auto &x_tensor = x->Get<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx);
auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
if (offset >= out->size()) {
out->resize(offset + 1);
}
auto *out_tensor = &out->at(offset);
out_tensor->CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx);
out_tensor->set_lod(x_tensor.lod());
}
};
class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
WriteToArrayOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
AddInput(
"I",
"(Tensor) the subscript index in tensor array. The number of element "
"should be 1");
AddOutput("Out", "(TensorArray) the tensor array will be written");
AddComment(R"DOC(Write a LoDTensor to a LoDTensor array.
Assume T is LoDTensor, i is the subscript of the array, and A is the array. The
equation is
A[i] = T
)DOC");
}
};
class WriteToArrayInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
"The number of element of subscript index must be 1");
PADDLE_ENFORCE(context->HasInput("X"), NotHasXError());
PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
context->SetOutputDim("Out", context->GetInputDim("X"));
}
protected:
virtual const char *NotHasXError() const { return "Must set the lod tensor"; }
virtual const char *NotHasOutError() const {
return "Must set the lod tensor array";
}
};
class WriteToArrayInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDescBind &op_desc,
framework::BlockDescBind *block) const override {
for (auto &out_var : op_desc.OutputArgumentNames()) {
VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY";
block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
}
}
};
class ReadFromArrayOp : public ArrayOpBase {
public:
ReadFromArrayOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: ArrayOpBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
auto *x = scope.FindVar(Input("X"));
PADDLE_ENFORCE(x != nullptr, "X must be set");
auto &x_array = x->Get<framework::LoDTensorArray>();
auto *out = scope.FindVar(Output("Out"));
PADDLE_ENFORCE(out != nullptr, "Out must be set");
auto *out_tesnor = out->GetMutable<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx);
PADDLE_ENFORCE_LT(offset, x_array.size());
out_tesnor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
out_tesnor->set_lod(x_array[offset].lod());
}
};
class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
ReadFromArrayProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(TensorArray) the array will be read from.");
AddInput("I",
"(Tensor) the subscript index in tensor array. The number of "
"element should be 1");
AddOutput("Out", "(LoDTensor) the tensor will be read from.");
AddComment(R"DOC(Read a LoDTensor from a LoDTensor Array
Assume T is LoDTensor, i is th e subscript of the array, and A is the array. The
equation is
T = A[i]
)DOC");
}
};
class ReadFromArrayInferShape : public WriteToArrayInferShape {
protected:
const char *NotHasXError() const override {
return "The input array X must be set";
}
const char *NotHasOutError() const override {
return "The output tensor out must be set";
}
};
class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDescBind> Apply() const override {
auto *grad_op = new framework::OpDescBind();
grad_op->SetType("read_from_array");
grad_op->SetInput("I", Input("I"));
grad_op->SetInput("X", OutputGrad("Out"));
grad_op->SetOutput("Out", InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDescBind>(grad_op);
}
};
class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDescBind> Apply() const override {
auto *grad_op = new framework::OpDescBind();
grad_op->SetType("write_to_array");
grad_op->SetInput("I", Input("I"));
grad_op->SetInput("X", OutputGrad("Out"));
grad_op->SetOutput("Out", InputGrad("X"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDescBind>(grad_op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp,
ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker,
ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType);
REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp,
ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker,
ops::ReadFromArrayGradMaker);
include_directories(${CMAKE_CURRENT_BINARY_DIR})
set(OPITMIZER_SRCS
adadelta_optimizer.cc
adagrad_optimizer.cc
......@@ -9,11 +7,6 @@ set(OPITMIZER_SRCS
sgd_optimizer.cc
)
add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
add_dependencies(paddle_optimizer paddle_proto ${external_project_dependencies})
if(WITH_TESTING)
add_simple_unittest(serialization_test)
add_simple_unittest(parameter_optimizer_test)
endif()
cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "adadelta_optimizer.h"
#include <algorithm>
#include <cmath>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "parameter_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath>
#include "adagrad_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "parameter_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "adam_optimizer.h"
#include <cmath>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "parameter_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "optimizer.h"
#include <glog/logging.h>
#include <cstdlib>
......@@ -6,8 +20,8 @@
#include "parameter_optimizer.h"
using namespace paddle;
using namespace paddle::optimizer;
using paddle::optimizer::ParameterOptimizer;
using paddle::optimizer::Tensor;
template <paddle_element_type VALUE>
struct EnumToType {};
......@@ -15,22 +29,21 @@ struct EnumToType {};
template <class T>
struct TypeToEnum {};
#define MATCH_ENUM_TYPE(TYPE, ENUM) \
template <> \
struct TypeToEnum<TYPE> { \
static paddle_element_type v() { return ENUM; }; \
static constexpr TYPE value = ENUM; \
}; \
template <> \
struct EnumToType<ENUM> { \
typedef TYPE Type; \
#define MATCH_ENUM_TYPE(TYPE, ENUM) \
template <> \
struct TypeToEnum<TYPE> { \
static paddle_element_type v() { return ENUM; } \
static constexpr TYPE value = ENUM; \
}; \
template <> \
struct EnumToType<ENUM> { \
typedef TYPE Type; \
}
MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
// TODO(zhihong): only implement below type, need to fix
MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdbool.h>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "adadelta_optimizer.h"
#include "adagrad_optimizer.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
......
......@@ -110,7 +110,7 @@ public:
int s = 0;
float* newp = (float*)opts_[i]->get_weight(&s);
EXPECT_EQ(s, kSize);
EXPECT_EQ(static_cast<size_t>(s), kSize);
for (size_t j = 0; j < kSize; ++j) {
EXPECT_EQ(newp[j], (*p)[j]);
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "sgd_optimizer.h"
#include "serialization.h"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "parameter_optimizer.h"
......@@ -15,7 +29,6 @@ public:
nesterov_(n) {
if (momentum_ != 0.0) {
size_t size = parameter->size();
// TODO: fix it with align aware allocator bind to Tensor
momentums_ = new Tensor(size);
}
}
......
......@@ -97,6 +97,15 @@ namespace pybind {
using namespace paddle::framework; // NOLINT
template <typename T>
static py::bytes SerializeMessage(T &self) {
// Check IsInitialized in Python
std::string retv;
PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
"Cannot serialize message");
return retv;
}
// Bind Methods
void BindProgramDesc(py::module &m) {
py::class_<ProgramDescBind>(m, "ProgramDesc", "")
......@@ -132,17 +141,7 @@ void BindProgramDesc(py::module &m) {
.def("block", &ProgramDescBind::MutableBlock,
py::return_value_policy::reference)
.def("num_blocks", &ProgramDescBind::Size)
.def("serialize_to_string",
[](ProgramDescBind &program_desc) -> py::bytes {
const ProgramDesc *desc = program_desc.Proto();
PADDLE_ENFORCE(desc->IsInitialized(),
"ProgramDesc has not been initialized.");
std::string res;
PADDLE_ENFORCE(
desc->SerializeToString(&res),
"Serialize ProgramDesc Error. This could be a bug of Paddle.");
return res;
})
.def("serialize_to_string", SerializeMessage<ProgramDescBind>)
.def("parse_from_string",
[](ProgramDescBind &program_desc, const std::string &data) {
ProgramDesc *desc = program_desc.Proto();
......@@ -181,16 +180,7 @@ void BindBlockDesc(py::module &m) {
py::return_value_policy::reference)
.def("op_size", &BlockDescBind::OpSize)
.def("op", &BlockDescBind::Op, py::return_value_policy::reference)
.def("serialize_to_string", [](BlockDescBind &block_desc) -> py::bytes {
const BlockDesc *desc = block_desc.Proto();
PADDLE_ENFORCE(desc->IsInitialized(),
"BlockDesc has not been initialized.");
std::string res;
PADDLE_ENFORCE(
desc->SerializeToString(&res),
"Serialize BlockDesc Error. This could be a bug of Paddle.");
return res;
});
.def("serialize_to_string", SerializeMessage<BlockDescBind>);
}
void BindVarDsec(py::module &m) {
......@@ -219,17 +209,7 @@ void BindVarDsec(py::module &m) {
.def("set_lod_level", &VarDescBind::SetLoDLevel)
.def("type", &VarDescBind::GetType)
.def("set_type", &VarDescBind::SetType)
.def("serialize_to_string",
[](VarDescBind &var_desc) -> py::bytes {
const VarDesc *desc = var_desc.Proto();
PADDLE_ENFORCE(desc->IsInitialized(),
"VarDesc has not been initialized.");
std::string res;
PADDLE_ENFORCE(
desc->SerializeToString(&res),
"Serialize VarDesc Error. This could be a bug of Paddle.");
return res;
})
.def("serialize_to_string", SerializeMessage<VarDescBind>)
.def("persistable", &VarDescBind::Persistable)
.def("set_persistable", &VarDescBind::SetPersistable);
......@@ -274,16 +254,7 @@ void BindOpDesc(py::module &m) {
.def("check_attrs", &OpDescBind::CheckAttrs)
.def("infer_shape", &OpDescBind::InferShape)
.def("infer_var_type", &OpDescBind::InferVarType)
.def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes {
const OpDesc *desc = op_desc.Proto();
PADDLE_ENFORCE(desc->IsInitialized(),
"OpDesc has not been initialized.");
std::string res;
PADDLE_ENFORCE(
desc->SerializeToString(&res),
"Serialize OpDesc Error. This could be a bug of Paddle.");
return res;
});
.def("serialize_to_string", SerializeMessage<OpDescBind>);
}
} // namespace pybind
......
......@@ -2,171 +2,185 @@
set -xe
# Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
fi
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
mkdir -p /paddle/build
cd /paddle/build
# build script will not fail if *.deb does not exist
rm *.deb 2>/dev/null || true
# delete previous built whl packages
rm -rf /paddle/paddle/dist 2>/dev/null || true
cat <<EOF
========================================
Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=Release
-DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-DWITH_MKLML=${WITH_MKLML:-ON}
-DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-ON}
-DWITH_SWIG_PY=ON
-DWITH_C_API=${WITH_C_API:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON}
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-DWITH_TESTING=${WITH_TESTING:-ON}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
# docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-DWITH_MKLML=${WITH_MKLML:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-ON} \
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-DWITH_C_API=${WITH_C_API:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-DWITH_TESTING=${WITH_TESTING:-ON} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
cat <<EOF
============================================
Building in /paddle/build ...
============================================
EOF
make -j `nproc`
function cmake_gen() {
# Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
fi
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
mkdir -p /paddle/build
cd /paddle/build
# build script will not fail if *.deb does not exist
rm *.deb 2>/dev/null || true
# delete previous built whl packages
rm -rf /paddle/paddle/dist 2>/dev/null || true
if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
cat <<EOF
========================================
Running unit tests ...
========================================
cat <<EOF
========================================
Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=Release
-DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-DWITH_MKLML=${WITH_MKLML:-ON}
-DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-ON}
-DWITH_SWIG_PY=ON
-DWITH_C_API=${WITH_C_API:-OFF}
-DWITH_PYTHON=${WITH_PYTHON:-ON}
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-DWITH_TESTING=${WITH_TESTING:-ON}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
========================================
EOF
ctest --output-on-failure
# make install should also be test when unittest
make install -j `nproc`
pip install /usr/local/opt/paddle/share/wheels/*.whl
paddle version
fi
if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
# Disable UNITTEST_USE_VIRTUALENV in docker because
# docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-DWITH_MKLML=${WITH_MKLML:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-ON} \
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-DWITH_C_API=${WITH_C_API:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-DWITH_TESTING=${WITH_TESTING:-ON} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
}
function run_build() {
cat <<EOF
========================================
Building documentation ...
In /paddle/build_doc
========================================
============================================
Building in /paddle/build ...
============================================
EOF
mkdir -p /paddle/build_doc
pushd /paddle/build_doc
cmake .. \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \
-DWITH_AVX=${WITH_AVX:-ON} \
-DWITH_SWIG_PY=ON \
-DWITH_STYLE_CHECK=OFF
make -j `nproc` gen_proto_py
make -j `nproc` paddle_docs paddle_docs_cn
popd
fi
if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
make -j `nproc`
}
function run_test() {
if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
cat <<EOF
========================================
Converting C++ source code into HTML ...
========================================
========================================
Running unit tests ...
========================================
EOF
export WOBOQ_OUT=/paddle/build/woboq_out
mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data
/woboq/generator/codebrowser_generator \
-b /paddle/build \
-a \
-o $WOBOQ_OUT \
-p paddle:/paddle
/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
fi
cat <<EOF
========================================
Generate /paddle/build/Dockerfile ...
========================================
ctest --output-on-failure
# make install should also be test when unittest
make install -j `nproc`
pip install /usr/local/opt/paddle/share/wheels/*.whl
paddle version
fi
}
function gen_docs() {
if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
cat <<EOF
========================================
Building documentation ...
In /paddle/build_doc
========================================
EOF
mkdir -p /paddle/build_doc
pushd /paddle/build_doc
cmake .. \
-DWITH_DOC=ON \
-DWITH_GPU=OFF \
-DWITH_AVX=${WITH_AVX:-ON} \
-DWITH_SWIG_PY=ON \
-DWITH_STYLE_CHECK=OFF
make -j `nproc` gen_proto_py
make -j `nproc` paddle_docs paddle_docs_cn
popd
fi
if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
cat <<EOF
========================================
Converting C++ source code into HTML ...
========================================
EOF
export WOBOQ_OUT=/paddle/build/woboq_out
mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data
/woboq/generator/codebrowser_generator \
-b /paddle/build \
-a \
-o $WOBOQ_OUT \
-p paddle:/paddle
/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
fi
}
cat > /paddle/build/Dockerfile <<EOF
FROM ${BASE_IMAGE}
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV HOME /root
function gen_dockerfile() {
cat <<EOF
========================================
Generate /paddle/build/Dockerfile ...
========================================
EOF
if [[ -n ${APT_MIRROR} ]]; then
cat >> /paddle/build/Dockerfile <<EOF
RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
cat > /paddle/build/Dockerfile <<EOF
FROM ${BASE_IMAGE}
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ENV HOME /root
EOF
fi
if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y libnccl-dev &&"
else
NCCL_DEPS=""
fi
cat >> /paddle/build/Dockerfile <<EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&\
${NCCL_DEPS}\
apt-get install -y wget python-pip && pip install -U pip && \
pip install /*.whl; apt-get install -f -y && \
apt-get clean -y && \
rm -f /*.whl && \
paddle version && \
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_GPU_ENV}
ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/
ADD paddle/pybind/print_operators_doc /usr/bin/
# default command shows the paddle version and exit
CMD ["paddle", "version"]
if [[ ${WITH_GPU} == "ON" ]]; then
NCCL_DEPS="apt-get install -y libnccl-dev &&"
else
NCCL_DEPS=""
fi
cat >> /paddle/build/Dockerfile <<EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&\
${NCCL_DEPS}\
apt-get install -y wget python-pip && pip install -U pip && \
pip install /*.whl; apt-get install -f -y && \
apt-get clean -y && \
rm -f /*.whl && \
paddle version && \
ldconfig
${DOCKERFILE_CUDNN_DSO}
${DOCKERFILE_GPU_ENV}
ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/
ADD paddle/pybind/print_operators_doc /usr/bin/
# default command shows the paddle version and exit
CMD ["paddle", "version"]
EOF
}
set +xe
cmake_gen
run_build
run_test
gen_docs
gen_dockerfile
printf "If you need to install PaddlePaddle in develop docker image,"
printf "please make install or pip install build/python/dist/*.whl.\n"
......@@ -17,8 +17,7 @@ limitations under the License. */
#include <fenv.h>
#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
!defined(__aarch64__)
#if defined(__APPLE__) || defined(__OSX__)
int fegetexcept(void);
int feenableexcept(unsigned int excepts);
......
......@@ -14,9 +14,13 @@ limitations under the License. */
#include "paddle/utils/Excepts.h"
#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
!defined(__aarch64__)
#if defined(__APPLE__) || defined(__OSX__)
#if defined(__arm__) || defined(__arm64__)
// TODO(liuyiqun): implement the arm version
int fegetexcept(void) { return -1; }
int feenableexcept(unsigned int excepts) { return -1; }
int fedisableexcept(unsigned int excepts) { return -1; }
#else
int fegetexcept(void) {
static fenv_t fenv;
return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
......@@ -49,5 +53,5 @@ int fedisableexcept(unsigned int excepts) {
return (fesetenv(&fenv) ? -1 : old_excepts);
}
#endif
#endif
......@@ -18,6 +18,6 @@ limitations under the License. */
TEST(StringUtil, to) {
ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
ASSERT_DEATH(paddle::str::to<int>(""), ".*");
ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
}
......@@ -44,6 +44,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/pad
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND touch stub.cc
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
......
......@@ -143,6 +143,7 @@ __all__ = [
'scale_shift_layer',
'img_conv3d_layer',
'resize_layer',
'sub_seq_layer',
]
......@@ -252,6 +253,7 @@ class LayerType(object):
SCALE_SHIFT_LAYER = 'scale_shift'
RESIZE = 'resize'
SUB_SEQ_LAYER = 'subseq'
@staticmethod
def is_layer_type(type_name):
......@@ -6980,3 +6982,58 @@ def resize_layer(input, size, name=None):
"""
Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
@wrap_act_default(act=LinearActivation())
@wrap_name_default('sub_seq')
def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
"""
sub_seq_layer will return sub-sequences from the input sequences. For each
sequence in the input sequence layer, sub_seq_layer will slice it by given
offset and size. Please notice that, number of offset value and size value
both are equal to the number of sequence in the input layer.
.. code-block:: python
sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
:param name: The name of this layer. It is optional.
:type name: basestring
:param input: The input of this layer, which should be sequence.
:type input: LayerOutput
:param offsets: offset indices to slice the input sequence, which should be
sequence type.
:type offsets: LayerOutput
:param sizes: sizes of the sub-sequences, which should be sequence type.
:type sizes: LayerOutput
:param act: Layer activation, default is LinearActivation
:type act: BaseActivation.
:param bias_attr: The Bias Attribute. If the parameter is set to
False or something not type of ParameterAttribute,
no bias is defined. If the parameter is set to
True, the bias is initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert isinstance(input, LayerOutput), (
'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
assert isinstance(offsets, LayerOutput), (
'The offset indices for sub_seq_layer, '
'must be a PaddlePaddle layer.')
assert isinstance(sizes, LayerOutput), (
'The sizes of sub-sequences, must be a PaddlePaddle layer.')
Layer(
name=name,
type=LayerType.SUB_SEQ_LAYER,
inputs=[input.name, offsets.name, sizes.name],
active_type=act.name,
bias=ParamAttr.to_bias(bias_attr))
return LayerOutput(
name,
LayerType.SUB_SEQ_LAYER,
parents=[input, offsets, sizes],
size=input.size)
......@@ -116,7 +116,7 @@ class AdamOptimizer(BaseSGDOptimizer):
m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
:param beta1: the :math:`\\beta_1` in equation.
:type beta1: float
......
......@@ -23,32 +23,32 @@ from paddle.v2.topology import Topology
def merge_v2_model(net, param_file, output_file):
'''Integrate the model config and model parameters into one file.
'''Merge the model config and parameters into one file.
The model configuration file describes the model structure which
ends with .py. The parameters file stores the parameters of the model
which ends with .tar.gz.
@param net The output layer of the network.
@param param_file Path of the model parameters(.tar.gz) which is stored by v2 api.
@param net The output layer of the network for inference.
@param param_file Path of the parameters (.tar.gz) which is stored by v2 api.
@param output_file Path of the merged file which will be generated.
Usage:
from paddle.util.merge_model import merge_v2_model
from paddle.utils.merge_model import merge_v2_model
# import your network configuration
from mobilenet import mobile_net
net = mobile_net(3*224*224, 102)
from example_net import net_conf
net = net_conf(is_predict=True)
param_file = './param_pass_00000.tar.gz'
output_file = './output.paddle'
merge_v2_model(net, param_file, output_file)
'''
assert isinstance(net, LayerOutput), \
"The net should be the output of the network"
"The net should be the output of the network for inference"
assert os.path.exists(param_file), \
"The model parameters file %s does not exists " % (param_file)
......
import paddle.v2.framework.core as core
from paddle.v2.framework.framework import Block, Program
from paddle.v2.framework.framework import Block, Program, g_main_program
g_scope = core.Scope()
......@@ -18,7 +18,7 @@ class Executor(object):
self.executor = core.Executor(act_places)
def run(self,
program,
program=None,
feed=None,
fetch_list=None,
feed_var_name='feed',
......@@ -29,6 +29,9 @@ class Executor(object):
if fetch_list is None:
fetch_list = []
if program is None:
program = g_main_program
if not isinstance(program, Program):
raise TypeError()
......
......@@ -12,6 +12,14 @@ def unique_name(prefix):
return "_".join([prefix, str(uid)])
def _debug_string_(proto):
error_fields = list()
if not proto.IsInitialized(error_fields):
raise ValueError("{0} are not initialized\nThe message is {1}".format(
error_fields, proto))
return proto.__str__()
class Variable(object):
def __init__(self,
block,
......@@ -95,7 +103,7 @@ class Variable(object):
def __str__(self):
protostr = self.desc.serialize_to_string()
proto = framework_pb2.VarDesc.FromString(str(protostr))
return proto.__str__()
return _debug_string_(proto)
__repr__ = __str__
......@@ -286,7 +294,7 @@ class Operator(object):
def __str__(self):
protostr = self.desc.serialize_to_string()
proto = framework_pb2.OpDesc.FromString(str(protostr))
return proto.__str__()
return _debug_string_(proto)
__repr__ = __str__
......@@ -343,7 +351,7 @@ class Block(object):
def __str__(self):
protostr = self.desc.serialize_to_string()
proto = framework_pb2.BlockDesc.FromString(str(protostr))
return proto.__str__()
return _debug_string_(proto)
__repr__ = __str__
......@@ -448,7 +456,7 @@ class Program(object):
def __str__(self):
protostr = self.desc.serialize_to_string()
proto = framework_pb2.ProgramDesc.FromString(str(protostr))
return proto.__str__()
return _debug_string_(proto)
def clone(self):
p = Program()
......
import paddle.v2.framework.core as core
from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, Operator
from paddle.v2.framework.initializer import ConstantInitializer, NormalInitializer
from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
Operator
from paddle.v2.framework.initializer import ConstantInitializer, \
NormalInitializer
from paddle.v2.framework.layer_helper import LayerHelper, unique_name
import re
......@@ -372,11 +374,13 @@ def sequence_pool(input, pool_type, **kwargs):
helper = LayerHelper('sequence_pool', input=input, **kwargs)
dtype = helper.input_dtype()
pool_out = helper.create_tmp_variable(dtype)
max_index = helper.create_tmp_variable(dtype)
helper.append_op(
type="sequence_pool",
inputs={"X": [input]},
outputs={"Out": [pool_out]},
inputs={"X": input},
outputs={"Out": pool_out,
"MaxIndex": max_index},
attrs={"pooltype": pool_type.upper()})
return pool_out
......@@ -577,25 +581,45 @@ class StaticRNN(object):
if self.status != StaticRNN.IN_RNN_BLOCK:
raise ValueError("You must invoke {0} in rnn block".format(method))
def memory(self, init=None, shape=None, dtype=None, init_value=0):
def memory(self,
init=None,
shape=None,
batch_ref=None,
init_value=0.0,
init_batch_dim_idx=0,
ref_batch_dim_idx=1):
'''
:param init: boot memory, if not set, a shape, batch_ref must be provided
:param shape: shape of the boot memory
:param batch_ref: batch size reference variable
:param init_value: the init value of boot memory
:param init_batch_dim_idx: the index of batch size in init's dimension
:param ref_batch_dim_idx: the index of batch size in batch_ref's dimension
:return: boot memory
'''
self._assert_in_rnn_block_('memory')
if init is None:
if shape is None or dtype is None:
if shape is None or batch_ref is None:
raise ValueError(
"if init is None, memory at least need shape and dtype")
"if init is None, memory at least need shape and batch_ref")
parent_block = self.parent_block()
var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
boot_var = parent_block.create_var(
name=var_name, shape=shape, dtype=dtype, persistable=False)
name=var_name,
shape=shape,
dtype=batch_ref.data_type,
persistable=False)
parent_block.append_op(
type="fill_constant",
inputs={},
type="fill_constant_batch_size_like",
inputs={'Input': [batch_ref]},
outputs={'Out': [boot_var]},
attrs={
'value': init_value,
'shape': [40] + list(boot_var.shape[1:]),
'data_type': boot_var.data_type
'shape': boot_var.shape,
'data_type': boot_var.data_type,
'input_dim_idx': ref_batch_dim_idx,
'output_dim_idx': init_batch_dim_idx
})
return self.memory(init=boot_var)
......@@ -749,3 +773,68 @@ def lod_rank_table(x, level=0, main_program=None):
outputs={'Out': table},
attrs={'level': level})
return table
def fill_constant(shape, dtype, value, main_program=None):
helper = LayerHelper("ones", **locals())
out = helper.create_tmp_variable(dtype=dtype)
helper.append_op(
type='fill_constant',
inputs={},
outputs={'Out': [out]},
attrs={
'shape': shape,
'data_type': out.data_type,
'value': float(value)
})
out.stop_gradient = True
return out
def ones(shape, dtype, main_program=None):
return fill_constant(value=1.0, **locals())
def zeros(shape, dtype, main_program=None):
return fill_constant(value=0.0, **locals())
def increment(x, value=1.0, main_program=None):
helper = LayerHelper("increment", **locals())
tmp = helper.create_tmp_variable(dtype=x.data_type)
helper.append_op(
type='increment',
inputs={'X': [x]},
outputs={'Out': [tmp]},
attrs={'step': value})
return tmp
def array_write(x, i, array=None, main_program=None):
helper = LayerHelper('array_write', **locals())
if array is None:
array = helper.create_variable(
name="{0}.out".format(helper.name),
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
dtype=x.data_type)
helper.append_op(
type='write_to_array',
inputs={'X': [x],
'I': [i]},
outputs={'Out': [array]})
return array
def array_read(array, i, main_program=None):
helper = LayerHelper('array_read', **locals())
if not isinstance(
array,
Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
raise TypeError("array should be tensor array vairable")
out = helper.create_tmp_variable(dtype=array.data_type)
helper.append_op(
type='read_from_array',
inputs={'X': [array],
'I': [i]},
outputs={'Out': [out]})
return out
import unittest
import paddle.v2.framework.core as core
import paddle.v2.framework.layers as layers
from paddle.v2.framework.executor import Executor
from paddle.v2.framework.backward import append_backward_ops
from paddle.v2.framework.framework import g_main_program
import numpy
class TestArrayReadWrite(unittest.TestCase):
def test_read_write(self):
x = [
layers.data(
name='x0', shape=[100]), layers.data(
name='x1', shape=[100]), layers.data(
name='x2', shape=[100])
]
for each_x in x:
each_x.stop_gradient = False
i = layers.zeros(shape=[1], dtype='int64')
arr = layers.array_write(x=x[0], i=i)
i = layers.increment(x=i)
i.stop_gradient = True
arr = layers.array_write(x=x[1], i=i, array=arr)
i = layers.increment(x=i)
i.stop_gradient = True
arr = layers.array_write(x=x[2], i=i, array=arr)
i = layers.zeros(shape=[1], dtype='int64')
a0 = layers.array_read(array=arr, i=i)
i = layers.increment(x=i)
i.stop_gradient = True # index should not calculate gradient
a1 = layers.array_read(array=arr, i=i)
i = layers.increment(x=i)
i.stop_gradient = True
a2 = layers.array_read(array=arr, i=i)
mean_a0 = layers.mean(x=a0)
mean_a1 = layers.mean(x=a1)
mean_a2 = layers.mean(x=a2)
a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])
mean_x0 = layers.mean(x=x[0])
mean_x1 = layers.mean(x=x[1])
mean_x2 = layers.mean(x=x[2])
x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])
scope = core.Scope()
cpu = core.CPUPlace()
exe = Executor(cpu)
tensor = core.LoDTensor()
tensor.set(numpy.random.random(size=(100, 100)).astype('float32'), cpu)
outs = map(numpy.array,
exe.run(feed={'x0': tensor,
'x1': tensor,
'x2': tensor},
fetch_list=[a_sum, x_sum],
scope=scope))
self.assertEqual(outs[0], outs[1])
total_sum = layers.sums(input=[a_sum, x_sum])
total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
append_backward_ops(total_sum_scaled)
g_vars = map(g_main_program.global_block().var,
[each_x.name + "@GRAD" for each_x in x])
g_out = [
item.sum()
for item in map(
numpy.array,
exe.run(feed={'x0': tensor,
'x1': tensor,
'x2': tensor},
fetch_list=g_vars))
]
g_out_sum = numpy.array(g_out).sum()
# since our final gradient is 1 and the neural network are all linear
# with mean_op.
# the input gradient should also be 1
self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
if __name__ == '__main__':
unittest.main()
......@@ -61,25 +61,23 @@ class TestConv2dOp(OpTest):
def test_check_grad(self):
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self):
self.check_grad(
['Input'],
'Output',
max_relative_error=0.05,
max_relative_error=0.02,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.05,
max_relative_error=0.02,
no_grad_set=set(['Input']))
def init_test_case(self):
# self.groups = 1
# self.op_type = "conv2d"
self.pad = [0, 0]
self.stride = [1, 1]
self.dilations = [1, 1]
......@@ -103,6 +101,9 @@ class TestWithGroup(TestConv2dOp):
self.op_type = "conv2d"
#----------------Conv2dCudnn----------------
class TestCudnn(TestConv2dOp):
def init_group(self):
self.groups = 1
......
......@@ -58,36 +58,37 @@ class TestConv2dTransposeOp(OpTest):
print 'check output here for', self.op_type
self.check_output()
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3]
def init_op_type(self):
self.op_type = "conv2d_transpose"
def test_check_grad_no_input(self):
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.05,
max_relative_error=0.02,
no_grad_set=set(['Input']))
def test_check_grad_no_filter(self):
self.check_grad(
['Input'],
'Output',
max_relative_error=0.05,
max_relative_error=0.02,
no_grad_set=set(['Filter']))
def test_check_grad(self):
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3]
def init_op_type(self):
self.op_type = "conv2d_transpose"
# ------------ test_cudnn ------------
class TestCudnn(TestConv2dTransposeOp):
def init_op_type(self):
self.op_type = "conv2d_transpose_cudnn"
......
import unittest
import numpy as np
from op_test import OpTest
def conv3d_forward_naive(input, filter, group, conv_param):
in_n, in_c, in_d, in_h, in_w = input.shape
out_c, f_c, f_d, f_h, f_w = filter.shape
assert f_c * group == in_c
assert np.mod(out_c, group) == 0
sub_out_c = out_c / group
stride, pad = conv_param['stride'], conv_param['pad']
out_d = 1 + (in_d + 2 * pad[0] - f_h) / stride[0]
out_h = 1 + (in_h + 2 * pad[1] - f_h) / stride[1]
out_w = 1 + (in_w + 2 * pad[2] - f_w) / stride[2]
out = np.zeros((in_n, out_c, out_d, out_h, out_w))
input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ),
(pad[2], )),
mode='constant',
constant_values=0)
for d in range(out_d):
for i in range(out_h):
for j in range(out_w):
for g in range(group):
input_pad_masked = \
input_pad[:, g * f_c:(g + 1) * f_c,
d * stride[0]:d * stride[0] + f_d,
i * stride[1]:i * stride[1] + f_h,
j * stride[2]:j * stride[2] + f_w]
f_sub = filter[g * sub_out_c:(g + 1) *
sub_out_c, :, :, :, :]
for k in range(sub_out_c):
out[:, g * sub_out_c + k, d, i, j] = \
np.sum(input_pad_masked * f_sub[k, :, :, :, :],
axis=(1, 2, 3, 4))
return out
class TestConv3dOp(OpTest):
def setUp(self):
self.init_group()
self.init_op_type()
self.init_test_case()
conv3d_param = {'stride': self.stride, 'pad': self.pad}
input = np.random.random(self.input_size).astype("float32")
filter = np.random.random(self.filter_size).astype("float32")
output = conv3d_forward_naive(input, filter, self.groups,
conv3d_param).astype("float32")
self.inputs = {'Input': input, 'Filter': filter}
self.attrs = {
'strides': self.stride,
'paddings': self.pad,
'groups': self.groups
}
self.outputs = {'Output': output}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
def test_check_grad_no_filter(self):
self.check_grad(
['Input'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Input']))
def init_test_case(self):
self.pad = [0, 0, 0]
self.stride = [1, 1, 1]
self.input_size = [2, 3, 4, 4, 4] # NCDHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3, 3]
def init_group(self):
self.groups = 1
def init_op_type(self):
self.op_type = "conv3d"
class TestCase1(TestConv3dOp):
def init_test_case(self):
self.pad = [1, 1, 1]
self.stride = [1, 1, 1]
self.input_size = [2, 3, 4, 4, 4] # NCDHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3, 3]
def init_group(self):
self.groups = 1
def init_op_type(self):
self.op_type = "conv3d"
class TestWithGroup1(TestConv3dOp):
def init_group(self):
self.groups = 3
def init_op_type(self):
self.op_type = "conv3d"
class TestWithGroup2(TestCase1):
def init_group(self):
self.groups = 3
def init_op_type(self):
self.op_type = "conv3d"
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
# [2, 3, 5, 5, 5]
in_n, in_c, in_d, in_h, in_w = input_.shape
# [3, 6, 3, 3, 3]
f_c, out_c, f_d, f_h, f_w = filter_.shape
assert in_c == f_c
stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
out_d = (in_d - 1) * stride[0] + f_d
out_h = (in_h - 1) * stride[1] + f_h
out_w = (in_w - 1) * stride[2] + f_w
out = np.zeros((in_n, out_c, out_d, out_h, out_w))
for n in range(in_n):
for d in range(in_d):
for i in range(in_h):
for j in range(in_w):
input_masked = input_[n, :, d, i, j] # (c)
input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
for k in range(out_c):
tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
axis=0)
d1, d2 = d * stride[0], d * stride[0] + f_d
i1, i2 = i * stride[1], i * stride[1] + f_h
j1, j2 = j * stride[2], j * stride[2] + f_w
out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
return out
class TestConv3dTransposeOp(OpTest):
def setUp(self):
# init as conv transpose
self.init_op_type()
# [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
self.init_test_case()
conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
input_ = np.random.random(self.input_size).astype("float32")
filter_ = np.random.random(self.filter_size).astype("float32")
output = conv3dtranspose_forward_naive(
input_, filter_, conv3dtranspose_param).astype("float32")
# print 'deconv output py', output, output.shape
self.inputs = {'Input': input_, 'Filter': filter_}
self.attrs = {
'strides': self.stride,
'paddings': self.pad,
# 'dilations': self.dilations
}
self.outputs = {'Output': output}
def test_check_output(self):
print 'check output here'
self.check_output()
def test_check_grad(self):
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self):
self.check_grad(
['Input'],
'Output',
max_relative_error=0.02,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self):
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.02,
no_grad_set=set(['Input']))
def init_test_case(self):
self.pad = [0, 0, 0]
self.stride = [1, 1, 1]
self.dilations = [1, 1, 1]
self.input_size = [2, 3, 5, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3, 3]
def init_op_type(self):
self.op_type = "conv3d_transpose"
if __name__ == '__main__':
unittest.main()
......@@ -21,9 +21,14 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
def setUp(self):
self.op_type = "fill_constant_batch_size_like"
self.inputs = {'Input': np.random.random((219, 232)).astype("float32")}
self.attrs = {'value': 3.5, 'shape': [132, -1, 7], 'dim_idx': 1}
out = np.random.random((132, 232, 7)).astype("float32")
self.attrs = {
'value': 3.5,
'shape': [132, -1, 7],
'input_dim_idx': 0,
'output_dim_idx': 1
}
out = np.random.random((132, 219, 7)).astype("float32")
out.fill(3.5)
self.outputs = {'Out': out}
......
import unittest
from paddle.v2.framework.framework import Program
class TestDebugStringFramework(unittest.TestCase):
def test_debug_str(self):
p = Program()
p.current_block().create_var(name='t', shape=[0, 1])
self.assertRaises(ValueError, callableObj=p.__str__)
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
import math
from op_test import OpTest
from test_lstm_op import identity, sigmoid, tanh, relu
class TestGRUOp(OpTest):
batch_size = 9
frame_size = 5
activate = {
'identity': identity,
'sigmoid': sigmoid,
'tanh': tanh,
'relu': relu
}
@staticmethod
def seq_to_batch(lod, is_reverse):
idx_in_seq_list = []
seq_starts = lod[0]
seq_lens = []
for i in range(len(seq_starts) - 1):
seq_lens.append(seq_starts[i + 1] - seq_starts[i])
sorted_seqs = sorted(
range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
num_batch = seq_lens[sorted_seqs[0]]
for batch_idx in range(num_batch):
idx_in_seq = []
for i in range(len(seq_lens)):
if seq_lens[sorted_seqs[i]] <= batch_idx:
break
idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
) if is_reverse else (
seq_starts[sorted_seqs[i]] + batch_idx)
idx_in_seq.append(idx)
idx_in_seq_list.append(idx_in_seq)
return idx_in_seq_list
def gru_step(self, x, h_p, w, b):
batch_size = x.shape[0]
frame_size = w.shape[0]
g = x + np.tile(b, (batch_size, 1))
w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
(frame_size, frame_size * 2))
u_r = self.activate[self.attrs['gate_activation']](np.dot(
h_p, w_u_r) + g[:, :frame_size * 2])
u = u_r[:, :frame_size]
r = u_r[:, frame_size:frame_size * 2]
r_h_p = r * h_p
w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
(frame_size, frame_size))
c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
g[:, frame_size * 2:])
g = np.hstack((u_r, c))
h = u * c + (1 - u) * h_p
return g, r_h_p, h
def gru(self):
input, lod = self.inputs['Input']
w = self.inputs['Weight']
b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
(1, self.frame_size * 3))
batch_gate = self.outputs['BatchGate']
batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
batch_hidden = self.outputs['BatchHidden']
hidden = self.outputs['Hidden']
idx_in_seq_list = self.idx_in_seq_list
h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
(len(idx_in_seq_list[0]), self.frame_size))
num_batch = len(idx_in_seq_list)
end_idx = 0
for batch_idx in range(num_batch):
x = input[idx_in_seq_list[batch_idx]]
g, r_h_p, h = self.gru_step(x, h_p, w, b)
if batch_idx < (num_batch - 1):
h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
start_idx = end_idx
end_idx = start_idx + len(idx_in_seq_list[batch_idx])
batch_gate[start_idx:end_idx] = g
batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
batch_hidden[start_idx:end_idx] = h
hidden[idx_in_seq_list[batch_idx]] = h
return batch_gate, batch_reset_hidden_prev, hidden
def set_data(self):
lod = [[0, 2, 6, self.batch_size]]
self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
batch_size = self.batch_size
frame_size = self.frame_size
input = np.random.rand(batch_size, frame_size * 3).astype('float64')
h0 = np.random.rand(len(self.idx_in_seq_list[0]),
frame_size).astype('float64')
weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
bias = np.random.rand(1, frame_size * 3).astype('float64')
self.inputs = {
'Input': (input, lod),
'H0': h0,
'Weight': weight,
'Bias': bias
}
self.outputs = {
'BatchGate': np.zeros(
(batch_size, frame_size * 3), dtype='float64'),
'BatchResetHiddenPrev': np.zeros(
(batch_size, frame_size), dtype='float64'),
'BatchHidden': np.zeros(
(batch_size, frame_size), dtype='float64'),
'Hidden': np.zeros(
(batch_size, frame_size), dtype='float64')
}
def set_confs(self):
self.is_reverse = False
self.attrs = {
'activation': 'tanh',
'gate_activation': 'sigmoid',
'is_reverse': self.is_reverse
}
def setUp(self):
self.op_type = "gru"
self.set_confs()
self.set_data()
self.gru()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
class TestGRUOpNoInitial(TestGRUOp):
def set_data(self):
super(TestGRUOpNoInitial, self).set_data()
self.inputs.pop('H0')
def test_check_grad(self):
self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
class TestGRUOpReverse(TestGRUOp):
def set_confs(self):
self.is_reverse = True
self.attrs = {
'activation': 'identity',
'gate_activation': 'sigmoid',
'is_reverse': self.is_reverse
}
if __name__ == "__main__":
unittest.main()
......@@ -21,7 +21,8 @@ class TestHuberLossOp(OpTest):
'Y': np.random.uniform(0, 1., (samples_num, 1)).astype('float32'),
}
residual = self.inputs['Y'] - self.inputs['X']
loss = np.vectorize(huber_loss_forward)(residual, delta)
loss = np.vectorize(huber_loss_forward)(residual,
delta).astype('float32')
self.attrs = {'delta': delta}
self.outputs = {
'Residual': residual,
......@@ -43,6 +44,5 @@ class TestHuberLossOp(OpTest):
['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
# TODO(typhoonzero): should add this back till we fix it
#if __name__ == '__main__':
# unittest.main()
if __name__ == '__main__':
unittest.main()
import unittest
import itertools
import numpy as np
from op_test import OpTest
def py_pnpair_op(score, label, query, column=-1, weight=None):
# group by query id
predictions = {}
batch_size = label.shape[0]
if weight is None:
weight = np.ones(shape=(batch_size, 1)).astype('float32')
for s, l, q, w in zip(score, label, query, weight):
s, l, q, w = s[column], l[0], q[0], w[0]
if q not in predictions:
predictions[q] = []
predictions[q].append((s, l, w))
# accumulate statistics
pos, neg, neu = 0, 0, 0
for _, ranks in predictions.items():
for e1, e2 in itertools.combinations(ranks, 2):
s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
w = (w1 + w2) * 0.5
if l1 == l2:
continue
if s1 == s2:
neu += w
elif (s1 - s2) * (l1 - l2) > 0:
pos += w
else:
neg += w
return np.array(pos).astype('float32'), np.array(neg).astype(
'float32'), np.array(neu).astype('float32')
class TestPositiveNegativePairOp(OpTest):
def setUp(self):
self.op_type = 'positive_negative_pair'
batch_size = 20
max_query_id = 5
score = np.random.normal(size=(batch_size, 1)).astype('float32')
label = np.random.normal(size=(batch_size, 1)).astype('float32')
query = np.array(
[np.random.randint(max_query_id) for i in range(batch_size)])
query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
pos, neg, neu = py_pnpair_op(score, label, query)
self.inputs = {'Score': score, 'Label': label, 'QueryID': query}
self.attrs = {'column': -1}
self.outputs = {
'PositivePair': pos,
'NegativePair': neg,
'NeutralPair': neu
}
def test_check_output(self):
self.check_output()
class TestPositiveNegativePairOpAccumulateWeight(OpTest):
def setUp(self):
self.op_type = 'positive_negative_pair'
batch_size = 20
max_query_id = 5
max_random_num = 2 << 15
score_dim = 2
score = np.random.normal(size=(batch_size, 2)).astype('float32')
label = np.random.normal(size=(batch_size, 1)).astype('float32')
weight = np.random.normal(size=(batch_size, 1)).astype('float32')
query = np.array(
[np.random.randint(max_query_id) for i in range(batch_size)])
query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
acc_pos = np.reshape(
np.random.randint(max_random_num), newshape=(1)).astype('float32')
acc_neg = np.reshape(
np.random.randint(max_random_num), newshape=(1)).astype('float32')
acc_neu = np.reshape(
np.random.randint(max_random_num), newshape=(1)).astype('float32')
column = np.random.randint(score_dim)
pos, neg, neu = py_pnpair_op(
score, label, query, column=column, weight=weight)
self.inputs = {
'Score': score,
'Label': label,
'QueryID': query,
'AccumulatePositivePair': acc_pos,
'AccumulateNegativePair': acc_neg,
'AccumulateNeutralPair': acc_neu,
'Weight': weight
}
self.attrs = {'column': column}
self.outputs = {
'PositivePair': pos + acc_pos,
'NegativePair': neg + acc_neg,
'NeutralPair': neu + acc_neu
}
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
import unittest
import logging
from op_test import get_numeric_gradient
from paddle.v2.framework.layers import *
import paddle.v2.framework.layers as layers
from paddle.v2.framework.framework import Program
from paddle.v2.framework.executor import Executor
from paddle.v2.framework.backward import append_backward_ops
......@@ -16,8 +13,8 @@ class PyRNNBase(object):
self.x = np.ones(shape=input_shape).astype("float32")
self.y = np.zeros(shape=output_shape).astype("float32")
def step(self):
pass
def step(self, step_id, x):
raise NotImplementedError
def forward(self):
for step_id in range(self.x.shape[0]):
......@@ -116,30 +113,30 @@ class RecurrentOpTest1(unittest.TestCase):
self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
self.output = mean(x=self.create_rnn_op(), **self.p_info)
self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
def create_rnn_op(self):
x = data(
x = layers.data(
shape=[self.sent_len, self.batch_size, self.input_dim],
data_type='float32',
name='x',
append_batch_size=False,
**self.p_info)
x.stop_gradient = False
h_boot = data(
h_boot = layers.data(
shape=[self.input_dim],
data_type='float32',
name='h_boot',
**self.p_info)
h_boot.stop_gradient = False
rnn = StaticRNN(main_program=self.main_program)
rnn = layers.StaticRNN(main_program=self.main_program)
with rnn.step():
h_pre = rnn.memory(init=h_boot)
x_t = rnn.step_input(x)
h = scale(
x=elementwise_add(
h = layers.scale(
x=layers.elementwise_add(
x=h_pre, y=x_t, **self.p_info),
scale=self.py_rnn.scale,
**self.p_info)
......@@ -249,41 +246,41 @@ class RecurrentOpTest2(RecurrentOpTest1):
self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
self.output = mean(x=self.create_rnn_op(), **self.p_info)
self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
def create_rnn_op(self):
x = data(
x = layers.data(
shape=[self.sent_len, self.batch_size, self.input_dim],
data_type='float32',
name='x',
append_batch_size=False,
**self.p_info)
x.stop_gradient = False
h_boot = data(
h_boot = layers.data(
shape=[self.input_dim],
data_type='float32',
name='h_boot',
**self.p_info)
h_boot.stop_gradient = False
rnn = StaticRNN(main_program=self.main_program)
rnn = layers.StaticRNN(main_program=self.main_program)
with rnn.step():
h_pre = rnn.memory(init=h_boot)
x_t = rnn.step_input(x)
temp_l = fc(input=x_t,
size=self.input_dim,
param_attr={'name': 'W'},
bias_attr=False,
**self.p_info)
temp_r = fc(input=h_pre,
size=self.input_dim,
param_attr={'name': 'U'},
bias_attr=False,
**self.p_info)
h = sigmoid(
x=elementwise_add(
temp_l = layers.fc(input=x_t,
size=self.input_dim,
param_attr={'name': 'W'},
bias_attr=False,
**self.p_info)
temp_r = layers.fc(input=h_pre,
size=self.input_dim,
param_attr={'name': 'U'},
bias_attr=False,
**self.p_info)
h = layers.sigmoid(
x=layers.elementwise_add(
x=temp_l, y=temp_r, **self.p_info),
**self.p_info)
......@@ -293,7 +290,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
return rnn()
class RecurrentOpTest3(RecurrentOpTest1):
class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
'''
Test RNNOp with two memories
equation:
......@@ -310,8 +307,8 @@ class RecurrentOpTest3(RecurrentOpTest1):
class PySimpleRNN3(PyRNNBase):
def __init__(self, input_shape, output_shape):
super(RecurrentOpTest3.PySimpleRNN3, self).__init__(input_shape,
output_shape)
super(RecurrentOpMultipleMemoryTest.PySimpleRNN3, self).__init__(
input_shape, output_shape)
seq_len, batch_size, input_dim = input_shape
self.h_boot1 = np.random.normal(size=(batch_size,
......@@ -345,27 +342,27 @@ class RecurrentOpTest3(RecurrentOpTest1):
self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
self.py_rnn = RecurrentOpTest3.PySimpleRNN3(self.input_shape,
self.output_shape)
self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
self.input_shape, self.output_shape)
self.output = mean(x=self.create_rnn_op(), **self.p_info)
self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
def create_rnn_op(self):
x = data(
x = layers.data(
shape=[self.sent_len, self.batch_size, self.input_dim],
data_type='float32',
name='x',
append_batch_size=False,
**self.p_info)
x.stop_gradient = False
h_boot1 = data(
h_boot1 = layers.data(
shape=[self.batch_size, self.input_dim],
data_type='float32',
name='h_boot1',
append_batch_size=False,
**self.p_info)
h_boot1.stop_gradient = False
h_boot2 = data(
h_boot2 = layers.data(
shape=[self.batch_size, self.input_dim],
data_type='float32',
name='h_boot2',
......@@ -373,15 +370,15 @@ class RecurrentOpTest3(RecurrentOpTest1):
**self.p_info)
h_boot2.stop_gradient = False
rnn = StaticRNN(main_program=self.main_program)
rnn = layers.StaticRNN(main_program=self.main_program)
with rnn.step():
h_pre1 = rnn.memory(init=h_boot1)
h_pre2 = rnn.memory(init=h_boot2)
x_t = rnn.step_input(x)
mem1 = scale(x=h_pre1, scale=1.0, **self.p_info)
mem2 = scale(x=h_pre2, scale=1.0, **self.p_info)
out = sums(input=[mem1, x_t, mem2], **self.p_info)
mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)
rnn.update_memory(h_pre1, mem1)
rnn.update_memory(h_pre2, mem2)
......@@ -390,5 +387,70 @@ class RecurrentOpTest3(RecurrentOpTest1):
return rnn()
class RecurrentOpNoMemBootTest(RecurrentOpTest1):
'''
Test RNNOp with two memories
equation:
mem = x + mem_pre
y = mem
vars:
- x
memories:
- mem
outputs:
- y
'''
class PySimpleRNN4(PyRNNBase):
def __init__(self, input_shape, output_shape):
super(RecurrentOpNoMemBootTest.PySimpleRNN4, self).__init__(
input_shape, output_shape)
men_dim = input_shape
self.mems = np.zeros(shape=men_dim).astype("float32")
def step(self, step_id, x):
if step_id == 0:
pre_mem = np.zeros_like(x)
else:
pre_mem = self.mems[step_id - 1]
self.mems[step_id] = pre_mem + x
self.y[step_id] = self.mems[step_id]
input_dim = 1
batch_size = 1
sent_len = 2
def setUp(self):
self.setup_program()
self.data_field = {"x"}
self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
self.output_shape)
self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
print self.main_program
def create_rnn_op(self):
x = layers.data(
shape=[self.sent_len, self.batch_size, self.input_dim],
data_type='float32',
name='x',
append_batch_size=False,
**self.p_info)
x.stop_gradient = False
rnn = layers.StaticRNN(main_program=self.main_program)
with rnn.step():
mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
x_t = rnn.step_input(x)
mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info)
rnn.update_memory(mem_pre, mem)
rnn.output(mem)
return rnn()
if __name__ == '__main__':
unittest.main()
......@@ -29,6 +29,9 @@ class TestSeqAvgPool(OpTest):
self.check_output()
def test_check_grad(self):
# Remove MaxIndex after check_grad is refined.
self.outputs['MaxIndex'] = \
np.zeros(self.outputs['Out'].shape).astype('int32')
self.check_grad(["X"], "Out")
......@@ -85,31 +88,53 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
def test_check_grad(self):
# Remove MaxIndex after check_grad is refined.
self.outputs['MaxIndex'] = \
np.zeros(self.outputs['Out'].shape).astype('int32')
self.check_grad(["X"], "Out", max_relative_error=0.06)
class TestSeqMaxPool(TestSeqAvgPool):
def set_data(self):
self.op_type = 'sequence_pool'
x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
lod = [[0, 4, 5, 8, 13]]
for i in range(4):
l = lod[0][i + 1] - lod[0][i]
x[lod[0][i] + np.random.randint(l), :] += 2.0
self.inputs = {'X': (x, lod)}
out = np.zeros((4, 23)).astype('float32')
self.outputs = {'Out': out}
return x, lod, out
def compute(self, x, lod, out):
self.attrs = {'pooltype': "MAX"}
for i in range(4):
sub_x = x[lod[0][i]:lod[0][i + 1], :]
out[i] = np.amax(sub_x, axis=0)
def test_check_grad(self):
# Remove MaxPool2D from gradient check to confirm the success of CI.
return
class TestSeqMaxPool2D(TestSeqAvgPool2D):
def set_data(self):
self.op_type = 'sequence_pool'
x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
lod = [[0, 4, 5, 8, 13]]
self.inputs = {'X': (x, lod)}
for i in range(4):
l = lod[0][i + 1] - lod[0][i]
x[lod[0][i] + np.random.randint(l), :] += 1.0
out = np.zeros((4, 3, 11)).astype('float32')
self.outputs = {'Out': out}
return x, lod, out
def compute(self, x, lod, out):
self.attrs = {'pooltype': "MAX"}
for i in range(4):
sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 17))
def test_check_grad(self):
# Remove MaxPool2D from gradient check to confirm the success of CI.
return
sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
class TestSeqLastPool(TestSeqAvgPool):
......
......@@ -12,30 +12,30 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
def setUp(self):
self.op_type = "softmax_with_cross_entropy"
batch_size = 3
batch_size = 2
class_num = 37
logits = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32")
[batch_size, class_num]).astype("float64")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int32")
labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
cross_entropy = np.asmatrix(
[[-np.log(softmax[i][labels[i][0]])]
for i in range(softmax.shape[0])],
dtype="float32")
dtype="float64")
self.inputs = {"Logits": logits, "Label": labels}
self.outputs = {
"Softmax": softmax.astype('float32'),
"Loss": cross_entropy.astype('float32')
"Softmax": softmax.astype("float64"),
"Loss": cross_entropy.astype("float64")
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
self.check_grad(["Logits"], "Loss")
class TestSoftmaxWithCrossEntropyOp2(OpTest):
......@@ -49,19 +49,19 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
class_num = 37
logits = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32")
[batch_size, class_num]).astype("float64")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
labels = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float32")
[batch_size, class_num]).astype("float64")
labels /= np.sum(labels, axis=1, keepdims=True)
cross_entropy = (-labels * np.log(softmax)).sum(
axis=1, keepdims=True).astype("float32")
axis=1, keepdims=True).astype("float64")
self.inputs = {"Logits": logits, "Label": labels}
self.outputs = {
"Softmax": softmax.astype('float32'),
"Loss": cross_entropy.astype('float32')
"Softmax": softmax.astype("float64"),
"Loss": cross_entropy.astype("float64")
}
self.attrs = {"soft_label": True}
......@@ -69,9 +69,8 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
self.check_output()
def test_check_grad(self):
self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
self.check_grad(["Logits"], "Loss")
if __name__ == "__main__":
exit(0) # FIXME: xe has bug
unittest.main()
......@@ -11,11 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Optimizers(update equation) for SGD method.
TODO(yuyang18): Complete comments.
"""
import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
import paddle.trainer_config_helpers.optimizers as v1_optimizers
......@@ -101,32 +96,37 @@ class Optimizer(object):
class Momentum(Optimizer):
"""
SGD Optimizer.
SGD is an optimization method, trying to find a neural network that
minimize the "cost/error" of it by iteration. In paddle's implementation
SGD Optimizer is synchronized, which means all gradients will be wait to
calculate and reduced into one gradient, then do optimize operation.
Momentum Optimizer.
The neural network consider the learning problem of minimizing an objective
function, that has the form of a sum
When sparse=False, the momentum update formula is as follows:
.. math::
Q(w) = \\sum_{i}^{n} Q_i(w)
v_{t} &= k * v_{t-1} - \\gamma_t / (g_{t} + \\lambda w_{t-1}) \\\\
w_{t} &= w_{t-1} + v_{t} \\\\
The value of function Q sometimes is the cost of neural network (Mean
Square Error between prediction and label for example). The function Q is
parametrised by w, the weight/bias of neural network. And weights is what to
be learned. The i is the i-th observation in (trainning) data.
where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
:math:`\\gamma_t` is learning rate at the t'th iteration.
:math:`w_{t}` is the weight as the t'th iteration.
And the :math:`v_{t}` is the history momentum variable.
So, the SGD method will optimize the weight by
When sparse=True, the update scheme:
.. math::
w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
where :math:`\\eta` is learning rate. And :math:`n` is batch size.
\\alpha_t &= \\alpha_{t-1} / k \\\\
\\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
\\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
where :math:`k` is momentum, :math:`\\lambda` is decay rate,
:math:`\\gamma_t` is learning rate at the t'th iteration.
:param momentum: the momentum factor.
:type momentum: float
:param sparse: with sparse support or not, False by default.
:type sparse: bool
"""
def __init__(self, momentum=None, sparse=False, **kwargs):
......@@ -146,7 +146,7 @@ class Adam(Optimizer):
m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
:param beta1: the :math:`\\beta_1` in equation.
:type beta1: float
......
from setuptools import setup, Distribution
from setuptools import setup, Distribution, Extension
class BinaryDistribution(Distribution):
def has_ext_modules(foo):
return True
......@@ -41,6 +41,7 @@ setup(name='paddlepaddle',
description='Parallel Distributed Deep Learning',
install_requires=setup_requires,
packages=packages,
ext_modules=[Extension('_foo', ['stub.cc'])],
package_data={
'paddle.v2.master': ['libpaddle_master.so'],
'paddle.v2.framework': ['core.so'],
......@@ -54,6 +55,5 @@ setup(name='paddlepaddle',
'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
},
scripts=paddle_bins,
distclass=BinaryDistribution,
data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册