提交 89f95ea2 编写于 作者: D dzhwinter

merge develop branch

...@@ -141,12 +141,6 @@ else() ...@@ -141,12 +141,6 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release) set(THIRD_PARTY_BUILD_TYPE Release)
endif() endif()
if(WITH_MKL)
option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
if (MKL_SPLIT_GEMM)
add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
endif()
endif()
set(WITH_MKLML ${WITH_MKL}) set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN) if (NOT DEFINED WITH_MKLDNN)
if (WITH_MKL AND AVX2_FOUND) if (WITH_MKL AND AVX2_FOUND)
......
...@@ -50,7 +50,11 @@ if(NOT WITH_PROFILER) ...@@ -50,7 +50,11 @@ if(NOT WITH_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
if(WITH_AVX AND AVX_FOUND) if(WITH_AVX AND AVX512F_FOUND)
set(SIMD_FLAG ${AVX512F_FLAG})
elseif(WITH_AVX AND AVX2_FOUND)
set(SIMD_FLAG ${AVX2_FLAG})
elseif(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG}) set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND) elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG}) set(SIMD_FLAG ${SSE3_FLAG})
...@@ -104,15 +108,20 @@ if(WITH_GPU) ...@@ -104,15 +108,20 @@ if(WITH_GPU)
endif() endif()
if(WITH_ANAKIN) if(WITH_ANAKIN)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile") message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
endif() endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile") message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
endif() endif()
set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR}) endif()
set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY}) if(WITH_ANAKIN)
message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h") # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
message(STATUS "cudnn library is ${CUDNN_LIBRARY}") # is a softlink to real cudnn.h directory
set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
endif() endif()
elseif(WITH_AMD_GPU) elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP) add_definitions(-DPADDLE_WITH_HIP)
......
...@@ -2,6 +2,11 @@ if (NOT WITH_ANAKIN) ...@@ -2,6 +2,11 @@ if (NOT WITH_ANAKIN)
return() return()
endif() endif()
option(ANAKIN_ENABLE_OP_TIMER "Get more detailed information with Anakin op time" OFF)
if(ANAKIN_ENABLE_OP_TIMER)
add_definitions(-DPADDLE_ANAKIN_ENABLE_OP_TIMER)
endif()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin) set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin)
# the anakin install dir is only default one now # the anakin install dir is only default one now
...@@ -11,13 +16,21 @@ set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR}) ...@@ -11,13 +16,21 @@ set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so) set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so) set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later. # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin") set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}") execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}") execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
include_directories(${ANAKIN_INCLUDE}) include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/) include_directories(${ANAKIN_INCLUDE}/saber/)
include_directories(${ANAKIN_INCLUDE}/saber/core/)
include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/x86/)
include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/cuda/base/cuda_c/)
set(ANAKIN_COMPILE_EXTRA_FLAGS set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
...@@ -25,9 +38,12 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS ...@@ -25,9 +38,12 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=format-extra-args -Wno-format-extra-args -Wno-error=format-extra-args -Wno-format-extra-args
-Wno-error=comment -Wno-comment -Wno-error=comment -Wno-comment
-Wno-error=format -Wno-format -Wno-error=format -Wno-format
-Wno-error=maybe-uninitialized -Wno-maybe-uninitialized
-Wno-error=switch -Wno-switch -Wno-error=switch -Wno-switch
-Wno-error=return-type -Wno-return-type -Wno-error=return-type -Wno-return-type
-Wno-error=non-virtual-dtor -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
-Wno-error=ignored-qualifiers
-Wno-ignored-qualifiers
-Wno-sign-compare -Wno-sign-compare
-Wno-reorder -Wno-reorder
-Wno-error=cpp) -Wno-error=cpp)
...@@ -38,7 +54,7 @@ ExternalProject_Add( ...@@ -38,7 +54,7 @@ ExternalProject_Add(
DEPENDS ${MKLML_PROJECT} DEPENDS ${MKLML_PROJECT}
# Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code. # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
GIT_REPOSITORY "https://github.com/luotao1/Anakin" GIT_REPOSITORY "https://github.com/luotao1/Anakin"
GIT_TAG "bcf17aabe7921ceb7bce591244b4f9dce7dba5c8" GIT_TAG "211d1fc5d813d70c0c14072f9083cf25f40940ea"
PREFIX ${ANAKIN_SOURCE_DIR} PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES CMAKE_ARGS -DUSE_GPU_PLACE=YES
...@@ -47,6 +63,8 @@ ExternalProject_Add( ...@@ -47,6 +63,8 @@ ExternalProject_Add(
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_ROOT=${CUDNN_ROOT}
-DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
-DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
${EXTERNAL_OPTIONAL_ARGS} ${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
) )
......
...@@ -54,7 +54,7 @@ ExternalProject_Add( ...@@ -54,7 +54,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51" GIT_TAG "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID ...@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
set(SSE3_FLAG "-msse3") set(SSE3_FLAG "-msse3")
set(AVX_FLAG "-mavx") set(AVX_FLAG "-mavx")
set(AVX2_FLAG "-mavx2") set(AVX2_FLAG "-mavx2")
set(AVX512F_FLAG "-mavx512f")
elseif(MSVC) elseif(MSVC)
set(MMX_FLAG "/arch:MMX") set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2") set(SSE2_FLAG "/arch:SSE2")
...@@ -81,5 +82,16 @@ int main() ...@@ -81,5 +82,16 @@ int main()
return 0; return 0;
}" AVX2_FOUND) }" AVX2_FOUND)
# Check AVX512F
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h>
int main()
{
__m512i a = _mm512_undefined_epi32();
return 0;
}" AVX512F_FOUND)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
# Distributed Training with NCCL2 # Distributed Training with NCCL2
We design a pattern that can enable training with `ParallelExecutor` and We design a pattern that can enable training with `ParallelExecutor` and
using [NCCL2](https://developer.nvidia.com/nccl) as it's collective use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
communication library. communication library.
In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast` In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
...@@ -9,14 +9,14 @@ to do multi GPU training. And if we initialize NCCL2 communicators as ...@@ -9,14 +9,14 @@ to do multi GPU training. And if we initialize NCCL2 communicators as
ranks in a distributed environment, we can simply run the `ParallelExecutor` ranks in a distributed environment, we can simply run the `ParallelExecutor`
as a distributed program! The only thing that may be different than in as a distributed program! The only thing that may be different than in
the single node version is that we need to broadcast the NCCL unique ID the single node version is that we need to broadcast the NCCL unique ID
to all the nodes, and initialize communicators using that ID, so NCCL2 to all the nodes and initialize communicators using that ID, so NCCL2
will know each other as ranks. can know each other as ranks.
To achieve this feature, we introduce a new operator: `gen_nccl_id` op, To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
what ever platform you like. whatever platform you like.
It have two running modes: It has two running modes:
1. Generate and broadcast mode, which should be used on trainer 0; 1. Generate and broadcast mode, which should be used on trainer 0;
1. Listen and fetch mode, which should be used on trainers other than 0. 1. Listen and fetch mode, which should be used on trainers other than 0.
...@@ -29,7 +29,7 @@ initialize NCCL communicator objects. ...@@ -29,7 +29,7 @@ initialize NCCL communicator objects.
<img src="src/ncc2_design.png"> <img src="src/ncc2_design.png">
The above figure indicates the general process when training with NCCL2 The above figure indicates the general process when training with NCCL2
distributed. Each trainer have the number of communicators equal to the distributed. Each trainer has the number of communicators equal to the
number of GPUs, but the ranks should match the global ranks number: here number of GPUs, but the ranks should match the global ranks number: here
we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1. be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
...@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs): ...@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs):
Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null. Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph. Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
......
...@@ -36,19 +36,19 @@ ...@@ -36,19 +36,19 @@
<tbody> <tbody>
<tr> <tr>
<td>OpProtoMake定义 </td> <td>OpProtoMake定义 </td>
<td>`.cc`文件,Backward Op不需要定义OpProtoMake </td> <td>.cc 文件,Backward Op不需要定义OpProtoMake </td>
</tr> </tr>
<tr> <tr>
<td>Op定义 </td> <td>Op定义 </td>
<td> `.cc`文件</td> <td> .cc 文件</td>
</tr> </tr>
<tr> <tr>
<td>Kernel实现 </td> <td>Kernel实现 </td>
<td> CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。</td> <td> CPU、CUDA共享Kernel实现在.h 文件中,否则,CPU 实现在.cc 文件中,CUDA 实现在.cu 文件中。</td>
</tr> </tr>
<tr> <tr>
<td>注册Op </td> <td>注册Op </td>
<td> Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中</td> <td> Op注册实现在.cc 文件;Kernel注册CPU实现在.cc 文件中,CUDA实现在.cu 文件中</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
...@@ -119,10 +119,29 @@ $$Out = scale*X$$ ...@@ -119,10 +119,29 @@ $$Out = scale*X$$
这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。 这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
### 定义GradProtoMaker类
每个Op的必须有一个对应的GraProtoMaker,若未定制对应前向Op的GradProtoMaker,fluid提供了DefaultGradProtoMaker,默认注册会使用全部输入输出,包括Input, Output, Output@Grad等,使用不需要的变量的会造成显存浪费。
下面示例定义了ScaleOp的GradProtoMaker。
```cpp
class ScaleGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad_op = new framework::OpDesc();
grad_op->SetType("scale");
grad_op->SetInput("X", OutputGrad("Out"));
grad_op->SetOutput("Out", InputGrad("X"));
grad_op->SetAttr("scale", GetAttr("scale"));
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
```
### 定义Operator类 ### 定义Operator类
下面的点实现了MulOp的定义: 下面实现了MulOp的定义:
```cpp ```cpp
class MulOp : public framework::OperatorWithKernel { class MulOp : public framework::OperatorWithKernel {
...@@ -334,3 +353,83 @@ ctest -R test_mul_op ...@@ -334,3 +353,83 @@ ctest -R test_mul_op
- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OPERATOR(B, ...)`等,这将会导致单元测试出错。 - 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OPERATOR(B, ...)`等,这将会导致单元测试出错。
- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 - 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 - 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
### PADDLE_ENFORCE使用注意
实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义,基本格式如下:
```
PADDLE_ENFORCE(表达式, 错误提示信息)
PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
```
如果表达式为真,或者比较对象A=B,则检查通过,否则会终止程序运行,向用户反馈相应的错误提示信息。
为了确保提示友好易懂,开发者需要注意其使用方法。
#### 总体原则
任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方,必须有详略得当的备注解释!**错误提示信息**不能为空!
#### 提示信息书写标准
1. [required] 哪里错了?为什么错了?
- 例如:`ValueError: Mismatched label shape`
2. [optional] 期望的输入是什么样的?实际的输入是怎样的?
- 例如:`Expected labels dimension=1. Received 4.`
3. [optional] 能否给出修改意见?
- 例如:`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
如果并非必要或者简洁的描述即可表达清楚以上要点,根据情况书写亦可。
##### FAQ 典型问题
1. 无报错信息或报错信息过于简单,不能给用户提供有效的提示!
问题示例1 :未写提示信息
```
PADDLE_ENFORCE(ctx->HasInput("X"), "");
```
问题示例2 :提示信息过于简单
```
PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么?
```
2. 在报错信息中使用开发人员定义的变量缩写,不易理解!
问题示例:
```
PADDLE_ENFORCE(forward_pd != nullptr,
"Fail to find eltwise_fwd_pd in device context"); //eltwise_fwd_pd用户可能看不懂
```
3. OP内部调用非法接口:Op内部如果出现Output = ShareDataWith(Input)
问题示例:
```cpp
auto *out = ctx.Output<framework::LoDTensor>("Out");
auto *in = ctx.Input<framework::LoDTensor>("X");
out->ShareDataWith(*in);
```
Op内部如果出现Output = ShareDataWith(Input),相当于operator图的中有一条隐藏边,连接了Input和Output,这条边无法在图分析中表达,引发基于图优化的错误。
4. OP实现的性能实践
调用了eigen的broadcast, chop等操作,性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen,gpu实现可以实现cuda kernel.
#### OP InferShape检查提示信息特别说明
- 检查输入输出变量,请统一遵循以下格式
`Input(变量名) of OP名 operator should not be null.`
正确示例:
```
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTMP operator should not be null.");
```
- 反向Op的输入输出检查,要写明反向Op的名字
正确示例:
```
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LoDResetGrad opreator should not be null.");
```
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。 Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md)[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md) 关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
## paddle::framework::Tensor ## paddle::framework::Tensor
......
# Distributed Training with NCCL2 and RDMA # Distributed Training with NCCL2 and RDMA
When doing distributed multi-GPU training, network bandwith often becomes the When doing distributed multi-GPU training, network bandwidth often becomes the
bottle neck. We introduce a way to use NCCL2 to do such training job to bottleneck. We introduce a way to use NCCL2 to do such training job to
achieve best performace. achieve best performance.
## Prepare Hardwares with RDMA and Multiple GPUs ## Prepare Hardware with RDMA and Multiple GPUs
I'm using two Linux servers each of them is installed with 8 GPUs and I'm using two Linux servers each of them installed with 8 GPUs and
one 100Gb RDMA card. one 100Gb RDMA card.
Base environment is: Base environment is:
...@@ -25,7 +25,7 @@ In general, the steps including: ...@@ -25,7 +25,7 @@ In general, the steps including:
1. Use docker to run tests and make sure GPUs and RDMA can work inside 1. Use docker to run tests and make sure GPUs and RDMA can work inside
the container. the container.
I'll ommit section "Install GPU drivers" because we can find it easily I'll omit the section "Install GPU drivers" because we can find it easily
somewhere else. somewhere else.
### Install RDMA drivers ### Install RDMA drivers
...@@ -33,7 +33,7 @@ somewhere else. ...@@ -33,7 +33,7 @@ somewhere else.
For my case, I've got two machines with device For my case, I've got two machines with device
"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was "Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can "CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
work with latest overlay2 filesystem. work with the latest overlay2 filesystem.
***NOTE: before you start, make sure you have a way to get a console ***NOTE: before you start, make sure you have a way to get a console
of the server other than ssh because we may need to re-configure the of the server other than ssh because we may need to re-configure the
...@@ -45,14 +45,14 @@ network device.*** ...@@ -45,14 +45,14 @@ network device.***
1. Run `./mlnxofedinstall --add-kernel-support` in the software package. 1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
1. Run `/etc/init.d/openibd restart` to make everything work, note that 1. Run `/etc/init.d/openibd restart` to make everything work, note that
this operation may cause the network goes down if you are using this this operation may cause the network goes down if you are using this
RDMA device as default network device and use ssh to login the server. RDMA device as default network device and use ssh to log in the server.
1. Re-configure the network interface, for example: 1. Re-configure the network interface, for example:
`ifconfig eth2 192.168.16.30/20 up`, then add routes if needed: `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
`ip route add default via 192.168.16.1 dev eth2`. `ip route add default via 192.168.16.1 dev eth2`.
1. Do the same thing on the other node. 1. Do the same thing on the other node.
1. Use `ping` to test if the two nodes have typical ICMP connection. 1. Use `ping` to test if the two nodes have typical ICMP connection.
1. Use either `udaddy` or `ib_write_bw` to test the network connection is 1. Use either `udaddy` or `ib_write_bw` to test the network connection is
ready and have the desired bandwith. ready and have the desired bandwidth.
### Prepare Docker Image to Run RDMA Programs ### Prepare Docker Image to Run RDMA Programs
...@@ -60,7 +60,7 @@ network device.*** ...@@ -60,7 +60,7 @@ network device.***
package in it. package in it.
1. Start a docker container and mount GPU driver libs into it (you can 1. Start a docker container and mount GPU driver libs into it (you can
skip this step if you are using nvidia-docker). skip this step if you are using nvidia-docker).
1. Mount RDMA dirvers and libs into the docker image (see below section), 1. Mount RDMA drivers and libs into the docker image (see below section),
also `udaddy` and `ib_write_bw` if needed. also `udaddy` and `ib_write_bw` if needed.
1. Mount GPU devices and RDMA devices into the container using `--device` 1. Mount GPU devices and RDMA devices into the container using `--device`
or just use privileged mode `--privileged`. or just use privileged mode `--privileged`.
......
...@@ -78,7 +78,7 @@ paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', ' ...@@ -78,7 +78,7 @@ paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', '
paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
...@@ -153,6 +153,7 @@ paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'n ...@@ -153,6 +153,7 @@ paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'n
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -161,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs ...@@ -161,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
...@@ -190,7 +192,7 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None ...@@ -190,7 +192,7 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
...@@ -250,7 +252,6 @@ paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwarg ...@@ -250,7 +252,6 @@ paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwarg
paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
......
...@@ -114,10 +114,19 @@ else() ...@@ -114,10 +114,19 @@ else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
endif() endif()
<<<<<<< HEAD
if (NOT WIN32) if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
=======
if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fast_threaded_ssa_graph_executor)
>>>>>>> origin/develop
endif() # NOT WIN32 endif() # NOT WIN32
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace framework {
template <typename T, size_t N>
class Array {
static_assert(N > 0, "The size of array must be larger than 0");
public:
HOSTDEVICE Array() {}
HOSTDEVICE explicit Array(const T &val) {
for (size_t i = 0; i < N; ++i) data_[i] = val;
}
HOSTDEVICE const T *Get() const { return data_; }
HOSTDEVICE T *GetMutable() { return data_; }
HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
HOSTDEVICE constexpr size_t size() const { return N; }
private:
T data_[N];
};
} // namespace framework
} // namespace paddle
...@@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b ...@@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b
cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor) cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
# device_context reduce_op_handle ) # device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
...@@ -19,10 +19,13 @@ namespace framework { ...@@ -19,10 +19,13 @@ namespace framework {
namespace details { namespace details {
struct ExecutionStrategy { struct ExecutionStrategy {
enum ExecutorType { kDefault = 0, kExperimental = 1 };
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
bool allow_op_delay_{false}; bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100}; size_t num_iteration_per_drop_scope_{100};
ExecutorType type_{kDefault};
}; };
} // namespace details } // namespace details
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
namespace paddle {
namespace framework {
namespace details {
FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> &&graph)
: strategy_(strategy),
local_scopes_(local_scopes),
places_(places),
graph_(std::move(graph)),
pool_(strategy.num_threads_ +
1), // add one more thread for generate op_deps
fetch_ctxs_(places) {
auto &ops = graph_->Get<details::GraphOps>("ops");
for (auto &op : ops) {
int dep = static_cast<int>(op->NotReadyInputSize());
op_deps_.emplace(op.get(), dep);
if (dep == 0) {
bootstrap_ops_.emplace_back(op.get());
}
}
PrepareAtomicOpDeps();
}
FeedFetchList FastThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
op_deps = atomic_op_deps_.get();
PrepareAtomicOpDeps();
paddle::framework::FeedFetchList fetches;
fetches.resize(fetch_tensors.size());
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
std::vector<std::unique_ptr<ir::Node>> fetch_nodes;
std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
}
}
}
for (size_t i = 0; i < fetch_tensors.size(); ++i) {
auto &var_name = fetch_tensors[i];
auto fetched_var_it = fetched_vars.find(var_name);
PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
"Cannot find fetched variable.(Perhaps the main_program "
"is not set to ParallelExecutor)");
auto &vars = fetched_var_it->second;
fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i,
&local_scopes_);
fetch_ops.emplace_back(op);
for (auto &p : places_) {
op->SetDeviceContext(p, fetch_ctxs_.Get(p));
}
for (auto *var : vars) {
op->AddInput(var);
}
(*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
}
size_t num_complete = 0;
remaining_ = 0;
BlockingQueue<size_t> complete_q;
for (auto op : bootstrap_ops_) {
RunOpAsync(op_deps.get(), op, &complete_q);
}
while (num_complete != op_deps->size()) {
size_t num_comp = complete_q.Pop();
if (num_comp == -1UL) {
int remaining = 0;
while (true) {
remaining = remaining_;
if (remaining == 0) {
break;
}
for (int i = 0; i < remaining; ++i) {
complete_q.Pop();
}
}
exception_.ReThrow();
}
num_complete += num_comp;
}
// Wait FetchOps.
if (!fetch_ops.empty()) {
fetch_ops.clear();
}
return fetches;
}
void FastThreadedSSAGraphExecutor::RunOpAsync(
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
++remaining_;
this->pool_.enqueue([=] {
OpHandleBase *op_to_run = op;
size_t complete = 0;
while (op_to_run != nullptr) {
try {
op_to_run->Run(strategy_.use_cuda_);
++complete;
} catch (...) {
exception_.Catch(std::current_exception());
--remaining_;
complete_q->Push(-1UL);
return;
}
auto &outputs = op_to_run->Outputs();
op_to_run = nullptr;
for (auto &output : outputs) {
for (auto &pending_op : output->PendingOps()) {
std::atomic<int> &deps = op_deps->at(pending_op);
if (deps.fetch_sub(1) == 1) { // pending_op ready
if (op_to_run == nullptr) {
op_to_run = pending_op;
} else {
this->RunOpAsync(op_deps, pending_op, complete_q);
}
}
}
}
}
--remaining_;
complete_q->Push(complete);
});
}
void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
atomic_op_deps_ = pool_.enqueue([&] {
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
new std::unordered_map<OpHandleBase *, std::atomic<int>>;
for (auto &pair : op_deps_) {
(*op_deps)[pair.first] = pair.second;
}
return std::unique_ptr<
std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
});
}
const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
namespace paddle {
namespace framework {
class Scope;
namespace details {
class OpHandleBase;
class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
public:
FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> &&graph);
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
const ir::Graph &Graph() const override;
private:
ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
std::unique_ptr<ir::Graph> graph_;
std::unordered_map<OpHandleBase *, int> op_deps_;
std::vector<OpHandleBase *> bootstrap_ops_;
::ThreadPool pool_;
platform::DeviceContextPool fetch_ctxs_;
std::atomic<int> remaining_;
void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
OpHandleBase *op, BlockingQueue<size_t> *complete_q);
void PrepareAtomicOpDeps();
std::future<
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
atomic_op_deps_;
ExceptionHolder exception_;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -763,6 +763,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ...@@ -763,6 +763,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
// Create RPC related op handles that connects its in ops and out ops. // Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
// FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
// put them into transpiler.
int op_dev_id = -1; int op_dev_id = -1;
if (node->Op()->Type() == "send") { if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe. // TODO(paddle-dev): getting the first var is not safe.
...@@ -771,26 +773,42 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ...@@ -771,26 +773,42 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
"This hack no longer holds, please fix."); "This hack no longer holds, please fix.");
// the variable name which contains .block means it was splited by // the variable name which contains .block means it was splited by
// split_byref op // split_byref op
// so that we can balance the variable blocks to all the pserver
// instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce && if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
node->inputs[0]->Name().find(".block") == std::string::npos) { node->inputs[0]->Name().find(".block") == std::string::npos) {
std::vector<std::string> input_var_names; std::vector<std::string> input_var_names;
for (ir::Node *n : node->inputs) { for (ir::Node *n : node->inputs) {
input_var_names.push_back(n->Name()); input_var_names.push_back(n->Name());
} }
op_dev_id = GetAppropriateDeviceID(input_var_names); auto send_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
VLOG(10) << "send grad " << input_var_names[0] << " origin "
<< send_param_grad[1] << " place: " << op_dev_id;
for (auto &varname : input_var_names) { for (auto &varname : input_var_names) {
result->Get<ShardedVarDevice>(kShardedVarDevice) result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(varname, op_dev_id); .emplace(varname, op_dev_id);
} }
result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(send_param_grad[1], op_dev_id);
} }
} else if (node->Op()->Type() == "recv") { } else if (node->Op()->Type() == "recv") {
std::vector<std::string> output_var_names; std::vector<std::string> output_var_names;
for (ir::Node *n : node->outputs) { for (ir::Node *n : node->outputs) {
output_var_names.push_back(n->Name()); output_var_names.push_back(n->Name());
} }
auto recv_param_grad = boost::get<std::vector<std::string>>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
// FIXME(typhoonzero): assume each recv op output one param
// Use the same place as send.
if (recv_param_grad.size() == 2U) {
op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
VLOG(10) << "recv param " << recv_param_grad[0]
<< " get grad place: " << recv_param_grad[1]
<< " place: " << op_dev_id;
} else {
op_dev_id = GetAppropriateDeviceID(output_var_names); op_dev_id = GetAppropriateDeviceID(output_var_names);
}
for (auto &varname : output_var_names) { for (auto &varname : output_var_names) {
result->Get<ShardedVarDevice>(kShardedVarDevice) result->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(varname, op_dev_id); .emplace(varname, op_dev_id);
......
...@@ -54,7 +54,8 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, ...@@ -54,7 +54,8 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_ sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
<< "\\n" << "\\n"
<< var_handle_ptr->place_ << "\\n" << var_handle_ptr->place_ << "\\n"
<< var_handle_ptr->version_ << "\"]" << std::endl; << "scope: " << var_handle_ptr->scope_idx_ << "\\n"
<< "v" << var_handle_ptr->version_ << "\"]" << std::endl;
} else if (dummy_ptr) { } else if (dummy_ptr) {
sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
} }
......
...@@ -158,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, ...@@ -158,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
#endif #endif
} }
size_t OpHandleBase::NotReadyInputSize() const {
std::unordered_set<VarHandleBase *> res;
for (auto *var : inputs_) {
if (var->GeneratedOp() != nullptr) {
res.emplace(var);
}
}
return res.size();
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -81,6 +81,8 @@ class OpHandleBase { ...@@ -81,6 +81,8 @@ class OpHandleBase {
return res.size(); return res.size();
} }
size_t NotReadyInputSize() const;
const std::vector<VarHandleBase *> &Outputs() const { return outputs_; } const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
size_t NoDummyInputSize() const; size_t NoDummyInputSize() const;
......
...@@ -5,8 +5,12 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper) ...@@ -5,8 +5,12 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits) cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter)
cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter) cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
bool VarOutLinksToOp(Node* node, const std::string& op_type) {
for (auto* out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) {
return true;
}
}
return false;
}
void BuildFCPattern(PDPattern* pattern) {
// make sure the selected MUL op has one input argument is a parameter.
auto* mul_parameter_var = pattern->NewNode(
[](Node* node) {
return node->IsVar() && node->outputs.size() == 1UL &&
node->outputs.front()->Op()->Type() == "mul" && node->Var() &&
node->Var()->Persistable(); // check is a parameter
},
"mul_weight" /*name*/);
auto* mul_tmp_input_var = pattern->NewNode(
[](Node* node) {
bool result =
node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
!node->Var()->Persistable(); // this input is not an parameter.
if (!result) return false;
// check whether one output is MUL op.
for (auto* op : node->outputs) {
if (op->IsOp() && op->Op()->Type() == "mul") return true;
}
return false;
},
"mul_tmp_var" /*name*/);
// select a MUL op
auto* mul_op = pattern->NewNode(
[](Node* node) {
return node->IsOp() && // start from an Op
node->Op()->Type() == "mul"; // type is mul
// the output should be consumed only by one element_add, that check
// leaves in a Var PDNode.
},
"mul" /*name*/);
// make sure the MUL op's output has only one consumer and links to an
// ELEMENTWISE_ADD op.
auto* mul_out_var = pattern->NewNode(
[](Node* node) {
return node->IsVar() && // starts from a Var
node->outputs.size() == 1UL && // only has one consumer
node->outputs.front()->IsOp() && // check basic logic
node->Var() && // not a ControlDepVar
node->outputs.front()->Op()->Type() ==
"elementwise_add"; // a very strong validation
},
"mul_out");
// this check is not essential, just to make the corresponding variable Node
// retrival easier.
auto* elementwise_add_tmp_var = pattern->NewNode(
[](Node* node) {
return node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
VarOutLinksToOp(node, "elementwise_add");
},
"elementwise_add_tmpvar");
// select an ELEMENTWISE_ADD op
auto* elementwise_add_op = pattern->NewNode(
[](Node* node) {
return node->IsOp() && node->Op()->Type() == "elementwise_add";
},
"elementwise_add" /*name*/);
// get the ELEMENTWISE_ADD op's output
auto* elementwise_add_out_var = pattern->NewNode(
[](Node* node) {
return node->IsVar() && node->inputs.size() == 1UL && node->Var() &&
node->inputs.front()->Op()->Type() == "elementwise_add";
},
"elementwise_add_out");
pattern->AddEdge(mul_parameter_var, mul_op);
pattern->AddEdge(mul_tmp_input_var, mul_op);
pattern->AddEdge(mul_op, mul_out_var);
pattern->AddEdge(mul_out_var, elementwise_add_op);
pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op);
pattern->AddEdge(elementwise_add_op, elementwise_add_out_var);
}
// Replace the node `from` in the links to `to`
bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
for (auto*& n : *links) {
if (n == from) {
n = to;
return true;
}
}
return false;
}
std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
std::unordered_set<Node*> nodes2delete;
GraphPatternDetecter gpd;
BuildFCPattern(gpd.mutable_pattern());
#define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetriveNode(#id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetriveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle FC fuse";
// Currently, there is no FC op available, so I will just simulate the
// scenerio.
// FC's fusion is simple, just op fuse, no need to process the
// parameters.
GET_NODE(mul_tmp_var); // x
GET_NODE(mul_weight); // Y
GET_NODE(elementwise_add_tmpvar); // bias
GET_NODE(elementwise_add_out); // Out
GET_NODE(mul); // MUL op
GET_NODE(elementwise_add); // ELEMENT_ADD op
GET_NODE(mul_out); // tmp
#undef GET_NODE
// Create an FC Node.
OpDesc desc;
std::string fc_x_in = mul_tmp_var->Name();
std::string fc_Y_in = mul_weight->Name();
std::string fc_bias_in = elementwise_add_tmpvar->Name();
std::string fc_out = elementwise_add_out->Name();
desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
desc.SetOutput("Out", std::vector<std::string>({fc_out}));
desc.SetType("fc");
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
fc_node->inputs =
std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
fc_node->outputs.push_back(elementwise_add_out);
// Update link relatons
PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
elementwise_add, fc_node));
PADDLE_ENFORCE(
LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
// Drop old nodes
graph->RemoveNode(mul);
graph->RemoveNode(elementwise_add);
graph->RemoveNode(mul_out); // tmp variable
};
gpd(graph.get(), handler);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
*/
class FCFusePass : public Pass {
public:
virtual ~FCFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetInput("Xs", inputs);
op->SetOutput("Ys", outputs);
}
// a->OP0->b
// a->OP1->c
// (b, c)->mul->d
// (d, e)->elementwise_add->f
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "c") {
var->SetPersistable(true);
}
}
SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}));
SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
std::vector<std::string>({"c"}));
SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
std::vector<std::string>({"d"}));
SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
std::vector<std::string>({"f"}));
return prog;
}
TEST(FCFusePass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("fc_fuse_pass");
int pre_nodes = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int after_nodes = graph->Nodes().size();
// Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out
// Add 1 Node: FC
EXPECT_EQ(pre_nodes - 2, after_nodes);
// Assert fc op in newly generated graph
int fc_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "fc") {
++fc_count;
}
}
EXPECT_EQ(fc_count, 1);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(fc_fuse_pass);
...@@ -117,7 +117,15 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { ...@@ -117,7 +117,15 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
} }
// For output args, always create a new var. // For output args, always create a new var.
for (auto &each_var_name : op->OutputArgumentNames()) { for (auto &each_var_name : op->OutputArgumentNames()) {
ir::Node *var = CreateVarNode(all_vars.at(each_var_name)); ir::Node *var = nullptr;
if (all_vars.count(each_var_name) != 0) {
var = CreateVarNode(all_vars.at(each_var_name));
} else {
// Operation output vars can be @EMPTY@. For example, while_grad
// can have multi @EMPTY@ outputs with no VarDesc.
// TODO(panyx0718): Add a test.
var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
}
var_nodes[each_var_name].push_back(var); var_nodes[each_var_name].push_back(var);
node->outputs.push_back(var); node->outputs.push_back(var);
var->inputs.push_back(node); var->inputs.push_back(node);
...@@ -208,7 +216,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { ...@@ -208,7 +216,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
// Add write after write dependence // Add write after write dependence
ir::Node *upstream_op = ir::Node *upstream_op =
(*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0]; (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
if (upstream_op) { // TODO(zcd): Add a test.
if (upstream_op && upstream_op != write_op) {
ir::Node *dep_var = CreateControlDepVar(); ir::Node *dep_var = CreateControlDepVar();
write_op->inputs.push_back(dep_var); write_op->inputs.push_back(dep_var);
upstream_op->outputs.push_back(dep_var); upstream_op->outputs.push_back(dep_var);
......
...@@ -98,11 +98,13 @@ class Graph { ...@@ -98,11 +98,13 @@ class Graph {
// Create a normal variable with non-null VarDesc. // Create a normal variable with non-null VarDesc.
ir::Node *CreateVarNode(VarDesc *var_desc) { ir::Node *CreateVarNode(VarDesc *var_desc) {
PADDLE_ENFORCE(var_desc);
return AddNode(new ir::Node(var_desc)); return AddNode(new ir::Node(var_desc));
} }
// Create a normal runnable operator with OpDesc. // Create a normal runnable operator with OpDesc.
ir::Node *CreateOpNode(OpDesc *op_desc) { ir::Node *CreateOpNode(OpDesc *op_desc) {
PADDLE_ENFORCE(op_desc);
return AddNode(new ir::Node(op_desc)); return AddNode(new ir::Node(op_desc));
} }
...@@ -134,6 +136,12 @@ class Graph { ...@@ -134,6 +136,12 @@ class Graph {
return ret; return ret;
} }
void RemoveNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
node_set_.erase(node);
nodes_.erase(node);
}
private: private:
// This method takes ownership of `node`. // This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) { ir::Node *AddNode(ir::Node *node) {
...@@ -143,14 +151,8 @@ class Graph { ...@@ -143,14 +151,8 @@ class Graph {
return node; return node;
} }
void RemoveNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
node_set_.erase(node);
nodes_.erase(node);
}
// NOTE: program_ shouldn't be exposed to user. // NOTE: program_ shouldn't be exposed to user.
const ProgramDesc &program_; const ProgramDesc program_;
std::map<std::string, boost::any> attrs_; std::map<std::string, boost::any> attrs_;
std::map<std::string, std::function<void(void)>> attr_dels_; std::map<std::string, std::function<void(void)>> attr_dels_;
std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_; std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
......
...@@ -104,7 +104,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( ...@@ -104,7 +104,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
for (auto &adj_n : var->inputs) { for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
adj_list[n].insert(adj_n); adj_list[n].insert(adj_n);
VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n) VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n) << " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var); << " via " << var->Name() << reinterpret_cast<void *>(var);
} }
......
...@@ -25,12 +25,30 @@ namespace paddle { ...@@ -25,12 +25,30 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
size_t PDPattern::id_ = 0UL;
PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
if (!name.empty()) {
PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
"PDNode's name should be unique, get duplicate [%s]",
name);
}
nodes_.emplace_back(new PDNode(std::move(teller), name)); nodes_.emplace_back(new PDNode(std::move(teller), name));
auto* cur = nodes_.back().get(); auto* cur = nodes_.back().get();
node_map_[name] = cur;
return cur; return cur;
} }
PDNode* PDPattern::RetriveNode(const std::string& id) const {
auto it = node_map_.find(id);
if (it == node_map_.end()) {
return nullptr;
}
return it->second;
}
void PDPattern::AddEdge(PDNode* a, PDNode* b) { void PDPattern::AddEdge(PDNode* a, PDNode* b) {
PADDLE_ENFORCE(a); PADDLE_ENFORCE(a);
PADDLE_ENFORCE(b); PADDLE_ENFORCE(b);
...@@ -51,15 +69,18 @@ void GraphPatternDetecter::operator()(Graph* graph, ...@@ -51,15 +69,18 @@ void GraphPatternDetecter::operator()(Graph* graph,
} }
bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) { bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
VLOG(4) << "mark pdnodes in graph";
if (graph.Nodes().empty()) return false; if (graph.Nodes().empty()) return false;
for (auto& node : GraphTraits::DFS(graph)) { for (auto& node : GraphTraits::DFS(graph)) {
for (const auto& pdnode : pattern_.nodes()) { for (const auto& pdnode : pattern_.nodes()) {
if (pdnode->Tell(&node)) { if (pdnode->Tell(&node)) {
VLOG(4) << "pdnode " << pdnode->name() << " marked";
pdnodes2nodes_[pdnode.get()].insert(&node); pdnodes2nodes_[pdnode.get()].insert(&node);
} }
} }
} }
VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
return !pdnodes2nodes_.empty(); return !pdnodes2nodes_.empty();
} }
...@@ -67,10 +88,20 @@ struct HitGroup { ...@@ -67,10 +88,20 @@ struct HitGroup {
std::unordered_map<PDNode*, Node*> roles; std::unordered_map<PDNode*, Node*> roles;
bool Match(Node* node, PDNode* pat) { bool Match(Node* node, PDNode* pat) {
if (nodes_.count(node)) {
if (!roles.count(pat)) return false;
return roles[pat] == node;
}
return !roles.count(pat) || roles.at(pat) == node; return !roles.count(pat) || roles.at(pat) == node;
} }
void Register(Node* node, PDNode* pat) { roles[pat] = node; } void Register(Node* node, PDNode* pat) {
roles[pat] = node;
nodes_.insert(node);
}
private:
std::unordered_set<Node*> nodes_;
}; };
// Tell whether Node a links to b. // Tell whether Node a links to b.
...@@ -104,6 +135,7 @@ GraphPatternDetecter::DetectPatterns() { ...@@ -104,6 +135,7 @@ GraphPatternDetecter::DetectPatterns() {
// Extend a PDNode to subgraphs by deducing the connection relations defined // Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes. // in edges of PDNodes.
for (const auto& edge : pattern_.edges()) { for (const auto& edge : pattern_.edges()) {
VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
// Each role has two PDNodes, which indicates two roles. // Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected. // Detect two Nodes that can match these two roles and they are connected.
auto& pre_groups = bi_records[step % 2]; auto& pre_groups = bi_records[step % 2];
...@@ -127,6 +159,7 @@ GraphPatternDetecter::DetectPatterns() { ...@@ -127,6 +159,7 @@ GraphPatternDetecter::DetectPatterns() {
} }
} }
} }
VLOG(3) << "step " << step << " get records: " << cur_groups.size();
} }
for (auto& group : bi_records[step % 2]) { for (auto& group : bi_records[step % 2]) {
......
...@@ -96,7 +96,8 @@ class PDPattern { ...@@ -96,7 +96,8 @@ class PDPattern {
void AddEdge(PDNode* a, PDNode* b); void AddEdge(PDNode* a, PDNode* b);
PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = ""); PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
PDNode* RetriveNode(const std::string& id) const;
const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; } const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
const std::vector<edge_t>& edges() const { return edges_; } const std::vector<edge_t>& edges() const { return edges_; }
...@@ -107,8 +108,12 @@ class PDPattern { ...@@ -107,8 +108,12 @@ class PDPattern {
FRIEND_TEST(PDPattern, NewNode); FRIEND_TEST(PDPattern, NewNode);
#endif #endif
static std::string NewID() { return "pdnode-" + std::to_string(id_++); }
std::vector<std::unique_ptr<PDNode>> nodes_; std::vector<std::unique_ptr<PDNode>> nodes_;
std::vector<edge_t> edges_; std::vector<edge_t> edges_;
std::unordered_map<std::string, PDNode*> node_map_;
static size_t id_;
}; };
/* /*
......
...@@ -163,8 +163,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) { ...@@ -163,8 +163,8 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
// 3. Detect op2 -> var2 -> op4 // 3. Detect op2 -> var2 -> op4
// 4. Detect op2 -> var3 -> op5 // 4. Detect op2 -> var3 -> op5
// But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2 // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
ASSERT_GE(count, 1UL); ASSERT_GE(count, 1);
ASSERT_LE(count, 2UL); ASSERT_LE(count, 2);
} }
} // namespace ir } // namespace ir
......
...@@ -200,9 +200,11 @@ TEST(GraphTest, WriteAfterWrite) { ...@@ -200,9 +200,11 @@ TEST(GraphTest, WriteAfterWrite) {
ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1])); ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
control_dep2 = n->inputs[1]; control_dep2 = n->inputs[1];
ASSERT_EQ(n->inputs.size(), 2); ASSERT_EQ(n->inputs.size(), 2);
ASSERT_EQ(control_dep1, control_dep2);
} }
} }
ASSERT_NE(control_dep1, nullptr);
ASSERT_NE(control_dep2, nullptr);
ASSERT_EQ(control_dep1, control_dep2);
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -12,7 +12,11 @@ ...@@ -12,7 +12,11 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include <stack> #include <stack>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
......
...@@ -25,6 +25,7 @@ static const char kGraphVizPath[] = "graph_viz_path"; ...@@ -25,6 +25,7 @@ static const char kGraphVizPath[] = "graph_viz_path";
std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl( std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
const std::string graph_viz_path = Get<std::string>(kGraphVizPath); const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
VLOG(3) << "draw IR graph viz to " << graph_viz_path;
std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path)); std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
PADDLE_ENFORCE(fout->good()); PADDLE_ENFORCE(fout->good());
std::ostream& sout = *fout; std::ostream& sout = *fout;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class InferCleanGraphPass : public Pass {
public:
virtual ~InferCleanGraphPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
auto is_valid_node = [](Node* x) {
return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
};
std::unordered_set<Node*> invalid_nodes;
for (auto* node : graph->Nodes()) {
if (is_valid_node(node)) {
invalid_nodes.insert(node);
}
}
// remove nodes from the graph.
for (auto* node : invalid_nodes) {
graph->RemoveNode(node);
}
// clean edges.
for (auto* node : graph->Nodes()) {
CleanEdges(&node->inputs, invalid_nodes);
CleanEdges(&node->outputs, invalid_nodes);
}
return graph;
}
void CleanEdges(std::vector<Node*>* nodes,
const std::unordered_set<Node*>& to_remove) const {
auto it = std::remove_if(nodes->begin(), nodes->end(),
[&](Node* x) { return to_remove.count(x); });
nodes->erase(it, nodes->end());
}
};
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(infer_clean_graph_pass,
paddle::framework::ir::InferCleanGraphPass);
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
const char Node::kControlDepVarName[] = "__control_var"; constexpr char Node::kControlDepVarName[];
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -27,21 +27,21 @@ namespace ir { ...@@ -27,21 +27,21 @@ namespace ir {
class Node { class Node {
public: public:
enum class Type { kOperation, kVariable }; enum class Type { kOperation, kVariable };
static const char kControlDepVarName[]; static constexpr char kControlDepVarName[] = "__control_var";
explicit Node(const std::string& name, Type type) explicit Node(const std::string& name, Type type)
: name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
explicit Node(VarDesc* var_desc) explicit Node(VarDesc* var_desc)
: name_(var_desc->Name()), : name_(var_desc->Name()),
var_desc_(var_desc), var_desc_(new VarDesc(*var_desc)),
op_desc_(nullptr), op_desc_(nullptr),
type_(Type::kVariable) {} type_(Type::kVariable) {}
explicit Node(OpDesc* op_desc) explicit Node(OpDesc* op_desc)
: name_(op_desc->Type()), : name_(op_desc->Type()),
var_desc_(nullptr), var_desc_(nullptr),
op_desc_(op_desc), op_desc_(new OpDesc(*op_desc, op_desc->Block())),
type_(Type::kOperation) {} type_(Type::kOperation) {}
Type NodeType() const { return type_; } Type NodeType() const { return type_; }
...@@ -50,12 +50,12 @@ class Node { ...@@ -50,12 +50,12 @@ class Node {
VarDesc* Var() { VarDesc* Var() {
PADDLE_ENFORCE(type_ == Type::kVariable); PADDLE_ENFORCE(type_ == Type::kVariable);
return var_desc_; return var_desc_.get();
} }
OpDesc* Op() { OpDesc* Op() {
PADDLE_ENFORCE(type_ == Type::kOperation); PADDLE_ENFORCE(IsOp());
return op_desc_; return op_desc_.get();
} }
bool IsOp() const { return type_ == Type::kOperation; } bool IsOp() const { return type_ == Type::kOperation; }
...@@ -66,8 +66,8 @@ class Node { ...@@ -66,8 +66,8 @@ class Node {
protected: protected:
const std::string name_; const std::string name_;
VarDesc* var_desc_; std::unique_ptr<VarDesc> var_desc_;
OpDesc* op_desc_; std::unique_ptr<OpDesc> op_desc_;
Type type_; Type type_;
private: private:
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
...@@ -193,8 +194,14 @@ ParallelExecutor::ParallelExecutor( ...@@ -193,8 +194,14 @@ ParallelExecutor::ParallelExecutor(
member_->local_scopes_, member_->use_cuda_, build_strategy); member_->local_scopes_, member_->use_cuda_, build_strategy);
#endif #endif
if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph))); exec_strategy, member_->local_scopes_, places, std::move(graph)));
} else {
member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph)));
}
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, std::move(var_infos), exec_strategy, member_->local_scopes_, std::move(var_infos),
member_->places_, std::move(member_->executor_))); member_->places_, std::move(member_->executor_)));
......
...@@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { ...@@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
auto all_ops = blocks_[block_id]->AllOps(); auto all_ops = blocks_[block_id]->AllOps();
for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) { for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
auto &op = all_ops[op_id]; auto &op = all_ops[op_id];
for (const std::string &attr_name : op->AttrNames()) { for (const std::string &attr_name : op->AttrNames()) {
if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
int sub_block_id = int sub_block_id =
o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
} else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
std::vector<int> sub_block_ids =
o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
std::vector<BlockDesc *> block_descs;
for (int block_id : sub_block_ids) {
block_descs.push_back(MutableBlock(block_id));
}
op->SetBlocksAttr(attr_name, block_descs);
} }
} }
} }
...@@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { ...@@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
desc_ = desc; desc_ = desc;
for (auto &block_desc : *desc_.mutable_blocks()) { InitFromProto();
blocks_.emplace_back(new BlockDesc(this, &block_desc));
}
for (auto &block : blocks_) {
for (auto *op : block->AllOps()) {
for (const auto &attr : op->Proto()->attrs()) {
if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
}
}
}
}
} }
ProgramDesc::ProgramDesc(const std::string &binary_str) { ProgramDesc::ProgramDesc(const std::string &binary_str) {
PADDLE_ENFORCE(desc_.ParseFromString(binary_str), PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
"Fail to parse program_desc from binary string."); "Fail to parse program_desc from binary string.");
InitFromProto();
}
void ProgramDesc::InitFromProto() {
for (auto &block_desc : *desc_.mutable_blocks()) { for (auto &block_desc : *desc_.mutable_blocks()) {
blocks_.emplace_back(new BlockDesc(this, &block_desc)); blocks_.emplace_back(new BlockDesc(this, &block_desc));
} }
...@@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) { ...@@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
if (attr.type() == proto::AttrType::BLOCK) { if (attr.type() == proto::AttrType::BLOCK) {
size_t blk_idx = attr.block_idx(); size_t blk_idx = attr.block_idx();
op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx)); op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
} else if (attr.type() == proto::AttrType::BLOCKS) {
auto blks_idx = attr.blocks_idx();
std::vector<BlockDesc *> block_descs;
for (int blk_idx : blks_idx) {
block_descs.push_back(this->MutableBlock(blk_idx));
}
op->SetBlocksAttr(attr.name(), block_descs);
} }
} }
} }
......
...@@ -76,6 +76,8 @@ class ProgramDesc { ...@@ -76,6 +76,8 @@ class ProgramDesc {
void SetFetchHolderName(const std::string &fetch_holder_name); void SetFetchHolderName(const std::string &fetch_holder_name);
private: private:
void InitFromProto();
proto::ProgramDesc desc_; proto::ProgramDesc desc_;
std::vector<std::unique_ptr<BlockDesc>> blocks_; std::vector<std::unique_ptr<BlockDesc>> blocks_;
......
...@@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) { ...@@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) {
out->SetType(proto::VarType::LOD_TENSOR); out->SetType(proto::VarType::LOD_TENSOR);
op->SetOutput("Y", {out->Name()}); op->SetOutput("Y", {out->Name()});
BlockDesc* new_block = program.AppendBlock(*global_block);
op = new_block->AppendOp();
op->SetType("mul");
op = global_block->AppendOp();
op->SetType("op_with_subblock");
op->SetAttr("sub_block", new_block);
std::vector<BlockDesc*> sub_blocks;
sub_blocks.push_back(program.AppendBlock(*global_block));
sub_blocks.push_back(program.AppendBlock(*global_block));
op->SetAttr("sub_blocks", sub_blocks);
ProgramDesc program_copy(program); ProgramDesc program_copy(program);
auto* global_block_copy = program_copy.MutableBlock(0); auto* global_block_copy = program_copy.MutableBlock(0);
...@@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) { ...@@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) {
assert_same_var("Y", y); assert_same_var("Y", y);
assert_same_var("Out", out); assert_same_var("Out", out);
bool found_sub_block = false;
bool found_sub_blocks = false;
for (size_t i = 0; i < global_block->OpSize(); ++i) { for (size_t i = 0; i < global_block->OpSize(); ++i) {
auto op_origin = global_block->Op(i); auto op_origin = global_block->Op(i);
auto op_copy = global_block_copy->Op(i); auto op_copy = global_block_copy->Op(i);
...@@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) { ...@@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) {
ASSERT_EQ(op_copy->Proto()->SerializeAsString(), ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
op_origin->Proto()->SerializeAsString()); op_origin->Proto()->SerializeAsString());
}
if (op->Type() == "op_with_subblock") {
ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
found_sub_block = true;
ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
found_sub_blocks = true;
}
}
ASSERT_TRUE(found_sub_block);
ASSERT_TRUE(found_sub_blocks);
// Not check block's protostr are same it because the order of vars could be // Not check block's protostr are same it because the order of vars could be
// different and it is correct. // different and it is correct.
} }
......
...@@ -139,7 +139,7 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) { ...@@ -139,7 +139,7 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
} }
auto write_iter = id_to_index_.find(key); auto write_iter = id_to_index_.find(key);
if (write_iter == id_to_index_.end()) { if (write_iter == id_to_index_.end()) {
size_t row_num = rows_.size(); int row_num = rows_.size();
if (row_num == value_->dims()[0]) { if (row_num == value_->dims()[0]) {
rwlock_->UNLock(); rwlock_->UNLock();
PADDLE_THROW("selected rows is full, then length exceed %d", row_num); PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
...@@ -182,7 +182,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, ...@@ -182,7 +182,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
"output tensor should have the same shape with table " "output tensor should have the same shape with table "
"except the dims[0]."); "except the dims[0].");
for (size_t i = 0; i < ids.numel(); ++i) { for (int i = 0; i < ids.numel(); ++i) {
int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown); int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
framework::VisitDataType( framework::VisitDataType(
framework::ToDataType(value_->type()), framework::ToDataType(value_->type()),
......
...@@ -31,7 +31,8 @@ size_t Tensor::memory_size() const { ...@@ -31,7 +31,8 @@ size_t Tensor::memory_size() const {
return holder_ == nullptr ? 0UL : holder_->size() - offset_; return holder_ == nullptr ? 0UL : holder_->size() - offset_;
} }
void* Tensor::mutable_data(platform::Place place, std::type_index type) { void* Tensor::mutable_data(platform::Place place, std::type_index type,
size_t requested_size) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
holder_->set_type(type); holder_->set_type(type);
} }
...@@ -39,7 +40,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) { ...@@ -39,7 +40,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
"When calling this method, the Tensor's numel must be " "When calling this method, the Tensor's numel must be "
"equal or larger than zero. " "equal or larger than zero. "
"Please check Tensor::Resize has been called first."); "Please check Tensor::Resize has been called first.");
int64_t size = numel() * SizeOfType(type); size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
...@@ -68,10 +69,10 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) { ...@@ -68,10 +69,10 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type) {
offset_); offset_);
} }
void* Tensor::mutable_data(platform::Place place) { void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing."); "Cannot invoke mutable data if current hold nothing.");
return mutable_data(place, holder_->type()); return mutable_data(place, holder_->type(), requested_size);
} }
Tensor& Tensor::ShareDataWith(const Tensor& src) { Tensor& Tensor::ShareDataWith(const Tensor& src) {
......
...@@ -89,22 +89,24 @@ class Tensor { ...@@ -89,22 +89,24 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
T* mutable_data(platform::Place place); T* mutable_data(platform::Place place, size_t requested_size = 0);
void* mutable_data(platform::Place place, std::type_index type); void* mutable_data(platform::Place place, std::type_index type,
size_t requested_size = 0);
void* mutable_data(platform::Place place); void* mutable_data(platform::Place place, size_t requested_size = 0);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
* *
* @param[in] dims The dimensions of the memory block. * @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block. * @param[in] place The place of the memory block.
* @param[in] requested_size The size of the block in bytes.
* *
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
T* mutable_data(DDim dims, platform::Place place); T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
const DDim& dims() const; const DDim& dims() const;
......
...@@ -46,16 +46,17 @@ inline T* Tensor::data() { ...@@ -46,16 +46,17 @@ inline T* Tensor::data() {
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place) { inline T* Tensor::mutable_data(DDim dims, platform::Place place,
size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims); Resize(dims);
return mutable_data<T>(place); return mutable_data<T>(place, requested_size);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place) { inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T))); return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
} }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
......
cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
analyzer.cc
helper.cc
# passes
fluid_to_data_flow_graph_pass.cc fluid_to_data_flow_graph_pass.cc
data_flow_graph_to_fluid_pass.cc data_flow_graph_to_fluid_pass.cc
dfg_graphviz_draw_pass.cc dfg_graphviz_draw_pass.cc
tensorrt_subgraph_pass.cc tensorrt_subgraph_pass.cc
tensorrt_subgraph_node_mark_pass.cc tensorrt_subgraph_node_mark_pass.cc
analyzer.cc fluid_to_ir_pass.cc
helper.cc
model_store_pass.cc model_store_pass.cc
DEPS framework_proto proto_desc) DEPS framework_proto proto_desc ir_pass_manager graph pass)
cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
...@@ -18,7 +22,7 @@ function (inference_analysis_test TARGET) ...@@ -18,7 +22,7 @@ function (inference_analysis_test TARGET)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS) set(multiValueArgs SRCS EXTRA_DEPS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(mem_opt "") set(mem_opt "")
...@@ -27,19 +31,51 @@ function (inference_analysis_test TARGET) ...@@ -27,19 +31,51 @@ function (inference_analysis_test TARGET)
endif() endif()
cc_test(${TARGET} cc_test(${TARGET}
SRCS "${analysis_test_SRCS}" SRCS "${analysis_test_SRCS}"
DEPS analysis DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass ${analysis_test_EXTRA_DEPS}
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt}) ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING) endif(WITH_TESTING)
endfunction(inference_analysis_test) endfunction(inference_analysis_test)
set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
set(DITU_RNN_MODEL ${DITU_INSTALL_DIR}/model)
set(DITU_RNN_DATA ${DITU_INSTALL_DIR}/data.txt)
function (inference_download_and_uncompress target url gz_filename)
message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
execute_process(COMMAND bash -c "mkdir -p ${DITU_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && wget -q ${url}")
execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && tar xzf ${gz_filename}")
message(STATUS "finish downloading ${gz_filename}")
endfunction(inference_download_and_uncompress)
if (NOT EXISTS ${DITU_INSTALL_DIR})
inference_download_and_uncompress(ditu_rnn_model ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
inference_download_and_uncompress(ditu_rnn_data ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
# ir
fc_fuse_pass
graph_viz_pass
infer_clean_graph_pass
graph_pattern_detecter
infer_clean_graph_pass
pass
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
--infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
--infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc) inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
...@@ -17,22 +17,23 @@ ...@@ -17,22 +17,23 @@
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h" #include "paddle/fluid/inference/analysis/model_store_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
namespace paddle { DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
"Enable subgraph to TensorRT engine for acceleration"); "Enable subgraph to TensorRT engine for acceleration");
DEFINE_string(inference_analysis_graphviz_log_root, "./", DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
DEFINE_string(IA_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs."); "Graphviz debuger for data flow graphs.");
DEFINE_string(inference_analysis_output_storage_path, "", DEFINE_string(IA_output_storage_path, "", "optimized model output path");
"optimized model output path");
namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
...@@ -40,11 +41,38 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -40,11 +41,38 @@ class DfgPassManagerImpl final : public DfgPassManager {
public: public:
DfgPassManagerImpl() { DfgPassManagerImpl() {
// TODO(Superjomn) set the key with pass reprs. // TODO(Superjomn) set the key with pass reprs.
LOG(INFO)
<< "-----------------------------------------------------------------";
if (FLAGS_IA_enable_ir) {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
} else {
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { }
TryAddTensorRtPass();
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
if (!FLAGS_IA_output_storage_path.empty()) {
AddPass("model-store-pass", new ModelStorePass);
}
LOG(INFO)
<< "-----------------------------------------------------------------";
}
std::string repr() const override { return "dfg-pass-manager"; }
std::string description() const override { return "DFG pass manager."; }
private:
void AddPass(const std::string& name, Pass* pass) {
VLOG(3) << "Adding pass " << name;
Register(name, pass);
AddGraphvizDebugerPass(pass);
}
void TryAddTensorRtPass() {
if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
auto trt_teller = [&](const Node* node) { auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"}); {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
"depthwise_conv2d", "batch_norm"});
if (!node->IsFunction()) return false; if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node); const auto* func = static_cast<const Function*>(node);
...@@ -59,20 +87,6 @@ class DfgPassManagerImpl final : public DfgPassManager { ...@@ -59,20 +87,6 @@ class DfgPassManagerImpl final : public DfgPassManager {
new TensorRTSubgraphNodeMarkPass(trt_teller)); new TensorRTSubgraphNodeMarkPass(trt_teller));
AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
} }
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
if (!FLAGS_inference_analysis_output_storage_path.empty()) {
AddPass("model-store-pass", new ModelStorePass);
}
}
std::string repr() const override { return "dfg-pass-manager"; }
std::string description() const override { return "DFG pass manager."; }
private:
void AddPass(const std::string& name, Pass* pass) {
LOG(INFO) << "Adding pass " << name;
Register(name, pass);
AddGraphvizDebugerPass(pass);
} }
// Add the graphviz debuger pass if the parent pass has one. // Add the graphviz debuger pass if the parent pass has one.
......
...@@ -39,14 +39,14 @@ limitations under the License. */ ...@@ -39,14 +39,14 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass.h" #include "paddle/fluid/inference/analysis/pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle {
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available. // flag if not available.
DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
DECLARE_string(inference_analysis_graphviz_log_root); DECLARE_string(IA_graphviz_log_root);
DECLARE_string(inference_analysis_output_storage_path); DECLARE_string(IA_output_storage_path);
DECLARE_bool(IA_enable_ir);
namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
......
...@@ -13,15 +13,25 @@ ...@@ -13,15 +13,25 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_without_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; FLAGS_IA_enable_tensorrt_subgraph_engine = false;
Argument argument; Argument argument;
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser; Analyzer analyser;
...@@ -29,13 +39,331 @@ TEST(Analyzer, analysis_without_tensorrt) { ...@@ -29,13 +39,331 @@ TEST(Analyzer, analysis_without_tensorrt) {
} }
TEST(Analyzer, analysis_with_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; FLAGS_IA_enable_tensorrt_subgraph_engine = true;
Argument argument; Argument argument;
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
void TestWord2vecPrediction(const std::string &model_path) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
auto predictor =
::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
config);
// One single batch
int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor;
tensor.shape = std::vector<int>({4, 1});
tensor.data = PaddleBuf(data, sizeof(data));
tensor.dtype = PaddleDType::INT64;
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor);
std::vector<PaddleTensor> outputs;
CHECK(predictor->Run(slots, &outputs));
PADDLE_ENFORCE(outputs.size(), 1UL);
// Check the output buffer size and result of each tid.
PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
0.000932706};
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << "data: "
<< static_cast<float *>(outputs.front().data.data())[i];
PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
result[i]);
}
}
namespace {
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<std::vector<float>> week_data_all, minute_data_all;
std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
data.week_data_all.assign(week_data_all.begin() + batch_iter,
week_data_all.begin() + batch_end);
data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
minute_data_all.begin() + batch_end);
// Prepare LoDs
data.lod1.push_back(0);
data.lod2.push_back(0);
data.lod3.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
CHECK(!data.week_data_all.empty());
CHECK(!data.minute_data_all.empty());
CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
}
data.rnn_week_datas.push_back(data.week_data_all[j]);
data.rnn_minute_datas.push_back(data.minute_data_all[j]);
// calculate lod
data.lod1.push_back(data.lod1.back() +
data.link_step_data_all[j].size());
data.lod3.push_back(data.lod3.back() + 1);
for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
data.lod2.push_back(data.lod2.back() +
data.link_step_data_all[j].size());
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
std::vector<std::vector<float>> link_step_data;
std::vector<std::string> link_datas;
split(data[0], '|', &link_datas);
for (auto &step_data : link_datas) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
link_step_data.push_back(tmp);
}
// load week data
std::vector<float> week_data;
split_to_float(data[2], ',', &week_data);
// load minute data
std::vector<float> minute_data;
split_to_float(data[1], ',', &minute_data);
link_step_data_all.push_back(std::move(link_step_data));
week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data));
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
week_tensor, minute_tensor;
lod_attention_tensor.name = "data_lod_attention";
init_zero_tensor.name = "cell_init";
lod_tensor_tensor.name = "data";
week_tensor.name = "week";
minute_tensor.name = "minute";
auto one_batch = data->NextBatch();
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor.shape.assign({1, 2});
lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
init_zero_tensor.shape.assign({batch_size, 15});
init_zero_tensor.lod.assign({one_batch.lod3});
lod_tensor_tensor.shape = rnn_link_data_shape;
lod_tensor_tensor.lod.assign({one_batch.lod1});
// clang-format off
week_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor.lod.assign({one_batch.lod3});
minute_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor.lod.assign({one_batch.lod3});
// clang-format on
// assign data
TensorAssignData(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
std::vector<float> tmp_zeros(batch_size * 15, 0.);
TensorAssignData(&init_zero_tensor, {tmp_zeros});
TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
// Set inputs.
auto init_zero_tensor1 = init_zero_tensor;
init_zero_tensor1.name = "hidden_init";
input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
init_zero_tensor1, lod_attention_tensor,
lod_tensor_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::FLOAT32;
}
}
std::string DescribeTensor(const PaddleTensor &tensor) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
switch (tensor.dtype) {
case PaddleDType::FLOAT32:
os << "float32";
break;
case PaddleDType::INT64:
os << "int64";
break;
default:
os << "unset";
}
os << '\n';
os << " - shape: " << to_string(tensor.shape) << '\n';
os << " - lod: ";
for (auto &l : tensor.lod) {
os << to_string(l) << "; ";
}
os << "\n";
os << " - data: ";
int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
[](int a, int b) { return a * b; });
for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " ";
}
os << '\n';
return os.str();
}
} // namespace
const float ditu_rnn_target_data[] = {
104.711, 11.2431, 1.35422, 0, 0, 0, 0, 0,
27.7039, 1.41486, 7.09526, 0, 0, 0, 0, 0,
7.6481, 6.5324, 56.383, 2.88018, 8.92918, 132.007, 4.27429, 2.02934,
14.1727, 10.7461, 25.0616, 16.0197, 14.4163, 16.9199, 6.75517, 0,
80.0249, 4.77739, 0, 0, 0, 0, 0, 0,
47.5643, 2.67029, 8.76252, 0, 0, 0, 0, 0,
51.8822, 4.4411, 0, 0, 0, 0, 0, 0,
10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0,
93.5771, 3.84641, 0, 0, 0, 0, 0, 0,
169.426, 0, 0, 0, 0, 0, 0, 0};
// Test with a really complicate model.
void TestDituRNNPrediction(const std::string &model_path,
const std::string &data_path, int batch_size,
bool use_analysis, bool activate_ir,
int num_times = 1) {
FLAGS_IA_enable_ir = activate_ir;
FLAGS_IA_enable_tensorrt_subgraph_engine = false;
FLAGS_IA_output_storage_path = "./analysis.out";
std::string model_out;
if (use_analysis) {
Argument argument(model_path);
argument.model_output_store_path.reset(new std::string("./analysis.out"));
Analyzer analyzer;
analyzer.Run(&argument);
// Should get the transformed model stored to ./analysis.out
model_out = "./analysis.out";
ASSERT_TRUE(PathExists(model_out));
} else {
model_out = FLAGS_infer_ditu_rnn_model;
}
NativeConfig config;
config.prog_file = model_out + "/__model__";
config.param_file = model_out + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
std::vector<PaddleTensor> input_slots;
DataRecord data(data_path, batch_size);
// Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
LOG(INFO) << "===========profile result===========";
LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
<< ", latency: " << timer.toc() / num_times << "ms";
LOG(INFO) << "=====================================";
for (auto &out : outputs) {
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
float *data = static_cast<float *>(out.data.data());
for (size_t i = 0;
i < std::min(sizeof(ditu_rnn_target_data) / sizeof(float), size);
i++) {
EXPECT_NEAR(data[i], ditu_rnn_target_data[i], 1e-3);
}
}
}
// Turn on the IR pass supportion, run a real inference and check the result.
TEST(Analyzer, SupportIRPass) {
FLAGS_IA_enable_ir = true;
FLAGS_IA_enable_tensorrt_subgraph_engine = false;
FLAGS_IA_output_storage_path = "./analysis.out";
Argument argument(FLAGS_inference_model_dir);
argument.model_output_store_path.reset(new std::string("./analysis.out"));
Analyzer analyzer;
analyzer.Run(&argument);
// Should get the transformed model stored to ./analysis.out
ASSERT_TRUE(PathExists("./analysis.out"));
// Inference from this path.
TestWord2vecPrediction("./analysis.out");
}
// Directly infer with the original model.
TEST(Analyzer, DituRNN_without_analysis) {
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
FLAGS_batch_size, false, false, FLAGS_repeat);
}
// Inference with the original model with the analysis turned on, the analysis
// module will transform the program to a data flow graph.
TEST(Analyzer, DituRNN_with_analysis) {
LOG(INFO) << "ditu rnn with analysis";
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
FLAGS_batch_size, true, false, FLAGS_repeat);
}
// Inference with analysis and IR. The IR module will fuse some large kernels.
TEST(Analyzer, DituRNN_with_analysis_with_IR) {
LOG(INFO) << "ditu rnn with analysis and IR fuse";
TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
FLAGS_batch_size, true, true, FLAGS_repeat);
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
...@@ -19,14 +19,16 @@ limitations under the License. */ ...@@ -19,14 +19,16 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using ir_node_t = framework::ir::Node;
using ir_graph_t = framework::ir::Graph;
// It is a better idea that the inputs and outputs of this graph is set manually // It is a better idea that the inputs and outputs of this graph is set manually
// before, but there must be a Pass that helps to prune the unnecessary ops that // before, but there must be a Pass that helps to prune the unnecessary ops that
// do not contribute to the given targets, so in this pass, analysis and get the // do not contribute to the given targets, so in this pass, analysis and get the
// inputs and outputs is OK. // inputs and outputs is OK.
void DataFlowGraph::Build() { void DataFlowGraph::Build() {
inputs.clear(); inputs_.clear();
outputs.clear(); outputs_.clear();
std::unordered_set<Node *> ins; std::unordered_set<Node *> ins;
std::unordered_set<Node *> outs; std::unordered_set<Node *> outs;
for (auto &node : nodes.nodes()) { for (auto &node : nodes.nodes()) {
...@@ -42,18 +44,140 @@ void DataFlowGraph::Build() { ...@@ -42,18 +44,140 @@ void DataFlowGraph::Build() {
// similarly, the nodes that in outs but not in ins is the graphs' outputs // similarly, the nodes that in outs but not in ins is the graphs' outputs
for (auto *in : ins) { for (auto *in : ins) {
if (!outs.count(in)) { if (!outs.count(in)) {
inputs.push_back(in); inputs_.push_back(in);
} }
} }
for (auto *out : outs) { for (auto *out : outs) {
if (!outs.count(out)) { if (!ins.count(out)) {
outputs.push_back(out); outputs_.push_back(out);
} }
} }
Clean(); Clean();
} }
void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) {
// insert vars
// The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
// will keep updating to its latest alias during the graph-building.
std::unordered_map<std::string, size_t> var2id;
auto &main_block = prog.blocks(framework::kRootBlockIndex);
for (int i = 0; i < main_block.vars_size(); i++) {
const auto &var = main_block.vars(i);
auto *v = nodes.Create(Node::Type::kValue);
v->SetName(var.name());
v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
v->SetPbMsg(var.SerializeAsString());
var2id[var.name()] = v->id();
}
// The variables in a SSA can only write once, so if a variable is written
// multiple times(quite common in our ProgramDesc design), multiple alias
// Nodes of this variable will be created, and each will just write once.
// An set that keep all the names of the variables(the original, not alias)
// that have been written(as outputs). Once an Op's output variable hit the
// set, it should create a new alias and update the global alias for this
// variable. And that make a Data Flow Graph a SSA.
std::unordered_set<Node *> unique_written_vars;
for (int i = 0; i < main_block.ops_size(); i++) {
const auto &op = main_block.ops(i);
auto *o = nodes.Create(Node::Type::kFunction);
o->SetName(op.type());
static_cast<Function *>(o)->SetFuncType(op.type());
// Link to the original protobuf message's memory, make it easier to
// generate from a data flow graph to fluid ProgramDesc.
o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs
for (int j = 0; j < op.inputs_size(); j++) {
auto &in_var = op.inputs(j);
for (int k = 0; k < in_var.arguments_size(); k++) {
auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k)));
in->outlinks.push_back(o);
o->inlinks.push_back(in);
unique_written_vars.insert(in);
}
}
for (int j = 0; j < op.outputs_size(); j++) {
auto &out_var = op.outputs(j);
for (int k = 0; k < out_var.arguments_size(); k++) {
auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]);
if (unique_written_vars.count(out)) {
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto *out_alias = nodes.Create(Node::Type::kValue);
out_alias->SetName(out->name());
out_alias->SetPbDesc(out->pb_desc());
out_alias->SetPbMsg(out->pb_msg());
var2id[out_alias->name()] =
out_alias->id(); // update variable's alias Node
LOG(INFO) << "loop found in graph, create SSA alias node ["
<< out_alias->repr() << "] for [" << out->repr() << "]";
out = out_alias;
}
out->inlinks.push_back(o);
o->outlinks.push_back(out);
}
}
}
// Analysis and extract the inputs and outputs of this graph.
Build();
}
void DataFlowGraph::Build(const framework::ir::Graph &graph) {
// Create nodes
std::unordered_map<ir_node_t *, Node *> ir_node_map;
for (auto *ir_node : graph.Nodes()) {
Node *x{nullptr};
if (ir_node->IsOp()) {
PADDLE_ENFORCE(ir_node->Op());
VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
x = nodes.Create(Node::Type::kFunction);
x->attr("ir_node").Pointer() = ir_node;
PADDLE_ENFORCE(ir_node->Op()->Proto());
x->SetName(ir_node->Op()->Proto()->type());
x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString());
} else if (ir_node->IsVar()) {
// Not create a Node for IR ControlDepVar, considering Inference currently
// just used in single thread scenerio.
VLOG(4) << "get var " << ir_node->Name();
x = nodes.Create(Node::Type::kValue);
x->attr("ir_node").Pointer() = ir_node;
x->SetName(ir_node->Name());
// x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString());
} else {
PADDLE_THROW("Failed to create an Node from IR, unknown type");
}
ir_node_map.emplace(ir_node, x);
}
VLOG(4) << "finish creating Nodes";
VLOG(4) << "to create edge";
// Create links
for (auto *ir_node : graph.Nodes()) {
auto it = ir_node_map.find(ir_node);
// Skip ControlDepVar.
if (it == ir_node_map.end()) continue;
auto *node = it->second;
for (auto *x : ir_node->inputs) {
if (!ir_node_map.count(x)) continue;
node->inlinks.push_back(ir_node_map.at(x));
}
for (auto *x : ir_node->outputs) {
if (!ir_node_map.count(x)) continue;
node->outlinks.push_back(ir_node_map.at(x));
}
}
Build();
PADDLE_ENFORCE(!inputs_.empty(),
"Can't deduce any inputs from the graph, Is the graph empty?");
ir_graph = &graph;
VLOG(3) << "finished build from IR";
}
void DataFlowGraph::Clean() { void DataFlowGraph::Clean() {
for (auto &node : nodes.nodes()) { for (auto &node : nodes.nodes()) {
std::unordered_set<Node *> inlinks_set(node->inlinks.begin(), std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
...@@ -61,11 +185,9 @@ void DataFlowGraph::Clean() { ...@@ -61,11 +185,9 @@ void DataFlowGraph::Clean() {
std::unordered_set<Node *> outlinks_set(node->outlinks.begin(), std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
node->outlinks.end()); node->outlinks.end());
if (inlinks_set.size() < node->inlinks.size()) { if (inlinks_set.size() < node->inlinks.size()) {
LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
node->inlinks.assign(inlinks_set.begin(), inlinks_set.end()); node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
} }
if (outlinks_set.size() < node->outlinks.size()) { if (outlinks_set.size() < node->outlinks.size()) {
LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
node->outlinks.assign(outlinks_set.begin(), outlinks_set.end()); node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
} }
} }
...@@ -112,10 +234,10 @@ GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator( ...@@ -112,10 +234,10 @@ GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
const std::vector<Node *> &source) const std::vector<Node *> &source)
: queue_(source.begin(), source.end()) {} : queue_(source.begin(), source.end()) {}
// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator( GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
// GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
// : queue_(std::move(other.queue_)), : queue_(std::move(other.queue_)),
// visited_(std::move(other.visited_)) {} visited_(std::move(other.visited_)) {}
GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator( GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
...@@ -159,7 +281,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==( ...@@ -159,7 +281,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
if (queue_.empty()) return other.queue_.empty(); if (queue_.empty()) return other.queue_.empty();
if ((!queue_.empty()) && (!other.queue_.empty())) { if ((!queue_.empty()) && (!other.queue_.empty())) {
return queue_.front() == other.queue_.front() && return queue_.front() == other.queue_.front() &&
visited_.size() == other.visited_.size(); // here need to check the visited_.size() == other.visited_.size();
// equality of queue and // equality of queue and
// visited. Just a light but week implementation. // visited. Just a light but week implementation.
} }
...@@ -174,10 +296,10 @@ GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator( ...@@ -174,10 +296,10 @@ GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
for (auto *x : source) stack_.push(x); for (auto *x : source) stack_.push(x);
} }
// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator( GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
// GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
// : stack_(std::move(other.stack_)), : stack_(std::move(other.stack_)),
// visited_(std::move(other.visited_)) {} visited_(std::move(other.visited_)) {}
GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator( GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
...@@ -339,7 +461,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT ...@@ -339,7 +461,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes; std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) { for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
if (node.type() == Node::Type::kValue || node.deleted()) { if (node.type() == Node::Type::kValue || node.deleted()) {
continue; continue;
} }
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/inference/analysis/graph_traits.h" #include "paddle/fluid/inference/analysis/graph_traits.h"
#include "paddle/fluid/inference/analysis/node.h" #include "paddle/fluid/inference/analysis/node.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -41,19 +42,43 @@ namespace analysis { ...@@ -41,19 +42,43 @@ namespace analysis {
*/ */
struct DataFlowGraph { struct DataFlowGraph {
NodeMap nodes; NodeMap nodes;
std::vector<Node *> inputs; // inputs and outputs are deduced from the graph.
std::vector<Node *> outputs; // Used to interact with IR.
const framework::ir::Graph *ir_graph{nullptr};
// Extract inputs and outputs of the graph. // Extract inputs and outputs of the graph.
void Build(); void Build();
void Build(const framework::proto::ProgramDesc &prog);
// Build a graph from ir::Graph.
void Build(const framework::ir::Graph &graph);
// Get an attribute.
AnyAttr &Attr(const std::string &key) { return attrs_[key]; }
// Output a DOT graph file for debug. // Output a DOT graph file for debug.
std::string DotString() const; std::string DotString() const;
std::string HumanReadableInfo(bool show_values = true, std::string HumanReadableInfo(bool show_values = true,
bool show_functions = true) const; bool show_functions = true) const;
const std::vector<Node *> &inputs() const {
PADDLE_ENFORCE(!inputs_.empty(),
"No inputs are deduced, need to Build() first.");
return inputs_;
}
const std::vector<Node *> &outputs() const {
PADDLE_ENFORCE(!outputs_.empty(),
"No outputs are deduced, need to Build() first.");
return outputs_;
}
private: private:
mutable std::vector<Node *> inputs_;
mutable std::vector<Node *> outputs_;
std::unordered_map<std::string, AnyAttr> attrs_;
// Remove duplicate edges and so on. // Remove duplicate edges and so on.
void Clean(); void Clean();
}; };
...@@ -70,7 +95,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -70,7 +95,7 @@ struct GraphTraits<DataFlowGraph> {
: public std::iterator<std::forward_iterator_tag, Node *> { : public std::iterator<std::forward_iterator_tag, Node *> {
NodesBFSIterator() = default; NodesBFSIterator() = default;
explicit NodesBFSIterator(const std::vector<Node *> &source); explicit NodesBFSIterator(const std::vector<Node *> &source);
// NodesBFSIterator(NodesBFSIterator &&other) noexcept; NodesBFSIterator(NodesBFSIterator &&other) noexcept;
// NOTE Heavy to use. // NOTE Heavy to use.
NodesBFSIterator(const NodesBFSIterator &other); NodesBFSIterator(const NodesBFSIterator &other);
...@@ -93,8 +118,8 @@ struct GraphTraits<DataFlowGraph> { ...@@ -93,8 +118,8 @@ struct GraphTraits<DataFlowGraph> {
struct NodesDFSIterator struct NodesDFSIterator
: public std::iterator<std::forward_iterator_tag, Node *> { : public std::iterator<std::forward_iterator_tag, Node *> {
NodesDFSIterator() = default; NodesDFSIterator() = default;
explicit NodesDFSIterator(const std::vector<Node *> &source); NodesDFSIterator(const std::vector<Node *> &source);
// NodesDFSIterator(NodesDFSIterator &&other) noexcept; NodesDFSIterator(NodesDFSIterator &&other) noexcept;
NodesDFSIterator(const NodesDFSIterator &other); NodesDFSIterator(const NodesDFSIterator &other);
Node &operator*(); Node &operator*();
...@@ -116,7 +141,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -116,7 +141,7 @@ struct GraphTraits<DataFlowGraph> {
struct NodesTSIterator struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, Node *> { : public std::iterator<std::forward_iterator_tag, Node *> {
NodesTSIterator() = default; NodesTSIterator() = default;
explicit NodesTSIterator(const std::vector<Node *> &source); NodesTSIterator(const std::vector<Node *> &source);
NodesTSIterator(NodesTSIterator &&other) NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0; other.cursor_ = 0;
...@@ -138,7 +163,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -138,7 +163,7 @@ struct GraphTraits<DataFlowGraph> {
size_t cursor_{0}; size_t cursor_{0};
}; };
explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {}
// default use BFS to visit the nodes. // default use BFS to visit the nodes.
iterator_range<NodesBFSIterator> nodes() { iterator_range<NodesBFSIterator> nodes() {
...@@ -156,20 +181,20 @@ struct GraphTraits<DataFlowGraph> { ...@@ -156,20 +181,20 @@ struct GraphTraits<DataFlowGraph> {
private: private:
NodesBFSIterator nodes_bfs_begin() { NodesBFSIterator nodes_bfs_begin() {
return NodesBFSIterator(graph_->inputs); return NodesBFSIterator(graph_.inputs());
} }
NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); } NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
NodesDFSIterator nodes_dfs_begin() { NodesDFSIterator nodes_dfs_begin() {
return NodesDFSIterator(graph_->inputs); return NodesDFSIterator(graph_.inputs());
} }
NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); } NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); } NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); }
NodesTSIterator nodes_ts_end() { return NodesTSIterator(); } NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
private: private:
DataFlowGraph *graph_; const DataFlowGraph &graph_;
}; };
// Extract the inputs and outputs of a graph. The inputs and outputs of a // Extract the inputs and outputs of a graph. The inputs and outputs of a
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle { namespace paddle {
...@@ -24,20 +25,18 @@ TEST(DataFlowGraph, BFS) { ...@@ -24,20 +25,18 @@ TEST(DataFlowGraph, BFS) {
auto dfg = ProgramDescToDFG(desc); auto dfg = ProgramDescToDFG(desc);
dfg.Build(); dfg.Build();
for (auto *in : dfg.inputs) { for (auto* in : dfg.inputs()) {
LOG(INFO) << "inputs: " << in->name() << " " LOG(INFO) << "inputs: " << in->name() << " "
<< static_cast<int>(in->type()); << static_cast<int>(in->type());
} }
for (auto *out : dfg.outputs) { for (auto* out : dfg.outputs()) {
LOG(INFO) << "outputs: " << out->name() << " " LOG(INFO) << "outputs: " << out->name() << " "
<< static_cast<int>(out->type()); << static_cast<int>(out->type());
} }
GraphTraits<DataFlowGraph> trait(&dfg);
auto nodes = trait.nodes();
size_t count = 0; size_t count = 0;
for (auto it = nodes.begin(); it != nodes.end(); ++it) { for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes()) {
LOG(INFO) << "visiting " << it->name(); LOG(INFO) << "visiting " << node.name();
++count; ++count;
} }
ASSERT_EQ(count, dfg.nodes.size()); ASSERT_EQ(count, dfg.nodes.size());
...@@ -45,13 +44,11 @@ TEST(DataFlowGraph, BFS) { ...@@ -45,13 +44,11 @@ TEST(DataFlowGraph, BFS) {
TEST(DataFlowGraph, DFS) { TEST(DataFlowGraph, DFS) {
auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
auto dfg = ProgramDescToDFG(desc); DataFlowGraph dfg;
dfg.Build(); dfg.Build(desc);
GraphTraits<DataFlowGraph> trait(&dfg);
auto nodes = trait.nodes_in_DFS();
size_t count = 0; size_t count = 0;
for (auto it = nodes.begin(); it != nodes.end(); ++it) { for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes_in_DFS()) {
LOG(INFO) << "visiting " << it->name(); LOG(INFO) << "visiting " << node.name();
++count; ++count;
} }
ASSERT_EQ(count, dfg.nodes.size()); ASSERT_EQ(count, dfg.nodes.size());
...@@ -74,21 +71,17 @@ TEST(DataFlowGraph, TS) { ...@@ -74,21 +71,17 @@ TEST(DataFlowGraph, TS) {
DataFlowGraph graph; DataFlowGraph graph;
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
auto *node = graph.nodes.Create(Node::Type::kValue); auto* node = graph.nodes.Create(Node::Type::kValue);
node->SetName("node-" + std::to_string(i)); node->SetName("node-" + std::to_string(i));
} }
auto add_link = [&](int i, int j) { auto add_link = [&](int i, int j) {
Node *source = graph.nodes.GetMutable(i); Node* source = graph.nodes.GetMutable(i);
Node *target = graph.nodes.GetMutable(j); Node* target = graph.nodes.GetMutable(j);
target->inlinks.push_back(source); target->inlinks.push_back(source);
source->outlinks.push_back(target); source->outlinks.push_back(target);
}; };
graph.inputs.push_back(graph.nodes.GetMutable(0));
graph.inputs.push_back(graph.nodes.GetMutable(1));
graph.inputs.push_back(graph.nodes.GetMutable(2));
add_link(0, 4); add_link(0, 4);
add_link(0, 5); add_link(0, 5);
add_link(1, 6); add_link(1, 6);
...@@ -97,8 +90,9 @@ TEST(DataFlowGraph, TS) { ...@@ -97,8 +90,9 @@ TEST(DataFlowGraph, TS) {
add_link(4, 7); add_link(4, 7);
add_link(4, 3); add_link(4, 3);
add_link(7, 3); add_link(7, 3);
graph.Build();
auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS(); auto its = GraphTraits<DataFlowGraph>(graph).nodes_in_TS();
std::vector<int> sorted_ids; std::vector<int> sorted_ids;
for (auto it = its.begin(); it != its.end(); ++it) { for (auto it = its.begin(); it != its.end(); ++it) {
LOG(INFO) << it->name(); LOG(INFO) << it->name();
...@@ -122,6 +116,50 @@ TEST(DataFlowGraph, TS) { ...@@ -122,6 +116,50 @@ TEST(DataFlowGraph, TS) {
assert_positive_sequence_pair(4, 7); assert_positive_sequence_pair(4, 7);
} }
TEST(DataFlowGraph, Build_ProgramDesc) {
auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
DataFlowGraph graph;
graph.Build(desc);
ASSERT_EQ(graph.nodes.size(), 38UL);
}
void SetOp(framework::ProgramDesc* prog, const std::string& type,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetInput("Xs", inputs);
op->SetOutput("Xs", outputs);
}
TEST(DataFlowGraph, Build_IR_Graph) {
framework::ProgramDesc prog;
for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(framework::proto::VarType::SELECTED_ROWS);
if (v == "c") {
var->SetPersistable(true);
}
}
SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}));
SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
std::vector<std::string>({"c"}));
SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
std::vector<std::string>({"d"}));
SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
std::vector<std::string>({"f"}));
DataFlowGraph graph;
framework::ir::Graph ir_graph(prog);
graph.Build(ir_graph);
ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -23,9 +23,6 @@ ...@@ -23,9 +23,6 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
namespace analysis { namespace analysis {
using framework::proto::ProgramDesc; using framework::proto::ProgramDesc;
...@@ -52,19 +49,15 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { ...@@ -52,19 +49,15 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; } bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
FilterRedundantOutputOfSubGraph(graph); // FilterRedundantOutputOfSubGraph(graph);
LOG(INFO) << "graph.inputs " << graph->inputs.size(); for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.deleted()) continue; if (node.deleted()) continue;
switch (node.type()) { switch (node.type()) {
case Node::Type::kFunction: { case Node::Type::kFunction: {
LOG(INFO) << "add function " << node.repr();
AddFluidOp(&node); AddFluidOp(&node);
} break; } break;
case Node::Type::kFunctionBlock: { case Node::Type::kFunctionBlock: {
LOG(INFO) << "add engine op " << node.repr() << " , "
<< static_cast<FunctionBlock *>(&node)->subgraph.size();
AddEngineOp(&node); AddEngineOp(&node);
} break; } break;
default: default:
...@@ -76,15 +69,27 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { ...@@ -76,15 +69,27 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
} }
void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc()); PADDLE_ENFORCE(node);
PADDLE_ENFORCE(node->IsFunction());
PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(),
"node has invalid protobuf repr.");
// currently only the main block is analyzed. // currently only the main block is analyzed.
PADDLE_ENFORCE(desc_);
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops(); auto *op = main_block->add_ops();
*op = *ori_op; // copy the attributes, by default, these will not be changed
if (node->pb_desc()) {
auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
*op =
*ori_op; // copy the attributes, by default, these will not be changed
// by analysis phrase. // by analysis phrase.
// The inputs and outputs of the existing ops are not changed by tensorrt // The inputs and outputs of the existing ops are not changed by tensorrt
// subgraph pass. // subgraph pass.
// NOTE It might be changed by other passes in the long run. // NOTE It might be changed by other passes in the long run.
} else {
op->ParseFromString(node->pb_msg());
}
} }
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
...@@ -191,8 +196,6 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, ...@@ -191,8 +196,6 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
// Set attrs // Set attrs
SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
SetAttr(desc.Proto(), "output_name_mapping", output_mapping); SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
node->SetPbMsg(desc.Proto()->SerializeAsString()); node->SetPbMsg(desc.Proto()->SerializeAsString());
...@@ -221,10 +224,9 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { ...@@ -221,10 +224,9 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
framework::BlockDesc block_desc(nullptr, &proto); framework::BlockDesc block_desc(nullptr, &proto);
block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0); block_desc.Proto()->set_idx(0);
LOG(INFO) << "origin variable size: " VLOG(4) << "origin variable size: "
<< argument_->origin_program_desc->blocks(0).vars().size(); << argument_->origin_program_desc->blocks(0).vars().size();
LOG(INFO) << "transformed variable size: " VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
<< block_desc.Proto()->vars().size();
// copy ops. // copy ops.
for (auto *node : block_node->subgraph) { for (auto *node : block_node->subgraph) {
...@@ -258,7 +260,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { ...@@ -258,7 +260,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root, FLAGS_IA_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger")); "data_flow_graph_to_fluid_graphviz_debugger"));
} }
......
...@@ -27,9 +27,6 @@ ...@@ -27,9 +27,6 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
DECLARE_int32(tensorrt_max_batchsize);
DECLARE_int32(tensorrt_workspace_size);
namespace analysis { namespace analysis {
class DataFlowGraphToFluidPass final : public DataFlowGraphPass { class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
public: public:
......
...@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) { ...@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png"; auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
std::string message; std::string message;
LOG(INFO) << "draw to " << png_path; VLOG(3) << "draw to " << png_path;
ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message); ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
} }
......
...@@ -52,72 +52,7 @@ bool FluidToDataFlowGraphPass::Finalize() { return true; } ...@@ -52,72 +52,7 @@ bool FluidToDataFlowGraphPass::Finalize() { return true; }
void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
PADDLE_ENFORCE(graph); PADDLE_ENFORCE(graph);
PADDLE_ENFORCE(desc_); PADDLE_ENFORCE(desc_);
// insert vars graph->Build(*desc_);
// The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
// will keep updating to its latest alias during the graph-building.
std::unordered_map<std::string, size_t> var2id;
auto &main_block = desc_->blocks(framework::kRootBlockIndex);
for (int i = 0; i < main_block.vars_size(); i++) {
const auto &var = main_block.vars(i);
auto *v = graph->nodes.Create(Node::Type::kValue);
v->SetName(var.name());
v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
v->SetPbMsg(var.SerializeAsString());
var2id[var.name()] = v->id();
}
// The variables in a SSA can only write once, so if a variable is written
// multiple times(quite common in our ProgramDesc design), multiple alias
// Nodes of this variable will be created, and each will just write once.
// An set that keep all the names of the variables(the original, not alias)
// that have been written(as outputs). Once an Op's output variable hit the
// set, it should create a new alias and update the global alias for this
// variable. And that make a Data Flow Graph a SSA.
std::unordered_set<Node *> unique_written_vars;
for (int i = 0; i < main_block.ops_size(); i++) {
const auto &op = main_block.ops(i);
auto *o = graph->nodes.Create(Node::Type::kFunction);
o->SetName(op.type());
static_cast<Function *>(o)->SetFuncType(op.type());
// Link to the original protobuf message's memory, make it easier to
// generate from a data flow graph to fluid ProgramDesc.
o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs
for (int j = 0; j < op.inputs_size(); j++) {
auto &in_var = op.inputs(j);
for (int k = 0; k < in_var.arguments_size(); k++) {
auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
in->outlinks.push_back(o);
o->inlinks.push_back(in);
}
}
for (int j = 0; j < op.outputs_size(); j++) {
auto &out_var = op.outputs(j);
for (int k = 0; k < out_var.arguments_size(); k++) {
auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
if (unique_written_vars.count(out)) {
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto *out_alias = graph->nodes.Create(Node::Type::kValue);
out_alias->SetName(out->name());
out_alias->SetPbDesc(out->pb_desc());
out_alias->SetPbMsg(out->pb_msg());
var2id[out_alias->name()] =
out_alias->id(); // update variable's alias Node
LOG(INFO) << "loop found in graph, create SSA alias node ["
<< out_alias->repr() << "] for [" << out->repr() << "]";
out = out_alias;
}
out->inlinks.push_back(o);
o->outlinks.push_back(out);
unique_written_vars.insert(out);
}
}
}
// Analysis and extract the inputs and outputs of this graph.
graph->Build();
} }
namespace { namespace {
...@@ -133,7 +68,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { ...@@ -133,7 +68,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const { Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger")); FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
} }
} // namespace analysis } // namespace analysis
......
...@@ -30,7 +30,7 @@ TEST(FluidToDataFlowGraphPass, Test) { ...@@ -30,7 +30,7 @@ TEST(FluidToDataFlowGraphPass, Test) {
ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL); ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
pass.Finalize(); pass.Finalize();
ASSERT_FALSE(argument.main_dfg->DotString().empty()); ASSERT_FALSE(argument.main_dfg->DotString().empty());
EXPECT_FALSE(argument.main_dfg->inputs.empty()); EXPECT_FALSE(argument.main_dfg->inputs().empty());
} }
} // namespace analysis } // namespace analysis
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle {
namespace inference {
namespace analysis {
class FluidToIrPass final : public DataFlowGraphPass {
public:
FluidToIrPass() = default;
bool Initialize(Argument *argument) override {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
if (argument->origin_program_desc) {
LOG(WARNING) << "argument's origin_program_desc is already set, might "
"duplicate called";
}
// set fluid model program path
if (!argument->fluid_model_program_path) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
argument->fluid_model_program_path.reset(
new std::string(*argument->fluid_model_dir + "/__model__"));
}
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
// Load program.
auto program = LoadProgramDesc(*argument->fluid_model_program_path);
argument->origin_program_desc.reset(
new framework::proto::ProgramDesc(program));
// Create main data flow graph.
if (!argument->main_dfg) {
argument->main_dfg.reset(new DataFlowGraph);
}
// Persist the ProgramDesc in graph's attribute. The IR graph just keep the
// address, will segfault if the original ProgramDesc destroys.
auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer();
ir_program_p = new framework::ProgramDesc(program);
argument_ = argument;
return true;
}
bool Finalize() override { return true; }
void Run(DataFlowGraph *graph) override {
// Call all the IR Passes
IRPassManager ir_passes(*static_cast<framework::ProgramDesc *>(
argument_->main_dfg->Attr("ir_program_desc").Pointer()));
ir_passes.Apply(std::vector<std::string>(
{// Manual update the passes here.
"graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass",
"fc_fuse_pass", "graph_viz_pass"}));
PADDLE_ENFORCE(argument_->main_dfg.get());
argument_->main_dfg->Build(ir_passes.graph());
// PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected());
}
std::string repr() const override { return "fluid-to-ir-pass"; }
private:
Argument *argument_{nullptr};
};
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST(FluidToIrPass, Test) {
FluidToIrPass pass;
Argument argument(FLAGS_inference_model_dir);
pass.Initialize(&argument);
pass.Run(argument.main_dfg.get());
}
} // namespace analysis
} // namespace inference
} // namespace paddle
USE_PASS(fc_fuse_pass);
USE_PASS(graph_viz_pass);
USE_PASS(infer_clean_graph_pass);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <sys/stat.h>
#include <cstdio> #include <cstdio>
#include <fstream> #include <fstream>
#include <string> #include <string>
...@@ -151,6 +152,23 @@ static framework::proto::ProgramDesc LoadProgramDesc( ...@@ -151,6 +152,23 @@ static framework::proto::ProgramDesc LoadProgramDesc(
return program_desc; return program_desc;
} }
static bool FileExists(const std::string &filepath) {
std::ifstream file(filepath);
bool exists = file.is_open();
file.close();
return exists;
}
static bool PathExists(const std::string &path) {
struct stat statbuf;
if (stat(path.c_str(), &statbuf) != -1) {
if (S_ISDIR(statbuf.st_mode)) {
return true;
}
}
return false;
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <string>
namespace paddle {
namespace inference {
namespace analysis {
IRPassManager::IRPassManager(const ProgramDesc& program) {
graph_.reset(new framework::ir::Graph(program));
}
void IRPassManager::Apply(const std::vector<std::string>& passes) {
graph_->Set("graph_viz_path", new std::string("./1.dot"));
// Apply all the passes
std::string pre_pass;
for (const std::string& pass_name : passes) {
LOG(WARNING) << "Running IR pass [" << pass_name << "]";
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
if (pass_name == "graph_viz_pass") {
std::string dot_file_path =
"ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
}
graph_ = pass->Apply(std::move(graph_));
pre_pass = pass_name;
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines IRPassManager, it helps control the passes in IR. Inference
* phrase will load the model program and parameters from disk, that is quite
* different from the training phase.
* This manager will control the Passes and make the passes in IR work smoothly
* for inference.
*/
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace inference {
namespace analysis {
using framework::ProgramDesc;
class IRPassManager final {
public:
IRPassManager(const ProgramDesc& program);
void Apply(const std::vector<std::string>& passes);
framework::ir::Graph& graph() const { return *graph_; }
private:
std::unique_ptr<framework::ir::Graph> graph_;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -35,19 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) { ...@@ -35,19 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
std::stringstream ss; std::stringstream ss;
// NOTE these commands only works on linux. // NOTE these commands only works on linux.
ss << "mkdir -p " << *argument_->model_output_store_path; ss << "mkdir -p " << *argument_->model_output_store_path;
LOG(INFO) << "run command: " << ss.str(); VLOG(3) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
ss.str(""); ss.str("");
ss << "cp " << *argument_->fluid_model_dir << "/*" ss << "cp " << *argument_->fluid_model_dir << "/*"
<< " " << *argument_->model_output_store_path; << " " << *argument_->model_output_store_path;
LOG(INFO) << "run command: " << ss.str(); VLOG(3) << "run command: " << ss.str();
PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
// Store program // Store program
PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
"program desc is not transformed, should call " "program desc is not transformed, should call "
"DataFlowGraphToFluidPass first."); "DataFlowGraphToFluidPass first.");
VLOG(3) << "store analyzed program to "
<< *argument_->model_output_store_path;
const std::string program_output_path = const std::string program_output_path =
*argument_->model_output_store_path + "/__model__"; *argument_->model_output_store_path + "/__model__";
std::ofstream file(program_output_path, std::ios::binary); std::ofstream file(program_output_path, std::ios::binary);
...@@ -58,6 +60,8 @@ void ModelStorePass::Run(DataFlowGraph *x) { ...@@ -58,6 +60,8 @@ void ModelStorePass::Run(DataFlowGraph *x) {
file.write(serialized_message.c_str(), serialized_message.size()); file.write(serialized_message.c_str(), serialized_message.size());
} }
bool ModelStorePass::Finalize() { return true; }
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -44,6 +44,8 @@ class ModelStorePass : public DataFlowGraphPass { ...@@ -44,6 +44,8 @@ class ModelStorePass : public DataFlowGraphPass {
model in the disk, and that model can be reloaded for prediction again.)DD"; model in the disk, and that model can be reloaded for prediction again.)DD";
} }
bool Finalize() override;
private: private:
Argument* argument_{nullptr}; Argument* argument_{nullptr};
}; };
......
...@@ -30,7 +30,7 @@ TEST(DFG_StorePass, test) { ...@@ -30,7 +30,7 @@ TEST(DFG_StorePass, test) {
argument.model_output_store_path.reset( argument.model_output_store_path.reset(
new std::string("./_dfg_store_pass_tmp")); new std::string("./_dfg_store_pass_tmp"));
// disable storage in alalyzer // disable storage in alalyzer
FLAGS_inference_analysis_output_storage_path = ""; FLAGS_IA_output_storage_path = "";
analyzer.Run(&argument); analyzer.Run(&argument);
ModelStorePass pass; ModelStorePass pass;
......
...@@ -20,17 +20,6 @@ namespace paddle { ...@@ -20,17 +20,6 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
template <>
std::string &NodeAttr::As<std::string>() {
if (data_.empty()) {
type_index_ = std::type_index(typeid(std::string));
}
PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
return data_;
}
std::string &NodeAttr::String() { return As<std::string>(); }
std::vector<Dot::Attr> Value::dot_attrs() const { std::vector<Dot::Attr> Value::dot_attrs() const {
return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"), return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
Dot::Attr("shape", "box"), Dot::Attr("shape", "box"),
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/device.h" #include "paddle/fluid/inference/analysis/device.h"
#include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -37,41 +38,36 @@ namespace analysis { ...@@ -37,41 +38,36 @@ namespace analysis {
class NodeMap; class NodeMap;
// A helper class to maintain the status from Pass. // A helper class to maintain the status from Pass.
struct NodeAttr { struct AnyAttr {
using any_t =
boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
// NOTE T should be a primary type or a struct combined by several primary // NOTE T should be a primary type or a struct combined by several primary
// types. // types.
// NOTE the STL containers should not use here. // NOTE the STL containers should not use here.
// Some usages // Some usages
// Attr attr; // Attr attr;
// attr.Bool() = true; // attr.Bool() = true;
bool &Bool() { return As<bool>(); } bool &Bool() { return As<bool>(); }
float &Float() { return As<float>(); } float &Float() { return As<float>(); }
int32_t &Int32() { return As<int32_t>(); } int32_t &Int32() { return As<int32_t>(); }
int64_t &Int64() { return As<int64_t>(); } int64_t &Int64() { return As<int64_t>(); }
void *&Pointer() { return As<void *>(); } void *&Pointer() { return As<void *>(); }
std::string &String(); std::string &String() { return As<std::string>(); }
private:
template <typename T> template <typename T>
T &As() { T &As() {
// init storage in the first usage. if (type_index_ == typeid(AnyAttr)) {
if (data_.empty()) { type_index_ = typeid(T);
VLOG(4) << "resize data to " << sizeof(T); any_data_ = T();
type_index_ = std::type_index(typeid(T)); } else {
data_.resize(sizeof(T)); PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
} }
PADDLE_ENFORCE(framework::IsType<T>(type_index_), return boost::get<T>(any_data_);
"type not matched, origin is %s, want %s",
DataTypeNamer::Global().repr(type_index_),
DataTypeNamer::Global().repr<T>());
PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
return *reinterpret_cast<T *>(&data_[0]);
} }
private: private:
std::string data_; any_t any_data_;
std::type_index type_index_{typeid(NodeAttr)}; std::type_index type_index_{typeid(AnyAttr)};
}; };
/* /*
...@@ -108,7 +104,7 @@ class Node { ...@@ -108,7 +104,7 @@ class Node {
// Get an additional attribute and convert it to T data type. NOTE this will // Get an additional attribute and convert it to T data type. NOTE this will
// silently create a new attribute if not exists. // silently create a new attribute if not exists.
NodeAttr &attr(const std::string &name) const { return attrs_[name]; } AnyAttr &attr(const std::string &name) const { return attrs_[name]; }
int id() const { return id_; } int id() const { return id_; }
...@@ -153,7 +149,7 @@ class Node { ...@@ -153,7 +149,7 @@ class Node {
Type type_{Type::kNone}; Type type_{Type::kNone};
// Mark this node is deleted by some pass. // Mark this node is deleted by some pass.
bool deleted_{false}; bool deleted_{false};
mutable std::unordered_map<std::string, NodeAttr> attrs_; mutable std::unordered_map<std::string, AnyAttr> attrs_;
}; };
class Function; class Function;
......
...@@ -20,6 +20,24 @@ namespace paddle { ...@@ -20,6 +20,24 @@ namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST(NodeAttr, bool) {
AnyAttr x;
x.Bool() = true;
ASSERT_EQ(x.Bool(), true);
}
TEST(NodeAttr, int32) {
AnyAttr x;
x.Int32() = 32;
ASSERT_EQ(x.Int32(), 32);
}
TEST(NodeAttr, string) {
AnyAttr x;
x.String() = "Hello";
ASSERT_EQ(x.String(), "Hello");
}
TEST(Node, Attr) { TEST(Node, Attr) {
// Node is an abstract class, use Value instead for they share the same Attr // Node is an abstract class, use Value instead for they share the same Attr
// logic. // logic.
...@@ -27,6 +45,9 @@ TEST(Node, Attr) { ...@@ -27,6 +45,9 @@ TEST(Node, Attr) {
auto* node = nodes.Create(Node::Type::kValue); auto* node = nodes.Create(Node::Type::kValue);
node->attr("v0").Int32() = 2008; node->attr("v0").Int32() = 2008;
ASSERT_EQ(node->attr("v0").Int32(), 2008); ASSERT_EQ(node->attr("v0").Int32(), 2008);
node->attr("str").String() = "hello world";
ASSERT_EQ(node->attr("str").String(), "hello world");
} }
} // namespace analysis } // namespace analysis
......
...@@ -63,7 +63,7 @@ class Pass { ...@@ -63,7 +63,7 @@ class Pass {
// Human-readable short representation. // Human-readable short representation.
virtual std::string repr() const = 0; virtual std::string repr() const = 0;
// Human-readable long description. // Human-readable long description.
virtual std::string description() const = 0; virtual std::string description() const { return "No DOC"; }
}; };
// NodePass process on any Node types. // NodePass process on any Node types.
......
...@@ -22,7 +22,7 @@ namespace analysis { ...@@ -22,7 +22,7 @@ namespace analysis {
bool PassManager::Initialize(Argument* argument) { bool PassManager::Initialize(Argument* argument) {
argument_ = argument; argument_ = argument;
for (auto& pass : data_) { for (auto& pass : data_) {
LOG(INFO) << "Initializing pass " << pass->repr(); LOG(WARNING) << "Initializing pass [" << pass->repr() << "]";
if (!pass->Initialize(argument)) { if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false; return false;
...@@ -33,8 +33,9 @@ bool PassManager::Initialize(Argument* argument) { ...@@ -33,8 +33,9 @@ bool PassManager::Initialize(Argument* argument) {
void DfgPassManager::RunAll() { void DfgPassManager::RunAll() {
PADDLE_ENFORCE(argument_); PADDLE_ENFORCE(argument_);
LOG(INFO) << "Total " << data_.size() << " passes";
for (auto& pass : data_) { for (auto& pass : data_) {
VLOG(4) << "Running pass [" << pass->repr() << "]"; LOG(WARNING) << "Running pass [" << pass->repr() << "]";
pass->Run(argument_->main_dfg.get()); pass->Run(argument_->main_dfg.get());
} }
} }
...@@ -42,8 +43,7 @@ void DfgPassManager::RunAll() { ...@@ -42,8 +43,7 @@ void DfgPassManager::RunAll() {
void NodePassManager::RunAll() { void NodePassManager::RunAll() {
PADDLE_ENFORCE(argument_); PADDLE_ENFORCE(argument_);
PADDLE_ENFORCE(argument_->main_dfg.get()); PADDLE_ENFORCE(argument_->main_dfg.get());
auto trait = auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
for (auto& node : trait) { for (auto& node : trait) {
for (auto& pass : data_) { for (auto& pass : data_) {
pass->Run(&node); pass->Run(&node);
......
...@@ -34,7 +34,7 @@ inline void MarkOutLinksInSubGraph(const Function *func) { ...@@ -34,7 +34,7 @@ inline void MarkOutLinksInSubGraph(const Function *func) {
} }
void SubGraphSplitter::MarkNodesInsideSubGraph() { void SubGraphSplitter::MarkNodesInsideSubGraph() {
for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) { for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes()) {
if (node_inside_subgraph_teller_(&node)) { if (node_inside_subgraph_teller_(&node)) {
node.attr(kMarkerAttrName).Bool() = true; node.attr(kMarkerAttrName).Bool() = true;
if (node.type() == Node::Type::kFunction) { if (node.type() == Node::Type::kFunction) {
...@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { ...@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() { std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
std::vector<Node *> marked_nodes; std::vector<Node *> marked_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) { for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
if (node.attr(kMarkerAttrName).Bool()) { if (node.attr(kMarkerAttrName).Bool()) {
marked_nodes.push_back(&node); marked_nodes.push_back(&node);
} }
...@@ -153,6 +153,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { ...@@ -153,6 +153,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
inlink_or_outlink_cleaner(o->inlinks); inlink_or_outlink_cleaner(o->inlinks);
} }
} }
FilterRedundantOutputOfSubGraph(graph_);
} }
} // namespace analysis } // namespace analysis
......
...@@ -69,8 +69,8 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass { ...@@ -69,8 +69,8 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
}; };
Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const { Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
DFG_GraphvizDrawPass::Config config( DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node"); "tensorrt_marked_node");
return new DfgDebuggerPass(config); return new DfgDebuggerPass(config);
} }
bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; } bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
......
...@@ -18,7 +18,10 @@ if(APPLE) ...@@ -18,7 +18,10 @@ if(APPLE)
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api) set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
graph_viz_pass fc_fuse_pass
infer_clean_graph_pass
)
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
...@@ -62,17 +65,20 @@ endif() ...@@ -62,17 +65,20 @@ endif()
if (WITH_ANAKIN AND WITH_GPU) # only needed in CI if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so. # compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin) cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
function(anakin_target target_name) function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction() endfunction()
anakin_target(inference_anakin_api) anakin_target(inference_anakin_api)
#anakin_target(inference_anakin_api_shared) anakin_target(inference_anakin_api_shared)
if (WITH_TESTING) if (WITH_TESTING)
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api dynload_cuda SERIAL) DEPS inference_anakin_api_shared dynload_cuda SERIAL)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc
ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared dynload_cuda SERIAL)
endif(WITH_TESTING) endif(WITH_TESTING)
endif() endif()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
...@@ -13,9 +13,22 @@ ...@@ -13,9 +13,22 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/api/api_anakin_engine.h" #include "paddle/fluid/inference/api/api_anakin_engine.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif
#include <mkl_service.h>
#include <omp.h>
#include <map>
#include <string>
#include <utility>
#include <vector> #include <vector>
#include "framework/core/net/net.h"
#include "framework/operators/ops.h"
#include "saber/funcs/timer.h"
namespace paddle { namespace paddle {
template <typename Target> template <typename Target>
...@@ -23,16 +36,24 @@ PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor( ...@@ -23,16 +36,24 @@ PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) { const AnakinConfig &config) {
CHECK(Init(config)); CHECK(Init(config));
} }
template <>
PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) {
omp_set_dynamic(0);
omp_set_num_threads(1);
mkl_set_num_threads(1);
CHECK(Init(config));
}
template <typename Target> template <typename Target>
bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) { bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
if (!(graph_.load(config.model_file))) { if (!(graph_.load(config.model_file))) {
LOG(FATAL) << "fail to load graph from " << config.model_file; VLOG(3) << "fail to load graph from " << config.model_file;
return false; return false;
} }
auto inputs = graph_.get_ins(); auto inputs = graph_.get_ins();
for (auto &input_str : inputs) { for (auto &input_str : inputs) {
graph_.ResetBatchSize(input_str, config.max_batch_size); graph_.ResetBatchSize(input_str, config.max_batch_size);
max_batch_size_ = config.max_batch_size;
} }
// optimization for graph // optimization for graph
if (!(graph_.Optimize())) { if (!(graph_.Optimize())) {
...@@ -52,14 +73,14 @@ bool PaddleInferenceAnakinPredictor<Target>::Run( ...@@ -52,14 +73,14 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
std::vector<PaddleTensor> *output_data, int batch_size) { std::vector<PaddleTensor> *output_data, int batch_size) {
for (const auto &input : inputs) { for (const auto &input : inputs) {
if (input.dtype != PaddleDType::FLOAT32) { if (input.dtype != PaddleDType::FLOAT32) {
LOG(ERROR) << "Only support float type inputs. " << input.name VLOG(3) << "Only support float type inputs. " << input.name
<< "'s type is not float"; << "'s type is not float";
return false; return false;
} }
auto d_tensor_in_p = executor_p_->get_in(input.name); auto d_tensor_in_p = executor_p_->get_in(input.name);
auto net_shape = d_tensor_in_p->valid_shape(); auto net_shape = d_tensor_in_p->shape();
if (net_shape.size() != input.shape.size()) { if (net_shape.size() != input.shape.size()) {
LOG(ERROR) << " input " << input.name VLOG(3) << " input " << input.name
<< "'s shape size should be equal to that of net"; << "'s shape size should be equal to that of net";
return false; return false;
} }
...@@ -79,21 +100,45 @@ bool PaddleInferenceAnakinPredictor<Target>::Run( ...@@ -79,21 +100,45 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
} }
d_tensor_in_p->reshape(tmp_shape); d_tensor_in_p->reshape(tmp_shape);
if (input.lod.size() > 0) {
if (input.lod.size() > 1) {
VLOG(3) << " input lod first dim should <=1, but you set "
<< input.lod.size();
return false;
}
std::vector<int> offset(input.lod[0].begin(), input.lod[0].end());
d_tensor_in_p->set_seq_offset(offset);
VLOG(3) << "offset.size(): " << offset.size();
for (int i = 0; i < offset.size(); i++) {
VLOG(3) << offset[i];
}
}
float *d_data_p = d_tensor_in_p->mutable_data(); float *d_data_p = d_tensor_in_p->mutable_data();
#ifdef PADDLE_WITH_CUDA
if (std::is_same<anakin::NV, Target>::value) {
if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()), if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float), d_tensor_in_p->valid_size() * sizeof(float),
cudaMemcpyHostToDevice) != 0) { cudaMemcpyHostToDevice) != 0) {
LOG(ERROR) << "copy data from CPU to GPU error"; VLOG(3) << "copy data from CPU to GPU error";
return false; return false;
} }
cudaStreamSynchronize(NULL);
} }
#endif
if (std::is_same<anakin::X86, Target>::value) {
memcpy(d_data_p, static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float));
}
}
#ifdef PADDLE_WITH_CUDA
cudaDeviceSynchronize(); cudaDeviceSynchronize();
executor_p_->prediction(); executor_p_->prediction();
cudaDeviceSynchronize(); cudaDeviceSynchronize();
#endif
if (output_data->empty()) { if (output_data->empty()) {
LOG(ERROR) << "At least one output should be set with tensors' names."; VLOG(3) << "At least one output should be set with tensors' names.";
return false; return false;
} }
for (auto &output : *output_data) { for (auto &output : *output_data) {
...@@ -102,14 +147,22 @@ bool PaddleInferenceAnakinPredictor<Target>::Run( ...@@ -102,14 +147,22 @@ bool PaddleInferenceAnakinPredictor<Target>::Run(
if (output.data.length() < tensor->valid_size() * sizeof(float)) { if (output.data.length() < tensor->valid_size() * sizeof(float)) {
output.data.Resize(tensor->valid_size() * sizeof(float)); output.data.Resize(tensor->valid_size() * sizeof(float));
} }
#if PADDLE_WITH_CUDA
if (std::is_same<anakin::NV, Target>::value) {
// Copy data from GPU -> CPU // Copy data from GPU -> CPU
if (cudaMemcpy(output.data.data(), tensor->mutable_data(), if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
tensor->valid_size() * sizeof(float), tensor->valid_size() * sizeof(float),
cudaMemcpyDeviceToHost) != 0) { cudaMemcpyDeviceToHost) != 0) {
LOG(ERROR) << "copy data from GPU to CPU error"; VLOG(3) << "copy data from GPU to CPU error";
return false; return false;
} }
cudaStreamSynchronize(NULL); }
#endif
if (std::is_same<anakin::X86, Target>::value) {
memcpy(output.data.data(), tensor->mutable_data(),
tensor->valid_size() * sizeof(float));
}
} }
return true; return true;
} }
...@@ -132,7 +185,7 @@ PaddleInferenceAnakinPredictor<Target>::Clone() { ...@@ -132,7 +185,7 @@ PaddleInferenceAnakinPredictor<Target>::Clone() {
auto anakin_predictor_p = auto anakin_predictor_p =
dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get()); dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
if (!anakin_predictor_p) { if (!anakin_predictor_p) {
LOG(ERROR) << "fail to call Init"; VLOG(3) << "fail to call Init";
return nullptr; return nullptr;
} }
anakin_predictor_p->get_executer().init(graph_); anakin_predictor_p->get_executer().init(graph_);
...@@ -162,6 +215,44 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -162,6 +215,44 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "Anakin Predictor create on unknown platform."; VLOG(3) << "Anakin Predictor create on unknown platform.";
return nullptr; return nullptr;
} }
}; }
#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
template <typename Target>
using executor_t =
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>;
template <typename Target>
void DisplayOpTimer(executor_t<Target> *net_executor, int epoch) {
std::vector<float> op_time = net_executor->get_op_time();
auto exec_funcs = net_executor->get_exec_funcs();
auto op_param = net_executor->get_op_param();
for (int i = 0; i < op_time.size(); i++) {
LOG(INFO) << "name: " << exec_funcs[i].name
<< " op_type: " << exec_funcs[i].op_name
<< " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
}
std::map<std::string, float> op_map;
for (int i = 0; i < op_time.size(); i++) {
auto it = op_map.find(op_param[i]);
if (it != op_map.end())
op_map[op_param[i]] += op_time[i];
else
op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
}
for (auto it = op_map.begin(); it != op_map.end(); ++it) {
LOG(INFO) << it->first << " " << (it->second) / epoch << " ms";
}
}
#endif
template <typename Target>
PaddleInferenceAnakinPredictor<Target>::~PaddleInferenceAnakinPredictor() {
#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
DisplayOpTimer<Target>(executor_p_, max_batch_size_);
#endif
delete executor_p_;
executor_p_ = nullptr;
}
} // namespace paddle } // namespace paddle
...@@ -47,10 +47,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { ...@@ -47,10 +47,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>& anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
get_executer(); get_executer();
~PaddleInferenceAnakinPredictor() override { ~PaddleInferenceAnakinPredictor() override;
delete executor_p_;
executor_p_ = nullptr;
};
private: private:
bool Init(const AnakinConfig& config); bool Init(const AnakinConfig& config);
...@@ -60,6 +57,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { ...@@ -60,6 +57,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>* anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
executor_p_{nullptr}; executor_p_{nullptr};
AnakinConfig config_; AnakinConfig config_;
int max_batch_size_{0};
}; };
} // namespace paddle } // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <thread> // NOLINT
#include <vector>
#include "framework/core/net/net.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(model, "", "Directory of the inference model.");
DEFINE_string(datapath, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
std::vector<std::string> string_split(std::string in_str,
std::string delimiter) {
std::vector<std::string> seq;
int found = in_str.find(delimiter);
int pre_found = -1;
while (found != std::string::npos) {
if (pre_found == -1) {
seq.push_back(in_str.substr(0, found));
} else {
seq.push_back(in_str.substr(pre_found + delimiter.length(),
found - delimiter.length() - pre_found));
}
pre_found = found;
found = in_str.find(delimiter, pre_found + delimiter.length());
}
seq.push_back(
in_str.substr(pre_found + 1, in_str.length() - (pre_found + 1)));
return seq;
}
std::vector<std::string> string_split(
std::string in_str, std::vector<std::string>& delimiter) { // NOLINT
std::vector<std::string> in;
std::vector<std::string> out;
out.push_back(in_str);
for (auto del : delimiter) {
in = out;
out.clear();
for (auto s : in) {
auto out_s = string_split(s, del);
for (auto o : out_s) {
out.push_back(o);
}
}
}
return out;
}
class Data {
public:
Data(std::string file_name, int batch_size)
: _batch_size(batch_size), _total_length(0) {
_file.open(file_name);
_file.seekg(_file.end);
_total_length = _file.tellg();
_file.seekg(_file.beg);
}
void get_batch_data(std::vector<std::vector<float>>& fea, // NOLINT
std::vector<std::vector<float>>& week_fea, // NOLINT
std::vector<std::vector<float>>& time_fea, // NOLINT
std::vector<long unsigned int>& seq_offset); // NOLINT
private:
std::fstream _file;
int _total_length;
int _batch_size;
};
void Data::get_batch_data(
std::vector<std::vector<float>>& fea, // NOLINT
std::vector<std::vector<float>>& week_fea, // NOLINT
std::vector<std::vector<float>>& time_fea, // NOLINT
std::vector<long unsigned int>& seq_offset) { // NOLINT
int seq_num = 0;
long unsigned int cum = 0; // NOLINT
char buf[10000];
seq_offset.clear();
seq_offset.push_back(0);
fea.clear();
week_fea.clear();
time_fea.clear();
while (_file.getline(buf, 10000)) {
std::string s = buf;
std::vector<std::string> deli_vec = {":"};
std::vector<std::string> data_vec = string_split(s, deli_vec);
std::vector<std::string> seq;
seq = string_split(data_vec[0], {"|"});
for (auto link : seq) {
std::vector<std::string> data = string_split(link, ",");
std::vector<float> vec;
for (int i = 0; i < data.size(); i++) {
vec.push_back(atof(data[i].c_str()));
}
fea.push_back(vec);
}
std::vector<std::string> week_data;
std::vector<std::string> time_data;
week_data = string_split(data_vec[2], ",");
std::vector<float> vec_w;
for (int i = 0; i < week_data.size(); i++) {
vec_w.push_back(atof(week_data[i].c_str()));
}
week_fea.push_back(vec_w);
time_data = string_split(data_vec[1], ",");
std::vector<float> vec_t;
for (int i = 0; i < time_data.size(); i++) {
vec_t.push_back(atof(time_data[i].c_str()));
}
time_fea.push_back(vec_t);
cum += seq.size();
seq_offset.push_back(cum);
seq_num++;
if (seq_num >= _batch_size) {
break;
}
}
}
namespace paddle {
AnakinConfig GetConfig() {
AnakinConfig config;
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = AnakinConfig::X86;
config.model_file = FLAGS_model;
config.device = 0;
config.max_batch_size = 1000; // the max number of token
return config;
}
void set_tensor(std::string name, std::vector<int> shape,
std::vector<PaddleTensor>& vec) { // NOLINT
int sum = 1;
std::for_each(shape.begin(), shape.end(), [&](int n) { sum *= n; });
float* data = new float[sum];
PaddleTensor tensor;
tensor.name = name;
tensor.shape = shape;
tensor.data = PaddleBuf(data, sum);
tensor.dtype = PaddleDType::FLOAT32;
vec.push_back(tensor);
}
void single_test() {
AnakinConfig config = GetConfig();
auto predictor =
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
int max_batch_size = 1000;
std::string feature_file = FLAGS_datapath;
Data map_data(feature_file, FLAGS_batch_size);
std::vector<std::vector<float>> fea;
std::vector<std::vector<float>> week_fea;
std::vector<std::vector<float>> time_fea;
std::vector<long unsigned int> seq_offset; // NOLINT
paddle::PaddleTensor tensor_0, tensor_1, tensor_2;
tensor_0.name = "input_0";
tensor_1.name = "input_4";
tensor_2.name = "input_5";
PaddleTensor tensor_out;
tensor_out.name = "final_output.tmp_1_gout";
tensor_out.shape = std::vector<int>({});
tensor_out.data = PaddleBuf();
tensor_out.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> inputs;
std::vector<PaddleTensor> outputs(1, tensor_out);
int data_0_dim = 38;
int data_1_dim = 10;
int data_2_dim = 10;
float data_0[max_batch_size * data_0_dim]; // NOLINT
float data_1[max_batch_size * data_1_dim]; // NOLINT
float data_2[max_batch_size * data_2_dim]; // NOLINT
int count = 0;
while (true) {
if (count++ > 0) break; // only run the first batch in ci.
seq_offset.clear();
map_data.get_batch_data(fea, week_fea, time_fea, seq_offset);
if (seq_offset.size() <= 1) {
LOG(FATAL) << "seq_offset.size() <= 1, exit.";
break;
}
std::vector<std::vector<long unsigned int>> seq_offset_vec; // NOLINT
seq_offset_vec.push_back(seq_offset);
tensor_0.lod = seq_offset_vec;
int p_shape_0[] = {(int)fea.size(), 1, 1, data_0_dim}; // NOLINT
int p_shape_1[] = {(int)week_fea.size(), data_1_dim, 1, 1}; // NOLINT
int p_shape_2[] = {(int)time_fea.size(), data_2_dim, 1, 1}; // NOLINT
std::vector<int> shape_0(p_shape_0, p_shape_0 + 4);
std::vector<int> shape_1(p_shape_1, p_shape_1 + 4);
std::vector<int> shape_2(p_shape_2, p_shape_2 + 4);
tensor_0.shape = shape_0;
tensor_1.shape = shape_1;
tensor_2.shape = shape_2;
for (int i = 0; i < fea.size(); i++) {
memcpy(data_0 + i * data_0_dim, &fea[i][0], sizeof(float) * data_0_dim);
}
for (int i = 0; i < week_fea.size(); i++) {
memcpy(data_1 + i * data_1_dim, &week_fea[i][0],
sizeof(float) * data_1_dim);
}
for (int i = 0; i < time_fea.size(); i++) {
memcpy(data_2 + i * data_2_dim, &time_fea[i][0],
sizeof(float) * data_2_dim);
}
tensor_0.data =
paddle::PaddleBuf(data_0, fea.size() * sizeof(float) * data_0_dim);
tensor_1.data =
paddle::PaddleBuf(data_1, week_fea.size() * sizeof(float) * data_1_dim);
tensor_2.data =
paddle::PaddleBuf(data_2, time_fea.size() * sizeof(float) * data_2_dim);
tensor_0.dtype = paddle::PaddleDType::FLOAT32;
tensor_1.dtype = paddle::PaddleDType::FLOAT32;
tensor_2.dtype = paddle::PaddleDType::FLOAT32;
inputs.clear();
inputs.push_back(tensor_1);
inputs.push_back(tensor_2);
inputs.push_back(tensor_0);
Timer timer;
timer.tic();
for (int i = 0; i < FLAGS_repeat; i++) predictor->Run(inputs, &outputs);
LOG(INFO) << "batch_size = " << FLAGS_batch_size
<< ", repeat = " << FLAGS_repeat
<< ", sequence_length = " << seq_offset[seq_offset.size() - 1]
<< ", latency: " << timer.toc() / FLAGS_repeat << "ms";
float* data_o = static_cast<float*>(outputs[0].data.data());
VLOG(3) << "outputs[0].data.length() = " << outputs[0].data.length();
for (size_t j = 0; j < outputs[0].data.length(); ++j) {
VLOG(3) << "output[" << j << "]: " << data_o[j];
}
}
}
} // namespace paddle
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
logger::init(argv[0]);
paddle::single_test();
/* multi-threads
std::vector<std::thread> threads;
int num = 1;
for (int i = 0; i < num; i++) {
LOG(INFO) << " thread id : " << i;
threads.emplace_back(paddle::single_test);
}
for (int i = 0; i < num; i++) {
threads[i].join();
}
threads.clear();
*/
return 0;
}
...@@ -137,9 +137,12 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -137,9 +137,12 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
for (size_t i = 0; i < feed_target_names_.size(); ++i) { for (size_t i = 0; i < feed_target_names_.size(); ++i) {
VLOG(4) << "setting " << i << "-th target"; if (config_.specify_input_name) {
feed_targets[inputs[i].name] = &feeds[i];
} else {
feed_targets[feed_target_names_[i]] = &feeds[i]; feed_targets[feed_target_names_[i]] = &feeds[i];
} }
}
// get fetch variable // get fetch variable
std::map<std::string, framework::LoDTensor *> fetch_targets; std::map<std::string, framework::LoDTensor *> fetch_targets;
std::vector<framework::LoDTensor> fetchs; std::vector<framework::LoDTensor> fetchs;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h" #include "paddle/fluid/operators/tensorrt_engine_op.h"
...@@ -32,7 +33,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -32,7 +33,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) { bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
VLOG(3) << "Predictor::init()"; VLOG(3) << "Predictor::init()";
FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
FLAGS_tensorrt_workspace_size = config_.workspace_size;
if (config_.use_gpu) { if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else { } else {
...@@ -150,3 +152,12 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>( ...@@ -150,3 +152,12 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
} }
} // namespace paddle } // namespace paddle
USE_TRT_CONVERTER(elementwise_add_weight);
USE_TRT_CONVERTER(mul);
USE_TRT_CONVERTER(conv2d);
USE_TRT_CONVERTER(relu);
USE_TRT_CONVERTER(fc);
USE_TRT_CONVERTER(pool2d);
USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER(batch_norm);
...@@ -23,7 +23,7 @@ namespace paddle { ...@@ -23,7 +23,7 @@ namespace paddle {
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
void CompareTensorRTWithFluid(bool enable_tensorrt) { void CompareTensorRTWithFluid(bool enable_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt; FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt;
//# 1. Create PaddlePredictor with a config. //# 1. Create PaddlePredictor with a config.
NativeConfig config0; NativeConfig config0;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <sys/time.h>
#include <algorithm>
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
namespace inference {
// Timer for timer
class Timer {
public:
double start;
double startu;
void tic() {
struct timeval tp;
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
}
double toc() {
struct timeval tp;
gettimeofday(&tp, NULL);
double used_time_ms =
(tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
return used_time_ms;
}
};
void split(const std::string &str, char sep, std::vector<std::string> *pieces) {
pieces->clear();
if (str.empty()) {
return;
}
size_t pos = 0;
size_t next = str.find(sep, pos);
while (next != std::string::npos) {
pieces->push_back(str.substr(pos, next - pos));
pos = next + 1;
next = str.find(sep, pos);
}
if (!str.substr(pos).empty()) {
pieces->push_back(str.substr(pos));
}
}
void split_to_float(const std::string &str, char sep, std::vector<float> *fs) {
std::vector<std::string> pieces;
split(str, sep, &pieces);
std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
[](const std::string &v) { return std::stof(v); });
}
template <typename T>
std::string to_string(const std::vector<T> &vec) {
std::stringstream ss;
for (const auto &c : vec) {
ss << c << " ";
}
return ss.str();
}
template <>
std::string to_string<std::vector<float>>(
const std::vector<std::vector<float>> &vec) {
std::stringstream ss;
for (const auto &piece : vec) {
ss << to_string(piece) << "\n";
}
return ss.str();
}
template <>
std::string to_string<std::vector<std::vector<float>>>(
const std::vector<std::vector<std::vector<float>>> &vec) {
std::stringstream ss;
for (const auto &line : vec) {
for (const auto &rcd : line) {
ss << to_string(rcd) << ";\t";
}
ss << '\n';
}
return ss.str();
}
// clang-format off
void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
// Assign buffer
int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; });
tensor->data.Resize(sizeof(float) * dim);
int c = 0;
for (const auto &f : data) {
for (float v : f) { static_cast<float *>(tensor->data.data())[c++] = v; }
}
}
} // namespace inference
} // namespace paddle
...@@ -65,13 +65,13 @@ config.model_dir = "xxx"; ...@@ -65,13 +65,13 @@ config.model_dir = "xxx";
config.use_gpu = false; config.use_gpu = false;
// 创建一个原生的 PaddlePredictor // 创建一个原生的 PaddlePredictor
auto predictor = auto predictor =
paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
// 创建输入 tensor // 创建输入 tensor
int64_t data[4] = {1, 2, 3, 4}; int64_t data[4] = {1, 2, 3, 4};
paddle::PaddleTensor tensor{.name = "", paddle::PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}), .shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)), .data = paddle::PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64}; .dtype = paddle::PaddleDType::INT64};
// 创建输出 tensor,输出 tensor 的内存可以复用 // 创建输出 tensor,输出 tensor 的内存可以复用
std::vector<paddle::PaddleTensor> outputs; std::vector<paddle::PaddleTensor> outputs;
// 执行预测 // 执行预测
......
...@@ -45,7 +45,7 @@ class PaddleBuf { ...@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
explicit PaddleBuf(size_t length) PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
...@@ -70,7 +70,7 @@ struct PaddleTensor { ...@@ -70,7 +70,7 @@ struct PaddleTensor {
std::vector<int> shape; std::vector<int> shape;
PaddleBuf data; // blob of data. PaddleBuf data; // blob of data.
PaddleDType dtype; PaddleDType dtype;
std::vector<std::vector<uint64_t>> lod; // lod data std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor
}; };
enum class PaddleEngineKind { enum class PaddleEngineKind {
...@@ -120,6 +120,8 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -120,6 +120,8 @@ struct NativeConfig : public PaddlePredictor::Config {
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
// Specify the variable's name of each input.
bool specify_input_name{false};
std::string prog_file; std::string prog_file;
std::string param_file; std::string param_file;
...@@ -137,6 +139,14 @@ struct AnakinConfig : public PaddlePredictor::Config { ...@@ -137,6 +139,14 @@ struct AnakinConfig : public PaddlePredictor::Config {
struct TensorRTConfig : public NativeConfig { struct TensorRTConfig : public NativeConfig {
// Determine whether a subgraph will be executed by TRT. // Determine whether a subgraph will be executed by TRT.
int min_subgraph_size{1}; int min_subgraph_size{1};
// While TensorRT allows an engine optimized for a given max batch size
// to run at any smaller size, the performance for those smaller
// sizes may not be as well-optimized. Therefore, Max batch is best
// equivalent to the runtime batch size.
int max_batch_size{1};
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int workspace_size{1 << 30};
}; };
// A factory to help create different predictors. // A factory to help create different predictors.
......
nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto) nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(convert) add_subdirectory(convert)
# Add TRT tests # Add TRT tests
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc softmax_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry) DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -24,3 +24,6 @@ nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc ...@@ -24,3 +24,6 @@ nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <math.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class BatchNormOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm";
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1); // Bias is a weight
PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1); // Mean is a weight
PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1); // Scale is a weight
PADDLE_ENFORCE_EQ(op_desc.Input("Variance").size(),
1); // Variance is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("X").front());
// Declare weights
auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
auto* Mean_v = scope.FindVar(op_desc.Input("Mean").front());
auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
const float eps = boost::get<float>(op_desc.GetAttr("epsilon"));
PADDLE_ENFORCE_NOT_NULL(Bias_v);
PADDLE_ENFORCE_NOT_NULL(Mean_v);
PADDLE_ENFORCE_NOT_NULL(Scale_v);
PADDLE_ENFORCE_NOT_NULL(Variance_v);
// get tensor
auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
auto* Mean_t = Mean_v->GetMutable<framework::LoDTensor>();
auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
auto* Variance_t = Variance_v->GetMutable<framework::LoDTensor>();
// create temp tensor for weights
framework::LoDTensor bias_tensor;
framework::LoDTensor mean_tensor;
framework::LoDTensor scale_tensor;
framework::LoDTensor variance_tensor;
bias_tensor.Resize(Bias_t->dims());
mean_tensor.Resize(Mean_t->dims());
scale_tensor.Resize(Scale_t->dims());
variance_tensor.Resize(Variance_t->dims());
platform::CPUPlace cpu_place;
// copy data from gpu to cpu
TensorCopySync((*Bias_t), cpu_place, &bias_tensor);
TensorCopySync((*Mean_t), cpu_place, &mean_tensor);
TensorCopySync((*Scale_t), cpu_place, &scale_tensor);
TensorCopySync((*Variance_t), cpu_place, &variance_tensor);
auto* bias_data = bias_tensor.mutable_data<float>(platform::CPUPlace());
auto* mean_data = mean_tensor.mutable_data<float>(platform::CPUPlace());
auto* scale_data = scale_tensor.mutable_data<float>(platform::CPUPlace());
auto* variance_data =
variance_tensor.mutable_data<float>(platform::CPUPlace());
std::unique_ptr<framework::LoDTensor> combile_scale_tensor(
new framework::LoDTensor());
std::unique_ptr<framework::LoDTensor> combile_bias_tensor(
new framework::LoDTensor());
combile_scale_tensor->Resize(scale_tensor.dims());
combile_bias_tensor->Resize(bias_tensor.dims());
auto* combile_scale_data =
combile_scale_tensor->mutable_data<float>(platform::CPUPlace());
auto* combile_bias_data =
combile_bias_tensor->mutable_data<float>(platform::CPUPlace());
size_t ele_num = combile_scale_tensor->memory_size() / sizeof(float);
for (size_t i = 0; i < ele_num; i++) {
float scale = scale_data[i];
float bias = bias_data[i];
float mean = mean_data[i];
float variance = variance_data[i];
combile_scale_data[i] = scale / sqrtf(variance + eps);
combile_bias_data[i] = bias - mean * combile_scale_data[i];
}
TensorRTEngine::Weight scale_weights{
nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_scale_data),
combile_scale_tensor->memory_size() / sizeof(float)};
TensorRTEngine::Weight shift_weights{
nvinfer1::DataType::kFLOAT, static_cast<void*>(combile_bias_data),
combile_bias_tensor->memory_size() / sizeof(float)};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
nvinfer1::IScaleLayer* layer =
TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Y").front();
engine_->weight_map[op_desc.Input("Bias").front()] =
std::move(combile_bias_tensor);
engine_->weight_map[op_desc.Input("Scale").front()] =
std::move(combile_scale_tensor);
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(batch_norm, BatchNormOpConverter);
...@@ -35,12 +35,20 @@ class Conv2dOpConverter : public OpConverter { ...@@ -35,12 +35,20 @@ class Conv2dOpConverter : public OpConverter {
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL); platform::CPUPlace cpu_place;
const int n_output = Y_t->dims()[0]; std::unique_ptr<framework::LoDTensor> weight_tensor(
const int filter_h = Y_t->dims()[2]; new framework::LoDTensor());
const int filter_w = Y_t->dims()[3]; weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
auto* weight_data =
weight_tensor->mutable_data<float>(platform::CPUPlace());
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
const int n_output = weight_tensor->dims()[0];
const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3];
const int groups = boost::get<int>(op_desc.GetAttr("groups")); const int groups = boost::get<int>(op_desc.GetAttr("groups"));
const std::vector<int> dilations = const std::vector<int> dilations =
...@@ -57,7 +65,7 @@ class Conv2dOpConverter : public OpConverter { ...@@ -57,7 +65,7 @@ class Conv2dOpConverter : public OpConverter {
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)}; weight_tensor->memory_size() / sizeof(float)};
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
auto* layer = TRT_ENGINE_ADD_LAYER( auto* layer = TRT_ENGINE_ADD_LAYER(
...@@ -70,6 +78,8 @@ class Conv2dOpConverter : public OpConverter { ...@@ -70,6 +78,8 @@ class Conv2dOpConverter : public OpConverter {
layer->setNbGroups(groups); layer->setNbGroups(groups);
auto output_name = op_desc.Output("Output").front(); auto output_name = op_desc.Output("Output").front();
engine_->weight_map[op_desc.Input("Filter").front()] =
std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { if (test_mode) {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle { namespace paddle {
...@@ -40,10 +39,17 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -40,10 +39,17 @@ class ElementwiseWeightOpConverter : public OpConverter {
auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
auto* weight_data =
weight_tensor->mutable_data<float>(platform::CPUPlace());
auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
std::vector<int> dims_y = framework::vectorize2int(Y_t->dims()); std::vector<int> dims_y = framework::vectorize2int(weight_tensor->dims());
if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) { if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
} }
...@@ -70,9 +76,9 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -70,9 +76,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
} }
TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight shift_weights{
static_cast<void*>(weight_data), nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)}; weight_tensor->memory_size() / sizeof(float)};
TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
0}; 0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
...@@ -82,6 +88,8 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -82,6 +88,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode, engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
shift_weights.get(), scale_weights.get(), power_weights.get()); shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside. // output, so place the declaration inside.
......
...@@ -12,12 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -73,19 +68,26 @@ class FcOpConverter : public OpConverter { ...@@ -73,19 +68,26 @@ class FcOpConverter : public OpConverter {
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
// This may trigger a GPU->CPU copy, because TRT's weight can only be // This may trigger a GPU->CPU copy, because TRT's weight can only be
// assigned from CPU memory, that can't be avoided. // assigned from CPU memory, that can't be avoided.
auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace()); platform::CPUPlace cpu_place;
PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix framework::LoDTensor weight_tensor;
size_t n_output = Y_t->dims()[1]; weight_tensor.Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, &weight_tensor);
framework::LoDTensor tmp; auto* weight_data = weight_tensor.mutable_data<float>(platform::CPUPlace());
tmp.Resize(Y_t->dims());
memcpy(tmp.mutable_data<float>(platform::CPUPlace()), weight_data, PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL); // a matrix
size_t n_output = weight_tensor.dims()[1];
std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
tmp->Resize(weight_tensor.dims());
memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)}; Y_t->memory_size() / sizeof(float)};
TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
static_cast<void*>(tmp.data<float>()), static_cast<void*>(tmp->data<float>()),
Y_t->memory_size() / sizeof(float)); Y_t->memory_size() / sizeof(float));
weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
tmp_weight.dims = weight.dims; tmp_weight.dims = weight.dims;
...@@ -106,6 +108,7 @@ class FcOpConverter : public OpConverter { ...@@ -106,6 +108,7 @@ class FcOpConverter : public OpConverter {
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
if (test_mode) { if (test_mode) {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
} }
......
...@@ -33,6 +33,7 @@ class Pool2dOpConverter : public OpConverter { ...@@ -33,6 +33,7 @@ class Pool2dOpConverter : public OpConverter {
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
std::string pool_type = std::string pool_type =
boost::get<std::string>(op_desc.GetAttr("pooling_type")); boost::get<std::string>(op_desc.GetAttr("pooling_type"));
std::vector<int> ksize = std::vector<int> ksize =
...@@ -42,7 +43,13 @@ class Pool2dOpConverter : public OpConverter { ...@@ -42,7 +43,13 @@ class Pool2dOpConverter : public OpConverter {
std::vector<int> paddings = std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings")); boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
if (global_pooling == true) {
nvinfer1::Dims input_shape = input1->getDimensions();
int nbDims = input_shape.nbDims;
nv_ksize.d[0] = input_shape.d[nbDims - 2];
nv_ksize.d[1] = input_shape.d[nbDims - 1];
}
const nvinfer1::DimsHW nv_strides(strides[0], strides[1]); const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(batch_norm_op, test) {
std::unordered_set<std::string> parameters(
{"batch_norm_scale", "batch_norm_bias", "batch_norm_mean",
"batch_norm_variance"});
framework::Scope scope;
TRTConvertValidation validator(5, parameters, scope, 1 << 15);
std::vector<int> param_shape{2};
validator.DeclInputVar("batch_norm_X", nvinfer1::DimsCHW(2, 5, 5));
validator.DeclParamVar("batch_norm_scale", param_shape);
validator.DeclParamVar("batch_norm_bias", param_shape);
validator.DeclParamVar("batch_norm_mean", param_shape);
validator.DeclParamVar("batch_norm_variance", param_shape);
validator.DeclOutputVar("batch_norm_Y", nvinfer1::DimsCHW(2, 5, 5));
validator.DeclOutputVar("batch_norm_save_mean", param_shape);
validator.DeclOutputVar("batch_norm_save_variance", param_shape);
// Prepare Op description
framework::OpDesc desc;
desc.SetType("batch_norm");
desc.SetInput("X", {"batch_norm_X"});
desc.SetInput("Scale", {"batch_norm_scale"});
desc.SetInput("Bias", {"batch_norm_bias"});
desc.SetInput("Mean", {"batch_norm_mean"});
desc.SetInput("Variance", {"batch_norm_variance"});
desc.SetOutput("Y", {"batch_norm_Y"});
desc.SetOutput("MeanOut", {"batch_norm_mean"});
desc.SetOutput("VarianceOut", {"batch_norm_variance"});
desc.SetOutput("SavedMean", {"batch_norm_save_mean"});
desc.SetOutput("SavedVariance", {"batch_norm_save_variance"});
float eps = 1e-5f;
bool is_test = true;
desc.SetAttr("epsilon", eps);
desc.SetAttr("is_test", is_test);
validator.SetOp(*desc.Proto());
std::unordered_set<std::string> neglected_output = {
"batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean",
"batch_norm_variance"};
validator.Execute(3, neglected_output);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(batch_norm);
...@@ -57,6 +57,7 @@ TEST(OpConverter, ConvertBlock) { ...@@ -57,6 +57,7 @@ TEST(OpConverter, ConvertBlock) {
auto* x = scope.Var("conv2d-Y"); auto* x = scope.Var("conv2d-Y");
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec)); x_tensor->Resize(framework::make_ddim(dim_vec));
x_tensor->mutable_data<float>(platform::CUDAPlace(0));
OpConverter converter; OpConverter converter;
converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope, converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
......
...@@ -20,7 +20,7 @@ namespace paddle { ...@@ -20,7 +20,7 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
TEST(Pool2dOpConverter, main) { void test_pool2d(bool global_pooling) {
framework::Scope scope; framework::Scope scope;
std::unordered_set<std::string> parameters; std::unordered_set<std::string> parameters;
TRTConvertValidation validator(5, parameters, scope, 1 << 15); TRTConvertValidation validator(5, parameters, scope, 1 << 15);
...@@ -28,6 +28,9 @@ TEST(Pool2dOpConverter, main) { ...@@ -28,6 +28,9 @@ TEST(Pool2dOpConverter, main) {
// The ITensor's Dims should not contain the batch size. // The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W. // So, the ITensor's Dims of input and output should be C * H * W.
validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4)); validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
if (global_pooling)
validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
else
validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2)); validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
// Prepare Op description // Prepare Op description
...@@ -45,6 +48,7 @@ TEST(Pool2dOpConverter, main) { ...@@ -45,6 +48,7 @@ TEST(Pool2dOpConverter, main) {
desc.SetAttr("ksize", ksize); desc.SetAttr("ksize", ksize);
desc.SetAttr("strides", strides); desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings); desc.SetAttr("paddings", paddings);
desc.SetAttr("global_pooling", global_pooling);
LOG(INFO) << "set OP"; LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
...@@ -53,6 +57,10 @@ TEST(Pool2dOpConverter, main) { ...@@ -53,6 +57,10 @@ TEST(Pool2dOpConverter, main) {
validator.Execute(3); validator.Execute(3);
} }
TEST(Pool2dOpConverter, normal) { test_pool2d(false); }
TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
......
...@@ -24,6 +24,7 @@ limitations under the License. */ ...@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
...@@ -48,11 +49,17 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, ...@@ -48,11 +49,17 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
auto dims = tensor->dims(); auto dims = tensor->dims();
size_t num_elements = analysis::AccuDims(dims, dims.size()); size_t num_elements = analysis::AccuDims(dims, dims.size());
PADDLE_ENFORCE_GT(num_elements, 0); PADDLE_ENFORCE_GT(num_elements, 0);
auto* data = tensor->mutable_data<float>(place);
platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(dims);
auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
for (size_t i = 0; i < num_elements; i++) { for (size_t i = 0; i < num_elements; i++) {
*(data + i) = random(0., 1.); *(temp_data + i) = random(0., 1.);
} }
TensorCopySync(temp_tensor, place, tensor);
} }
/* /*
...@@ -91,18 +98,26 @@ class TRTConvertValidation { ...@@ -91,18 +98,26 @@ class TRTConvertValidation {
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims); engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
} }
void DeclParamVar(const std::string& name, const std::vector<int> dim_vec) {
DeclVar(name, dim_vec);
}
// Declare a parameter varaible in the scope. // Declare a parameter varaible in the scope.
void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) { void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims, true); DeclVar(name, dims, true);
} }
void DeclOutputVar(const std::string& name, const std::vector<int> dim_vec) {
DeclVar(name, dim_vec);
}
void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) { void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims); DeclVar(name, dims);
} }
void DeclVar(const std::string& name, const std::vector<int> dim_vec) { void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CPUPlace place; platform::CUDAPlace place;
platform::CPUDeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
auto* x = scope_.Var(name); auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); auto* x_tensor = x->GetMutable<framework::LoDTensor>();
...@@ -141,18 +156,22 @@ class TRTConvertValidation { ...@@ -141,18 +156,22 @@ class TRTConvertValidation {
PADDLE_ENFORCE(var); PADDLE_ENFORCE(var);
auto tensor = var->GetMutable<framework::LoDTensor>(); auto tensor = var->GetMutable<framework::LoDTensor>();
engine_->SetInputFromCPU( engine_->SetInputFromGPU(
input, static_cast<void*>(tensor->data<void>()), input, static_cast<void*>(tensor->data<void>()),
sizeof(float) * sizeof(float) *
analysis::AccuDims(tensor->dims(), tensor->dims().size())); analysis::AccuDims(tensor->dims(), tensor->dims().size()));
} }
} }
void Execute(int batch_size) { // We use the set 'neglected_output' here, because some Ops like batch norm,
// the outputs specified in the op des are only used during training,
// so we should neglect those output during inference.
void Execute(int batch_size,
std::unordered_set<std::string> neglected_output = {}) {
// Execute Fluid Op // Execute Fluid Op
PADDLE_ENFORCE_LE(batch_size, max_batch_size_); PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
platform::CPUPlace place; platform::CUDAPlace place;
platform::CPUDeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
op_->Run(scope_, place); op_->Run(scope_, place);
// Execute TRT. // Execute TRT.
engine_->Execute(batch_size); engine_->Execute(batch_size);
...@@ -161,6 +180,7 @@ class TRTConvertValidation { ...@@ -161,6 +180,7 @@ class TRTConvertValidation {
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 3000; const size_t output_space_size = 3000;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
if (neglected_output.count(output)) continue;
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size); std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
......
...@@ -33,6 +33,7 @@ void TensorRTEngine::Build(const DescType &paddle_model) { ...@@ -33,6 +33,7 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
} }
void TensorRTEngine::Execute(int batch_size) { void TensorRTEngine::Execute(int batch_size) {
freshDeviceId();
batch_size_ = batch_size; batch_size_ = batch_size;
std::vector<void *> buffers; std::vector<void *> buffers;
for (auto &buf : buffers_) { for (auto &buf : buffers_) {
...@@ -60,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() { ...@@ -60,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() {
} }
void TensorRTEngine::FreezeNetwork() { void TensorRTEngine::FreezeNetwork() {
freshDeviceId();
PADDLE_ENFORCE(infer_builder_ != nullptr, PADDLE_ENFORCE(infer_builder_ != nullptr,
"Call InitNetwork first to initialize network."); "Call InitNetwork first to initialize network.");
PADDLE_ENFORCE(infer_network_ != nullptr, PADDLE_ENFORCE(infer_network_ != nullptr,
...@@ -241,6 +243,13 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { ...@@ -241,6 +243,13 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
void TensorRTEngine::freshDeviceId() {
int count;
cudaGetDeviceCount(&count);
PADDLE_ENFORCE_LT(device_, count);
cudaSetDevice(device_);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
...@@ -52,13 +53,15 @@ class TensorRTEngine : public EngineBase { ...@@ -52,13 +53,15 @@ class TensorRTEngine : public EngineBase {
}; };
TensorRTEngine(int max_batch, int max_workspace, TensorRTEngine(int max_batch, int max_workspace,
cudaStream_t* stream = nullptr, cudaStream_t* stream = nullptr, int device = 0,
nvinfer1::ILogger& logger = NaiveLogger::Global()) nvinfer1::ILogger& logger = NaiveLogger::Global())
: max_batch_(max_batch), : max_batch_(max_batch),
max_workspace_(max_workspace), max_workspace_(max_workspace),
stream_(stream ? stream : &default_stream_), stream_(stream ? stream : &default_stream_),
logger_(logger) { logger_(logger),
cudaStreamCreate(&default_stream_); device_(device) {
freshDeviceId();
cudaStreamCreate(stream_);
} }
virtual ~TensorRTEngine(); virtual ~TensorRTEngine();
...@@ -119,6 +122,15 @@ class TensorRTEngine : public EngineBase { ...@@ -119,6 +122,15 @@ class TensorRTEngine : public EngineBase {
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
void SetRuntimeBatch(size_t batch_size); void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch(); int GetRuntimeBatch();
int GetDevice() { return device_; }
// A pointer to CPU memory is needed of the TRT weight.
// Before TRT runs, fluid loads weight into GPU storage.
// so we need to copy the weights from GPU to CPU in our op converter.
// We use a map to store these weights for the weight memory is not released
// in advance, which affecting the construction of TRT Op.
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map;
private: private:
// the max batch size // the max batch size
...@@ -140,6 +152,8 @@ class TensorRTEngine : public EngineBase { ...@@ -140,6 +152,8 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_; std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/> std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
itensor_map_; itensor_map_;
// The specific GPU id that the TensorRTEngine bounded to.
int device_;
// TensorRT related internal members // TensorRT related internal members
template <typename T> template <typename T>
...@@ -156,6 +170,10 @@ class TensorRTEngine : public EngineBase { ...@@ -156,6 +170,10 @@ class TensorRTEngine : public EngineBase {
infer_ptr<nvinfer1::INetworkDefinition> infer_network_; infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
infer_ptr<nvinfer1::ICudaEngine> infer_engine_; infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
infer_ptr<nvinfer1::IExecutionContext> infer_context_; infer_ptr<nvinfer1::IExecutionContext> infer_context_;
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
// freshDeviceId().
void freshDeviceId();
}; // class TensorRTEngine }; // class TensorRTEngine
// Add an layer__ into engine__ with args ARGS. // Add an layer__ into engine__ with args ARGS.
...@@ -188,8 +206,8 @@ class TRT_EngineManager { ...@@ -188,8 +206,8 @@ class TRT_EngineManager {
// Create or get an engine called `name` // Create or get an engine called `name`
TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream, TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
const std::string& name) { const std::string& name, int gpu_device = 0) {
auto* p = new TensorRTEngine(max_batch, max_workspace, stream); auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
engines_[name].reset(p); engines_[name].reset(p);
return p; return p;
} }
......
...@@ -27,7 +27,7 @@ namespace tensorrt { ...@@ -27,7 +27,7 @@ namespace tensorrt {
class TensorRTEngineTest : public ::testing::Test { class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ASSERT_EQ(0, cudaStreamCreate(&stream_)); // ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(10, 1 << 10, &stream_); engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
engine_->InitNetwork(); engine_->InitNetwork();
} }
......
...@@ -85,7 +85,11 @@ function(op_library TARGET) ...@@ -85,7 +85,11 @@ function(op_library TARGET)
#remove windows unsupported op #remove windows unsupported op
if (WIN32) if (WIN32)
<<<<<<< HEAD
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
=======
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
>>>>>>> origin/develop
if ("${TARGET}" STREQUAL "${windows_unsupport_op}") if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return() return()
endif() endif()
...@@ -109,7 +113,8 @@ function(op_library TARGET) ...@@ -109,7 +113,8 @@ function(op_library TARGET)
endif() endif()
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op") foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -257,6 +262,7 @@ op_library(softmax_op DEPS softmax) ...@@ -257,6 +262,7 @@ op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op DEPS tensorrt_engine_op
analysis) analysis)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h"
#include <sys/time.h>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(C0) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
"Input(LSTMWeight) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
"Input(LSTMBias) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
"Input(AttentionWeight) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
"Output(AttentionedX) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
"Output(AttentionFCOut) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
"Output(LSTMX) of AttentionLSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
"Output(LSTMOUT) of AttentionLSTM should not be null.");
auto x_dims = ctx->GetInputDim("X");
const int M = x_dims[1];
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
auto w_dims = ctx->GetInputDim("LSTMWeight");
const int D = w_dims[1] / 4;
PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2.");
PADDLE_ENFORCE_EQ(w_dims[0], D + M,
"LSTMWeight dims should be (%d + %d) * %d.", D + M, 4 * D);
auto b_dims = ctx->GetInputDim("LSTMBias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x %d.", 4 * D);
PADDLE_ENFORCE_EQ(b_dims[1], 4 * D, "LSTMBias dims should be 1 x %d.", 4 * D);
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
if (ctx->HasInput("H0")) {
auto h_dims = ctx->GetInputDim("H0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
}
auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
PADDLE_ENFORCE_EQ(atten_w_dims.size(), 2,
"Input(AttentionWeight)'s rank must be 2.");
PADDLE_ENFORCE_EQ(atten_w_dims[0], M + D,
"AttentionWeight shapes must be (%d + %d) * 1.", M, D);
PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
"AttentionWeight shapes must be (%d + %d) * 1.", M, D);
if (ctx->HasInput("AttentionBias")) {
auto atten_b_dims = ctx->GetInputDim("AttentionBias");
PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
"Input(AttentionBias)'s rank must be 2.");
PADDLE_ENFORCE_EQ(atten_b_dims[0], 1,
"AttentionBias shapes must be 1 * 1.");
PADDLE_ENFORCE_EQ(atten_b_dims[1], 1,
"AttentionBias shapes must be 1 * 1.");
}
if (ctx->HasInput("AttentionScalar")) {
auto dims = ctx->GetInputDim("AttentionScalar");
PADDLE_ENFORCE_EQ(dims.size(), 2,
"Input(AttentionScalar)'s rank must be 2.");
PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalar shapes must be 1 * 1.");
PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1.");
}
if (ctx->HasInput("AttentionScalarBias")) {
auto dims = ctx->GetInputDim("AttentionScalarBias");
PADDLE_ENFORCE(
ctx->HasInput("AttentionScalar"),
"AttentionScalar should not be null when have AttentionScalarBias.");
PADDLE_ENFORCE_EQ(dims.size(), 2,
"Input(AttentionScalarBias)'s rank must be 2.");
PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalarBias shapes must be 1 * 1.");
PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalarBias shapes must be 1 * 1.");
}
framework::DDim out_dims({x_dims[0], D});
ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("Cell", out_dims);
ctx->SetOutputDim("AttentionedX", {x_dims[0], 1});
ctx->SetOutputDim("LSTMX", {1, M});
ctx->SetOutputDim("LSTMOUT", {1, 4 * D});
// AttentionFCOut should be reshape as (maxseqlen,1) in runtime
ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("X", "Cell");
}
framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
void AttentionLSTMOpMaker::Make() {
AddInput("X",
"(LoDTensor) the input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X M), where T is the "
"total time steps in this mini-batch, M is the dim size of x.");
AddInput("C0",
"(Tensor) LSTM C0"
"This is a tensor with shape (N x D), where N is the batch size, D "
"is the gate size."
"C0 is necessary because of attention.");
AddInput("H0",
"(Tensor, optional) LSTM H0"
"This is a tensor with shape (N x D), where N is the "
"batch size and D is the gate size.")
.AsDispensable();
AddInput("AttentionWeight",
"(Tensor) the weights of attention fc. Always relu the fc result."
"The shape is ((M+D) x 1), where M is the dim size of x, D is the "
"gate size of LSTM.");
AddInput("AttentionBias",
"(Tensor, optional) the bias of attention fc."
"The shape is (1 x 1)")
.AsDispensable();
AddInput("AttentionScalar",
"(Tensor, optional) the scalar on the result of attentioned fc. "
"Always relu the Scalar."
"The shape is (1 x 1)")
.AsDispensable();
AddInput("AttentionScalarBias",
"(Tensor, optional) the scalar bias of attention fc."
"The shape is (1 x 1)")
.AsDispensable();
AddInput("LSTMWeight",
"(Tensor) the combined weight of LSTM"
" - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
"is the dim size of x"
" - Weight = {W_forget, W_input, W_output, W_cell}");
AddInput("LSTMBias",
"(Tensor) the combined bias of LSTM, shape (1x4D)."
"Note: we should add the bias of hidden and context accorindg to "
"the same gate: "
"{B_forget, B_input, B_output, B_cell}");
AddOutput("Hidden",
"(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("Cell",
"(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("AttentionedX",
"(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size.")
.AsIntermediate();
AddOutput("AttentionFCOut",
"(Tensor) (max_seq_len, 1), compute at each step.")
.AsIntermediate();
AddOutput("LSTMX",
"(Tensor) the input X of LSTM for each step."
"Shape is (1 x M), where M is the x frame size")
.AsIntermediate();
AddOutput(
"LSTMOUT",
"(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
"Shape is (1 x 4D), where M is the x frame size")
.AsIntermediate();
AddAttr<std::string>("gate_activation",
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
"gate, `sigmoid` by default.")
.SetDefault("sigmoid")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("cell_activation",
"(string, default: tanh)"
"The activation for cell output, `tanh` by defalut.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("candidate_activation",
"(string, default: tanh)"
"The activation for candidate hidden state, "
"`tanh` by default.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddComment(R"DOC(
Attention Long-Short Term Memory (LSTM) Operator.
Attention part:
concat( x(seqlen * M), expand( cell_t-1(1,D) ) ) => tmp(seqlen*(M+D))
tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu
fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu
dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M)
LSTM part:
use lstm_x_t as input and compute as standard LSTM.
)DOC");
}
// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0;
template <typename T>
inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
if (bias) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] + bias[0];
}
math::vec_relu<T>(n, y, y);
} else {
math::vec_relu<T>(n, x, y);
}
}
template <typename DeviceContext, typename T>
inline void vec_softmax(const math::BlasT<DeviceContext, T>& blas, const int n,
const T* x, T* y) {
T scalar = x[0];
// max
for (int i = 1; i < n; ++i) {
scalar = scalar < x[i] ? x[i] : scalar;
}
// sub
for (int i = 0; i < n; ++i) {
y[i] = x[i] - scalar;
}
// exp
blas.VEXP(n, y, y);
// sum
scalar = T(0);
for (int i = 0; i < n; ++i) {
scalar += y[i];
}
// scale
blas.SCAL(n, static_cast<T>(1) / scalar, y);
}
template <typename T>
class AttentionLSTMKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using DeviceContext = paddle::platform::CPUDeviceContext;
auto* x = ctx.Input<LoDTensor>("X");
auto* h0 = ctx.Input<Tensor>("H0");
auto* c0 = ctx.Input<Tensor>("C0");
auto* atten_w = ctx.Input<Tensor>("AttentionWeight");
auto* atten_b = ctx.Input<Tensor>("AttentionBias");
auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");
auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");
auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");
auto* lstm_b = ctx.Input<Tensor>("LSTMBias");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
auto* cell_out = ctx.Output<LoDTensor>("Cell");
auto* atted_x = ctx.Output<Tensor>("AttentionedX");
auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");
auto* lstm_x = ctx.Output<Tensor>("LSTMX");
auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");
// some shape should be reshape here since infershape can not get lod info
auto x_lod = x->lod();
const int N = x_lod[0].size() - 1; // batch size
auto x_dims = x->dims(); // T x M
auto w_dims = lstm_w->dims(); // (D+M) x 4D
const int total_T = x_dims[0];
const int M = x_dims[1]; // x frame size
const int D = w_dims[1] / 4; // gate frame size
const int D2 = D * 2;
const int D3 = D * 3;
const int D4 = w_dims[1];
int max_seq_len = x_lod[0][1];
for (int i = 1; i < N; ++i) {
int len = x_lod[0][i + 1] - x_lod[0][i];
max_seq_len = max_seq_len < len ? len : max_seq_len;
}
PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
fc_out->Resize({max_seq_len, 1});
math::VecActivations<T> act_functor;
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
act_gate = act_functor(ctx.Attr<std::string>("gate_activation"));
act_cell = act_functor(ctx.Attr<std::string>("cell_activation"));
act_cand = act_functor(ctx.Attr<std::string>("candidate_activation"));
const T* x_data = x->data<T>();
const T* h0_data = h0 ? h0->data<T>() : NULL;
const T* c0_data = c0->data<T>();
const T* lstm_w_data = lstm_w->data<T>();
const T* lstm_b_data = lstm_b->data<T>();
const T* atten_w_data = atten_w->data<T>();
const T* atten_b_data = atten_b ? atten_b->data<T>() : NULL;
const T* atten_scalar_data = atten_scalar ? atten_scalar->data<T>() : NULL;
const T* atten_scalar_bias_data =
atten_scalar_bias ? atten_scalar_bias->data<T>() : NULL;
T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
T* atted_x_data = atted_x->mutable_data<T>(ctx.GetPlace());
T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
// x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
auto blas = math::GetBlas<DeviceContext, T>(ctx);
math::FCCompute<DeviceContext, T>(blas, total_T, 1, M, x_data, atten_w_data,
atted_x_data, atten_b_data);
const T* cur_atten_x_data = atted_x_data;
const T* cur_x_data = x_data;
const T* prev_cell_data = NULL;
const T* prev_hidden_data = NULL;
T* cur_cell_out_data = cell_out_data;
T* cur_hidden_out_data = hidden_out_data;
for (int i = 0; i < N; ++i) {
int seq_len = x_lod[0][i + 1] - x_lod[0][i];
prev_cell_data = c0_data + i * D;
prev_hidden_data = h0_data ? h0_data + i * D : NULL;
for (int step = 0; step < seq_len; ++step) {
/// 1. compute attention vector
// 1a. prev_cell(1xD) * fc(D) rest part of atten_wgt
T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M);
// 1b. add cell bias and relu
bias_relu<T>(seq_len, cur_atten_x_data, &prev_cell_bias, fc_out_data);
// 1c. fc scalar
if (atten_scalar_data) {
blas.SCAL(seq_len, *atten_scalar_data, fc_out_data);
bias_relu<T>(seq_len, fc_out_data, atten_scalar_bias_data,
fc_out_data);
}
// 1d. softmax
vec_softmax<DeviceContext, T>(blas, seq_len, fc_out_data, fc_out_data);
// mul x(seq_len*M) and sum pool
math::FCCompute<DeviceContext, T>(blas, 1, M, seq_len, fc_out_data,
cur_x_data, lstm_x_data);
/// 2. compute LSTM step
// lstm weight : concat[forget , input , output , tilde]
// shape : (D + M) x (4 * D)
// fc inputX(1xM) * weightX(M*(4D)) => 1 x 4D
blas.MatMul(1, D4, M, lstm_x_data, lstm_w_data + D * D4, lstm_out_data);
if (prev_hidden_data) {
blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
prev_hidden_data, D, lstm_w_data, D4, static_cast<T>(1),
lstm_out_data, D4);
}
// since input is 1xM, so can use add bias
blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data);
// gate act: sigmoid
act_gate(D3, lstm_out_data, lstm_out_data);
// candicate act: tanh
act_cand(D, lstm_out_data + D3, lstm_out_data + D3);
// a = forget * prev_cell
blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data);
// b = input * tilde
blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D);
// cell_out = a + b
blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data);
// state act tanh(cell_out) * output_gate
act_cell(D, cur_cell_out_data, lstm_out_data);
blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data);
prev_hidden_data = cur_hidden_out_data;
prev_cell_data = cur_cell_out_data;
cur_cell_out_data = cur_cell_out_data + D;
cur_hidden_out_data = cur_hidden_out_data + D;
}
cur_x_data = cur_x_data + seq_len * M;
cur_atten_x_data = cur_atten_x_data + seq_len;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp,
ops::AttentionLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel<float>,
ops::AttentionLSTMKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
class AttentionLSTMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle
...@@ -62,9 +62,21 @@ class ConcatGradKernel : public framework::OpKernel<T> { ...@@ -62,9 +62,21 @@ class ConcatGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto* out_grad = auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out")); ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto ins = ctx.MultiInput<framework::Tensor>("X"); auto ins = ctx.MultiInput<framework::LoDTensor>("X");
auto out_var_names = ctx.Outputs(framework::GradVarName("X")); auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X")); auto outs =
ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
{
auto dx = outs;
auto x = ins;
for (size_t i = 0; i < dx.size(); ++i) {
if (dx[i] != nullptr) {
dx[i]->set_lod(x[i]->lod());
}
}
}
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
// get output tensor that the name is not kEmptyVarName // get output tensor that the name is not kEmptyVarName
......
...@@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase { ...@@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase {
protected: protected:
std::vector<const framework::LoDTensor *> InputTensors( std::vector<const framework::LoDTensor *> InputTensors(
const framework::Scope &scope) const { const framework::Scope &scope, const std::string &in_name) const {
std::vector<const framework::LoDTensor *> retv; std::vector<const framework::LoDTensor *> retv;
auto xs = Inputs("X"); auto xs = Inputs(in_name);
retv.resize(xs.size(), nullptr); retv.resize(xs.size(), nullptr);
std::transform( std::transform(
xs.begin(), xs.end(), retv.begin(), xs.begin(), xs.end(), retv.begin(),
...@@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp { ...@@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
auto xs = InputTensors(scope);
bool need_run; bool need_run;
if (Attr<bool>("is_scalar_condition")) { if (Attr<bool>("is_scalar_condition")) {
// When is_scalar_condition is True, the conditional variable is a scalar,
// whether need to execute the operators in sub-block depends on the
// conditional variable (Cond).
auto xs = InputTensors(scope, "Cond");
need_run = ScalarCondition(xs); need_run = ScalarCondition(xs);
} else { } else {
// When is_scalar_condition is False, the conditional variable maybe a
// vector or tensor, whether need to execute the operators in sub-block
// depends on the input variables (Input).
auto xs = InputTensors(scope, "Input");
need_run = std::all_of( need_run = std::all_of(
xs.begin(), xs.end(), xs.begin(), xs.end(),
[](const framework::LoDTensor *t) { return t->numel() != 0; }); [](const framework::LoDTensor *t) { return t->numel() != 0; });
...@@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp { ...@@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp {
class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("Cond",
"The conditional variable of this operator. If X is empty, the " "The conditional variable of this operator. If Cond is empty, the "
"whole sub-block will not be executed.") "whole sub-block will not be executed.")
.AsDuplicable(); .AsDuplicable();
AddInput("Params", "The input variables of the sub-block.").AsDuplicable(); AddInput("Input", "The input variables of the sub-block.").AsDuplicable();
AddOutput("Out", "The output variables of the sub-block.").AsDuplicable(); AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
AddOutput("Scope", AddOutput("Scope",
"(std::vector<Scope*>) The step scope of conditional block. To " "(std::vector<Scope*>) The step scope of conditional block. To "
...@@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<framework::BlockDesc *>( AddAttr<framework::BlockDesc *>(
"sub_block", "The step block of conditional block operator"); "sub_block", "The step block of conditional block operator");
AddAttr<bool>("is_scalar_condition", AddAttr<bool>("is_scalar_condition",
"the input X is used as scalar " "The conditional variable (Cond) is used as scalar "
"condition") "condition.")
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC(Conditional block operator AddComment(R"DOC(Conditional block operator
Run the sub-block if X is not empty. Params is the other inputs and Out is the If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
outputs of the sub-block. run the operators in sub-block if Cond is True.
If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or
tensor, run the operators in sub-block if all of input variables are not empty.
)DOC"); )DOC");
} }
}; };
...@@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
auto xs = this->InputTensors(scope);
bool need_run; bool need_run;
if (Attr<bool>("is_scalar_condition")) { if (Attr<bool>("is_scalar_condition")) {
auto xs = this->InputTensors(scope, "Cond");
need_run = ScalarCondition(xs); need_run = ScalarCondition(xs);
} else { } else {
auto xs = this->InputTensors(scope, "Input");
need_run = std::all_of( need_run = std::all_of(
xs.begin(), xs.end(), xs.begin(), xs.end(),
[](const framework::LoDTensor *t) { return t->numel() != 0; }); [](const framework::LoDTensor *t) { return t->numel() != 0; });
...@@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp {
auto *block = Attr<framework::BlockDesc *>("sub_block"); auto *block = Attr<framework::BlockDesc *>("sub_block");
exec.Run(*block->Program(), &cur_scope, block->ID(), false); exec.Run(*block->Program(), &cur_scope, block->ID(), false);
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"), AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
Outputs(framework::GradVarName("Params"))); Outputs(framework::GradVarName("Input")));
AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"), AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
Outputs(framework::GradVarName("X"))); Outputs(framework::GradVarName("Cond")));
} }
} }
...@@ -199,15 +210,15 @@ class ConditionalBlockGradOp : public ConditionalOp { ...@@ -199,15 +210,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
class ConditionalBlockGradInferShape : public framework::InferShapeBase { class ConditionalBlockGradInferShape : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *context) const override { void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInputs("X")); PADDLE_ENFORCE(context->HasInputs("Cond"));
if (context->HasInputs("Params")) { if (context->HasInputs("Input")) {
PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params"))); PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input")));
context->SetOutputsDim(framework::GradVarName("Params"), context->SetOutputsDim(framework::GradVarName("Input"),
context->GetInputsDim("Params")); context->GetInputsDim("Input"));
} }
if (context->HasOutputs(framework::GradVarName("X"))) { if (context->HasOutputs(framework::GradVarName("Cond"))) {
context->SetOutputsDim(framework::GradVarName("X"), context->SetOutputsDim(framework::GradVarName("Cond"),
context->GetInputsDim("X")); context->GetInputsDim("Cond"));
} }
} }
}; };
...@@ -220,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { ...@@ -220,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
std::unique_ptr<framework::OpDesc> Apply() const override { std::unique_ptr<framework::OpDesc> Apply() const override {
auto grad_op = new framework::OpDesc(); auto grad_op = new framework::OpDesc();
grad_op->SetType("conditional_block_grad"); grad_op->SetType("conditional_block_grad");
grad_op->SetInput("X", Input("X")); grad_op->SetInput("Cond", Input("Cond"));
grad_op->SetInput("Params", Input("Params")); grad_op->SetInput("Input", Input("Input"));
grad_op->SetInput("Out", Output("Out")); grad_op->SetInput("Out", Output("Out"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetInput("Scope", Output("Scope")); grad_op->SetInput("Scope", Output("Scope"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); grad_op->SetOutput(framework::GradVarName("Cond"),
grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Cond", false));
InputGrad("Params", false)); grad_op->SetOutput(framework::GradVarName("Input"),
InputGrad("Input", false));
grad_op->SetBlockAttr("sub_block", this->grad_block_[0]); grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition")); grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
return std::unique_ptr<framework::OpDesc>(grad_op); return std::unique_ptr<framework::OpDesc>(grad_op);
......
...@@ -53,6 +53,18 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -53,6 +53,18 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
key_ += "-BWD"; key_ += "-BWD";
} }
size_t GetDstMemorySize() const {
return conv_pd_->dst_primitive_desc().get_size();
}
size_t GetDiffWeightsMemorySize() const {
return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
}
size_t GetDiffSourceMemorySize() const {
return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive( std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT std::vector<mkldnn::primitive>& pipeline) { // NOLINT
...@@ -126,6 +138,15 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -126,6 +138,15 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
pipeline); pipeline);
} }
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
auto bias_pd = conv_pd_->bias_primitive_desc();
return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
"@bias_mem_p", pipeline);
}
std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution( std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p, std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p, std::shared_ptr<mkldnn::memory> weights_memory_p,
...@@ -147,6 +168,28 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -147,6 +168,28 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
return conv_p; return conv_p;
} }
std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> bias_memory_p,
std::shared_ptr<mkldnn::memory> dst_memory_p) {
auto prim_key = key_ + "@conv_p";
auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (conv_p == nullptr) {
conv_p = std::make_shared<mkldnn::convolution_forward>(
*conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
*(bias_memory_p.get()), *(dst_memory_p.get()));
dev_ctx_.SetBlob(prim_key, conv_p);
} else {
is_reusing_ = true;
}
return conv_p;
}
std::shared_ptr<mkldnn::convolution_backward_weights> std::shared_ptr<mkldnn::convolution_backward_weights>
AcquireConvolutionBackwardWeights( AcquireConvolutionBackwardWeights(
std::shared_ptr<mkldnn::memory> src_memory_p, std::shared_ptr<mkldnn::memory> src_memory_p,
...@@ -229,6 +272,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -229,6 +272,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter"); auto* filter = ctx.Input<Tensor>("Filter");
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
...@@ -237,6 +281,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -237,6 +281,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef, filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor"); "Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
if (bias) {
PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
bias->format() != memory::format::format_undef,
"Wrong layout/format set for Bias tensor");
PADDLE_ENFORCE(bias->dims().size() == 1,
"Bias must only have 1 dimension, i.e. X");
}
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides"); std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
...@@ -251,12 +306,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -251,12 +306,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz = std::vector<int> weights_tz =
...@@ -288,13 +337,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -288,13 +337,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd = std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, if (bias) {
mkldnn_engine); bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine);
} else {
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
paddings, mkldnn_engine);
}
// Save conv_pd/src_memory/weights_memory for backward pass // Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd); dev_ctx.SetBlob(key_conv_pd, conv_pd);
...@@ -306,6 +365,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -306,6 +365,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_weights_memory_p = handler.AcquireWeightsMemory( auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data)); user_weights_md, to_void_cast<T>(filter_data));
T* output_data =
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
auto src_memory_p = auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
...@@ -315,8 +376,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -315,8 +376,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data)); handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
// create convolution op primitive // create convolution op primitive
auto conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, std::shared_ptr<mkldnn::convolution_forward> conv_p;
if (bias) {
const T* bias_data = bias->data<T>();
auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
auto user_bias_memory_p =
handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
auto bias_memory_p =
handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p); dst_memory_p);
}
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p); pipeline.push_back(*conv_p);
...@@ -346,6 +421,27 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -346,6 +421,27 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>( return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd); p_conv_pd);
} }
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
bias, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
auto p_conv_pd =
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
}; };
template <typename T> template <typename T>
...@@ -393,13 +489,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -393,13 +489,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
T* input_grad_data = nullptr; T* input_grad_data = nullptr;
T* filter_grad_data = nullptr; T* filter_grad_data = nullptr;
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (filter_grad) {
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz = std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims()); paddle::framework::vectorize2int(filter->dims());
...@@ -485,6 +574,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -485,6 +574,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
handler.AcquireDiffDstMemoryFromWeightsPrimitive( handler.AcquireDiffDstMemoryFromWeightsPrimitive(
user_diff_dst_memory_p, pipeline); user_diff_dst_memory_p, pipeline);
const size_t size = handler.GetDiffWeightsMemorySize();
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
auto diff_weights_memory_p = auto diff_weights_memory_p =
handler.AcquireDiffWeightsMemoryFromWeightsPrimitive( handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
reinterpret_cast<void*>(filter_grad_data)); reinterpret_cast<void*>(filter_grad_data));
...@@ -507,6 +599,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -507,6 +599,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p, handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
pipeline); pipeline);
const size_t size = handler.GetDiffSourceMemorySize();
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive( auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
reinterpret_cast<void*>(input_grad_data)); reinterpret_cast<void*>(input_grad_data));
......
...@@ -37,6 +37,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -37,6 +37,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides"); std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
int groups = ctx->Attrs().Get<int>("groups"); int groups = ctx->Attrs().Get<int>("groups");
...@@ -57,7 +58,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -57,7 +58,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups, PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
"The number of input channels should be equal to filter " "The number of input channels should be equal to filter "
"channels * groups."); "channels * groups.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
filter_dims[0] % groups, 0, filter_dims[0] % groups, 0,
"The number of output channels should be divided by groups."); "The number of output channels should be divided by groups.");
...@@ -122,6 +122,11 @@ void Conv2DOpMaker::Make() { ...@@ -122,6 +122,11 @@ void Conv2DOpMaker::Make() {
"H is the height of the filter, and W is the width of the filter. " "H is the height of the filter, and W is the width of the filter. "
"If the groups attribute is greater than 1, C equals the number of " "If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups."); "input image channels divided by the groups.");
AddInput("Bias",
"(Tensor) Bias to be added to each output of filter application."
"The format of output tensor is X (one-dimensional) of size equal"
"to the number of output channels. Only used with MKL-DNN.")
.AsDispensable();
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution operator. " "(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.") "The format of output tensor is also NCHW.")
......
...@@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> { ...@@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
int* track_value = int* track_value =
track.mutable_data<int>(emission_dims, platform::CPUPlace()); track.mutable_data<int>(emission_dims, platform::CPUPlace());
#ifdef __AVX__
// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
// 16 elements per iteration. Then it can implement the parallel processing.
// Only optimize for float type.
#ifdef __AVX512F__
size_t step_size = 16;
#else
size_t step_size = 8;
#endif
if (std::is_same<T, float>::value && (tag_num >= step_size)) {
size_t steps = tag_num / step_size;
size_t remain = tag_num % step_size;
int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
// Setup the alpha initial value.
size_t i_offset = 0;
for (size_t i = 0; i <= steps; ++i) {
#ifdef __AVX512F__
// Declare the variable for the content of weights, input and alpha
// values.
__m512 w_content, x_content, alpha_content;
// Load the relevant data into the variables from un-aligned address.
w_content = _mm512_loadu_ps((const float*)(w + i_offset));
x_content = _mm512_loadu_ps((const float*)(x + i_offset));
alpha_content = _mm512_add_ps(w_content, x_content);
// Save the alpha value.
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
alpha_content);
#else
// Declare the variable for the content of weights, input and alpha
// values.
__m256 w_content, x_content, alpha_content;
// Load the relevant data into the variables from un-aligned address.
w_content = _mm256_loadu_ps((const float*)(w + i_offset));
x_content = _mm256_loadu_ps((const float*)(x + i_offset));
alpha_content = _mm256_add_ps(w_content, x_content);
// Save the alpha value.
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
alpha_content);
#endif
i_offset += step_size;
if (i == steps - 1) {
if (remain > 0) {
i_offset += last_offset;
} else {
break;
}
}
}
// Use the column-major strategy to get the location of maximum score.
size_t seq_offset = 0;
for (size_t k = 1; k < seq_len; ++k) {
size_t j_offset = 0;
for (size_t j = 0; j <= steps; ++j) {
#ifdef __AVX512F__
// Initialize the variables of maximum score and location.
__m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
__m512i max_j = _mm512_setzero_si512();
#else
// Initialize the variables of maximum score and location.
__m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
__m256i max_j = _mm256_set1_epi32(0);
#endif
// Calculate the offset of transition_weights.
size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
for (size_t i = 0; i < tag_num; ++i) {
#ifdef __AVX512F__
// Initalize the content of alpha variable with related offset.
__m512 alpha_content =
_mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
// Obtain the content of weights from un-aligned address.
__m512 w_content =
_mm512_loadu_ps((const float*)(w + trans_offset));
__m512 score_v = _mm512_add_ps(alpha_content, w_content);
__mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm512_mask_set1_epi32(max_j, mask, i);
// Update the max_score value.
max_score = _mm512_max_ps(max_score, score_v);
#else
// Initalize the content of alpha variable with related offset.
__m256 alpha_content = _mm256_broadcast_ss(
(const float*)(alpha_value + seq_offset + i));
// Obtain the content of weights from un-aligned address.
__m256 w_content =
_mm256_loadu_ps((const float*)(w + trans_offset));
__m256 score_v = _mm256_add_ps(alpha_content, w_content);
__m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
#ifdef __AVX2__
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm256_or_si256(
_mm256_andnot_si256((__m256i)mask, max_j),
_mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
#else
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
__m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
__m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
// According to the mask value, it update the index of the max_score
// location.
max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
#endif
// Update the max_score value.
max_score = _mm256_max_ps(max_score, score_v);
#endif
trans_offset += tag_num;
}
#ifdef __AVX512F__
// Update the alpha and track values.
__m512 x_content = _mm512_loadu_ps(
(const float*)(x + seq_offset + tag_num + j_offset));
max_score = _mm512_add_ps(max_score, x_content);
_mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
tag_num + j_offset),
max_score);
_mm512_storeu_si512(
reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
j_offset),
max_j);
#else
// Update the alpha and track values.
__m256 x_content = _mm256_loadu_ps(
(const float*)(x + seq_offset + tag_num + j_offset));
max_score = _mm256_add_ps(max_score, x_content);
_mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
tag_num + j_offset),
max_score);
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
j_offset),
max_j);
#endif
// Calculate the offset of next step
j_offset += step_size;
if (j == steps - 1) {
if (remain > 0) {
j_offset += last_offset;
} else {
break;
}
}
}
seq_offset += tag_num;
}
} else {
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
for (size_t k = 1; k < seq_len; ++k) {
for (size_t i = 0; i < tag_num; ++i) {
T max_score = -std::numeric_limits<T>::max();
int max_j = 0;
for (size_t j = 0; j < tag_num; ++j) {
T score = alpha_value[(k - 1) * tag_num + j] +
w[(j + state_trans_base_idx) * tag_num + i];
if (score > max_score) {
max_score = score;
max_j = j;
}
}
alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
track_value[k * tag_num + i] = max_j;
}
}
}
#else
for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
for (size_t k = 1; k < seq_len; ++k) { for (size_t k = 1; k < seq_len; ++k) {
...@@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> { ...@@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
} }
} }
#endif
T max_score = -std::numeric_limits<T>::max(); T max_score = -std::numeric_limits<T>::max();
int max_i = 0; int max_i = 0;
for (size_t i = 0; i < tag_num; ++i) { for (size_t i = 0; i < tag_num; ++i) {
......
...@@ -130,12 +130,13 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, ...@@ -130,12 +130,13 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
checkpoint_notify_id != -1, checkpoint_notify_id != -1,
"when checkpoint_notify_id = -1, there should be no RPC invoke."); "when checkpoint_notify_id = -1, there should be no RPC invoke.");
auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>(); // TODO(tangwei12): find out why scope will be error.
auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
lt_var->clear(); lt_var->clear();
lt_var->append(out_var_name); lt_var->append(out_var_name);
VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: " VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
<< out_var_name; << out_var_name;
executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope); executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
return true; return true;
} }
......
...@@ -137,9 +137,10 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> { ...@@ -137,9 +137,10 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
}; };
template <typename T> template <typename T>
class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> { class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
...@@ -136,9 +137,11 @@ elementwise_add_grad(const framework::ExecutionContext& ctx, ...@@ -136,9 +137,11 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseAddGradKernel : public framework::OpKernel<T> { class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise_op_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -53,9 +53,10 @@ struct DivGradDY { ...@@ -53,9 +53,10 @@ struct DivGradDY {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseDivGradKernel : public framework::OpKernel<T> { class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise_op_function.h"
namespace paddle { namespace paddle {
...@@ -55,9 +56,10 @@ struct MaxGradDy { ...@@ -55,9 +56,10 @@ struct MaxGradDy {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseMaxGradKernel : public framework::OpKernel<T> { class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise_op_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -55,9 +55,10 @@ struct MinGradDy { ...@@ -55,9 +55,10 @@ struct MinGradDy {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseMinGradKernel : public framework::OpKernel<T> { class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -23,6 +25,37 @@ struct MulFunctor { ...@@ -23,6 +25,37 @@ struct MulFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
}; };
template <typename DeviceContext, typename T>
void default_elementwise_mul(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MulFunctor<T>(), z);
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_mul(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto blas = math::GetBlas<DeviceContext, T>(ctx);
blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
z->mutable_data<T>(ctx.GetPlace()));
}
template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value ||
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_mul(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseMulKernel : public framework::OpKernel<T> { class ElementwiseMulKernel : public framework::OpKernel<T> {
public: public:
...@@ -33,9 +66,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> { ...@@ -33,9 +66,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis"); if (x->numel() == y->numel()) {
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, elementwise_mul<DeviceContext, T>(ctx, x, y, z);
MulFunctor<T>(), z); } else {
default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
}
} }
}; };
...@@ -50,9 +85,10 @@ struct MulGradDY { ...@@ -50,9 +85,10 @@ struct MulGradDY {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseMulGradKernel : public framework::OpKernel<T> { class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
......
...@@ -205,6 +205,20 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad { ...@@ -205,6 +205,20 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
} }
}; };
template <typename T>
class ElemwiseGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* dx =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
if (dx != nullptr) {
auto& dout =
*context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
dx->set_lod(dout.lod());
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -80,6 +80,9 @@ inline framework::DDim trim_trailing_singular_dims( ...@@ -80,6 +80,9 @@ inline framework::DDim trim_trailing_singular_dims(
for (int i = 0; i < actual_dims_size; ++i) { for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i]; trim_dims[i] = dims[i];
} }
if (trim_dims.size() == 0) {
return framework::DDim(framework::make_dim());
}
framework::DDim actual_dims = framework::make_ddim(trim_dims); framework::DDim actual_dims = framework::make_ddim(trim_dims);
return actual_dims; return actual_dims;
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/elementwise_op.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise_op_function.h"
namespace paddle { namespace paddle {
...@@ -50,9 +51,10 @@ struct SubGradDY { ...@@ -50,9 +51,10 @@ struct SubGradDY {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseSubGradKernel : public framework::OpKernel<T> { class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElemwiseGradKernel<T>::Compute(ctx);
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
......
...@@ -15,8 +15,7 @@ limitations under the License. */ ...@@ -15,8 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/fc_op.h" #include "paddle/fluid/operators/fc_op.h"
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc_compute.h"
DECLARE_int32(paddle_num_threads);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -36,9 +35,14 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -36,9 +35,14 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
if (ctx->HasInput("Bias")) { if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias"); auto bias_dims = ctx->GetInputDim("Bias");
if (bias_dims.size() == 2) {
PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim]."); PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1], PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
"The shape of Bias must be [1, dim]."); "The shape of Bias must be [1, dim].");
} else if (bias_dims.size() == 1) {
PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1],
"The shape of Bias must be [1, dim].");
}
} }
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor."); "Fully Connected input should be 2-D or 4-D tensor.");
...@@ -110,13 +114,8 @@ void FCOpMaker::Make() { ...@@ -110,13 +114,8 @@ void FCOpMaker::Make() {
AddComment(R"DOC( AddComment(R"DOC(
Fully Connected Operator. Fully Connected Operator.
The fully connected operation calculates the output based on the input, weights and bias attribute. The fully connected operation calculates the output based on the input, weights and bias.
The size of each dimension of the parameters checked in the infer-shape. The size of each dimension of the parameters checked in the infer-shape.
The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
Additional parametrs are use_mkldnn and bias_attr.
The input(X) size and output(Out) size may be diffrent.
The fully connected layer only supports MKLDNN version
)DOC"); )DOC");
} }
...@@ -133,26 +132,15 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -133,26 +132,15 @@ class FCOpKernel : public framework::OpKernel<T> {
auto in_dims = input->dims(); auto in_dims = input->dims();
auto w_dims = w->dims(); auto w_dims = w->dims();
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
math::FCCompute<platform::CPUDeviceContext, T>(
blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
bias ? bias->data<T>() : NULL);
blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0], // TODO(TJ): fuse act
static_cast<T>(1), input_data, w_data, static_cast<T>(0),
output_data);
if (bias) {
const T* bias_data = bias->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int bs = 0; bs < in_dims[0]; bs++) {
blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
output_data + bs * w_dims[1]);
}
}
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
namespace paddle {
namespace operators {
void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("WeightX"),
"Input(WeightX) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("WeightH"),
"Input(WeightH) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Bias"),
"Input(Bias) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("XX"),
"Output(XX) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
"Output(BatchedGate) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
"Output(BatchedGate) of LSTM should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
if (ctx->HasInput("H0")) {
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(Cell) and Input(Hidden) of LSTM should not "
"be null at the same time.");
auto h_dims = ctx->GetInputDim("H0");
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
}
auto wx_dims = ctx->GetInputDim("WeightX");
PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
"The rank of Input(WeightX) should be 2.");
PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
"The first dimension of Input(WeightX) "
"should be %d.",
x_dims[1]);
int frame_size = wx_dims[1] / 4;
auto wh_dims = ctx->GetInputDim("WeightH");
PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
"The rank of Input(WeightH) should be 2.");
PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
"The first dimension of Input(WeightH) "
"should be %d.",
frame_size);
PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
"The second dimension of Input(WeightH) "
"should be 4 * %d.",
frame_size);
auto b_dims = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1.");
PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
"Do not support peephole yet.");
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection",
frame_size);
framework::DDim out_dims({x_dims[0], frame_size});
ctx->SetOutputDim("Hidden", out_dims);
ctx->SetOutputDim("Cell", out_dims);
ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchCellPreAct", out_dims);
ctx->ShareLoD("X", "Hidden");
ctx->ShareLoD("X", "Cell");
int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
ctx->SetOutputDim("XX", {x_dims[0], xx_width});
ctx->ShareLoD("X", "XX");
}
framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
void FusionLSTMOpMaker::Make() {
AddInput("X",
"(LoDTensor) the input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X M), where T is the "
"total time steps in this mini-batch, M is the dim size of x.");
AddInput("WeightX",
"(Tensor) the learnable weights of X."
" - The shape is (M x 4D), where M is the dim size of x, D is the "
"hidden size. "
" - Weight = {W_cx, W_ix, W_fx, W_ox}");
AddInput("WeightH",
"(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
" - The shape is (D x 4D), where D is the hidden size. "
" - Weight = {W_ch, W_ih, W_fh, W_oh}");
AddInput("Bias",
"(Tensor) the learnable weights. Almost same as LSTMOp"
"Note: we should add the fc bias into this (1x4D) in bias."
"input-hidden bias weight and peephole connections weight if "
"setting `use_peepholes` True. "
"1. `use_peepholes = False` "
" - The shape is (1 x 4D). "
" - Bias = {b_c, b_i, b_f, b_o}."
"2. `use_peepholes = True` "
" - The shape is (1 x 7D). "
" - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
AddInput("H0",
"(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
"optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size and D is the hidden size.")
.AsDispensable();
AddInput("C0",
"(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
"optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size. `H0` and `C0` can be NULL but only at the same time.")
.AsDispensable();
AddOutput("Hidden",
"(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("Cell",
"(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
"The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("XX",
"(LoDTensor) the result after X * WeightX (size is T x 4D)"
" or batched_X (size is T x M), this will be automatically chosen,"
" where T is the total time steps in this mini-batch,"
" D is the hidden size, M is the dim size of x input.")
.AsIntermediate();
AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
.AsIntermediate();
AddAttr<bool>("use_peepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
.SetDefault(true);
AddAttr<bool>("is_reverse",
"(bool, defalut: False) "
"whether to compute reversed LSTM.")
.SetDefault(false);
AddAttr<std::string>("gate_activation",
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
"gate, `sigmoid` by default.")
.SetDefault("sigmoid")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("cell_activation",
"(string, default: tanh)"
"The activation for cell output, `tanh` by defalut.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("candidate_activation",
"(string, default: tanh)"
"The activation for candidate hidden state, "
"`tanh` by default.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddComment(R"DOC(
Fusion Long-Short Term Memory (LSTM) Operator.
This operator fuse the X into LSTM, more details can refer to LSTM op.
)DOC");
}
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
// TODO(TJ): check mem copy perf
row_shuffle(ctx, src, index_lod, dst, indexed_src);
}
template <typename DeviceContext, typename T>
class FuisonLSTMKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<LoDTensor>("X");
auto* wx = ctx.Input<Tensor>("WeightX");
auto* wh = ctx.Input<Tensor>("WeightH");
auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden_t0 = ctx.Input<Tensor>("H0");
auto* cell_t0 = ctx.Input<Tensor>("C0");
auto* xx = ctx.Output<LoDTensor>("XX");
auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
auto* cell_out = ctx.Output<LoDTensor>("Cell");
bool is_reverse = ctx.Attr<bool>("is_reverse");
T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
hidden_out->mutable_data<T>(ctx.GetPlace());
cell_out->mutable_data<T>(ctx.GetPlace());
const T* x_data = x->data<T>();
const T* wx_data = wx->data<T>();
auto x_dims = x->dims();
auto wx_dims = wx->dims();
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (x_dims[1] > wx_dims[1]) {
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
x_data, wx_data, xx_data,
bias->data<T>());
to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
} else {
to_batch(dev_ctx, *x, xx, true, is_reverse);
batched_gate->set_lod(xx->lod());
math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
xx_data, wx_data, batched_gate_data,
bias->data<T>());
}
int frame_size = static_cast<int>(wx_dims[1] / 4);
framework::DDim out_dims({x_dims[0], frame_size});
math::LstmMetaValue<T> lstm_value;
// no peephole
lstm_value.check_ig = nullptr;
lstm_value.check_fg = nullptr;
lstm_value.check_og = nullptr;
lstm_value.prev_state_value = nullptr;
Tensor ordered_c0;
framework::Vector<size_t> order(batched_gate->lod()[2]);
if (cell_t0) {
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
true);
lstm_value.prev_state_value = ordered_c0.data<T>();
}
// Use the local variable as here.
LoDTensor batch_hidden, batch_cell;
auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
auto batch_starts = batched_gate->lod()[0];
size_t max_seq_len = batch_starts.size() - 1;
auto gate_act = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType(
ctx.Attr<std::string>("cell_activation"));
auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation"));
for (size_t n = 0; n < max_seq_len; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
Tensor gate_t = batched_gate->Slice(bstart, bend);
Tensor out_t = batch_hidden.Slice(bstart, bend);
Tensor cell_t = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
int cur_batch_size = bend - bstart;
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
// TODO(TJ): use gemm directly
blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
&gate_t, static_cast<T>(1.0));
} else if (hidden_t0) {
// TODO(TJ): move h0 outside for
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
// If n == 0 and there is initialized hidden state, calculate W_h * H0.
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor ordered_h0;
ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
&ordered_h0, true);
// TODO(TJ): use gemm directly
blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
static_cast<T>(1.0));
}
lstm_value.gate_value = gate_t.data<T>();
lstm_value.output_value = out_t.data<T>();
lstm_value.state_value = cell_t.data<T>();
lstm_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<DeviceContext, T>::compute(
dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
cand_act);
lstm_value.prev_state_value = lstm_value.state_value;
}
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batch_hidden.set_lod(batched_gate->lod());
// restore the output hidden in LoDTensor from the batch hidden
to_seq(dev_ctx, batch_hidden, hidden_out);
batch_cell.set_lod(batched_gate->lod());
// restore the output cell state in LoDTensor from the batch cell
to_seq(dev_ctx, batch_cell, cell_out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(
fusion_lstm,
ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
class FusionLSTMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle
...@@ -92,6 +92,7 @@ class LoadOp : public framework::OperatorBase { ...@@ -92,6 +92,7 @@ class LoadOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::DeserializeFromStream(fin, selectedRows, dev_ctx); framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
selectedRows->SyncIndex();
} }
}; };
......
...@@ -90,6 +90,11 @@ class Blas { ...@@ -90,6 +90,11 @@ class Blas {
void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A, void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
int lda, const T* B, int ldb, T beta, T* C, int ldc) const; int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
template <typename T>
void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
T alpha, const T* A, int lda, const T* B, int ldb, T beta, T* C,
int ldc) const;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <typename T> template <typename T>
T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N, T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
...@@ -109,6 +114,10 @@ class Blas { ...@@ -109,6 +114,10 @@ class Blas {
void GEMM_FREE(T* data) const; void GEMM_FREE(T* data) const;
#endif #endif
template <typename T>
void MatMul(const int M, const int N, const int K, const T* A, const T* B,
T* C) const;
template <typename T> template <typename T>
void MatMul(const framework::Tensor& mat_a, bool trans_a, void MatMul(const framework::Tensor& mat_a, bool trans_a,
const framework::Tensor& mat_b, bool trans_b, T alpha, const framework::Tensor& mat_b, bool trans_b, T alpha,
...@@ -134,13 +143,25 @@ class Blas { ...@@ -134,13 +143,25 @@ class Blas {
template <typename T> template <typename T>
void VADD(int n, const T* x, const T* y, T* z) const; void VADD(int n, const T* x, const T* y, T* z) const;
template <typename T>
void VMUL(int n, const T* x, const T* y, T* z) const;
template <typename T> template <typename T>
void VCOPY(int n, const T* x, T* y) const; void VCOPY(int n, const T* x, T* y) const;
template <typename T>
void VEXP(int n, const T* x, T* y) const;
template <typename T> template <typename T>
void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta, void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
T* C) const; T* C) const;
template <typename T>
T DOT(int n, const T* x, const T* y) const;
template <typename T>
void SCAL(int n, const T a, T* x) const;
template <typename T> template <typename T>
void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
int K, T alpha, const T* A, const T* B, T beta, T* C, int K, T alpha, const T* A, const T* B, T beta, T* C,
...@@ -202,16 +223,36 @@ class BlasT : private Blas<DeviceContext> { ...@@ -202,16 +223,36 @@ class BlasT : private Blas<DeviceContext> {
Base()->template VADD<T>(args...); Base()->template VADD<T>(args...);
} }
template <typename... ARGS>
void VMUL(ARGS... args) const {
Base()->template VMUL<T>(args...);
}
template <typename... ARGS> template <typename... ARGS>
void VCOPY(ARGS... args) const { void VCOPY(ARGS... args) const {
Base()->template VCOPY<T>(args...); Base()->template VCOPY<T>(args...);
} }
template <typename... ARGS>
void VEXP(ARGS... args) const {
Base()->template VEXP<T>(args...);
}
template <typename... ARGS> template <typename... ARGS>
void GEMV(ARGS... args) const { void GEMV(ARGS... args) const {
Base()->template GEMV<T>(args...); Base()->template GEMV<T>(args...);
} }
template <typename... ARGS>
T DOT(ARGS... args) const {
return Base()->template DOT<T>(args...);
}
template <typename... ARGS>
void SCAL(ARGS... args) const {
Base()->template SCAL<T>(args...);
}
template <typename... ARGS> template <typename... ARGS>
void BatchedGEMM(ARGS... args) const { void BatchedGEMM(ARGS... args) const {
Base()->template BatchedGEMM<T>(args...); Base()->template BatchedGEMM<T>(args...);
......
...@@ -73,6 +73,16 @@ struct CBlas<float> { ...@@ -73,6 +73,16 @@ struct CBlas<float> {
platform::dynload::cblas_sgemv(args...); platform::dynload::cblas_sgemv(args...);
} }
template <typename... ARGS>
static float DOT(ARGS... args) {
return platform::dynload::cblas_sdot(args...);
}
template <typename... ARGS>
static void SCAL(ARGS... args) {
platform::dynload::cblas_sscal(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) { static void GEMM_BATCH(ARGS... args) {
platform::dynload::cblas_sgemm_batch(args...); platform::dynload::cblas_sgemm_batch(args...);
...@@ -82,6 +92,16 @@ struct CBlas<float> { ...@@ -82,6 +92,16 @@ struct CBlas<float> {
static void VADD(ARGS... args) { static void VADD(ARGS... args) {
platform::dynload::vsAdd(args...); platform::dynload::vsAdd(args...);
} }
template <typename... ARGS>
static void VMUL(ARGS... args) {
platform::dynload::vsMul(args...);
}
template <typename... ARGS>
static void VEXP(ARGS... args) {
platform::dynload::vsExp(args...);
}
}; };
template <> template <>
...@@ -133,6 +153,16 @@ struct CBlas<double> { ...@@ -133,6 +153,16 @@ struct CBlas<double> {
platform::dynload::cblas_dgemv(args...); platform::dynload::cblas_dgemv(args...);
} }
template <typename... ARGS>
static double DOT(ARGS... args) {
return platform::dynload::cblas_ddot(args...);
}
template <typename... ARGS>
static void SCAL(ARGS... args) {
platform::dynload::cblas_dscal(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void GEMM_BATCH(ARGS... args) { static void GEMM_BATCH(ARGS... args) {
platform::dynload::cblas_dgemm_batch(args...); platform::dynload::cblas_dgemm_batch(args...);
...@@ -142,6 +172,16 @@ struct CBlas<double> { ...@@ -142,6 +172,16 @@ struct CBlas<double> {
static void VADD(ARGS... args) { static void VADD(ARGS... args) {
platform::dynload::vdAdd(args...); platform::dynload::vdAdd(args...);
} }
template <typename... ARGS>
static void VMUL(ARGS... args) {
platform::dynload::vdMul(args...);
}
template <typename... ARGS>
static void VEXP(ARGS... args) {
platform::dynload::vdExp(args...);
}
}; };
#else #else
...@@ -199,6 +239,10 @@ struct CBlas<platform::float16> { ...@@ -199,6 +239,10 @@ struct CBlas<platform::float16> {
static void SMM_GEMM(...) { static void SMM_GEMM(...) {
PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
} }
static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
static void GEMM_BATCH(...) { static void GEMM_BATCH(...) {
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
...@@ -206,64 +250,6 @@ struct CBlas<platform::float16> { ...@@ -206,64 +250,6 @@ struct CBlas<platform::float16> {
#endif #endif
}; };
template <typename T>
inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
bool transb, const T &alpha, const T &beta) {
#ifdef PADDLE_WITH_LIBXSMM
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
// But the threshold is custom
constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
std::abs<T>(alpha - static_cast<T>(1) >
std::numeric_limits<T>::epsilon()) ||
std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
return false;
} else {
return true;
}
#endif
return false;
}
template <>
inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
bool transa, bool transb,
const platform::float16 &alpha,
const platform::float16 &beta) {
return false;
}
template <typename T>
inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
const T *A, int lda, const T *B, int ldb, T beta, T *C,
int ldc) {
#ifdef PADDLE_WITH_LIBXSMM
if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
beta)) {
// Note: SMM use ColMajor
const char transa = 'N';
const char transb = 'N';
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
&beta, C, &ldc);
return;
}
#endif
#ifdef PADDLE_MKL_SPLIT_GEMM
constexpr int bs = 2;
if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
for (int off = 0; off < M; off += bs) {
CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
}
return;
}
#endif
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <> template <>
template <typename T> template <typename T>
...@@ -308,7 +294,7 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA, ...@@ -308,7 +294,7 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
int lda = (transA == CblasNoTrans) ? K : M; int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K; int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N; int ldc = N;
GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc); beta, C, ldc);
} }
...@@ -318,11 +304,22 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M, ...@@ -318,11 +304,22 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
int N, int K, T alpha, const T *A, int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb, int lda, const T *B, int ldb,
T beta, T *C, int ldc) const { T beta, T *C, int ldc) const {
GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc); lda, B, ldb, beta, C, ldc);
} }
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M,
int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb,
T beta, T *C, int ldc) const {
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <typename DeviceContext> template <typename DeviceContext>
template <typename T> template <typename T>
void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a, void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
...@@ -374,6 +371,61 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y, ...@@ -374,6 +371,61 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
#endif #endif
} }
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VMUL(n, x, y, z);
#else
// try to find if openblas support vmul
for (int i = 0; i < n; ++i) {
z[i] = x[i] * y[i];
}
#endif
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VEXP(n, x, y);
#else
// try to find if openblas support vexp
for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]);
}
#endif
}
template <>
template <typename T>
T Blas<platform::CPUDeviceContext>::DOT(int n, const T *x, const T *y) const {
#ifdef PADDLE_WITH_MKLML
return CBlas<T>::DOT(n, x, 1, y, 1);
#else
// try to find if openblas support cblas_dot
T sum = 0;
for (int i = 0; i < n; ++i) {
sum += x[i] * y[i];
}
return sum;
#endif
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::SCAL(n, a, x, 1);
#else
// try to find if openblas support cblas_scal
for (int i = 0; i < n; ++i) {
x[i] = a * x[i];
}
#endif
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha, void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
...@@ -415,6 +467,42 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM( ...@@ -415,6 +467,42 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
#endif #endif
} }
template <typename DeviceContext>
template <typename T>
void Blas<DeviceContext>::MatMul(const int M, const int N, const int K,
const T *A, const T *B, T *C) const {
this->template GEMM<T>(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
static_cast<T>(1), A, K, B, N, static_cast<T>(0), C,
N);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::MatMul(const int M, const int N,
const int K, const T *A,
const T *B, T *C) const {
#ifdef PADDLE_WITH_LIBXSMM
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
// But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
// Since the matrix is very small,
// so the unit of calculation is already very fast,
// and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
// use xsmm directly.
// Note: SMM use ColMajor
const char transa = 'N';
const char transb = 'N';
const T alpha = static_cast<T>(1);
const T beta = static_cast<T>(0);
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta,
C, &N);
return;
#endif
CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K,
static_cast<T>(1), A, K, B, N, static_cast<T>(0), C, N);
}
template <typename DeviceContext> template <typename DeviceContext>
template <typename T> template <typename T>
void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
......
...@@ -71,7 +71,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -71,7 +71,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const std::vector<const framework::Tensor*>& ref_inputs, const std::vector<const framework::LoDTensor*>& ref_inputs,
const int axis, std::vector<framework::Tensor*>* outputs) { const int axis, std::vector<framework::Tensor*>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
size_t num = outputs->size(); size_t num = outputs->size();
......
...@@ -189,7 +189,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -189,7 +189,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
const std::vector<const framework::Tensor*>& ref_inputs, const std::vector<const framework::LoDTensor*>& ref_inputs,
const int axis, std::vector<framework::Tensor*>* outputs) { const int axis, std::vector<framework::Tensor*>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int o_num = outputs->size(); int o_num = outputs->size();
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -57,7 +57,7 @@ template <typename DeviceContext, typename T> ...@@ -57,7 +57,7 @@ template <typename DeviceContext, typename T>
class ConcatGradFunctor { class ConcatGradFunctor {
public: public:
void operator()(const DeviceContext& context, const framework::Tensor& input, void operator()(const DeviceContext& context, const framework::Tensor& input,
const std::vector<const framework::Tensor*>& ref_inputs, const std::vector<const framework::LoDTensor*>& ref_inputs,
const int axis, std::vector<framework::Tensor*>* outputs); const int axis, std::vector<framework::Tensor*>* outputs);
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
template <typename T>
inline T sigmoid(T x) {
return 1. / (1. + exp(-x));
}
template <typename T>
inline T tanh(T x) {
return 2. * sigmoid(2. * x) - 1.;
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_identity(const int n, const T* x, T* y) {
// do nothing
return;
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_sigmoid(const int n, const T* x, T* y) {
const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX;
for (int i = 0; i < n; ++i) {
T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
y[i] = 1.0 / (1.0 + std::exp(-tmp));
}
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_tanh(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = tanh<T>(x[i]);
}
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
inline void vec_relu(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <>
inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
float* y) {
// TODO(TJ): complete me
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <>
inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
float* y) {
// TODO(TJ): complete me
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
class VecActivations {
public:
std::function<void(const int, const T*, T*)> operator()(
const std::string& type) {
if (type == "sigmoid") {
return vec_sigmoid<T, isa>;
} else if (type == "relu") {
return vec_relu<T, isa>;
} else if (type == "tanh") {
return vec_tanh<T, isa>;
} else if (type == "identity" || type == "") {
return vec_identity<T, isa>;
}
PADDLE_THROW("Not support type %s.", type);
}
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/blas.h"
DECLARE_int32(paddle_num_threads);
namespace paddle {
namespace operators {
namespace math {
template <typename DeviceContext, typename T>
inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
const int N, const int K, const T* X, const T* W, T* Y,
const T* B = NULL, bool relu = false) {
blas.MatMul(M, N, K, X, W, Y);
if (B == NULL) {
return;
}
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for (int i = 0; i < M; i++) {
blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
}
if (!relu) {
return;
}
// TODO(TJ): fuse relu
LOG(FATAL) << "Not implemented!";
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -54,9 +54,9 @@ class MulOp : public framework::OperatorWithKernel { ...@@ -54,9 +54,9 @@ class MulOp : public framework::OperatorWithKernel {
auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims); auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
x_mat_dims[1], y_mat_dims[0], "First matrix's width must be equal with second matrix's "
"First matrix's width must be equal with second matrix's height."); "height. %s, %s");
std::vector<int64_t> output_dims; std::vector<int64_t> output_dims;
output_dims.reserve( output_dims.reserve(
static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims)); static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
......
...@@ -62,23 +62,31 @@ class MulGradKernel : public framework::OpKernel<T> { ...@@ -62,23 +62,31 @@ class MulGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims"); int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims"); int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
const Tensor* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<framework::LoDTensor>("X");
const Tensor* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<framework::LoDTensor>("Y");
const Tensor x_matrix = x->dims().size() > 2 auto x_matrix = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims) ? framework::ReshapeToMatrix(*x, x_num_col_dims)
: *x; : static_cast<const Tensor&>(*x);
const Tensor y_matrix = y->dims().size() > 2 auto y_matrix = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims) ? framework::ReshapeToMatrix(*y, y_num_col_dims)
: *y; : static_cast<const Tensor&>(*y);
const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
Tensor dout_mat; Tensor dout_mat;
dout_mat.ShareDataWith(*dout); dout_mat.ShareDataWith(*dout);
dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0], dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
if (dx != nullptr) {
dx->set_lod(x->lod());
}
if (dy != nullptr) {
dy->set_lod(y->lod());
}
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
if (dx) { if (dx) {
......
...@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase { ...@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase {
class RecvOpMaker : public framework::OpProtoAndCheckerMaker { class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
AddInput("X", "(Any) Dummy inputs, used for control dependency")
.AsDuplicable();
AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable(); AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
......
...@@ -54,7 +54,7 @@ class SamplingIdKernel : public framework::OpKernel<T> { ...@@ -54,7 +54,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
static_cast<T>(context.Attr<float>("max"))); static_cast<T>(context.Attr<float>("max")));
std::vector<T> ids(batch_size); std::vector<T> ids(batch_size);
for (size_t i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
T r = dist(engine); T r = dist(engine);
int idx = width - 1; int idx = width - 1;
for (int j = 0; j < width; ++j) { for (int j = 0; j < width; ++j) {
...@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
break; break;
} }
} }
ids[i] = ins_vector[i * width + idx]; ids[i] = ins_vector[idx];
} }
std::vector<int64_t> out_dim; std::vector<int64_t> out_dim;
......
...@@ -142,6 +142,8 @@ class SaveOp : public framework::OperatorBase { ...@@ -142,6 +142,8 @@ class SaveOp : public framework::OperatorBase {
std::string filename = lt_var->data(); std::string filename = lt_var->data();
VLOG(4) << "SaveSelectedRows get File name: " << filename; VLOG(4) << "SaveSelectedRows get File name: " << filename;
MkDirRecursively(DirName(filename).c_str());
auto &selectedRows = var->Get<framework::SelectedRows>(); auto &selectedRows = var->Get<framework::SelectedRows>();
// get device context from pool // get device context from pool
......
...@@ -81,8 +81,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -81,8 +81,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("X", "The source input of scatter op"); AddInput("X", "The source input of scatter op");
AddInput("Ids", "The index input of scatter op where X will be updated"); AddInput("Ids", "The index input of scatter op where X will be updated");
AddInput("Updates", "The updated value of updates op"); AddInput("Updates", "The updated value of scatter op");
AddOutput("Out", "The output of add op"); AddOutput("Out", "The output of scatter op");
AddComment(R"DOC( AddComment(R"DOC(
Scatter Operator. Scatter Operator.
...@@ -90,7 +90,7 @@ This operator obtains output by updating the input on selected indices on the fi ...@@ -90,7 +90,7 @@ This operator obtains output by updating the input on selected indices on the fi
$$ $$
Out = X \\ Out = X \\
Out[Ids] = X[Ids] + Updates Out[Ids] = Updates
$$ $$
)DOC"); )DOC");
......
...@@ -34,9 +34,9 @@ class ScatterOpKernel : public framework::OpKernel<T> { ...@@ -34,9 +34,9 @@ class ScatterOpKernel : public framework::OpKernel<T> {
auto *Updates = ctx.Input<Tensor>("Updates"); auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out"); auto *Out = ctx.Output<Tensor>("Out");
// In place output: Out = X, Out[Ids] += Updates // In place output: Out = X, Out[Ids] = Updates
framework::TensorCopySync(*X, ctx.GetPlace(), Out); framework::TensorCopySync(*X, ctx.GetPlace(), Out);
// Apply ScatterUpdate: Out[index] += Updates[:] // Apply ScatterUpdate: Out[index] = Updates[:]
ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out); ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
} }
}; };
...@@ -55,7 +55,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> { ...@@ -55,7 +55,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
// In place gradient: dX = dO // In place gradient: dX = dO
framework::TensorCopySync(*dOut, ctx.GetPlace(), dX); framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Ids] // Gradient by Gather: dUpdates = dO[Ids]
CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates); CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
} }
}; };
......
...@@ -37,23 +37,20 @@ class SendBarrierOp : public framework::OperatorBase { ...@@ -37,23 +37,20 @@ class SendBarrierOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override { const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints"); std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
bool sync_mode = Attr<bool>("sync_mode");
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; VLOG(3) << "SendBarrierOp sync";
// need to wait before sending send_barrier message // need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
if (sync_mode) {
for (auto& ep : eps) { for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep; VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
} }
}
}; };
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker { class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent. ...@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent.
"(string vector, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.") "Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"}); .SetDefault({"127.0.0.1:6164"});
AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
} }
}; };
......
...@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() { void Make() {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable(); .AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Send operator Send operator
......
...@@ -68,7 +68,9 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> { ...@@ -68,7 +68,9 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out")); auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
auto* x = ctx.Input<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X")); auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
if (x_grad) {
x_grad->set_lod(x->lod());
}
auto lod = x->lod(); auto lod = x->lod();
const size_t level = lod.size() - 1; const size_t level = lod.size() - 1;
......
...@@ -66,6 +66,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> { ...@@ -66,6 +66,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out")); auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
auto* x = ctx.Input<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X")); auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
if (x_grad) {
x_grad->set_lod(x->lod());
}
auto lod = x->lod(); auto lod = x->lod();
const size_t level = lod.size() - 1; const size_t level = lod.size() - 1;
......
...@@ -23,9 +23,9 @@ class SqueezeOpInferShape : public framework::InferShapeBase { ...@@ -23,9 +23,9 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of SqueezeOp should not be null."); "Input(X) of Squeeze operator should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SqueezeOp should not be null."); "Output(Out) of Squeeze operator should not be null.");
const auto &x_dims = ctx->GetInputDim("X"); const auto &x_dims = ctx->GetInputDim("X");
// Check input tensor dims (<6) Eigen limit. // Check input tensor dims (<6) Eigen limit.
...@@ -107,7 +107,6 @@ class SqueezeOp : public framework::OperatorBase { ...@@ -107,7 +107,6 @@ class SqueezeOp : public framework::OperatorBase {
framework::AttributeMap attrs; framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims); attrs["shape"] = framework::vectorize2int(out_dims);
attrs["inplace"] = Attr<bool>("inplace");
// Invoke Reshape Op // Invoke Reshape Op
auto reshape_op = framework::OpRegistry::CreateOp( auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {Input("X")}}, {"Shape", {}}}, "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
...@@ -125,12 +124,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -125,12 +124,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
"(std::vector<int>). List of integers," "(std::vector<int>). List of integers,"
" indicating the dimensions to squeeze.") " indicating the dimensions to squeeze.")
.SetDefault({}); .SetDefault({});
AddAttr<bool>("inplace",
"(default: false) Squeeze the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x).")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Squeeze Operator. Squeeze Operator.
...@@ -180,7 +173,6 @@ class SqueezeGradOp : public framework::OperatorBase { ...@@ -180,7 +173,6 @@ class SqueezeGradOp : public framework::OperatorBase {
auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims(); auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
framework::AttributeMap attrs; framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims); attrs["shape"] = framework::vectorize2int(x_dims);
attrs["inplace"] = Attr<bool>("inplace");
auto reshape_op = framework::OpRegistry::CreateOp( auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/stack_op.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
ops::StackGradOpDescMaker);
REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
ops::StackKernel<plat::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(stack_grad,
ops::StackGradKernel<plat::CPUDeviceContext, float>,
ops::StackGradKernel<plat::CPUDeviceContext, double>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/stack_op.h"
namespace plat = paddle::platform;
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
ops::StackKernel<plat::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(stack_grad,
ops::StackGradKernel<plat::CUDADeviceContext, float>,
ops::StackGradKernel<plat::CUDADeviceContext, double>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#ifdef __NVCC__
#include <thrust/device_vector.h>
#include "paddle/fluid/framework/array.h"
#endif
namespace paddle {
namespace operators {
class StackOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GT(ctx->Inputs("X").size(), 0,
"Number of Inputs(X) must be larger than 0");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist.");
auto input_dims = ctx->GetInputsDim("X");
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
"Dims of all Inputs(X) must be the same");
}
// Only lod of X[0] would be shared with Y
ctx->ShareLoD("X", /*->*/ "Y");
int axis = ctx->Attrs().Get<int>("axis");
int rank = input_dims[0].size();
PADDLE_ENFORCE(
axis >= -(rank + 1) && axis < rank + 1,
"Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank);
if (axis < 0) axis += (rank + 1);
auto vec = framework::vectorize2int(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
ctx->SetOutputDim("Y", framework::make_ddim(vec));
}
};
class StackOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input of stack op.").AsDuplicable();
AddOutput("Y", "The output of stack op.");
AddAttr<int>("axis",
"The axis along which all of the Inputs(X) should be stacked.")
.SetDefault(0);
AddComment(R"DOC(
Stack Operator.
Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inputs(X) must be the same.
)DOC");
}
};
template <typename VecXType, typename T>
struct StackFunctor {
HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post)
: x_(x), y_(y), n_(n), post_(post) {}
HOSTDEVICE void operator()(int idx) {
int i = idx / (n_ * post_);
int which_x = idx / post_ - i * n_;
int x_index = i * post_ + idx % post_;
y_[idx] = x_[which_x][x_index];
}
private:
VecXType x_;
T *y_;
int n_;
int post_;
};
template <typename VecDxType, typename T>
struct StackGradFunctor {
HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post)
: dx_(dx), dy_(dy), n_(n), post_(post) {}
HOSTDEVICE void operator()(int idx) {
int i = idx / (n_ * post_);
int which_x = idx / post_ - i * n_;
int x_index = i * post_ + idx % post_;
dx_[which_x][x_index] = dy_[idx];
}
private:
VecDxType dx_;
const T *dy_;
int n_;
int post_;
};
template <typename DeviceContext, typename VecXType, typename T>
static inline void StackFunctorForRange(const DeviceContext &ctx,
const VecXType &x, T *y, int total_num,
int n, int post) {
platform::ForRange<DeviceContext> for_range(ctx, total_num);
for_range(StackFunctor<VecXType, T>(x, y, n, post));
}
template <typename DeviceContext, typename VecDxType, typename T>
static inline void StackGradFunctorForRange(const DeviceContext &ctx,
const VecDxType &dx, const T *dy,
int total_num, int n, int post) {
platform::ForRange<DeviceContext> for_range(ctx, total_num);
for_range(StackGradFunctor<VecDxType, T>(dx, dy, n, post));
}
template <typename DeviceContext, typename T>
class StackKernel : public framework::OpKernel<T> {
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto x = ctx.MultiInput<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y");
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size());
auto *y_data = y->mutable_data<T>(ctx.GetPlace());
std::vector<const T *> x_datas(n);
for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
int pre = 1, post = 1;
auto &dim = x[0]->dims();
for (auto i = 0; i < axis; ++i) pre *= dim[i];
for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
int total_num = pre * n * post;
auto &dev_ctx = ctx.template device_context<DeviceContext>();
constexpr auto kMaxThreshold = 16;
if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
n > kMaxThreshold) {
#ifdef __NVCC__
VLOG(10) << "Stack more than " << kMaxThreshold
<< " tensors on GPU may be slow.";
thrust::device_vector<const T *> device_x_vec(x_datas);
auto x_data_arr = device_x_vec.data().get();
#else
auto x_data_arr = x_datas.data();
#endif
StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
#ifdef __NVCC__
// Wait() must be called because device_x_vec may be destructed before
// kernel ends
dev_ctx.Wait();
#endif
}
#ifdef __NVCC__
else { // NOLINT
framework::Array<const T *, kMaxThreshold> x_data_arr;
for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i];
StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post);
}
#endif
}
};
class StackOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@Grad) must exist.");
int axis = ctx->Attrs().Get<int>("axis");
auto dy_dim = ctx->GetInputDim(framework::GradVarName("Y"));
int rank = dy_dim.size();
PADDLE_ENFORCE(axis >= -rank && axis < rank,
"Attr(axis) must be inside [-rank, rank), where rank = %d",
rank);
if (axis < 0) axis += rank;
PADDLE_ENFORCE_EQ(ctx->Outputs(framework::GradVarName("X")).size(),
static_cast<size_t>(dy_dim[axis]),
"Number of Outputs(X@Grad) is wrong");
auto vec = framework::vectorize2int(dy_dim);
vec.erase(vec.begin() + axis);
ctx->SetOutputsDim(
framework::GradVarName("X"),
std::vector<framework::DDim>(dy_dim[axis], framework::make_ddim(vec)));
}
};
class StackGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("stack_grad");
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
op->SetAttrMap(Attrs());
return op;
}
};
template <typename DeviceContext, typename T>
class StackGradKernel : public framework::OpKernel<T> {
using Tensor = framework::LoDTensor;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += dy->dims().size();
int n = dy->dims()[axis];
std::vector<T *> dx_datas(n); // NOLINT
for (int i = 0; i < n; i++) {
dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
}
auto dy_data = dy->data<T>();
int pre = 1;
for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
int total_num = dy->numel();
int post = total_num / (n * pre);
auto &dev_ctx = ctx.template device_context<DeviceContext>();
constexpr auto kMaxThreshold = 16;
if (std::is_same<DeviceContext, platform::CPUDeviceContext>::value ||
n > kMaxThreshold) {
#ifdef __NVCC__
VLOG(10) << "Stack more than " << kMaxThreshold
<< " tensors on GPU may be slow.";
thrust::device_vector<T *> device_dx_vec(dx_datas);
auto dx_data_arr = device_dx_vec.data().get();
#else
auto dx_data_arr = dx_datas.data();
#endif
StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
post);
#ifdef __NVCC__
// Wait() must be called because device_dx_vec may be destructed before
// kernel ends
dev_ctx.Wait();
#endif
}
#ifdef __NVCC__
else { // NOLINT
framework::Array<T *, kMaxThreshold> dx_data_arr;
for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i];
StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,
post);
}
#endif
}
};
} // namespace operators
} // namespace paddle
...@@ -34,15 +34,15 @@ ...@@ -34,15 +34,15 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::CPUDeviceContext;
using framework::DataLayout; using framework::DataLayout;
using mkldnn::memory; using mkldnn::memory;
using mkldnn::primitive; using mkldnn::primitive;
using mkldnn::reorder;
using mkldnn::stream; using mkldnn::stream;
using mkldnn::sum; using mkldnn::sum;
using mkldnn::reorder; using paddle::framework::Tensor;
using paddle::platform::CPUDeviceContext;
using paddle::platform::MKLDNNDeviceContext;
using platform::to_void_cast; using platform::to_void_cast;
template <typename T> template <typename T>
...@@ -175,18 +175,35 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -175,18 +175,35 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto& sel_row = get_selected_row(i); auto& sel_row = get_selected_row(i);
first_dim += sel_row.rows().size(); first_dim += sel_row.rows().size();
} }
auto in_dim =
framework::vectorize(get_selected_row(N - 1).value().dims()); std::vector<int64_t> in_dim;
for (int i = 0; i < N; i++) {
auto& sel_row = get_selected_row(i);
if (sel_row.rows().size() > 0) {
in_dim = framework::vectorize(sel_row.value().dims());
break;
}
}
if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
} else {
in_dim[0] = static_cast<int64_t>(first_dim);
}
in_dim[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(ctx.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
return; return;
} }
out_value->mutable_data<T>(ctx.GetPlace());
math::SelectedRowsAddTo<CPUDeviceContext, T> functor; math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
int64_t offset = 0; int64_t offset = 0;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
......
...@@ -105,18 +105,30 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -105,18 +105,30 @@ class SumKernel : public framework::OpKernel<T> {
auto &sel_row = get_selected_row(i); auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size(); first_dim += sel_row.rows().size();
} }
auto in_dim =
framework::vectorize(get_selected_row(N - 1).value().dims()); std::vector<int64_t> in_dim;
for (int i = 0; i < N; i++) {
auto &sel_row = get_selected_row(i);
if (sel_row.rows().size() > 0) {
in_dim = framework::vectorize(sel_row.value().dims());
break;
}
}
if (in_dim.empty()) {
VLOG(3) << "WARNING: all the inputs are empty";
in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
} else {
in_dim[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
}
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
return; return;
} }
out_value->mutable_data<T>(context.GetPlace());
math::SelectedRowsAddTo<DeviceContext, T> functor; math::SelectedRowsAddTo<DeviceContext, T> functor;
......
...@@ -17,112 +17,16 @@ ...@@ -17,112 +17,16 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h" #include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle { namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
namespace operators { namespace operators {
using inference::Singleton;
using inference::tensorrt::TRT_EngineManager;
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace {
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE_EQ(shape.size(), 4UL);
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
}
} // namespace
template <typename DeviceContext, typename T>
void TensorRTEngineKernel<DeviceContext, T>::Prepare(
const framework::ExecutionContext &context) const {
VLOG(4) << "Prepare engine";
// Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
int max_batch = context.Attr<int>("max_batch");
auto max_workspace = context.Attr<int>("max_workspace");
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto &param : params) {
parameters.insert(param);
}
std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping");
// TODO(Superjomn) replace this with a different stream
auto *engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch, max_workspace, nullptr /*engine hold its own stream*/,
context.Attr<std::string>("engine_uniq_key"));
engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs
VLOG(4) << "declare inputs";
for (auto &input : context.Inputs("Xs")) {
if (parameters.count(input)) continue;
VLOG(4) << "declare input " << input;
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto shape = var->GetShape();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if (shape[0] == -1) {
shape[0] = FLAGS_tensorrt_engine_batch_size;
}
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
block_desc, parameters, context.scope(), engine);
// Add outputs
for (auto &output : output_maps) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -130,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -130,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Ys", "A list of outputs").AsDuplicable(); AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph."); AddAttr<std::string>("subgraph", "the subgraph.");
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine."); AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
AddAttr<int>("max_batch", "the maximum batch size.");
AddAttr<int>("max_workspace", "the maximum batch size.");
AddComment("TensorRT engine operator."); AddComment("TensorRT engine operator.");
} }
}; };
...@@ -150,11 +52,4 @@ namespace ops = paddle::operators; ...@@ -150,11 +52,4 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
REGISTER_OP_CPU_KERNEL(
tensorrt_engine,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
tensorrt_engine,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -19,16 +19,51 @@ ...@@ -19,16 +19,51 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle { namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size); DECLARE_int32(tensorrt_engine_batch_size);
DECLARE_int32(tensorrt_max_batch_size);
DECLARE_int32(tensorrt_workspace_size);
namespace operators { namespace operators {
using FluidDT = framework::proto::VarType_Type;
using TRT_DT = nvinfer1::DataType;
namespace {
TRT_DT FluidDataType2TRT(FluidDT type) {
switch (type) {
case FluidDT::VarType_Type_FP32:
return TRT_DT::kFLOAT;
case FluidDT::VarType_Type_INT32:
return TRT_DT::kINT32;
default:
return TRT_DT::kINT32;
}
PADDLE_THROW("unkown type");
return TRT_DT::kINT32;
}
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
if (shape.size() == 4UL)
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
return nvinfer1::DimsCHW(shape[1], 1, 1);
}
} // namespace
using inference::Singleton; using inference::Singleton;
using inference::tensorrt::TRT_EngineManager; using inference::tensorrt::TRT_EngineManager;
...@@ -47,7 +82,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel { ...@@ -47,7 +82,7 @@ class TensorRTEngineOp : public framework::OperatorWithKernel {
.FindVar(input0) .FindVar(input0)
->GetMutable<framework::LoDTensor>() ->GetMutable<framework::LoDTensor>()
->type()), ->type()),
platform::CPUPlace()); ctx.GetPlace());
return kt; return kt;
} }
}; };
...@@ -64,7 +99,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -64,7 +99,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto input_names = context.op().Inputs("Xs"); auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
context.Attr<int>("max_batch")); FLAGS_tensorrt_max_batch_size);
std::vector<std::string> output_maps = std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping"); context.Attr<std::vector<std::string>>("output_name_mapping");
...@@ -94,12 +129,19 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -94,12 +129,19 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Convert output tensor from engine to fluid // Convert output tensor from engine to fluid
int output_index = 0; int output_index = 0;
VLOG(4) << "TensorRT Engine Op Outputs:";
for (const auto& y : context.Outputs("Ys")) { for (const auto& y : context.Outputs("Ys")) {
VLOG(4) << y;
// convert output and copy to fluid. // convert output and copy to fluid.
nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
auto dims = trt_t->getDimensions(); auto dims = trt_t->getDimensions();
// Use the output ITensor's dims to reshape the Fluid Tensor. // Use the output ITensor's dims to reshape the Fluid Tensor.
std::vector<int> ddim(dims.d, dims.d + dims.nbDims); // The ITensor doesn't contain the batch size dim.
std::vector<int> ddim;
ddim.push_back(FLAGS_tensorrt_engine_batch_size);
for (int i = 0; i < dims.nbDims; i++) {
ddim.push_back(dims.d[i]);
}
auto* fluid_v = context.scope().FindVar(y); auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
...@@ -113,8 +155,10 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -113,8 +155,10 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
FLAGS_tensorrt_engine_batch_size; FLAGS_tensorrt_engine_batch_size;
engine->GetOutputInCPU(output_maps[output_index], engine->GetOutputInGPU(
fluid_t->mutable_data<float>(platform::CPUPlace()), output_maps[output_index],
fluid_t->mutable_data<float>(platform::CUDAPlace(
boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
size * sizeof(float)); size * sizeof(float));
//} else { //} else {
// engine->GetOutputInGPU( // engine->GetOutputInGPU(
...@@ -128,8 +172,67 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -128,8 +172,67 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
} }
protected: protected:
// Build the engine. void Prepare(const framework::ExecutionContext& context) const {
void Prepare(const framework::ExecutionContext& context) const; VLOG(4) << "Prepare engine";
// Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
int max_batch = FLAGS_tensorrt_max_batch_size;
auto max_workspace = FLAGS_tensorrt_workspace_size;
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto& param : params) {
parameters.insert(param);
}
std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping");
// TODO(Superjomn) replace this with a different stream
auto* engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch, max_workspace, nullptr /*engine hold its own stream*/,
context.Attr<std::string>("engine_uniq_key"),
boost::get<platform::CUDAPlace>(context.GetPlace()).device);
engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs
VLOG(4) << "declare inputs";
for (auto& input : context.Inputs("Xs")) {
if (parameters.count(input)) continue;
VLOG(4) << "declare input " << input;
auto* var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto shape = var->GetShape();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if (shape[0] == -1) {
shape[0] = FLAGS_tensorrt_engine_batch_size;
}
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlock(block_desc, parameters, context.scope(), engine);
// Add outputs
for (auto& output : output_maps) {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
}
}; };
} // namespace operators } // namespace operators
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/tensorrt_engine_op.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -23,20 +24,20 @@ limitations under the License. */ ...@@ -23,20 +24,20 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_CPU_ONLY_OP(tensorrt_engine); USE_CUDA_ONLY_OP(tensorrt_engine);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace { namespace {
void CreateCPUTensor(framework::Scope* scope, const std::string& name, void CreateCUDATensor(framework::Scope* scope, const std::string& name,
const std::vector<int64_t>& shape) { const std::vector<int64_t>& shape) {
auto* var = scope->Var(name); auto* var = scope->Var(name);
auto* tensor = var->GetMutable<framework::LoDTensor>(); auto* tensor = var->GetMutable<framework::LoDTensor>();
auto dims = framework::make_ddim(shape); auto dims = framework::make_ddim(shape);
tensor->Resize(dims); tensor->Resize(dims);
platform::CPUPlace place; platform::CUDAPlace place;
platform::CPUDeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
inference::tensorrt::RandomizeTensor(tensor, place, ctx); inference::tensorrt::RandomizeTensor(tensor, place, ctx);
} }
...@@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, ...@@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
using inference::analysis::SetAttr; using inference::analysis::SetAttr;
TEST(TensorRTEngineOp, manual) { TEST(TensorRTEngineOp, manual) {
FLAGS_tensorrt_engine_batch_size = 2;
FLAGS_tensorrt_max_batch_size = 2;
framework::ProgramDesc program; framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks(); auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0); block_->set_idx(0);
...@@ -98,8 +101,6 @@ TEST(TensorRTEngineOp, manual) { ...@@ -98,8 +101,6 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters", SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
std::vector<std::string>({})); std::vector<std::string>({}));
...@@ -112,15 +113,15 @@ TEST(TensorRTEngineOp, manual) { ...@@ -112,15 +113,15 @@ TEST(TensorRTEngineOp, manual) {
LOG(INFO) << "engine_op " << engine_op.get(); LOG(INFO) << "engine_op " << engine_op.get();
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; platform::CUDAPlace place;
platform::CPUDeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
// Prepare variables. // Prepare variables.
CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4})); CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6})); CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6})); CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8})); CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8})); CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
// Execute them. // Execute them.
LOG(INFO) << "engine_op run"; LOG(INFO) << "engine_op run";
...@@ -128,10 +129,12 @@ TEST(TensorRTEngineOp, manual) { ...@@ -128,10 +129,12 @@ TEST(TensorRTEngineOp, manual) {
} }
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
FLAGS_tensorrt_engine_batch_size = batch_size;
FLAGS_tensorrt_max_batch_size = batch_size;
framework::ProgramDesc program; framework::ProgramDesc program;
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; platform::CUDAPlace place;
platform::CPUDeviceContext ctx(place); platform::CUDADeviceContext ctx(place);
auto* block_ = program.Proto()->add_blocks(); auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0); block_->set_idx(0);
...@@ -165,10 +168,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { ...@@ -165,10 +168,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
// Prepare variables. // Prepare variables.
if (!x_created) { if (!x_created) {
CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape)); CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
} }
CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape)); CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape)); CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
// It is wired, need to copy manually. // It is wired, need to copy manually.
*block_->add_ops() = *fc->Proto(); *block_->add_ops() = *fc->Proto();
......
...@@ -23,9 +23,9 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase { ...@@ -23,9 +23,9 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of UnsqueezeOp should not be null."); "Input(X) of Unsqueeze operator should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of UnsqueezeOp should not be null."); "Output(Out) of Unsqueeze operator should not be null.");
const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes"); const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
const auto &x_dims = ctx->GetInputDim("X"); const auto &x_dims = ctx->GetInputDim("X");
...@@ -95,7 +95,6 @@ class UnsqueezeOp : public framework::OperatorBase { ...@@ -95,7 +95,6 @@ class UnsqueezeOp : public framework::OperatorBase {
framework::AttributeMap attrs; framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(out_dims); attrs["shape"] = framework::vectorize2int(out_dims);
attrs["inplace"] = Attr<bool>("inplace");
// Invoke Reshape op. // Invoke Reshape op.
auto reshape_op = framework::OpRegistry::CreateOp( auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {Input("X")}}, {"Shape", {}}}, "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
...@@ -126,13 +125,6 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -126,13 +125,6 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
" within [1, 6] dimensions (Eigen limit)."); " within [1, 6] dimensions (Eigen limit).");
} }
}); });
AddAttr<bool>(
"inplace",
"(default: false) Unsqueeze the source tensor's shape without "
"memory copy. When Attr(inplace) is set true, the output "
"tensor shares memory with Input(X), otherwise, a new output "
"tensor is created, and its data are copied from Input(x).")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Unsqueeze Operator. Unsqueeze Operator.
...@@ -168,7 +160,6 @@ class UnsqueezeGradOp : public framework::OperatorBase { ...@@ -168,7 +160,6 @@ class UnsqueezeGradOp : public framework::OperatorBase {
framework::AttributeMap attrs; framework::AttributeMap attrs;
attrs["shape"] = framework::vectorize2int(x_dims); attrs["shape"] = framework::vectorize2int(x_dims);
attrs["inplace"] = Attr<bool>("inplace");
auto reshape_op = framework::OpRegistry::CreateOp( auto reshape_op = framework::OpRegistry::CreateOp(
"reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
......
...@@ -57,12 +57,16 @@ class WhileOp : public framework::OperatorBase { ...@@ -57,12 +57,16 @@ class WhileOp : public framework::OperatorBase {
PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
"Condition of while op must in CPU memory."); "Condition of while op must in CPU memory.");
bool is_test = Attr<bool>("is_test");
auto ctx = executor.Prepare(*program, block->ID());
while (cond.data<bool>()[0]) { while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope(); auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope); step_scopes->push_back(&current_scope);
executor.RunPreparedContext(ctx.get(), &current_scope, false);
executor.Run(*program, &current_scope, block->ID(), if (is_test) {
false /*create_local_scope*/); scope.DeleteScope(&current_scope);
}
} }
} }
}; };
...@@ -88,6 +92,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -88,6 +92,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
"variables generated in the i'th step."); "variables generated in the i'th step.");
AddAttr<framework::BlockDesc *>(kStepBlock, AddAttr<framework::BlockDesc *>(kStepBlock,
"The step block inside WhileOp"); "The step block inside WhileOp");
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
)DOC"); )DOC");
} }
...@@ -103,12 +108,15 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -103,12 +108,15 @@ class WhileGradOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
PADDLE_ENFORCE(!Attr<bool>("is_test"),
"GradOp is only callable when is_test is false");
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place); auto &dev_ctx = *pool.Get(dev_place);
framework::Executor executor(dev_place); framework::Executor executor(dev_place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
auto ctx = executor.Prepare(*program, block->ID());
auto *step_scopes = auto *step_scopes =
scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>(); scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
...@@ -161,8 +169,7 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -161,8 +169,7 @@ class WhileGradOp : public framework::OperatorBase {
} }
} }
} }
executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false);
executor.Run(*program, *cur_scope_iter, block->ID(), false);
auto &pg_names = Outputs(kXGRAD); auto &pg_names = Outputs(kXGRAD);
auto &p_names = Inputs(kX); auto &p_names = Inputs(kX);
......
...@@ -115,15 +115,16 @@ size_t CUDAPinnedMaxChunkSize() { ...@@ -115,15 +115,16 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256; return CUDAPinnedMaxAllocSize() / 256;
} }
#ifdef PADDLE_WITH_XBYAK
namespace jit { namespace jit {
#ifdef PADDLE_WITH_XBYAK
static Xbyak::util::Cpu cpu; static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) { bool MayIUse(const cpu_isa_t cpu_isa) {
using namespace Xbyak::util; // NOLINT using namespace Xbyak::util; // NOLINT
switch (cpu_isa) { switch (cpu_isa) {
case sse42: case sse42:
return cpu.has(Cpu::tSSE42); return cpu.has(Cpu::tSSE42);
case avx:
return cpu.has(Cpu::tAVX);
case avx2: case avx2:
return cpu.has(Cpu::tAVX2); return cpu.has(Cpu::tAVX2);
case avx512_common: case avx512_common:
...@@ -146,8 +147,16 @@ bool MayIUse(const cpu_isa_t cpu_isa) { ...@@ -146,8 +147,16 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
} }
return false; return false;
} }
#else
bool MayIUse(const cpu_isa_t cpu_isa) {
if (cpu_isa == isa_any) {
return true;
} else {
return false;
}
}
#endif
} // namespace jit } // namespace jit
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -37,12 +37,11 @@ size_t CUDAPinnedMinChunkSize(); ...@@ -37,12 +37,11 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator. //! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize(); size_t CUDAPinnedMaxChunkSize();
#ifdef PADDLE_WITH_XBYAK
namespace jit { namespace jit {
typedef enum { typedef enum {
isa_any, isa_any,
sse42, sse42,
avx,
avx2, avx2,
avx512_common, avx512_common,
avx512_core, avx512_core,
...@@ -55,7 +54,6 @@ typedef enum { ...@@ -55,7 +54,6 @@ typedef enum {
inline bool MayIUse(const cpu_isa_t cpu_isa); inline bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace jit } // namespace jit
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, ...@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
#if CUDA_VERSION < 9000 #if CUDA_VERSION < 9000
return __shfl_down(val, delta, width); return __shfl_down(val, delta, width);
#else #else
return __shfl_down_sync(mask, val, delta, width); return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
#endif #endif
} }
...@@ -46,9 +46,16 @@ template <> ...@@ -46,9 +46,16 @@ template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta, float16 val, int delta,
int width) { int width) {
half tmp = static_cast<half>(val); return float16(
__shfl_down(tmp, static_cast<unsigned>(delta), width); __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
return float16(tmp); }
#else
template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta,
int width) {
return float16(__shfl_down_sync(mask, static_cast<half>(val),
static_cast<unsigned>(delta), width));
} }
#endif #endif
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm>
#include <iostream> #include <iostream>
#include <random> #include <random>
...@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) { ...@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
for (size_t i = 0; i < num / 2; ++i) { for (size_t i = 0; i < num / 2; ++i) {
// NOTE(dzhwinter): the float16 add has small underflow/overflow // NOTE(dzhwinter): the float16 add has small truncate error.
// so we use EXPECT_NEAR to check the result. // so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(out[i]), EXPECT_NEAR(static_cast<float>(out[i]),
static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])), static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
...@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) { ...@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3); TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3); TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
} }
// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
template <typename T>
static __forceinline__ __device__ T WarpReduceSum(T val) {
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
}
return val;
}
template <typename T>
__forceinline__ __device__ T BlockReduce(T val) {
static __shared__ T shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = WarpReduceSum(val); // Each warp performs partial reduction
if (lane == 0) shared[wid] = val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
// read from shared memory only if that warp existed
val =
(threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
if (wid == 0) val = WarpReduceSum(val); // Final reduce within first warp
return val;
}
template <typename T>
__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
T sum(0);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
sum += in[i];
}
sum = BlockReduce<T>(sum);
__syncthreads();
if (threadIdx.x == 0) out[blockIdx.x] = sum;
}
template <typename T>
void TestReduce(size_t num, float atol = 0.01) {
T* in1;
T *d_in1, *d_in2;
size_t size = sizeof(T) * num;
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
in1 = reinterpret_cast<T*>(malloc(size));
std::minstd_rand engine;
std::uniform_real_distribution<double> dist(0.0, 1.0);
for (size_t i = 0; i < num; ++i) {
in1[i] = static_cast<T>(dist(engine));
}
auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// NOTE(dzhwinter): the float16 add has small underflow/overflow
// so we use EXPECT_NEAR to check the result.
EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
free(in1);
cudaFree(d_in1);
cudaFree(d_in2);
}
TEST(CudaShuffleSync, float16) {
TestReduce<float>(10);
TestReduce<float>(1000);
// float16 will overflow or accumulate truncate errors in big size.
TestReduce<float16>(10);
TestReduce<float16>(100, /*atol error*/ 1.0);
}
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
......
...@@ -14,11 +14,10 @@ limitations under the License. */ ...@@ -14,11 +14,10 @@ limitations under the License. */
#pragma once #pragma once
#include <mkl.h> #include <mkl.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -50,25 +49,33 @@ extern void* mklml_dso_handle; ...@@ -50,25 +49,33 @@ extern void* mklml_dso_handle;
#define MKLML_ROUTINE_EACH(__macro) \ #define MKLML_ROUTINE_EACH(__macro) \
__macro(cblas_sgemm); \ __macro(cblas_sgemm); \
__macro(cblas_saxpy); \
__macro(cblas_scopy); \
__macro(cblas_sgemv); \
__macro(cblas_sgemm_batch); \
__macro(cblas_dgemm); \ __macro(cblas_dgemm); \
__macro(cblas_saxpy); \
__macro(cblas_daxpy); \ __macro(cblas_daxpy); \
__macro(cblas_scopy); \
__macro(cblas_dcopy); \ __macro(cblas_dcopy); \
__macro(cblas_sgemv); \
__macro(cblas_dgemv); \ __macro(cblas_dgemv); \
__macro(cblas_dgemm_batch); \
__macro(vsAdd); \
__macro(vdAdd); \
__macro(cblas_sgemm_alloc); \ __macro(cblas_sgemm_alloc); \
__macro(cblas_sgemm_pack); \
__macro(cblas_sgemm_compute); \
__macro(cblas_sgemm_free); \
__macro(cblas_dgemm_alloc); \ __macro(cblas_dgemm_alloc); \
__macro(cblas_sgemm_pack); \
__macro(cblas_dgemm_pack); \ __macro(cblas_dgemm_pack); \
__macro(cblas_sgemm_compute); \
__macro(cblas_dgemm_compute); \ __macro(cblas_dgemm_compute); \
__macro(cblas_sgemm_free); \
__macro(cblas_dgemm_free); \ __macro(cblas_dgemm_free); \
__macro(cblas_sgemm_batch); \
__macro(cblas_dgemm_batch); \
__macro(cblas_sdot); \
__macro(cblas_ddot); \
__macro(cblas_sscal); \
__macro(cblas_dscal); \
__macro(vsAdd); \
__macro(vdAdd); \
__macro(vsMul); \
__macro(vdMul); \
__macro(vsExp); \
__macro(vdExp); \
__macro(MKL_Set_Num_Threads) __macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <nccl.h> #include <nccl.h>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"
#include "warpctc/include/ctc.h" #include "warpctc/include/ctc.h"
namespace paddle { namespace paddle {
......
...@@ -35,6 +35,7 @@ limitations under the License. */ ...@@ -35,6 +35,7 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/to_string.h" #include "paddle/fluid/string/to_string.h"
......
...@@ -125,6 +125,11 @@ class MKLDNNHandler { ...@@ -125,6 +125,11 @@ class MKLDNNHandler {
return this->AcquireMemory(md, ptr, "@user_weights_mem_p"); return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
} }
std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemory( std::shared_ptr<mkldnn::memory> AcquireDstMemory(
const mkldnn::memory::desc& md, void* ptr) { const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
......
<<<<<<< HEAD
#pragma once #pragma once
#include <string> #include <string>
...@@ -24,3 +25,42 @@ static void* dlsym(void *handle, const char* symbol_name) { ...@@ -24,3 +25,42 @@ static void* dlsym(void *handle, const char* symbol_name) {
} // namespace anoymous } // namespace anoymous
#endif #endif
=======
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdexcept>
#include <string>
#if !defined(_WIN32)
#include <dlfcn.h> // for dladdr
#include <execinfo.h> // for backtrace
#else
#include <Shlwapi.h>
#include <Windows.h>
static void* dlsym(void* handle, const char* symbol_name) {
FARPROC found_symbol;
found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
if (found_symbol == NULL) {
throw std::runtime_error(std::string(symbol_name) + " not found.");
}
return reinterpret_cast<void*>(found_symbol);
}
#endif
>>>>>>> origin/develop
<<<<<<< HEAD
set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method) set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
if(NOT WIN32) if(NOT WIN32)
...@@ -7,12 +8,20 @@ endif() ...@@ -7,12 +8,20 @@ endif()
if(WITH_PYTHON) if(WITH_PYTHON)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED hip_library(paddle_pybind SHARED
<<<<<<< HEAD
SRCS ${PYBIND_SRCS} SRCS ${PYBIND_SRCS}
=======
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
>>>>>>> origin/develop
DEPS ${PYBIND_DEPS} DEPS ${PYBIND_DEPS}
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
else() else()
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
<<<<<<< HEAD
SRCS ${PYBIND_SRCS} SRCS ${PYBIND_SRCS}
=======
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
>>>>>>> origin/develop
DEPS ${PYBIND_DEPS} DEPS ${PYBIND_DEPS}
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID AND NOT WIN32) if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
......
...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/const_value.h"
#include <paddle/fluid/framework/op_proto_maker.h> #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
namespace paddle { namespace paddle {
...@@ -24,6 +25,8 @@ void BindConstValue(pybind11::module* m) { ...@@ -24,6 +25,8 @@ void BindConstValue(pybind11::module* m) {
m->def("kTempVarName", [] { return framework::kTempVarName; }); m->def("kTempVarName", [] { return framework::kTempVarName; });
m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
m->def("kControlDepVarName",
[] { return framework::ir::Node::kControlDepVarName; });
auto op_proto_and_checker_maker = auto op_proto_and_checker_maker =
m->def_submodule("op_proto_and_checker_maker"); m->def_submodule("op_proto_and_checker_maker");
......
...@@ -596,8 +596,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -596,8 +596,8 @@ All parameter, weight, gradient are variables in Paddle.
// -- python binds for parallel executor. // -- python binds for parallel executor.
py::class_<ParallelExecutor> pe(m, "ParallelExecutor"); py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
py::class_<ExecutionStrategy>(pe, "ExecutionStrategy") py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
.def(py::init()) exec_strategy.def(py::init())
.def_property( .def_property(
"num_threads", "num_threads",
[](const ExecutionStrategy &self) { return self.num_threads_; }, [](const ExecutionStrategy &self) { return self.num_threads_; },
...@@ -624,6 +624,16 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -624,6 +624,16 @@ All parameter, weight, gradient are variables in Paddle.
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
}); });
exec_strategy.def_property(
"use_experimental_executor",
[](const ExecutionStrategy &self) {
return self.type_ == ExecutionStrategy::kExperimental;
},
[](ExecutionStrategy &self, bool experimental) {
self.type_ = experimental ? ExecutionStrategy::kExperimental
: ExecutionStrategy::kDefault;
});
py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy"); py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy") py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
......
...@@ -116,7 +116,6 @@ function cmake_gen() { ...@@ -116,7 +116,6 @@ function cmake_gen() {
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
======================================== ========================================
EOF EOF
...@@ -146,7 +145,6 @@ EOF ...@@ -146,7 +145,6 @@ EOF
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \
-DPY_VERSION=${PY_VERSION:-2.7} -DPY_VERSION=${PY_VERSION:-2.7}
} }
......
...@@ -54,7 +54,7 @@ function cpu_config() { ...@@ -54,7 +54,7 @@ function cpu_config() {
if [ $platform == "Linux" ]; then if [ $platform == "Linux" ]; then
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
elif [ $platform == "Darwin" ]; then elif [ $platform == "Darwin" ]; then
if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
# HT is OFF # HT is OFF
ht=1 ht=1
fi fi
......
...@@ -19,6 +19,7 @@ import hashlib ...@@ -19,6 +19,7 @@ import hashlib
import os import os
import errno import errno
import shutil import shutil
import six
import sys import sys
import importlib import importlib
import paddle.dataset import paddle.dataset
...@@ -94,6 +95,8 @@ def download(url, module_name, md5sum, save_name=None): ...@@ -94,6 +95,8 @@ def download(url, module_name, md5sum, save_name=None):
dl = 0 dl = 0
total_length = int(total_length) total_length = int(total_length)
for data in r.iter_content(chunk_size=4096): for data in r.iter_content(chunk_size=4096):
if six.PY2:
data = six.b(data)
dl += len(data) dl += len(data)
f.write(data) f.write(data)
done = int(50 * dl / total_length) done = int(50 * dl / total_length)
......
...@@ -35,6 +35,7 @@ import itertools ...@@ -35,6 +35,7 @@ import itertools
import functools import functools
from .common import download from .common import download
import tarfile import tarfile
import six
import scipy.io as scio import scipy.io as scio
from paddle.dataset.image import * from paddle.dataset.image import *
from paddle.reader import * from paddle.reader import *
...@@ -45,10 +46,10 @@ from six.moves import cPickle as pickle ...@@ -45,10 +46,10 @@ from six.moves import cPickle as pickle
from six.moves import zip from six.moves import zip
__all__ = ['train', 'test', 'valid'] __all__ = ['train', 'test', 'valid']
DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118' DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
# In official 'readme', tstid is the flag of test data # In official 'readme', tstid is the flag of test data
...@@ -120,7 +121,10 @@ def reader_creator(data_file, ...@@ -120,7 +121,10 @@ def reader_creator(data_file,
file = file.strip() file = file.strip()
batch = None batch = None
with open(file, 'rb') as f: with open(file, 'rb') as f:
if six.PY2:
batch = pickle.load(f) batch = pickle.load(f)
else:
batch = pickle.load(f, encoding='bytes')
data = batch['data'] data = batch['data']
labels = batch['label'] labels = batch['label']
for sample, label in zip(data, batch['label']): for sample, label in zip(data, batch['label']):
......
...@@ -24,7 +24,6 @@ import paddle.dataset.common ...@@ -24,7 +24,6 @@ import paddle.dataset.common
import subprocess import subprocess
import numpy import numpy
import platform import platform
import six
import tempfile import tempfile
from six.moves import range from six.moves import range
__all__ = ['train', 'test', 'convert'] __all__ = ['train', 'test', 'convert']
......
...@@ -49,6 +49,12 @@ EMPTY_VAR_NAME = core.kEmptyVarName() ...@@ -49,6 +49,12 @@ EMPTY_VAR_NAME = core.kEmptyVarName()
TEMP_VAR_NAME = core.kTempVarName() TEMP_VAR_NAME = core.kTempVarName()
GRAD_VAR_SUFFIX = core.kGradVarSuffix() GRAD_VAR_SUFFIX = core.kGradVarSuffix()
ZERO_VAR_SUFFIX = core.kZeroVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
def generate_control_dev_var_name():
import random
return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random())
def grad_var_name(var_name): def grad_var_name(var_name):
...@@ -1363,6 +1369,13 @@ class Program(object): ...@@ -1363,6 +1369,13 @@ class Program(object):
self._current_role = core.op_proto_and_checker_maker.OpRole.Forward self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
self._op_role_var = [] self._op_role_var = []
# for distribute
self._is_distributed = False
self._is_chief = False
self._slice_vars_and_attrs = []
self._endpoints = []
self._distributed_lookup_table = None
@property @property
def op_role(self): def op_role(self):
""" """
......
...@@ -372,6 +372,7 @@ def load_vars(executor, ...@@ -372,6 +372,7 @@ def load_vars(executor,
load_vars( load_vars(
executor, executor,
dirname=dirname, dirname=dirname,
main_program=main_program,
vars=list(filter(predicate, main_program.list_vars())), vars=list(filter(predicate, main_program.list_vars())),
filename=filename) filename=filename)
else: else:
...@@ -403,9 +404,15 @@ def load_vars(executor, ...@@ -403,9 +404,15 @@ def load_vars(executor,
inputs={}, inputs={},
outputs={"Out": load_var_list}, outputs={"Out": load_var_list},
attrs={'file_path': os.path.join(dirname, filename)}) attrs={'file_path': os.path.join(dirname, filename)})
executor.run(load_prog) executor.run(load_prog)
if main_program is None:
main_program = default_main_program()
# load slice vars on pserver, if have it.
_load_slice_up_vars(executor, dirname,
main_program._slice_vars_and_attrs)
def load_params(executor, dirname, main_program=None, filename=None): def load_params(executor, dirname, main_program=None, filename=None):
""" """
...@@ -659,11 +666,19 @@ def save_inference_model(dirname, ...@@ -659,11 +666,19 @@ def save_inference_model(dirname,
save_persistables(executor, dirname, inference_program, params_filename) save_persistables(executor, dirname, inference_program, params_filename)
# if there is lookup table, the trainer 0 will notify all pserver to save.
if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
lookup_table_filename = os.path.join(dirname, "__lookup_table__")
_save_lookup_tables_by_notify(executor, lookup_table_filename,
main_program._distributed_lookup_table,
main_program._endpoints)
def load_inference_model(dirname, def load_inference_model(dirname,
executor, executor,
model_filename=None, model_filename=None,
params_filename=None): params_filename=None,
pserver_endpoints=None):
""" """
Load inference model from a directory Load inference model from a directory
...@@ -679,6 +694,10 @@ def load_inference_model(dirname, ...@@ -679,6 +694,10 @@ def load_inference_model(dirname,
parameters were saved in a single binary parameters were saved in a single binary
file. If parameters were saved in separate file. If parameters were saved in separate
files, set it as 'None'. files, set it as 'None'.
pserver_endpoints(list|None): This only need by distributed inference.
When use distributed look up table in training,
We also need it in inference.The parameter is
a list of pserver endpoints.
Returns: Returns:
tuple: The return of this function is a tuple with three elements: tuple: The return of this function is a tuple with three elements:
...@@ -697,12 +716,16 @@ def load_inference_model(dirname, ...@@ -697,12 +716,16 @@ def load_inference_model(dirname,
exe = fluid.Executor(fluid.CPUPlace()) exe = fluid.Executor(fluid.CPUPlace())
path = "./infer_model" path = "./infer_model"
endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
[inference_program, feed_target_names, fetch_targets] = [inference_program, feed_target_names, fetch_targets] =
fluid.io.load_inference_model(dirname=path, executor=exe) fluid.io.load_inference_model(dirname=path, executor=exe)
results = exe.run(inference_program, results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img}, feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets) fetch_list=fetch_targets)
# if we need lookup table, we will use:
fluid.io.load_inference_model(dirname=path, executor=exe, pserver_endpoints=endpoints)
# In this exsample, the inference program was saved in the # In this exsample, the inference program was saved in the
# "./infer_model/__model__" and parameters were saved in # "./infer_model/__model__" and parameters were saved in
# separate files in ""./infer_model". # separate files in ""./infer_model".
...@@ -729,6 +752,9 @@ def load_inference_model(dirname, ...@@ -729,6 +752,9 @@ def load_inference_model(dirname,
program = Program.parse_from_string(program_desc_str) program = Program.parse_from_string(program_desc_str)
load_persistables(executor, dirname, program, params_filename) load_persistables(executor, dirname, program, params_filename)
if pserver_endpoints:
program = _endpoints_replacement(program, pserver_endpoints)
feed_target_names = program.desc.get_feed_target_names() feed_target_names = program.desc.get_feed_target_names()
fetch_target_names = program.desc.get_fetch_target_names() fetch_target_names = program.desc.get_fetch_target_names()
fetch_targets = [ fetch_targets = [
...@@ -738,6 +764,61 @@ def load_inference_model(dirname, ...@@ -738,6 +764,61 @@ def load_inference_model(dirname,
return [program, feed_target_names, fetch_targets] return [program, feed_target_names, fetch_targets]
def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
pserver_endpoints):
"""
This function will send checkpoint notify message from Trainer 0
to all the pservers.
The checkpoint notify message contains lookup table name,
the absolute path on pserver to save lookup_table.
Args:
executor(Executor): The executor to run for send checkpoint notify.
dirname(str): The folder where to save.
lookup_table(string): the lookup table name, when use distribute
lookup table, we can get lookup table name by DistributeTranspiler.
table_name
ps_endpoint_list(list): the parameter server ip:port list.
when use distribute lookup table, we can get ps_endpoint_list by
distribute arguments.
Return:
None
Examples:
.. code-block:: python
exe = fluid.Executor(fluid.CPUPlace())
param_path = "./my_paddle_model"
table_name = "share_w"
ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
_save_pserver_vars_by_notify(executor=exe,
dirname=param_path, lookup_table=table_name,
pserver_endpoints=ps_endpoints)
"""
pserver_notify_program = Program()
pserver_notify_block = pserver_notify_program.global_block()
attrs = {}
attrs['epmap'] = pserver_endpoints
attrs['dir'] = dirname
attrs['lookup_table'] = lookup_table
pserver_notify_block.append_op(
type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
executor.run(pserver_notify_program)
def _endpoints_replacement(program, endpoints):
ENDPOINT_MAP = "epmap"
for op in program.global_block().ops:
if op.has_attr(ENDPOINT_MAP):
op.set_attr(ENDPOINT_MAP, endpoints)
program._sync_with_cpp()
return program
def get_parameter_value(para, executor): def get_parameter_value(para, executor):
""" """
Get the LoDTensor value of the given parameter. Get the LoDTensor value of the given parameter.
...@@ -799,3 +880,46 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -799,3 +880,46 @@ def get_parameter_value_by_name(name, executor, program=None):
program = default_main_program() program = default_main_program()
var = program.global_block().var(name) var = program.global_block().var(name)
return get_parameter_value(var, executor) return get_parameter_value(var, executor)
def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
if not slice_vars_and_attrs:
return
load_prog = Program()
load_block = load_prog.global_block()
for var_tuple in slice_vars_and_attrs:
orig_var = var_tuple[0]
start = var_tuple[1]
slice_var = var_tuple[2]
end = start + reduce(lambda x, y: x * y, slice_var.shape)
clone_orig_var = load_block.create_var(
name=orig_var.name,
type=orig_var.type,
shape=orig_var.shape,
dtype=orig_var.dtype,
persistable=True)
clone_slice_var = load_block.create_var(
name=slice_var.name,
type=slice_var.type,
shape=slice_var.shape,
dtype=slice_var.dtype,
persistable=True)
load_block.append_op(
type='load',
inputs={},
outputs={'Out': [clone_orig_var]},
attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
load_block.append_op(
type="slice",
inputs={'Input': clone_orig_var},
outputs={'Out': clone_slice_var},
attrs={'axes': [0],
'starts': [start],
'ends': [end]})
executor.run(load_prog)
...@@ -661,6 +661,7 @@ class While(object): ...@@ -661,6 +661,7 @@ class While(object):
Args: Args:
cond (Variable): condition used to compare. cond (Variable): condition used to compare.
is_test(bool): A flag indicating whether execution is in test phase.
name (str): The name of this layer. name (str): The name of this layer.
Examples: Examples:
...@@ -683,7 +684,7 @@ class While(object): ...@@ -683,7 +684,7 @@ class While(object):
IN_WHILE_BLOCK = 1 IN_WHILE_BLOCK = 1
AFTER_WHILE_BLOCK = 2 AFTER_WHILE_BLOCK = 2
def __init__(self, cond, name=None): def __init__(self, cond, is_test=False, name=None):
self.helper = LayerHelper("while", name=name) self.helper = LayerHelper("while", name=name)
self.status = While.BEFORE_WHILE_BLOCK self.status = While.BEFORE_WHILE_BLOCK
if not isinstance(cond, Variable): if not isinstance(cond, Variable):
...@@ -694,6 +695,7 @@ class While(object): ...@@ -694,6 +695,7 @@ class While(object):
if reduce(lambda a, b: a * b, cond.shape, 1) != 1: if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
raise TypeError("condition should be a bool scalar") raise TypeError("condition should be a bool scalar")
self.cond_var = cond self.cond_var = cond
self.is_test = is_test
def block(self): def block(self):
return WhileGuard(self) return WhileGuard(self)
...@@ -735,7 +737,8 @@ class While(object): ...@@ -735,7 +737,8 @@ class While(object):
}, },
outputs={'Out': out_vars, outputs={'Out': out_vars,
'StepScopes': [step_scope]}, 'StepScopes': [step_scope]},
attrs={'sub_block': while_block}) attrs={'sub_block': while_block,
"is_test": self.is_test})
def lod_rank_table(x, level=0): def lod_rank_table(x, level=0):
...@@ -1272,8 +1275,8 @@ class ConditionalBlock(object): ...@@ -1272,8 +1275,8 @@ class ConditionalBlock(object):
parent_block.append_op( parent_block.append_op(
type='conditional_block', type='conditional_block',
inputs={ inputs={
'X': self.inputs, 'Cond': self.inputs,
'Params': param_list, 'Input': param_list,
}, },
outputs={'Out': out_list, outputs={'Out': out_list,
'Scope': [step_scope]}, 'Scope': [step_scope]},
......
...@@ -24,7 +24,7 @@ from .layer_function_generator import templatedoc ...@@ -24,7 +24,7 @@ from .layer_function_generator import templatedoc
from .. import core from .. import core
from ..executor import global_scope from ..executor import global_scope
from ..framework import convert_np_dtype_to_dtype_, default_main_program, \ from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
default_startup_program, program_guard, Program default_startup_program, program_guard, Program, Variable
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..unique_name import generate as unique_name from ..unique_name import generate as unique_name
...@@ -209,7 +209,7 @@ class ListenAndServ(object): ...@@ -209,7 +209,7 @@ class ListenAndServ(object):
}) })
def Send(endpoints, send_vars, sync=True): def Send(endpoints, send_vars, dummy_output=None, sync=True):
""" """
Send variables to the server side, and get vars from server Send variables to the server side, and get vars from server
side when server have finished running server side program. side when server have finished running server side program.
...@@ -223,6 +223,13 @@ def Send(endpoints, send_vars, sync=True): ...@@ -223,6 +223,13 @@ def Send(endpoints, send_vars, sync=True):
""" """
assert (type(send_vars) == list) assert (type(send_vars) == list)
if dummy_output is None:
dummy_output = []
elif isinstance(dummy_output, Variable):
dummy_output = [dummy_output]
assert (type(dummy_output) == list)
epmap = endpoints.split(",") epmap = endpoints.split(",")
endpoints = list(set(epmap)) endpoints = list(set(epmap))
...@@ -232,6 +239,7 @@ def Send(endpoints, send_vars, sync=True): ...@@ -232,6 +239,7 @@ def Send(endpoints, send_vars, sync=True):
helper.append_op( helper.append_op(
type="send", type="send",
inputs={"X": send_vars}, inputs={"X": send_vars},
outputs={"Out": dummy_output},
attrs={ attrs={
"endpoints": endpoints, "endpoints": endpoints,
"epmap": epmap, "epmap": epmap,
...@@ -241,7 +249,7 @@ def Send(endpoints, send_vars, sync=True): ...@@ -241,7 +249,7 @@ def Send(endpoints, send_vars, sync=True):
helper.append_op(type="send_barrier", attrs={"endpoints": endpoints}) helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
def Recv(endpoints, get_vars, sync=True): def Recv(endpoints, get_vars, dummy_input=None, sync=True):
""" """
Receive variables from server side Receive variables from server side
...@@ -256,13 +264,20 @@ def Recv(endpoints, get_vars, sync=True): ...@@ -256,13 +264,20 @@ def Recv(endpoints, get_vars, sync=True):
""" """
assert (type(get_vars) == list) assert (type(get_vars) == list)
if dummy_input is None:
dummy_input = []
elif isinstance(dummy_input, Variable):
dummy_input = [dummy_input]
assert (type(dummy_input) == list)
epmap = endpoints.split(",") epmap = endpoints.split(",")
endpoints = list(set(epmap)) endpoints = list(set(epmap))
helper = LayerHelper("Recv", **locals()) helper = LayerHelper("Recv", **locals())
helper.append_op( helper.append_op(
type="recv", type="recv",
inputs={"X": get_vars}, inputs={"X": dummy_input},
outputs={"Out": get_vars}, outputs={"Out": get_vars},
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
......
...@@ -94,6 +94,7 @@ __all__ = [ ...@@ -94,6 +94,7 @@ __all__ = [
'image_resize_short', 'image_resize_short',
'resize_bilinear', 'resize_bilinear',
'gather', 'gather',
'scatter',
'random_crop', 'random_crop',
'mean_iou', 'mean_iou',
'relu', 'relu',
...@@ -102,6 +103,7 @@ __all__ = [ ...@@ -102,6 +103,7 @@ __all__ = [
'rank_loss', 'rank_loss',
'prelu', 'prelu',
'flatten', 'flatten',
'stack',
] ]
...@@ -5036,6 +5038,47 @@ def gather(input, index): ...@@ -5036,6 +5038,47 @@ def gather(input, index):
return out return out
def scatter(input, index, updates, name=None):
"""
**Scatter Layer**
Output is obtained by updating the input on selected indices on the first
axis.
.. math::
Out = X
Out[Ids] = Updates
Args:
input (Variable): The source input with rank>=1.
index (Variable): The index input with rank=1. Its dtype should be
int32 or int64 as it is used as indexes.
updates (Variable): The updated value of scatter op.
name (str|None): The output variable name. Default None.
Returns:
output (Variable): The output is a tensor with the same shape as input.
Examples:
.. code-block:: python
output = fluid.layers.scatter(input, index, updates)
"""
helper = LayerHelper('scatter', **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
helper.append_op(
type="scatter",
inputs={"X": input,
"Ids": index,
"Updates": updates},
outputs={"Out": out})
return out
@templatedoc() @templatedoc()
def random_crop(x, shape, seed=None): def random_crop(x, shape, seed=None):
""" """
...@@ -5475,3 +5518,17 @@ def flatten(x, axis=1, name=None): ...@@ -5475,3 +5518,17 @@ def flatten(x, axis=1, name=None):
outputs={'Out': out}, outputs={'Out': out},
attrs={"axis": axis}) attrs={"axis": axis})
return out return out
def stack(x, axis=0):
helper = LayerHelper('stack', **locals())
axis = 0 if axis is None else axis
if not isinstance(x, list) and not isinstance(x, tuple):
x = [x]
out = helper.create_tmp_variable(x[0].dtype)
helper.append_op(
type='stack', inputs={'X': x}, outputs={'Y': out},
attrs={'axis': axis})
return out
...@@ -65,7 +65,6 @@ __all__ = [ ...@@ -65,7 +65,6 @@ __all__ = [
'uniform_random_batch_size_like', 'uniform_random_batch_size_like',
'gaussian_random', 'gaussian_random',
'gaussian_random_batch_size_like', 'gaussian_random_batch_size_like',
'scatter',
'sum', 'sum',
'slice', 'slice',
'shape', 'shape',
......
...@@ -30,7 +30,8 @@ import numpy as np ...@@ -30,7 +30,8 @@ import numpy as np
class TestMNISTIfElseOp(unittest.TestCase): class TestMNISTIfElseOp(unittest.TestCase):
def test_raw_api(self): # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
def not_test_raw_api(self):
prog = Program() prog = Program()
startup_prog = Program() startup_prog = Program()
with program_guard(prog, startup_prog): with program_guard(prog, startup_prog):
...@@ -91,7 +92,8 @@ class TestMNISTIfElseOp(unittest.TestCase): ...@@ -91,7 +92,8 @@ class TestMNISTIfElseOp(unittest.TestCase):
return return
self.assertFalse(True) self.assertFalse(True)
def test_ifelse(self): # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
def not_test_ifelse(self):
prog = Program() prog = Program()
startup_prog = Program() startup_prog = Program()
with program_guard(prog, startup_prog): with program_guard(prog, startup_prog):
...@@ -153,6 +155,13 @@ class TestIfElse(unittest.TestCase): ...@@ -153,6 +155,13 @@ class TestIfElse(unittest.TestCase):
self.cond_value = 0.5 self.cond_value = 0.5
self.data = np.random.rand(25, 1).astype(np.float32) self.data = np.random.rand(25, 1).astype(np.float32)
def numpy_cal(self):
s1 = self.data[np.where(self.data < self.cond_value)]
res = np.sum(np.exp(s1))
s2 = self.data[np.where(self.data >= self.cond_value)]
res += np.sum(np.tanh(s2))
return res
def compare_ifelse_op_and_numpy(self, place): def compare_ifelse_op_and_numpy(self, place):
self.set_test_case() self.set_test_case()
...@@ -166,10 +175,12 @@ class TestIfElse(unittest.TestCase): ...@@ -166,10 +175,12 @@ class TestIfElse(unittest.TestCase):
ie = layers.IfElse(ifcond) ie = layers.IfElse(ifcond)
with ie.true_block(): with ie.true_block():
true_target = ie.input(src) true_target = ie.input(src)
true_target = fluid.layers.exp(true_target)
ie.output(true_target) ie.output(true_target)
with ie.false_block(): with ie.false_block():
false_target = ie.input(src) false_target = ie.input(src)
false_target = fluid.layers.tanh(false_target)
ie.output(false_target) ie.output(false_target)
if_out = ie() if_out = ie()
out = layers.reduce_sum(if_out) out = layers.reduce_sum(if_out)
...@@ -180,7 +191,8 @@ class TestIfElse(unittest.TestCase): ...@@ -180,7 +191,8 @@ class TestIfElse(unittest.TestCase):
o1, = exe.run(fluid.default_main_program(), o1, = exe.run(fluid.default_main_program(),
feed={'data': self.data}, feed={'data': self.data},
fetch_list=[out]) fetch_list=[out])
o2 = np.sum(self.data) o2 = self.numpy_cal()
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
o1, o2, atol=1e-8), o1, o2, atol=1e-8),
......
...@@ -46,7 +46,8 @@ def cnn_model(data): ...@@ -46,7 +46,8 @@ def cnn_model(data):
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.3)))
conv_pool_2 = fluid.nets.simple_img_conv_pool( conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1, input=conv_pool_1,
filter_size=5, filter_size=5,
...@@ -54,7 +55,8 @@ def cnn_model(data): ...@@ -54,7 +55,8 @@ def cnn_model(data):
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=0.2)))
SIZE = 10 SIZE = 10
input_shape = conv_pool_2.shape input_shape = conv_pool_2.shape
...@@ -66,8 +68,7 @@ def cnn_model(data): ...@@ -66,8 +68,7 @@ def cnn_model(data):
size=SIZE, size=SIZE,
act="softmax", act="softmax",
param_attr=fluid.param_attr.ParamAttr( param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer( initializer=fluid.initializer.Constant(value=0.1)))
loc=0.0, scale=scale, seed=1)))
return predict return predict
......
...@@ -16,7 +16,6 @@ from __future__ import print_function ...@@ -16,7 +16,6 @@ from __future__ import print_function
import numpy as np import numpy as np
import argparse import argparse
import six
import time import time
import math import math
...@@ -130,7 +129,12 @@ class SE_ResNeXt(): ...@@ -130,7 +129,12 @@ class SE_ResNeXt():
input=conv, pool_size=7, pool_type='avg', global_pooling=True) input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.2) drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') out = fluid.layers.fc(
input=drop,
size=class_dim,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.2)))
return out return out
def shortcut(self, input, ch_out, stride): def shortcut(self, input, ch_out, stride):
...@@ -180,7 +184,7 @@ class SE_ResNeXt(): ...@@ -180,7 +184,7 @@ class SE_ResNeXt():
act=None, act=None,
# avoid pserver CPU init differs from GPU # avoid pserver CPU init differs from GPU
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant()), initializer=fluid.initializer.Constant(value=0.2)),
bias_attr=False) bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act) return fluid.layers.batch_norm(input=conv, act=act)
...@@ -229,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase): ...@@ -229,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase):
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
# FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed. learning_rate=fluid.layers.piecewise_decay(
#learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr),
# boundaries=bd, values=lr),
learning_rate=base_lr,
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4)) regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
...@@ -265,9 +265,9 @@ def main(role="pserver", ...@@ -265,9 +265,9 @@ def main(role="pserver",
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) != 7: if len(sys.argv) != 8:
print( print(
"Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
) )
role = sys.argv[1] role = sys.argv[1]
endpoints = sys.argv[2] endpoints = sys.argv[2]
...@@ -275,6 +275,8 @@ if __name__ == "__main__": ...@@ -275,6 +275,8 @@ if __name__ == "__main__":
current_endpoint = sys.argv[4] current_endpoint = sys.argv[4]
trainers = int(sys.argv[5]) trainers = int(sys.argv[5])
is_dist = True if sys.argv[6] == "TRUE" else False is_dist = True if sys.argv[6] == "TRUE" else False
# FIXME(typhoonzero): refine this test.
is_async = True if sys.argv[7] == "TRUE" else False
main( main(
role=role, role=role,
endpoints=endpoints, endpoints=endpoints,
......
...@@ -56,8 +56,8 @@ def get_numeric_gradient(place, ...@@ -56,8 +56,8 @@ def get_numeric_gradient(place,
def get_output(): def get_output():
sum = [] sum = []
for output_name in output_names:
op.run(scope, place) op.run(scope, place)
for output_name in output_names:
sum.append( sum.append(
np.array(scope.find_var(output_name).get_tensor()).mean()) np.array(scope.find_var(output_name).get_tensor()).mean())
return np.array(sum).mean() return np.array(sum).mean()
......
...@@ -38,7 +38,8 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -38,7 +38,8 @@ class TestParallelExecutorBase(unittest.TestCase):
seed=None, seed=None,
use_parallel_executor=True, use_parallel_executor=True,
use_reduce=False, use_reduce=False,
optimizer=fluid.optimizer.Adam): optimizer=fluid.optimizer.Adam,
use_fast_executor=False):
def run_executor(exe, feed, fetch_list, program=None): def run_executor(exe, feed, fetch_list, program=None):
if isinstance(exe, fluid.ParallelExecutor): if isinstance(exe, fluid.ParallelExecutor):
res = exe.run(fetch_list=fetch_list, feed=feed) res = exe.run(fetch_list=fetch_list, feed=feed)
...@@ -71,6 +72,8 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -71,6 +72,8 @@ class TestParallelExecutorBase(unittest.TestCase):
startup_exe.run(startup) startup_exe.run(startup)
exec_strategy = fluid.ExecutionStrategy() exec_strategy = fluid.ExecutionStrategy()
exec_strategy.allow_op_delay = allow_op_delay exec_strategy.allow_op_delay = allow_op_delay
if use_fast_executor:
exec_strategy.use_experimental_executor = True
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from test_fusion_lstm_op import fc, ACTIVATION
from test_softmax_op import stable_softmax
def attention_lstm(
x, # T x M
lod, # 1 x N
h0, # N x D
c0, # N x D
fcws, # (M+D) x 1, 1x1
fcbs, # 1 x 1, 1x1
w, # (M+D) x 4D
b, # 1 x 4D
act_gate,
act_cell,
act_cand):
T = sum(lod[0])
N = len(lod[0])
M = x.shape[1]
D = b.shape[1] / 4
assert T == x.shape[0]
assert len(fcws) == len(fcbs)
hidden = []
cell = []
start_offset = 0
for bid in range(N):
seq_len = lod[0][bid]
xi = np.copy(x[start_offset:start_offset + seq_len, :]).reshape(seq_len,
M)
prev_cell = np.copy(c0[bid]).reshape([1, D])
prev_hidden = np.copy(h0[bid]).reshape([1, D])
for step in range(seq_len):
expanded_cell = np.repeat(prev_cell, seq_len, axis=0)
tmp = np.concatenate((xi, expanded_cell), axis=1)
assert tmp.shape[0] == seq_len
assert tmp.shape[1] == M + D
for fcid in range(len(fcbs)):
tmp = fc(tmp, fcws[fcid], fcbs[fcid])
tmp = ACTIVATION['relu'](tmp)
tmp = np.reshape(tmp, (1, seq_len))
tmp = stable_softmax(tmp).reshape(seq_len, 1)
lstmx = xi * tmp # seq * M
lstmx = np.sum(lstmx.reshape(seq_len, M), axis=0).reshape([1, M])
lstmin = np.concatenate((prev_hidden, lstmx), axis=1)
lstmout = fc(lstmin, w, b).reshape([1, 4 * D])
g_f, g_i, g_o, cand = np.split(lstmout, 4, axis=1)
g_f = act_gate(g_f).reshape([1, D])
g_i = act_gate(g_i).reshape([1, D])
g_o = act_gate(g_o).reshape([1, D])
cand = act_cand(cand).reshape([1, D])
cell_t = (prev_cell * g_f) + (g_i * cand)
hidden_t = g_o * act_cell(cell_t)
hidden.append(hidden_t.flatten())
cell.append(cell_t.flatten())
prev_cell = cell_t.reshape([1, D])
prev_hidden = hidden_t.reshape([1, D])
start_offset += seq_len
hidden = np.array(hidden).astype('float32').reshape([T, D])
cell = np.array(cell).astype('float32').reshape([T, D])
return hidden, cell
class TestAttentionLSTMOp(OpTest):
def set_conf(self):
pass
def setUp(self):
self.op_type = 'attention_lstm'
self.lod = [[3]]
self.M = 30
self.D = 15
self.has_initial_hidden = True
self.act_gate = 'sigmoid'
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.set_conf()
T = sum(self.lod[0])
bs = len(self.lod[0])
x = np.random.normal(size=(T, self.M)).astype('float32')
c0 = np.random.normal(size=(bs, self.D)).astype('float32')
if self.has_initial_hidden:
h0 = np.random.normal(size=(bs, self.D)).astype('float32')
else:
h0 = np.zeros((bs, self.D)).astype('float32')
fcw1 = np.random.normal(size=(self.M + self.D, 1)).astype('float32')
fcb1 = np.random.normal(size=(1, 1)).astype('float32')
fcw2 = np.random.normal(size=(1, 1)).astype('float32')
fcb2 = np.random.normal(size=(1, 1)).astype('float32')
# lstm weight and bias
w = np.random.normal(size=(self.M + self.D,
self.D * 4)).astype('float32')
b = np.random.normal(size=(1, self.D * 4)).astype('float32')
h, c = attention_lstm(x, self.lod, h0, c0, [fcw1, fcw2], [fcb1, fcb2],
w, b, ACTIVATION[self.act_gate],
ACTIVATION[self.act_cell],
ACTIVATION[self.act_cand])
self.inputs = {
'X': (x, self.lod),
'C0': c0,
'AttentionWeight': fcw1,
'AttentionBias': fcb1,
'AttentionScalar': fcw2,
'AttentionScalarBias': fcb2,
'LSTMWeight': w,
'LSTMBias': b
}
if self.has_initial_hidden:
self.inputs['H0'] = h0
self.outputs = {
'Hidden': (h, self.lod),
'Cell': (c, self.lod),
}
self.attrs = {
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
'candidate_activation': self.act_cand
}
def test_check_output(self):
self.check_output()
class TestAttentionOpNonInit(TestAttentionLSTMOp):
def set_conf(self):
self.has_initial_hidden = False
class TestAttentionOpAct(TestAttentionLSTMOp):
def set_conf(self):
self.M = 3
self.D = 2
self.act_gate = 'relu'
self.act_cell = 'tanh'
self.act_cand = 'sigmoid'
class TestAttentionOpMD1(TestAttentionLSTMOp):
def set_conf(self):
self.M = 36
self.D = 8
class TestAttentionOpMD2(TestAttentionLSTMOp):
def set_conf(self):
self.M = 8
self.D = 8
class TestAttentionOpMD3(TestAttentionLSTMOp):
def set_conf(self):
self.M = 15
self.D = 30
class TestAttentionOpBS1(TestAttentionLSTMOp):
def set_conf(self):
self.lod = [[5]]
self.M = 16
self.D = 32
class TestAttentionOpBS2(TestAttentionLSTMOp):
def set_conf(self):
self.lod = [[3, 6]]
class TestAttentionOpBS5(TestAttentionLSTMOp):
def set_conf(self):
self.lod = [[3, 2, 4, 7, 5]]
if __name__ == '__main__':
unittest.main()
...@@ -27,6 +27,7 @@ import unittest ...@@ -27,6 +27,7 @@ import unittest
from multiprocessing import Process from multiprocessing import Process
import os import os
import signal import signal
import six
import collections import collections
SEED = 1 SEED = 1
...@@ -55,7 +56,8 @@ def cnn_model(data): ...@@ -55,7 +56,8 @@ def cnn_model(data):
# TODO(dzhwinter) : refine the initializer and random seed settting # TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10 SIZE = 10
input_shape = conv_pool_2.shape input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] param_shape = [six.moves.reduce(lambda a, b: a * b, input_shape[1:], 1)
] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc( predict = fluid.layers.fc(
...@@ -108,7 +110,7 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): ...@@ -108,7 +110,7 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
def operator_equal(a, b): def operator_equal(a, b):
for k, v in a.__dict__.iteritems(): for k, v in six.iteritems(a.__dict__):
if isinstance(v, fluid.framework.Program) or \ if isinstance(v, fluid.framework.Program) or \
isinstance(v, fluid.framework.Block): isinstance(v, fluid.framework.Block):
continue continue
...@@ -118,8 +120,8 @@ def operator_equal(a, b): ...@@ -118,8 +120,8 @@ def operator_equal(a, b):
raise ValueError("In operator_equal not equal:{0}\n".format(k)) raise ValueError("In operator_equal not equal:{0}\n".format(k))
elif isinstance(v, collections.OrderedDict): elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0]) v0 = sorted(six.iteritems(v), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
if v0 != v1: if v0 != v1:
raise ValueError("In operator_equal not equal:{0}\n".format(k)) raise ValueError("In operator_equal not equal:{0}\n".format(k))
...@@ -131,7 +133,7 @@ def operator_equal(a, b): ...@@ -131,7 +133,7 @@ def operator_equal(a, b):
def block_equal(a, b): def block_equal(a, b):
for k, v in a.__dict__.iteritems(): for k, v in six.iteritems(a.__dict__):
if isinstance(v, core.ProgramDesc) or isinstance( if isinstance(v, core.ProgramDesc) or isinstance(
v, fluid.framework.Program) or isinstance(v, core.BlockDesc): v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
continue continue
...@@ -143,8 +145,8 @@ def block_equal(a, b): ...@@ -143,8 +145,8 @@ def block_equal(a, b):
assert (len(a.ops) == len(b.ops)) assert (len(a.ops) == len(b.ops))
elif isinstance(v, collections.OrderedDict): elif isinstance(v, collections.OrderedDict):
v0 = sorted(v.iteritems(), key=lambda x: x[0]) v0 = sorted(six.iteritems(v), key=lambda x: x[0])
v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])
if v0 != v1: if v0 != v1:
raise ValueError("In block_equal not equal:{0}\n".format(k)) raise ValueError("In block_equal not equal:{0}\n".format(k))
...@@ -156,7 +158,7 @@ def block_equal(a, b): ...@@ -156,7 +158,7 @@ def block_equal(a, b):
def program_equal(a, b): def program_equal(a, b):
for k, v in a.__dict__.iteritems(): for k, v in six.iteritems(a.__dict__):
if isinstance(v, core.ProgramDesc): if isinstance(v, core.ProgramDesc):
continue continue
......
...@@ -30,7 +30,7 @@ class TestDistRunnerBase(object): ...@@ -30,7 +30,7 @@ class TestDistRunnerBase(object):
"get_model should be implemented by child classes.") "get_model should be implemented by child classes.")
def get_transpiler(self, trainer_id, main_program, pserver_endpoints, def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
trainers): trainers, sync_mode):
# NOTE: import fluid until runtime, or else forking processes will cause error. # NOTE: import fluid until runtime, or else forking processes will cause error.
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -39,17 +39,22 @@ class TestDistRunnerBase(object): ...@@ -39,17 +39,22 @@ class TestDistRunnerBase(object):
trainer_id=trainer_id, trainer_id=trainer_id,
program=main_program, program=main_program,
pservers=pserver_endpoints, pservers=pserver_endpoints,
trainers=trainers) trainers=trainers,
sync_mode=sync_mode)
return t return t
def run_pserver(self, pserver_endpoints, trainers, current_endpoint, def run_pserver(self,
trainer_id): pserver_endpoints,
trainers,
current_endpoint,
trainer_id,
sync_mode=True):
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
self.get_model(batch_size=2) self.get_model(batch_size=2)
t = self.get_transpiler(trainer_id, t = self.get_transpiler(trainer_id,
fluid.default_main_program(), pserver_endpoints, fluid.default_main_program(), pserver_endpoints,
trainers) trainers, sync_mode)
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog) startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace() place = fluid.CPUPlace()
...@@ -57,7 +62,13 @@ class TestDistRunnerBase(object): ...@@ -57,7 +62,13 @@ class TestDistRunnerBase(object):
exe.run(startup_prog) exe.run(startup_prog)
exe.run(pserver_prog) exe.run(pserver_prog)
def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): def run_trainer(self,
place,
endpoints,
trainer_id,
trainers,
is_dist=True,
sync_mode=True):
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
...@@ -65,7 +76,7 @@ class TestDistRunnerBase(object): ...@@ -65,7 +76,7 @@ class TestDistRunnerBase(object):
if is_dist: if is_dist:
t = self.get_transpiler(trainer_id, t = self.get_transpiler(trainer_id,
fluid.default_main_program(), endpoints, fluid.default_main_program(), endpoints,
trainers) trainers, sync_mode)
trainer_prog = t.get_trainer_program() trainer_prog = t.get_trainer_program()
else: else:
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
...@@ -106,9 +117,9 @@ def runtime_main(test_class): ...@@ -106,9 +117,9 @@ def runtime_main(test_class):
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
if len(sys.argv) != 7: if len(sys.argv) != 8:
print( print(
"Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
) )
role = sys.argv[1] role = sys.argv[1]
endpoints = sys.argv[2] endpoints = sys.argv[2]
...@@ -116,34 +127,43 @@ def runtime_main(test_class): ...@@ -116,34 +127,43 @@ def runtime_main(test_class):
current_endpoint = sys.argv[4] current_endpoint = sys.argv[4]
trainers = int(sys.argv[5]) trainers = int(sys.argv[5])
is_dist = True if sys.argv[6] == "TRUE" else False is_dist = True if sys.argv[6] == "TRUE" else False
sync_mode = True if sys.argv[7] == "TRUE" else False
model = test_class() model = test_class()
if role == "pserver": if role == "pserver":
model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) model.run_pserver(endpoints, trainers, current_endpoint, trainer_id,
sync_mode)
else: else:
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace() ) else fluid.CPUPlace()
model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) model.run_trainer(p, endpoints, trainer_id, trainers, is_dist,
sync_mode)
import paddle.compat as cpt import paddle.compat as cpt
class TestDistBase(unittest.TestCase): class TestDistBase(unittest.TestCase):
def _setup_config(self):
raise NotImplementedError("tests should have _setup_config implemented")
def setUp(self): def setUp(self):
self._trainers = 2 self._trainers = 2
self._pservers = 2 self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python" self._python_interp = "python"
self._sync_mode = True
self._setup_config()
def start_pserver(self, model_file, check_error_log): def start_pserver(self, model_file, check_error_log):
sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers) self._trainers, sync_mode_str)
ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers, sync_mode_str)
ps0_pipe = subprocess.PIPE ps0_pipe = subprocess.PIPE
ps1_pipe = subprocess.PIPE ps1_pipe = subprocess.PIPE
...@@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase): ...@@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase):
# Run local to get a base line # Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs) env_local.update(required_envs)
local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
(self._python_interp, model_file, (self._python_interp, model_file,
"127.0.0.1:1234", "127.0.0.1:1234", 1) "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str)
if not check_error_log: if not check_error_log:
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
local_cmd.split(" "), local_cmd.split(" "),
...@@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase): ...@@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase):
self._wait_ps_ready(ps1.pid) self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \ tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers) self._trainers, sync_mode_str)
tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \ tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
(self._python_interp, model_file, self._ps_endpoints, ps1_ep, (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
self._trainers) self._trainers, sync_mode_str)
env0 = {"CUDA_VISIBLE_DEVICES": "0"} env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"} env1 = {"CUDA_VISIBLE_DEVICES": "1"}
......
...@@ -17,10 +17,21 @@ import unittest ...@@ -17,10 +17,21 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistMnist2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-7)
class TestDistMnistAsync(TestDistBase):
def _setup_config(self):
self._sync_mode = False
def test_se_resnext(self):
self.check_with_place("dist_mnist.py", delta=200)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase ...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
def test_se_resnext(self):
self.check_with_place("dist_se_resnext.py", delta=100)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -19,6 +19,9 @@ from test_dist_base import TestDistBase ...@@ -19,6 +19,9 @@ from test_dist_base import TestDistBase
class TestDistTransformer2x2(TestDistBase): class TestDistTransformer2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_transformer(self): def test_transformer(self):
# TODO(paddle-dev): check if the delta is OK. # TODO(paddle-dev): check if the delta is OK.
# Usually start around ~8000 and converge to ~5000 # Usually start around ~8000 and converge to ~5000
......
...@@ -47,7 +47,6 @@ class TranspilerTest(unittest.TestCase): ...@@ -47,7 +47,6 @@ class TranspilerTest(unittest.TestCase):
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return
def get_main_program(self): def get_main_program(self):
main = fluid.Program() main = fluid.Program()
...@@ -95,6 +94,7 @@ class TranspilerTest(unittest.TestCase): ...@@ -95,6 +94,7 @@ class TranspilerTest(unittest.TestCase):
def test_transpiler(self): def test_transpiler(self):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
self.transpiler_test_impl() self.transpiler_test_impl()
...@@ -249,7 +249,6 @@ class TestLRDecay(TranspilerTest): ...@@ -249,7 +249,6 @@ class TestLRDecay(TranspilerTest):
decay_rate=0.1, decay_rate=0.1,
staircase=True)) staircase=True))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
...@@ -279,7 +278,6 @@ class TestLRDecayConditional(TranspilerTest): ...@@ -279,7 +278,6 @@ class TestLRDecayConditional(TranspilerTest):
learning_rate=fluid.layers.piecewise_decay([10000, 20000], learning_rate=fluid.layers.piecewise_decay([10000, 20000],
[1.0, 0.5, 1.0])) [1.0, 0.5, 1.0]))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
...@@ -328,7 +326,6 @@ class TestL2Decay(TranspilerTest): ...@@ -328,7 +326,6 @@ class TestL2Decay(TranspilerTest):
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
...@@ -363,7 +360,6 @@ class TestL2DecayWithPiecewise(TranspilerTest): ...@@ -363,7 +360,6 @@ class TestL2DecayWithPiecewise(TranspilerTest):
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4)) regularization=fluid.regularizer.L2Decay(1e-4))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
...@@ -393,13 +389,14 @@ class TestDistLookupTableBase(TranspilerTest): ...@@ -393,13 +389,14 @@ class TestDistLookupTableBase(TranspilerTest):
def network_with_table(self, is_sparse, is_distributed): def network_with_table(self, is_sparse, is_distributed):
self.table_size = 1000 self.table_size = 1000
self.emb_size = 64 self.emb_size = 64
self.lookup_table_name = 'shared_w'
def emb_pool(ids): def emb_pool(ids):
emb = fluid.layers.embedding( emb = fluid.layers.embedding(
input=ids, input=ids,
size=[self.table_size, self.emb_size], size=[self.table_size, self.emb_size],
dtype='float32', dtype='float32',
param_attr='shared_w', # share parameter param_attr=self.lookup_table_name, # share parameter
is_sparse=is_sparse, is_sparse=is_sparse,
is_distributed=is_distributed) is_distributed=is_distributed)
pool = fluid.layers.sequence_pool(input=emb, pool_type='average') pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
...@@ -572,7 +569,7 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase): ...@@ -572,7 +569,7 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):
def transpiler_test_impl(self): def transpiler_test_impl(self):
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
pserver1, startup1 = self.get_pserver(self.pserver1_ep, config) pserver1, _ = self.get_pserver(self.pserver1_ep, config)
self.assertTrue(self.transpiler.has_distributed_lookup_table) self.assertTrue(self.transpiler.has_distributed_lookup_table)
lookup_table_var = pserver1.global_block().vars[ lookup_table_var = pserver1.global_block().vars[
...@@ -582,6 +579,21 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase): ...@@ -582,6 +579,21 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):
self.assertEqual(row_size, calc_row_size) self.assertEqual(row_size, calc_row_size)
class TestDistArgsInProgram(TestDistLookupTableBase):
def net_conf(self):
self.network_with_table(is_sparse=True, is_distributed=True)
def transpiler_test_impl(self):
trainer, _ = self.get_trainer()
self.assertTrue(trainer._is_distributed)
self.assertTrue(trainer._is_chief)
self.assertEqual(trainer._distributed_lookup_table,
self.lookup_table_name)
self.assertEqual(trainer._endpoints,
[self.pserver1_ep, self.pserver2_ep])
class TestRMSPropOptimizer(TranspilerTest): class TestRMSPropOptimizer(TranspilerTest):
def net_conf(self): def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32') x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
...@@ -595,7 +607,6 @@ class TestRMSPropOptimizer(TranspilerTest): ...@@ -595,7 +607,6 @@ class TestRMSPropOptimizer(TranspilerTest):
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
return
def transpiler_test_impl(self): def transpiler_test_impl(self):
pserver, startup = self.get_pserver(self.pserver1_ep) pserver, startup = self.get_pserver(self.pserver1_ep)
...@@ -612,5 +623,40 @@ class TestRMSPropOptimizer(TranspilerTest): ...@@ -612,5 +623,40 @@ class TestRMSPropOptimizer(TranspilerTest):
self.assertEqual(moment_var.shape, (500, 1000)) self.assertEqual(moment_var.shape, (500, 1000))
class TestLoadSliceVar(TranspilerTest):
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'),
bias_attr=fluid.ParamAttr(name='fc_b'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
optimizer.minimize(avg_cost)
def transpiler_test_impl(self):
pserver, _ = self.get_pserver(self.pserver1_ep)
pserver2, _ = self.get_pserver(self.pserver2_ep)
self.assertTrue(pserver._slice_vars_and_attrs)
self.assertTrue(pserver2._slice_vars_and_attrs)
for idx in xrange(len(pserver._slice_vars_and_attrs)):
self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
pserver2._slice_vars_and_attrs[idx][0])
total_numel = reduce(lambda x, y: x * y,
pserver._slice_vars_and_attrs[idx][0].shape)
self.assertEqual(
total_numel,
reduce(lambda x, y: x * y,
pserver._slice_vars_and_attrs[idx][2].shape) + reduce(
lambda x, y: x * y,
pserver2._slice_vars_and_attrs[idx][2].shape))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase ...@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
def test_se_resnext(self): def test_se_resnext(self):
self.check_with_place("dist_word2vec.py", delta=1e-7) self.check_with_place("dist_word2vec.py", delta=1e-7)
class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self):
self._sync_mode = False
def test_se_resnext(self):
self.check_with_place("dist_word2vec.py", delta=1)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -64,27 +64,47 @@ class TestFCOp(OpTest): ...@@ -64,27 +64,47 @@ class TestFCOp(OpTest):
self.check_output() self.check_output()
class TestFCOpBiasBoth(TestFCOp): class TestFCOpNoBias(TestFCOp):
def init_shapes(self, mb, ic, oc, h, w): def init_shapes(self, mb, ic, oc, h, w):
for with_bias in {True, False}: self.with_bias = False
self.with_bias = with_bias
self.matrix = MatrixGenerate(mb, ic, oc, h, w) self.matrix = MatrixGenerate(mb, ic, oc, h, w)
class TestFCOp1(TestFCOpBiasBoth): class TestFCOpWithBias(TestFCOp):
def init_shapes(self, mb, ic, oc, h, w):
self.with_bias = True
self.matrix = MatrixGenerate(mb, ic, oc, h, w)
class TestFCOp1(TestFCOpNoBias):
def init_op_type(self): def init_op_type(self):
self.init_shapes(2, 8, 10, 1, 1) self.init_shapes(2, 8, 10, 1, 1)
class TestFCOp2(TestFCOpBiasBoth): class TestFCOp2(TestFCOpNoBias):
def init_op_type(self): def init_op_type(self):
self.init_shapes(4, 5, 6, 2, 2) self.init_shapes(4, 5, 6, 2, 2)
class TestFCOp4(TestFCOpBiasBoth): class TestFCOp4(TestFCOpNoBias):
def init_op_type(self): def init_op_type(self):
self.init_shapes(1, 32, 64, 3, 3) self.init_shapes(1, 32, 64, 3, 3)
class TestFCOpWithBias1(TestFCOpWithBias):
def init_op_type(self):
self.init_shapes(3, 8, 10, 2, 1)
class TestFCOpWithBias2(TestFCOpWithBias):
def init_op_type(self):
self.init_shapes(4, 5, 6, 2, 2)
class TestFCOpWithBias3(TestFCOpWithBias):
def init_op_type(self):
self.init_shapes(1, 64, 32, 3, 3)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from test_lstm_op import lstm, ACTIVATION
def fc(x, w, b):
return np.dot(x, w) + b
def fusion_lstm(
x, # T x M
lod, # 1 x N
wx=None, # M x 4D
bx=None, # 1 x 4D
h0=None, # N x D
c0=None, # N x D
w_h=None, # D x 4D
w_b=None, # 1 x 4D
w_c=None, # 1 x 3D
is_reverse=False,
act_gate=None,
act_cell=None,
act_cand=None):
return lstm(
fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
act_cell, act_cand)
class TestLstmOp(OpTest):
def set_argument(self):
self.lod = [[2, 3, 2]]
def setUp(self):
self.op_type = 'fusion_lstm'
self.lod = [[2, 3, 2]]
self.M = 8
self.D = 16
self.has_initial_state = False
self.is_reverse = False
self.act_gate = 'sigmoid'
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.use_peepholes = False
self.set_argument()
T = sum(self.lod[0])
bs = len(self.lod[0])
x = np.random.normal(size=(T, self.M)).astype('float64')
if self.has_initial_state:
h0 = np.random.normal(size=(bs, self.D)).astype('float64')
c0 = np.random.normal(size=(bs, self.D)).astype('float64')
else:
h0 = np.zeros((bs, self.D)).astype('float64')
c0 = np.zeros((bs, self.D)).astype('float64')
wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
if self.use_peepholes:
b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
else:
b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
w_b = np.copy(b[:, 0:4 * self.D])
w_c = b[:, 4 * self.D:] if self.use_peepholes else None
# this is the weight of fc
wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64')
# this is the bias of fc
# and it should be manually added into the bias of this fusion LSTM
bx = np.random.normal(size=(1, 4 * self.D)).astype('float64')
b[0, 0:4 * self.D] += bx[0, :]
h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
self.is_reverse, ACTIVATION[self.act_gate],
ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
self.inputs = {
'X': (x, self.lod),
'WeightX': wx,
'WeightH': wh,
'Bias': b
}
if self.has_initial_state:
self.inputs['H0'] = h0
self.inputs['C0'] = c0
self.outputs = {
'Hidden': (h, self.lod),
'Cell': (c, self.lod),
}
self.attrs = {
'use_peepholes': self.use_peepholes,
'is_reverse': self.is_reverse,
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
'candidate_activation': self.act_cand
}
def test_check_output(self):
self.check_output(atol=1e-8)
class TestLstmOpInitReverse(TestLstmOp):
def set_argument(self):
self.has_initial_state = True
self.is_reverse = True
class TestLstmOpMD1(TestLstmOp):
def set_argument(self):
self.M = 36
self.D = 8
class TestLstmOpMD2(TestLstmOp):
def set_argument(self):
self.M = 8
self.D = 8
class TestLstmOpMD3(TestLstmOp):
def set_argument(self):
self.M = 15
self.D = 3
class TestLstmOpBS1(TestLstmOp):
def set_argument(self):
self.lod = [[3]]
self.D = 16
if __name__ == '__main__':
unittest.main()
...@@ -347,6 +347,25 @@ class TestBook(unittest.TestCase): ...@@ -347,6 +347,25 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(loss) self.assertIsNotNone(loss)
print(str(program)) print(str(program))
def test_scatter(self):
program = Program()
with program_guard(program):
x = layers.data(
name='x',
shape=[3, 3],
append_batch_size=False,
dtype='float32')
idx = layers.data(
name='idx', shape=[2], append_batch_size=False, dtype='int32')
updates = layers.data(
name='updates',
shape=[2, 3],
append_batch_size=False,
dtype='float32')
out = layers.scatter(input=x, index=idx, updates=updates)
self.assertIsNotNone(out)
print(str(program))
def test_lod_reset(self): def test_lod_reset(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -183,7 +183,9 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -183,7 +183,9 @@ class TestMNIST(TestParallelExecutorBase):
use_parallel_executor=True) use_parallel_executor=True)
self.assertAlmostEquals( self.assertAlmostEquals(
np.mean(parallel_first_loss), single_first_loss, delta=1e-6) np.mean(parallel_first_loss),
single_first_loss,
delta=1e-6, )
self.assertAlmostEquals( self.assertAlmostEquals(
np.mean(parallel_last_loss), single_last_loss, delta=1e-6) np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
...@@ -191,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -191,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
self.check_simple_fc_parallel_accuracy(True) self.check_simple_fc_parallel_accuracy(True)
self.check_simple_fc_parallel_accuracy(False) self.check_simple_fc_parallel_accuracy(False)
def check_batchnorm_fc_convergence(self, use_cuda): def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
...@@ -203,11 +205,13 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -203,11 +205,13 @@ class TestMNIST(TestParallelExecutorBase):
fc_with_batchnorm, fc_with_batchnorm,
feed_dict={"image": img, feed_dict={"image": img,
"label": label}, "label": label},
use_cuda=use_cuda) use_cuda=use_cuda,
use_fast_executor=use_fast_executor)
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence(True) for use_cuda in (False, True):
self.check_batchnorm_fc_convergence(False) for use_fast_executor in (False, True):
self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
def test_batchnorm_fc_with_new_strategy(self): def test_batchnorm_fc_with_new_strategy(self):
# FIXME(zcd): close this test temporally. # FIXME(zcd): close this test temporally.
......
...@@ -25,9 +25,6 @@ import paddle.fluid.core as core ...@@ -25,9 +25,6 @@ import paddle.fluid.core as core
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
def net_profiler(self, state, profile_path='/tmp/profile'): def net_profiler(self, state, profile_path='/tmp/profile'):
enable_if_gpu = state == 'GPU' or state == "All"
if enable_if_gpu and not core.is_compiled_with_cuda():
return
startup_program = fluid.Program() startup_program = fluid.Program()
main_program = fluid.Program() main_program = fluid.Program()
...@@ -81,8 +78,6 @@ class TestProfiler(unittest.TestCase): ...@@ -81,8 +78,6 @@ class TestProfiler(unittest.TestCase):
pass_acc_calculator.add(value=acc, weight=b_size) pass_acc_calculator.add(value=acc, weight=b_size)
pass_acc = pass_acc_calculator.eval() pass_acc = pass_acc_calculator.eval()
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_cpu_profiler(self): def test_cpu_profiler(self):
self.net_profiler('CPU') self.net_profiler('CPU')
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import time
import unittest
from multiprocessing import Process
import signal
import numpy
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layers.io import ListenAndServ
from paddle.fluid.layers.io import Recv
from paddle.fluid.layers.io import Send
from paddle.fluid.transpiler.details import program_to_code
class TestProgram2Code(unittest.TestCase):
def test_print(self):
place = fluid.CPUPlace()
self.init_serv(place)
self.init_client(place, 9123)
def init_serv(self, place):
main = fluid.Program()
with fluid.program_guard(main):
serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
with serv.do():
out_var = main.global_block().create_var(
name="scale_0.tmp_0",
psersistable=True,
dtype="float32",
shape=[32, 32])
x = layers.data(
shape=[32, 32],
dtype='float32',
name="X",
append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block())
layers.scale(x=x, scale=10.0, out=out_var)
program_to_code(main)
def init_client(self, place, port):
main = fluid.Program()
with fluid.program_guard(main):
x = layers.data(
shape=[32, 32],
dtype='float32',
name='X',
append_batch_size=False)
fluid.initializer.Constant(value=2.3)(x, main.global_block())
get_var = main.global_block().create_var(
name="scale_0.tmp_0", # server side var
dtype="float32",
persistable=False,
shape=[32, 32])
fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
Send("127.0.0.1:%d" % port, [x])
o = Recv("127.0.0.1:%d" % port, [get_var])
program_to_code(main)
if __name__ == "__main__":
unittest.main()
...@@ -41,7 +41,7 @@ class TestSqueezeOp(OpTest): ...@@ -41,7 +41,7 @@ class TestSqueezeOp(OpTest):
self.new_shape = (3, 5) self.new_shape = (3, 5)
def init_attrs(self): def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": False} self.attrs = {"axes": self.axes}
# Correct: There is mins axis. # Correct: There is mins axis.
...@@ -68,49 +68,5 @@ class TestSqueezeOp3(TestSqueezeOp): ...@@ -68,49 +68,5 @@ class TestSqueezeOp3(TestSqueezeOp):
self.new_shape = (3, 5, 1, 4) self.new_shape = (3, 5, 1, 4)
# Correct: Inplace.
class TestSqueezeOpInplace1(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = (0, 2)
self.new_shape = (3, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. There is mins axis.
class TestSqueezeOpInplace2(TestSqueezeOp):
def inti_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = (0, -2)
self.new_shape = (3, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. No axes input.
class TestSqueezeOpInplace3(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 3, 1, 5)
self.axes = ()
self.new_shape = (3, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inpalce. Just part of axes be squeezed.
class TestSqueezeOpInplace4(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 1, 5, 1, 4, 1)
self.axes = (1, -1)
self.new_shape = (3, 5, 1, 4)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from op_test import OpTest
import numpy as np
import unittest
class TestStackOpBase(OpTest):
def initDefaultParameters(self):
self.num_inputs = 4
self.input_dim = (5, 6, 7)
self.axis = 0
self.dtype = 'float32'
def initParameters(self):
pass
def get_x_names(self):
x_names = []
for i in range(self.num_inputs):
x_names.append('x{}'.format(i))
return x_names
def setUp(self):
self.initDefaultParameters()
self.initParameters()
self.op_type = 'stack'
self.x = []
for i in range(self.num_inputs):
self.x.append(
np.random.random(size=self.input_dim).astype(self.dtype))
tmp = []
x_names = self.get_x_names()
for i in range(self.num_inputs):
tmp.append((x_names[i], self.x[i]))
self.inputs = {'X': tmp}
self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
self.attrs = {'axis': self.axis}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(self.get_x_names(), 'Y')
class TestStackOp1(TestStackOpBase):
def initParameters(self):
self.num_inputs = 16
class TestStackOp2(TestStackOpBase):
def initParameters(self):
self.num_inputs = 20
class TestStackOp3(TestStackOpBase):
def initParameters(self):
self.axis = -1
class TestStackOp4(TestStackOpBase):
def initParameters(self):
self.axis = -4
class TestStackOp5(TestStackOpBase):
def initParameters(self):
self.axis = 1
class TestStackOp6(TestStackOpBase):
def initParameters(self):
self.axis = 3
if __name__ == '__main__':
unittest.main()
...@@ -17,6 +17,8 @@ from __future__ import print_function ...@@ -17,6 +17,8 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestSumOp(OpTest): class TestSumOp(OpTest):
...@@ -42,5 +44,66 @@ class TestSumOp(OpTest): ...@@ -42,5 +44,66 @@ class TestSumOp(OpTest):
pass pass
class TestSelectedRowsSumOp(OpTest):
def check_with_place(self, place):
scope = core.Scope()
self.check_input_and_optput(scope, place, True, True, True)
self.check_input_and_optput(scope, place, False, True, True)
self.check_input_and_optput(scope, place, False, False, True)
self.check_input_and_optput(scope, place, False, False, False)
def check_input_and_optput(self,
scope,
place,
w1_has_data=False,
w2_has_data=False,
w3_has_data=False):
self.create_selected_rows(scope, place, "W1", w1_has_data)
self.create_selected_rows(scope, place, "W2", w2_has_data)
self.create_selected_rows(scope, place, "W3", w3_has_data)
# create Out Variable
out = scope.var('Out').get_selected_rows()
# create and run sum operator
sum_op = Operator("sum", X=["W1", "W2", "W3"], Out='Out')
sum_op.run(scope, place)
has_data_w_num = 0
for w in [w1_has_data, w2_has_data, w3_has_data]:
if not w:
has_data_w_num += 1
self.assertEqual(7 * has_data_w_num, len(out.rows()))
def create_selected_rows(self, scope, place, var_name, isEmpty):
# create and initialize W Variable
if not isEmpty:
rows = [0, 1, 2, 3, 4, 5, 6]
row_numel = 12
else:
rows = []
row_numel = 12
var = scope.var(var_name)
w_selected_rows = var.get_selected_rows()
w_selected_rows.set_height(len(rows))
w_selected_rows.set_rows(rows)
w_array = np.ones((len(rows), row_numel)).astype("float32")
for i in range(len(rows)):
w_array[i] *= i
w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place)
return var
def test_w_is_selected_rows(self):
places = [core.CPUPlace()]
# currently only support CPU
for place in places:
self.check_with_place(place)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -41,7 +41,7 @@ class TestUnsqueezeOp(OpTest): ...@@ -41,7 +41,7 @@ class TestUnsqueezeOp(OpTest):
self.new_shape = (3, 1, 1, 5) self.new_shape = (3, 1, 1, 5)
def init_attrs(self): def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": False} self.attrs = {"axes": self.axes}
# Correct: Single input index. # Correct: Single input index.
...@@ -76,38 +76,5 @@ class TestUnsqueezeOp4(TestUnsqueezeOp): ...@@ -76,38 +76,5 @@ class TestUnsqueezeOp4(TestUnsqueezeOp):
self.new_shape = (3, 1, 1, 2, 5, 1) self.new_shape = (3, 1, 1, 2, 5, 1)
# Correct: Inplace.
class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 5)
self.axes = (0, 2)
self.new_shape = (1, 3, 1, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. There is mins index.
class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 5)
self.axes = (0, -2)
self.new_shape = (1, 3, 1, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
# Correct: Inplace. There is duplicated axis.
class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
def init_test_case(self):
self.ori_shape = (3, 2, 5)
self.axes = (0, 3, 3)
self.new_shape = (1, 3, 2, 1, 1, 5)
def init_attrs(self):
self.attrs = {"axes": self.axes, "inplace": True}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -153,9 +153,6 @@ def append_input_output(block, op_proto, np_list, is_input, dtype): ...@@ -153,9 +153,6 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
def append_loss_ops(block, output_names): def append_loss_ops(block, output_names):
mean_inputs = list(map(block.var, output_names)) mean_inputs = list(map(block.var, output_names))
# for item in mean_inputs:
# print(item)
# print("Item", item.dtype)
if len(mean_inputs) == 1: if len(mean_inputs) == 1:
loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1]) loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
......
...@@ -285,6 +285,7 @@ class Trainer(object): ...@@ -285,6 +285,7 @@ class Trainer(object):
self._load_checkpoint() self._load_checkpoint()
if param_path and os.path.isdir(param_path): if param_path and os.path.isdir(param_path):
with self._prog_and_scope_guard():
# load params from param_path into scope # load params from param_path into scope
io.load_persistables( io.load_persistables(
executor=exe, executor=exe,
......
...@@ -16,6 +16,9 @@ from __future__ import print_function ...@@ -16,6 +16,9 @@ from __future__ import print_function
import six import six
from paddle.fluid import core
import paddle
def delete_ops(block, ops): def delete_ops(block, ops):
try: try:
...@@ -39,3 +42,133 @@ def find_op_by_output_arg(block, arg_name): ...@@ -39,3 +42,133 @@ def find_op_by_output_arg(block, arg_name):
if arg_name in op.output_arg_names: if arg_name in op.output_arg_names:
return index return index
return -1 return -1
def get_indent_space(indent, space_num=4):
ret = ""
for i in range(0, indent * space_num):
ret += " "
return ret
def variable_to_code(var):
"""
Get readable codes of fluid variable.
Args:
var: A fluid operator.
Returns:
string: The formatted string.
"""
var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})".\
format(i="{", e="}", name=var.name, type=var.type, shape=var.shape, dtype=var.dtype)
if type(var) == paddle.fluid.framework.Parameter:
if var.trainable:
var_str = "trainable parameter " + var_str
else:
var_str = "parameter " + var_str
else:
var_str = "var " + var_str
if var.persistable:
var_str = "persist " + var_str
return var_str
def op_to_code(op):
"""
Get readable codes of fluid operator.
Args:
op: A fluid operator.
Returns:
string: The foramtted string.
"""
outputs_str = "{"
for i in range(0, len(op.output_names)):
outputs_str += "{name}=".format(name=op.output_names[i])
o = op.output(op.output_names[i])
outputs_str += "{value}".format(value=o)
if i != len(op.output_names) - 1:
outputs_str += ", "
outputs_str += "}"
inputs_str = "{"
for i in range(0, len(op.input_names)):
inputs_str += "{name}=".format(name=op.input_names[i])
o = op.input(op.input_names[i])
inputs_str += "{value}".format(value=o)
if i != len(op.input_names) - 1:
inputs_str += ", "
inputs_str += "}"
attrs_str = ""
for i in range(0, len(op.attr_names)):
name = op.attr_names[i]
attr_type = op.desc.attr_type(name)
if attr_type == core.AttrType.BLOCK:
a = "{name} = block[{value}]".format(
name=name, type=attr_type, value=op.block_attr_id(name))
attrs_str += a
continue
if attr_type == core.AttrType.BLOCKS:
a = "{name} = blocks{value}".format(
name=name, type=attr_type, value=op.blocks_attr_ids(name))
attrs_str += a
continue
a = "{name} = {value}".format(
name=name, type=attr_type, value=op.desc.attr(name))
attrs_str += a
if i != len(op.attr_names) - 1:
attrs_str += ", "
if outputs_str != "{}":
op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
format(outputs = outputs_str, op_type=op.type, inputs=inputs_str, attrs=attrs_str)
else:
op_str = "{op_type}(inputs={inputs}, {attrs})".\
format(op_type=op.type, inputs=inputs_str, attrs=attrs_str)
return op_str
def program_to_code(prog):
"""
Print readable codes of fluid program.
Args:
prog : A fluid program.
An example result like bellow:
https://github.com/PaddlePaddle/Paddle/pull/12673
"""
indent = 0
block_idx = 0
for block in prog.blocks:
print("{0}{1} // block {2}".format(
get_indent_space(indent), '{', block_idx))
indent += 1
# sort all vars
all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
for var in all_vars:
print("{}{}".format(
get_indent_space(indent), variable_to_code(var[1])))
if len(all_vars) > 0:
print("")
for op in block.ops:
print("{}{}".format(get_indent_space(indent), op_to_code(op)))
indent -= 1
print("{0}{1}".format(get_indent_space(indent), '}'))
block_idx += 1
...@@ -34,6 +34,7 @@ import math ...@@ -34,6 +34,7 @@ import math
import random import random
import numpy as np import numpy as np
import collections import collections
import six
from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework from .. import core, framework
...@@ -210,7 +211,19 @@ class DistributeTranspiler(object): ...@@ -210,7 +211,19 @@ class DistributeTranspiler(object):
ps_dispatcher = self.config.split_method(self.pserver_endpoints) ps_dispatcher = self.config.split_method(self.pserver_endpoints)
self.has_distributed_lookup_table = self._has_distributed_lookup_table() self.has_distributed_lookup_table = self._has_distributed_lookup_table()
self.param_name_to_grad_name = dict()
self.grad_name_to_param_name = dict()
for param_var, grad_var in self.params_grads:
self.param_name_to_grad_name[param_var.name] = grad_var.name
self.grad_name_to_param_name[grad_var.name] = param_var.name
# add distributed attrs to program
self.origin_program._is_distributed = True
self.origin_program._endpoints = self.pserver_endpoints
self.origin_program._is_chief = self.trainer_id == 0
self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
# split and create vars, then put splited vars in dicts for later use.
# step 1: split and create vars, then put splited vars in dicts for later use. # step 1: split and create vars, then put splited vars in dicts for later use.
self._init_splited_vars() self._init_splited_vars()
...@@ -229,34 +242,43 @@ class DistributeTranspiler(object): ...@@ -229,34 +242,43 @@ class DistributeTranspiler(object):
random.seed(self.origin_program.random_seed) random.seed(self.origin_program.random_seed)
random.shuffle(grad_var_mapping_items) random.shuffle(grad_var_mapping_items)
for orig_varname, splited_vars in grad_var_mapping_items: grad_name_to_send_dummy_out = dict()
for grad_varname, splited_vars in grad_var_mapping_items:
eplist = ps_dispatcher.dispatch(splited_vars) eplist = ps_dispatcher.dispatch(splited_vars)
if not self.config.slice_var_up: if not self.config.slice_var_up:
assert (len(splited_vars) == 1) assert (len(splited_vars) == 1)
splited_grad_varname = grad_varname
if len(splited_vars) == 1: if len(splited_vars) == 1:
orig_varname = splited_vars[0].name splited_grad_varname = splited_vars[0].name
index = find_op_by_output_arg(program.global_block(), index = find_op_by_output_arg(program.global_block(),
orig_varname) splited_grad_varname)
elif len(splited_vars) > 1: elif len(splited_vars) > 1:
orig_var = program.global_block().vars[orig_varname] orig_var = program.global_block().vars[splited_grad_varname]
index = find_op_by_output_arg(program.global_block(), index = find_op_by_output_arg(program.global_block(),
orig_varname) splited_grad_varname)
self._insert_split_op(program, orig_var, index, splited_vars) self._insert_split_op(program, orig_var, index, splited_vars)
index += 1 index += 1
else: else:
AssertionError("Can not insert the send op by original " AssertionError("Can not insert the send op by original "
"variable name :", orig_varname) "variable name :", splited_grad_varname)
dummy_output = program.global_block().create_var(
name=framework.generate_control_dev_var_name())
grad_name_to_send_dummy_out[grad_varname] = dummy_output
program.global_block()._insert_op( program.global_block()._insert_op(
index=index + 1, index=index + 1,
type="send", type="send",
inputs={"X": splited_vars}, inputs={"X": splited_vars},
outputs={}, outputs={"Out": dummy_output},
attrs={ attrs={
"epmap": eplist, "epmap": eplist,
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
OP_ROLE_VAR_ATTR_NAME:
[self.grad_name_to_param_name[grad_varname], grad_varname],
"sync_mode": not self.sync_mode,
}) })
for _, var in enumerate(splited_vars): for _, var in enumerate(splited_vars):
send_vars.append(var) send_vars.append(var)
...@@ -268,7 +290,6 @@ class DistributeTranspiler(object): ...@@ -268,7 +290,6 @@ class DistributeTranspiler(object):
outputs={}, outputs={},
attrs={ attrs={
"endpoints": pserver_endpoints, "endpoints": pserver_endpoints,
"sync_mode": self.sync_mode,
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
}) })
...@@ -284,19 +305,25 @@ class DistributeTranspiler(object): ...@@ -284,19 +305,25 @@ class DistributeTranspiler(object):
self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
# step4: Concat the parameters splits together after recv. # step4: Concat the parameters splits together after recv.
for varname, splited_var in six.iteritems(self.param_var_mapping): for param_varname, splited_var in six.iteritems(self.param_var_mapping):
eps = [] eps = []
for var in splited_var: for var in splited_var:
index = [v.name for v in recv_vars].index(var.name) index = [v.name for v in recv_vars].index(var.name)
eps.append(eplist[index]) eps.append(eplist[index])
grad_send_dummy_out = grad_name_to_send_dummy_out[
self.param_name_to_grad_name[param_varname]]
program.global_block().append_op( program.global_block().append_op(
type="recv", type="recv",
inputs={}, inputs={"X": [grad_send_dummy_out]},
outputs={"Out": splited_var}, outputs={"Out": splited_var},
attrs={ attrs={
"epmap": eps, "epmap": eps,
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
OP_ROLE_VAR_ATTR_NAME: [
param_varname,
self.param_name_to_grad_name[param_varname]
],
"sync_mode": not self.sync_mode
}) })
if self.sync_mode: if self.sync_mode:
...@@ -309,10 +336,10 @@ class DistributeTranspiler(object): ...@@ -309,10 +336,10 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
}) })
for varname, splited_var in six.iteritems(self.param_var_mapping): for param_varname, splited_var in six.iteritems(self.param_var_mapping):
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
orig_param = program.global_block().vars[varname] orig_param = program.global_block().vars[param_varname]
program.global_block().append_op( program.global_block().append_op(
type="concat", type="concat",
inputs={"X": splited_var}, inputs={"X": splited_var},
...@@ -359,7 +386,7 @@ class DistributeTranspiler(object): ...@@ -359,7 +386,7 @@ class DistributeTranspiler(object):
# FIXME(gongwb): delete not need ops. # FIXME(gongwb): delete not need ops.
# note that: some parameter is not trainable and those ops can't be deleted. # note that: some parameter is not trainable and those ops can't be deleted.
for varname, splited_var in self.param_var_mapping.iteritems(): for varname, splited_var in six.iteritems(self.param_var_mapping):
# Get the eplist of recv vars # Get the eplist of recv vars
eps = [] eps = []
for var in splited_var: for var in splited_var:
...@@ -380,7 +407,7 @@ class DistributeTranspiler(object): ...@@ -380,7 +407,7 @@ class DistributeTranspiler(object):
op = startup_program.global_block().append_op( op = startup_program.global_block().append_op(
type="recv", type="recv",
inputs={}, inputs={"X": []},
outputs={"Out": splited_var}, outputs={"Out": splited_var},
attrs={ attrs={
"epmap": eps, "epmap": eps,
...@@ -396,7 +423,7 @@ class DistributeTranspiler(object): ...@@ -396,7 +423,7 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
}) })
for varname, splited_var in self.param_var_mapping.iteritems(): for varname, splited_var in six.iteritems(self.param_var_mapping):
#add concat ops to merge splited parameters received from parameter servers. #add concat ops to merge splited parameters received from parameter servers.
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
...@@ -580,6 +607,8 @@ class DistributeTranspiler(object): ...@@ -580,6 +607,8 @@ class DistributeTranspiler(object):
checkpoint_block_id = self._create_checkpoint_save_block( checkpoint_block_id = self._create_checkpoint_save_block(
pserver_program, table_opt_block.idx) pserver_program, table_opt_block.idx)
pserver_program._distributed_lookup_table = self.table_name
# NOTE: if has_distributed_lookup_table is False, then prefetch_block will # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
# not be executed, so it's safe to use optimize_block to hold the place # not be executed, so it's safe to use optimize_block to hold the place
if self.has_distributed_lookup_table: if self.has_distributed_lookup_table:
...@@ -606,6 +635,10 @@ class DistributeTranspiler(object): ...@@ -606,6 +635,10 @@ class DistributeTranspiler(object):
outputs={}, outputs={},
attrs=attrs) attrs=attrs)
# add distributed attrs
pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
endpoint)
pserver_program._sync_with_cpp() pserver_program._sync_with_cpp()
return pserver_program return pserver_program
...@@ -679,8 +712,31 @@ class DistributeTranspiler(object): ...@@ -679,8 +712,31 @@ class DistributeTranspiler(object):
inputs=new_inputs, inputs=new_inputs,
outputs=new_outputs, outputs=new_outputs,
attrs=op.all_attrs()) attrs=op.all_attrs())
# add slice vars
s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
return s_prog return s_prog
def _get_slice_vars_and_attrs(self, endpoint):
slice_vars_and_attrs = []
block_suffix = "block"
for param in self.param_grad_ep_mapping[endpoint]["params"]:
orig_var_name, block_name, _ = self._get_varname_parts(param.name)
if not block_name:
continue
block_idx = int(block_name.split(block_suffix)[1])
orig_var = self.origin_program.global_block().vars[orig_var_name]
skip_numel = 0
slice_vars = self.param_var_mapping[orig_var_name]
for slice_var in slice_vars[:block_idx]:
skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
slice_vars_and_attrs.append([orig_var, skip_numel, param])
return slice_vars_and_attrs
# ====================== private transpiler functions ===================== # ====================== private transpiler functions =====================
def _has_distributed_lookup_table(self): def _has_distributed_lookup_table(self):
...@@ -786,13 +842,15 @@ class DistributeTranspiler(object): ...@@ -786,13 +842,15 @@ class DistributeTranspiler(object):
self.config.min_block_size) self.config.min_block_size)
assert (len(grad_blocks) == len(param_blocks)) assert (len(grad_blocks) == len(param_blocks))
# origin_varname -> [splited_var] # origin_param_name -> [splited_param_vars]
self.param_var_mapping = self._create_vars_from_blocklist( self.param_var_mapping = self._create_vars_from_blocklist(
self.origin_program, param_blocks) self.origin_program, param_blocks)
# origin_grad_name -> [splited_grad_vars]
self.grad_var_mapping = self._create_vars_from_blocklist( self.grad_var_mapping = self._create_vars_from_blocklist(
self.origin_program, self.origin_program,
grad_blocks, grad_blocks,
add_trainer_suffix=self.trainer_num > 1) add_trainer_suffix=self.trainer_num > 1)
# dict(grad_splited_var -> param_splited_var)
self.grad_param_mapping = collections.OrderedDict() self.grad_param_mapping = collections.OrderedDict()
for g, p in zip(grad_blocks, param_blocks): for g, p in zip(grad_blocks, param_blocks):
g_name, g_bid, _ = g.split(":") g_name, g_bid, _ = g.split(":")
...@@ -919,11 +977,15 @@ class DistributeTranspiler(object): ...@@ -919,11 +977,15 @@ class DistributeTranspiler(object):
index=op_index + 2, index=op_index + 2,
type="send", type="send",
inputs={'X': self.trainer_side_table_grad_list}, inputs={'X': self.trainer_side_table_grad_list},
outputs={}, outputs={'Out': []},
attrs={ attrs={
"sync_mode": True, "sync_mode": True,
"epmap": pserver_endpoints, "epmap": pserver_endpoints,
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
OP_ROLE_VAR_ATTR_NAME: [
self.grad_name_to_param_name[table_grad_name],
table_grad_name
]
}) })
break break
...@@ -1197,8 +1259,8 @@ class DistributeTranspiler(object): ...@@ -1197,8 +1259,8 @@ class DistributeTranspiler(object):
elif op_type == "momentum": elif op_type == "momentum":
if varkey == "Velocity": if varkey == "Velocity":
return param_shape return param_shape
elif op_type == "": elif op_type == "rmsprop":
if varkey == "Moment": if varkey in ["Moment", "MeanSquare"]:
return param_shape return param_shape
elif op_type == "sgd": elif op_type == "sgd":
pass pass
...@@ -1277,8 +1339,6 @@ class DistributeTranspiler(object): ...@@ -1277,8 +1339,6 @@ class DistributeTranspiler(object):
pserver_block = program.global_block() pserver_block = program.global_block()
new_inputs = collections.OrderedDict() new_inputs = collections.OrderedDict()
# update param/grad shape first, then other inputs like
# moment can use the updated shape
def _get_param_block(opt_op): def _get_param_block(opt_op):
# param is already created on global program # param is already created on global program
param_block = None param_block = None
...@@ -1291,22 +1351,6 @@ class DistributeTranspiler(object): ...@@ -1291,22 +1351,6 @@ class DistributeTranspiler(object):
for key in opt_op.input_names: for key in opt_op.input_names:
if key == "Grad": if key == "Grad":
new_inputs[key] = merged_var new_inputs[key] = merged_var
# For RMSProp optimizer
elif key == "Moment" or key == "MeanSquare":
param_block = _get_param_block(opt_op)
if not param_block:
return
moment_var = origin_program.global_block().vars[opt_op.input(
key)[0]]
tmpvar = pserver_block.create_var(
name=moment_var.name,
persistable=moment_var.persistable,
dtype=moment_var.dtype,
# change to use same shape as param
# TODO(typhoonzero): didn't append .block in the var name,
# may affect checkpoint saving? Need to verify.
shape=param_block.shape)
new_inputs[key] = tmpvar
elif key == "Param": elif key == "Param":
param_block = _get_param_block(opt_op) param_block = _get_param_block(opt_op)
if not param_block: if not param_block:
...@@ -1334,7 +1378,7 @@ class DistributeTranspiler(object): ...@@ -1334,7 +1378,7 @@ class DistributeTranspiler(object):
for key in opt_op.input_names: for key in opt_op.input_names:
new_shape = None new_shape = None
if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]: if key in ["Param", "Grad", "LearningRate"]:
continue continue
var = self.origin_program.global_block().vars[opt_op.input(key)[0]] var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
# update accumulator variable shape # update accumulator variable shape
......
...@@ -59,8 +59,12 @@ class InferenceTranspiler(object): ...@@ -59,8 +59,12 @@ class InferenceTranspiler(object):
scope = global_scope() scope = global_scope()
if not isinstance(scope, core.Scope): if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
self._fuse_batch_norm(program, place, scope) use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
if use_mkldnn:
self._fuse_relu_mkldnn(program) self._fuse_relu_mkldnn(program)
self._fuse_conv_bias_mkldnn(program)
else:
self._fuse_batch_norm(program, place, scope)
def _fuse_relu_mkldnn(self, program): def _fuse_relu_mkldnn(self, program):
''' '''
...@@ -82,10 +86,6 @@ class InferenceTranspiler(object): ...@@ -82,10 +86,6 @@ class InferenceTranspiler(object):
:param program: program to transpile :param program: program to transpile
:type program: Program :type program: Program
''' '''
use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
if not use_mkldnn:
return
self.block = program.block(0) self.block = program.block(0)
i = 0 i = 0
...@@ -106,6 +106,69 @@ class InferenceTranspiler(object): ...@@ -106,6 +106,69 @@ class InferenceTranspiler(object):
# And a better solution will be considered later. # And a better solution will be considered later.
program = program.clone() program = program.clone()
def _fuse_conv_bias_mkldnn(self, program):
'''
Transpile the program by fused convolution and elementwise_add.
Replace conv2d and elementwise_add ops with a new conv2d op
based on an old conv2d op and the :math:`Bias` taken from
elementwise_add.
For input :math:`X`:
- Conv process: :math:`X = input * W`
- Elementwise_add process: :math` X = X + bias`
After fuse into one operation:
.. math::
X = input * W + bias
The operator transformation is:
- before:
- conv->elementwise_add->any_other_op
- after:
- conv->any_other_op
The transpile stages are:
1. Extract bias and output variables from elementwise_add.
2. Extract Input, Weight and attributes from conv op.
3. Create a new convolution op based on extracted params.
4. Remove old conv op.
5. Remove elementwise_add.
5. Remove unused variables.
Args:
program (Program): program to transpile
'''
self.block = program.block(0)
i = 0
while i < len(self.block.ops) - 2:
current_op = self.block.ops[i]
next_op = self.block.ops[i + 1]
# conv2d with bias
if current_op.type in ['conv2d'] and \
next_op.type in ['elementwise_add']:
self._fuse_conv_bias(i, current_op, next_op)
self.block._remove_op(i + 1) # Remove old conv
self.block._remove_op(i + 1) # Remove elementwise_add
i = i + 1
i = i + 1
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def _fuse_batch_norm(self, program, place, scope): def _fuse_batch_norm(self, program, place, scope):
''' '''
Transpile the program by fused batch normalization. Transpile the program by fused batch normalization.
...@@ -185,7 +248,6 @@ class InferenceTranspiler(object): ...@@ -185,7 +248,6 @@ class InferenceTranspiler(object):
self.block._remove_op(i + 2) self.block._remove_op(i + 2)
i = i + 1 i = i + 1
i = i + 1 i = i + 1
self._adjust_input() self._adjust_input()
self._remove_unused_var() self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force, # TODO(luotao): use clone() method to flush the program.desc in force,
...@@ -288,6 +350,33 @@ class InferenceTranspiler(object): ...@@ -288,6 +350,33 @@ class InferenceTranspiler(object):
# collect the renamed input # collect the renamed input
self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0] self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
def _fuse_conv_bias(self, index, conv_op, elementwise_add_op):
'''
fuse the conv op with elementwise_add
:param index: index of the conv_op in ops list
:type index: Int
:param conv_op: convolution operator
:type conv_op: Operator
:param elementwise_add_op: convolution's bias operator
:type elementwise_add_op: Operator
'''
bias_var = self.block.var(elementwise_add_op.input("Y")[0])
out_var = self.block.var(elementwise_add_op.output("Out")[0])
filter_var = self.block.var(conv_op.input("Filter")[0])
in_var = self.block.var(conv_op.input("Input")[0])
attrs = {name: conv_op.attr(name) for name in conv_op.attr_names}
self.block._insert_op(
index,
type="conv2d",
inputs={"Input": in_var,
"Filter": filter_var,
"Bias": bias_var},
outputs={"Output": out_var},
attrs=attrs)
def _adjust_input(self): def _adjust_input(self):
for i in range(len(self.block.ops)): for i in range(len(self.block.ops)):
current_op = self.block.ops[i] current_op = self.block.ops[i]
......
...@@ -159,7 +159,9 @@ if '${WITH_MKL}' == 'ON': ...@@ -159,7 +159,9 @@ if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path)
package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so'] package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
if '${WITH_MKLDNN}' == 'ON': if '${CMAKE_BUILD_TYPE}' == 'Release':
# only change rpath in Release mode.
if '${WITH_MKLDNN}' == 'ON':
# TODO(typhoonzero): use install_name_tool to patch mkl libs once # TODO(typhoonzero): use install_name_tool to patch mkl libs once
# we can support mkl on mac. # we can support mkl on mac.
# #
...@@ -179,13 +181,15 @@ package_dir['paddle.libs']=libs_path ...@@ -179,13 +181,15 @@ package_dir['paddle.libs']=libs_path
# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries. # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
if "@APPLE@" == "1": if '${CMAKE_BUILD_TYPE}' == 'Release':
# only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
if "@APPLE@" == "1":
command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
else: else:
command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
if os.system(command) != 0: if os.system(command) != 0:
raise Exception("patch core.so failed, command: %s" % command) raise Exception("patch core.so failed, command: %s" % command)
if '${WITH_FLUID_ONLY}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
# change rpath of _swig_paddle.so. # change rpath of _swig_paddle.so.
if "@APPLE@" == "1": if "@APPLE@" == "1":
command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册