提交 125e9166 编写于 作者: T tangwei12

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into sum_op_dim_fix

......@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
*.DS_Store
*.vs
build/
build_doc/
*.user
......@@ -15,6 +16,7 @@ build_doc/
.cproject
.pydevproject
.settings/
CMakeSettings.json
Makefile
.test_env/
third_party/
......
......@@ -27,15 +27,6 @@ script:
# 43min timeout
paddle/scripts/paddle_docker_build.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
- |
if [[ "$JOB" != "doc" ]]; then exit 0; fi;
# For document only
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd`
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications:
email:
on_success: change
......
......@@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(WITH_INFERENCE "Compile fluid inference library" ON)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
......@@ -72,6 +73,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER
if(NOT PY_VERSION)
set(PY_VERSION 2.7)
endif()
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......@@ -158,6 +160,7 @@ endif()
########################################################################################
include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm
include(external/zlib) # download, build, install zlib
include(external/gflags) # download, build, install gflags
......@@ -174,6 +177,7 @@ include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
include(external/cub)
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
......@@ -200,6 +204,14 @@ include(external/snappy) # download snappy
include(external/snappystream)
include(external/threadpool)
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(external/anakin)
elseif()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
endif()
include(cudnn) # set cudnn libraries, must before configure
include(cupti)
include(configure) # add paddle env configuration
......@@ -228,14 +240,6 @@ set(EXTERNAL_LIBS
${PYTHON_LIBRARIES}
)
if(WITH_GPU)
include(cuda)
include(tensorrt)
include(external/anakin)
else()
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
endif()
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
......
......@@ -56,6 +56,10 @@ if(NOT CMAKE_CROSSCOMPILING)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
endif()
if(UNIX AND NOT APPLE)
# except apple from nix*Os family
set(LINUX TRUE)
endif(UNIX AND NOT APPLE)
if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
......@@ -97,6 +101,18 @@ if(WITH_GPU)
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
if(WITH_ANAKIN)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
endif()
set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
endif()
elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
......
......@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
$ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib
......
......@@ -2,12 +2,25 @@ if (NOT WITH_ANAKIN)
return()
endif()
set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
"Anakin install path." FORCE)
set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
INCLUDE(ExternalProject)
set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin)
# the anakin install dir is only default one now
set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output)
set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR})
set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
set(ANAKIN_MODLE_URL "http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL}")
include_directories(${ANAKIN_INCLUDE})
include_directories(${ANAKIN_INCLUDE}/saber/)
set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-Wno-error=unused-variable -Wno-unused-variable
-Wno-error=format-extra-args -Wno-format-extra-args
-Wno-error=comment -Wno-comment
......@@ -19,36 +32,32 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-error=cpp)
set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
function(fetch_include_recursively root_dir)
if (IS_DIRECTORY ${root_dir})
include_directories(${root_dir})
endif()
file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
foreach(sub ${ALL_SUB})
if (IS_DIRECTORY ${root_dir}/${sub})
fetch_include_recursively(${root_dir}/${sub})
endif()
endforeach()
endfunction()
if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
# download library
message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
endif()
ExternalProject_Add(
extern_anakin
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin"
GIT_TAG "04256ba78fa3da0beb74e8036c8efd68c12824d6"
PREFIX ${ANAKIN_SOURCE_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
-DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DCUDNN_ROOT=${CUDNN_ROOT}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
)
if (WITH_ANAKIN)
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
fetch_include_recursively(${ANAKIN_INCLUDE})
link_directories(${ANAKIN_LIBRARY})
endif()
message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
add_library(anakin_shared SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
add_dependencies(anakin_shared extern_anakin protobuf mklml)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject)
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
include_directories(${CUB_INCLUDE_DIR})
ExternalProject_Add(
extern_cub
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
GIT_TAG "v1.8.0"
PREFIX ${CUB_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(cub STATIC ${dummyfile})
else()
add_library(cub INTERFACE)
endif()
add_dependencies(cub extern_cub)
LIST(APPEND externl_project_dependencies cub)
......@@ -24,7 +24,7 @@ SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
IF(WIN32 OR APPLE)
MESSAGE(WARNING
MESSAGE(WARNING
"Windows or Mac is not supported with MKLDNN in Paddle yet."
"Force WITH_MKLDNN=OFF")
SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
......@@ -57,8 +57,10 @@ ExternalProject_Add(
GIT_TAG "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
......
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set(WITH_XBYAK ON)
if(WIN32 OR APPLE)
SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
return()
endif()
include(ExternalProject)
set(XBYAK_PROJECT extern_xbyak)
set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include)
include_directories(${XBYAK_INC_DIR})
include_directories(${XBYAK_INC_DIR}/xbyak)
add_definitions(-DPADDLE_WITH_XBYAK)
# xbyak options
add_definitions(-DXBYAK64)
add_definitions(-DXBYAK_NO_OP_NAMES)
ExternalProject_Add(
${XBYAK_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ""
GIT_REPOSITORY "https://github.com/herumi/xbyak.git"
GIT_TAG "v5.661" # Jul 26th
PREFIX ${XBYAK_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
add_library(xbyak STATIC ${dummyfile})
else()
add_library(xbyak INTERFACE)
endif()
add_dependencies(xbyak ${XBYAK_PROJECT})
list(APPEND external_project_dependencies xbyak)
......@@ -264,7 +264,10 @@ function(cc_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if (${cc_test_SERIAL})
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endif()
endfunction(cc_test)
......@@ -329,7 +332,10 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME})
if (nv_test_SERIAL)
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endif()
endfunction(nv_test)
......@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
......
......@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
list(APPEND inference_deps anakin_inference_lib)
endif()
......
......@@ -38,11 +38,3 @@ _switch_scope
.. autofunction:: paddle.fluid.executor._switch_scope
:noindex:
.. _api_fluid_executor_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.executor.fetch_var
:noindex:
......@@ -106,22 +106,6 @@ _switch_scope
.. autofunction:: paddle.fluid._switch_scope
:noindex:
.. _api_fluid_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.fetch_var
:noindex:
.. _api_fluid_Go:
Go
--
.. autoclass:: paddle.fluid.Go
:members:
:noindex:
.. _api_fluid_make_channel:
......
......@@ -177,8 +177,8 @@ graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
mem_opt_pass->Apply(std::move(graph));
graph = PassRegistry::Instance().Get("multi_device_pass").Apply(std::move(grah));
graph = PassRegistry::Instance().Get("multi_device_check_pass").Apply(std::move(grah));
graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
Executor exe;
exe.Run(graph);
......
# 如何使用timeline工具做性能分析
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
1. 在训练的主循环外加上`profiler.start_profiler(...)``profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
**提示:**
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
if pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile")
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
...
```
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)
```python
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
```
1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。
......
# how to use timeline tool to do profile
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
1. Add `profiler.start_profiler(...)``profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[],
use_program_cache=True)
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
if pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile")
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[])
...
```
......@@ -17,6 +19,10 @@
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
```python
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
```
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
![chrome tracing](./tracing.jpeg)
......
......@@ -4,7 +4,7 @@ Paddle 预测 API
为了更简单方便的预测部署,Fluid 提供了一套高层 API
用来隐藏底层不同的优化实现。
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
包括
- 头文件 ``paddle_inference_api.h`` 定义了所有的接口
......@@ -104,5 +104,5 @@ engine
------------
- `inference
demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
- `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
- `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
# Operator fusion
Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.
There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.
## Challenge
The challenge of fusing operators is:
- how to make the rules.
- how to implement these rules efficiently.
### How to make the rules?
The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
### How to implement these rules efficiently?
#### How to fuse the adjacent operations efficiently?
Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
#### How to fuse the operators that have the same function efficiently?
We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
......@@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N
paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
......@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
......@@ -34,21 +37,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
......@@ -62,20 +54,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', '
paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
......@@ -89,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro
paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
......@@ -167,10 +155,12 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
......@@ -263,9 +253,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
......@@ -306,7 +294,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
......@@ -336,15 +326,13 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -5,5 +5,7 @@ add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(string)
add_subdirectory(recordio)
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)
if(WITH_INFERENCE)
# NOTE: please add subdirectory inference at last.
add_subdirectory(inference)
endif()
......@@ -7,6 +7,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
else()
......@@ -99,7 +100,7 @@ else()
endif()
cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
......@@ -17,6 +17,8 @@
#include <string>
#include <unordered_map>
using float16 = paddle::platform::float16;
namespace paddle {
namespace framework {
......@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
RegisterType<cc_type>(retv, proto_type, #cc_type)
// NOTE: Add your customize type here.
RegType(platform::float16, proto::VarType::FP16);
RegType(float16, proto::VarType::FP16);
RegType(float, proto::VarType::FP32);
RegType(double, proto::VarType::FP64);
RegType(int, proto::VarType::INT32);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_type.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
TEST(DataType, float16) {
using paddle::framework::Tensor;
using paddle::platform::CPUPlace;
using paddle::platform::float16;
namespace f = paddle::framework;
f::proto::VarType::Type dtype = f::proto::VarType::FP16;
Tensor tensor;
CPUPlace cpu;
tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
// test fp16 tensor
EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
// test fp16 size
EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
// test debug info
std::string type = "float16";
EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
}
......@@ -5,9 +5,9 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph graph_helper)
cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
......@@ -28,7 +28,7 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
......
......@@ -17,6 +17,7 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
......@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr);
if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1;
} else {
......
......@@ -21,6 +21,26 @@ namespace framework {
namespace details {
struct BuildStrategy {
// ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
// kReduce, for CPU and GPU. If you use kAllReduce, different threads
// optimize their parameters separately. If you use kReduce, the optimizations
// of parameters are distributed to different threads.
// For example, a model has 100 parameters and is running with four threads,
// if you choose kAllReduce, every thread is to optimize 100 parameters
// separately, if you choose kReduce, every thread is to optimize 25
// parameters.
// Of particular note is, if you use kReduce when using CPU training,
// all the parameters are shared between different threads. This feature will
// save memory.
// FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
// equal for GPU. Because, the result of the different order of summing maybe
// different, for example, the result of `a+b+c+d` may be different with the
// result of `c+a+b+d`.
// For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
// so the result of kAllReduce and kReduce maybe not equal.
// For CPU, if you want to fix the order of summing to make the result
// of kAllReduce and kReduce no diff, you can add
// `FLAGS_cpu_deterministic=true` to env.
enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
enum class GradientScaleStrategy {
......
......@@ -14,6 +14,7 @@
#pragma once
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -22,27 +23,24 @@ namespace details {
class ExceptionHolder {
public:
void Catch(const platform::EnforceNotMet& exp) {
std::lock_guard<std::mutex> lock(mu_);
exception_.reset(new platform::EnforceNotMet(exp));
type_ = kEnforceNotMet;
}
void Catch(const platform::EOFException& exp) {
std::lock_guard<std::mutex> lock(mu_);
// EOFException will not cover up existing EnforceNotMet.
if (exception_.get() == nullptr) {
exception_.reset(new platform::EOFException(exp));
type_ = kEOF;
void Catch(std::exception_ptr eptr) {
try {
std::rethrow_exception(eptr);
} catch (platform::EOFException exp) {
Catch(exp);
} catch (platform::EnforceNotMet exp) {
Catch(exp);
} catch (...) {
LOG(FATAL) << "Unknown exception caught";
}
}
bool ExceptionCatched() const {
bool IsCaught() const {
std::lock_guard<std::mutex> lock(mu_);
return exception_.get() != nullptr;
}
void Throw() {
void ReThrow() {
std::lock_guard<std::mutex> lock(mu_);
switch (type_) {
case kNone:
......@@ -50,27 +48,41 @@ class ExceptionHolder {
case kEnforceNotMet: {
auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
throw e;
break;
}
case kEOF: {
auto e = *static_cast<platform::EOFException*>(exception_.get());
throw e;
break;
}
default:
LOG(FATAL) << "Unknown exception.";
}
exception_.reset();
type_ = kNone;
ClearImpl();
}
void Clear() {
std::lock_guard<std::mutex> lock(mu_);
ClearImpl();
}
private:
void ClearImpl() {
exception_.reset();
type_ = kNone;
}
private:
void Catch(const platform::EnforceNotMet& exp) {
std::lock_guard<std::mutex> lock(mu_);
exception_.reset(new platform::EnforceNotMet(exp));
type_ = kEnforceNotMet;
}
void Catch(const platform::EOFException& exp) {
std::lock_guard<std::mutex> lock(mu_);
// EOFException will not cover up existing EnforceNotMet.
if (exception_.get() == nullptr) {
exception_.reset(new platform::EOFException(exp));
type_ = kEOF;
}
}
enum ExceptionType { kNone, kEnforceNotMet, kEOF };
ExceptionType type_{kNone};
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
......@@ -86,7 +86,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_device_check_pass,
REGISTER_PASS(multi_devices_check_pass,
paddle::framework::details::SSAGraghBuilderWithChecker)
.RequireGraphAttr(paddle::framework::details::kGraphVars)
.RequireGraphAttr(paddle::framework::details::kGraphDepVars)
......
......@@ -14,7 +14,7 @@
#pragma once
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include <string>
......@@ -22,7 +22,7 @@ namespace paddle {
namespace framework {
namespace details {
class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
class SSAGraghBuilderWithChecker : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
......
......@@ -21,7 +21,7 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
......@@ -33,6 +33,92 @@
namespace paddle {
namespace framework {
namespace details {
namespace {
void PolishGraphToSupportDataHazards(ir::Graph *graph) {
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
if (name_pair.second.size() <= 1) {
continue;
}
auto it_new = name_pair.second.rbegin();
auto it_old = name_pair.second.rbegin();
++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
OpHandleBase *write_op = (*it_new)->GeneratedOp();
const auto &read_ops = (*it_old)->PendingOps();
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
if (read_op == write_op) {
// Read Write is the same op.
continue;
}
bool has_dep = false;
for (auto *r_out : read_op->Outputs()) {
for (auto *w_in : write_op->Inputs()) {
if (r_out->Node() == w_in->Node()) {
has_dep = true;
break;
}
}
}
if (has_dep) continue;
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
read_op->AddOutput(dep_var);
write_op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
}
}
}
}
VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
const platform::Place &place,
size_t place_offset) {
auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
auto &var_holder = var_holders[node->Name()];
VarHandle *var = nullptr;
if (var_holder.empty()) {
if (node->Var()) {
var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
node->Name(), place);
} else {
var = new VarHandle(
graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
place_offset, node->Name(), place);
}
var_holder.emplace_back(var);
} else {
var = var_holder.rbegin()->get();
}
return var;
}
void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node, const platform::Place &place,
size_t place_offset) {
auto &vars =
graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
size_t version = vars.size();
auto var =
new VarHandle(new_node, version, place_offset, new_node->Name(), place);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
void AddOutputToLeafOps(ir::Graph *graph) {
for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
if (!op->Outputs().empty()) {
continue;
}
auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
op->AddOutput(dummy_leaf);
}
}
} // namespace
static const char kLossVarName[] = "loss_var_name";
static const char kPlaces[] = "places";
......@@ -275,7 +361,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
if (strategy_.gradient_scale_ !=
BuildStrategy::GradientScaleStrategy::kCustomized) {
// TODO(paddle-dev): Why is there no input for this op_handle?
CreateScaleLossGradOp(&result);
auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
CreateScaleLossGradOp(&result, loss_grad_name);
}
// This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss.
......@@ -535,7 +622,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
return got == sharded_var_device.end() ? -1 : got->second;
}
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
ir::Graph *result, const std::string &loss_grad_name) const {
for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
......@@ -558,10 +646,10 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(result, op_handle,
result->CreateEmptyNode(GradVarName(loss_var_name_),
ir::Node::Type::kVariable),
places_[i], i);
CreateOpOutput(
result, op_handle,
result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
places_[i], i);
}
}
......@@ -749,7 +837,7 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_device_pass,
REGISTER_PASS(multi_devices_pass,
paddle::framework::details::MultiDevSSAGraphBuilder)
.RequirePassAttr(paddle::framework::details::kLossVarName)
.RequirePassAttr(paddle::framework::details::kPlaces)
......
......@@ -18,7 +18,7 @@
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
......@@ -30,7 +30,7 @@ namespace framework {
class Scope;
namespace details {
class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
class MultiDevSSAGraphBuilder : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
......@@ -75,7 +75,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result) const;
void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const;
void CreateComputationalOp(ir::Graph *result, ir::Node *node,
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_printer.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
......@@ -82,5 +82,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_device_print_pass,
REGISTER_PASS(multi_devices_print_pass,
paddle::framework::details::SSAGraghBuilderWithPrinter);
......@@ -18,7 +18,7 @@
#include <iosfwd>
#include <ostream>
#include <string>
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
namespace paddle {
namespace framework {
......@@ -35,7 +35,7 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
void Print(const ir::Graph& graph, std::ostream& sout) const override;
};
class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
class SSAGraghBuilderWithPrinter : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_helper.h"
namespace paddle {
namespace framework {
namespace details {} // namespace details
} // namespace framework
} // namespace paddle
......@@ -52,33 +52,6 @@ const char kGraphOps[] = "ops";
typedef std::unordered_map<std::string, int> ShardedVarDevice;
const char kShardedVarDevice[] = "sharded_var_device";
class SSAGraphBuilder : public ir::Pass {
public:
SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {}
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
protected:
/*
Dependency graph has been constructed. However, there are still data
hazards need to be handled.
*/
static void PolishGraphToSupportDataHazards(ir::Graph *graph);
static VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
const platform::Place &place,
size_t place_offset);
// Add an output variable (each_var_name, place, place_offset) to op_handle,
// which belongs to graph
static void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node, const platform::Place &place,
size_t place_offset);
static void AddOutputToLeafOps(ir::Graph *graph);
};
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -16,12 +16,18 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(
cpu_deterministic, false,
"Whether to make the result of computation deterministic in CPU side.");
namespace paddle {
namespace framework {
namespace details {
void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr);
if (places_.size() == 1) return;
// the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
......@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
} else {
std::vector<const LoDTensor *> lod_tensors =
GetInputValues<LoDTensor>(in_var_handles, var_scopes);
if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
this->RunAndRecordEvent([&] {
ReduceLoDTensor func(lod_tensors,
out_var->GetMutable<framework::LoDTensor>());
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
// FIXME(zcd): The order of summing is important,
// especially when the type of data is float or double.
// For example, the result of `a+b+c+d` may be different
// with the result of `c+a+b+d`, so the summing order should be fixed.
if (!FLAGS_cpu_deterministic) {
ReduceLoDTensor func(lod_tensors,
out_var->GetMutable<framework::LoDTensor>());
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
} else {
// We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
// here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
auto &reduce_sum_trg = *this->local_scopes_[0]
->FindVar(kLocalExecScopeName)
->Get<Scope *>()
->FindVar(out_var_handle->name_)
->GetMutable<framework::LoDTensor>();
ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
auto trg = out_var->GetMutable<framework::LoDTensor>();
if (reduce_sum_trg.data<void>() != trg->data<void>()) {
TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
}
}
});
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
#ifdef PADDLE_WITH_CUDA
......
......@@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
......@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception();
}
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
drop_scope_counter_ += 1;
if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include <utility>
namespace paddle {
namespace framework {
namespace details {
void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) {
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
for (auto &name_pair : var_map) {
if (name_pair.second.size() <= 1) {
continue;
}
auto it_new = name_pair.second.rbegin();
auto it_old = name_pair.second.rbegin();
++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
OpHandleBase *write_op = (*it_new)->GeneratedOp();
const auto &read_ops = (*it_old)->PendingOps();
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
if (read_op == write_op) {
// Read Write is the same op.
continue;
}
bool has_dep = false;
for (auto *r_out : read_op->Outputs()) {
for (auto *w_in : write_op->Inputs()) {
if (r_out->Node() == w_in->Node()) {
has_dep = true;
break;
}
}
}
if (has_dep) continue;
auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
read_op->AddOutput(dep_var);
write_op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
}
}
}
}
}
VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
ir::Graph *graph, ir::Node *node, const platform::Place &place,
size_t place_offset) {
auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
auto &var_holder = var_holders[node->Name()];
VarHandle *var = nullptr;
if (var_holder.empty()) {
if (node->Var()) {
var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
node->Name(), place);
} else {
var = new VarHandle(
graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
place_offset, node->Name(), place);
}
var_holder.emplace_back(var);
} else {
var = var_holder.rbegin()->get();
}
return var;
}
void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node,
const platform::Place &place,
size_t place_offset) {
auto &vars =
graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
size_t version = vars.size();
auto var =
new VarHandle(new_node, version, place_offset, new_node->Name(), place);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) {
for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
if (!op->Outputs().empty()) {
continue;
}
auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
op->AddOutput(dummy_leaf);
}
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -14,7 +14,8 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
......@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars;
......@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Clean run context
run_op_futures_.clear();
exception_holder_.Clear();
event.reset(nullptr);
// Step 3. Execution
while (!pending_vars.empty()) {
......@@ -103,11 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) {
if (exception_holder_.ExceptionCatched()) {
if (exception_holder_.IsCaught()) {
for (auto &run_op_future : run_op_futures_) {
run_op_future.wait();
}
exception_holder_.Throw();
exception_holder_.ReThrow();
} else {
continue;
}
......@@ -216,12 +220,8 @@ void ThreadedSSAGraphExecutor::RunOp(
running_ops_--;
ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted";
} catch (platform::EOFException ex) {
exception_holder_.Catch(ex);
} catch (platform::EnforceNotMet ex) {
exception_holder_.Catch(ex);
} catch (...) {
LOG(FATAL) << "Unknown exception catched";
exception_holder_.Catch(std::current_exception());
}
};
if (pool_) {
......
......@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
}
for (auto& op : ctx->ops_) {
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
// NOTE! Please do not delete this line, it's usefull because the debug
// string before and after op.run are different, after run the output
// will have right shape which is usefull for debug.
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
......
......@@ -3,7 +3,10 @@ cc_library(graph SRCS graph.cc DEPS node)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
......@@ -182,9 +182,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
}
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
* prune them.
* We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly.
* So, to make the program running in the right order, we should add the
* dependence of WAR and WAW.
*
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
*/
......@@ -201,6 +203,19 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
(*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
const auto &read_ops = (*it_old)->outputs;
PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
// Add write after write dependence
ir::Node *upstream_op =
(*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
if (upstream_op) {
ir::Node *dep_var = CreateControlDepVar();
write_op->inputs.push_back(dep_var);
upstream_op->outputs.push_back(dep_var);
dep_var->outputs.push_back(write_op);
dep_var->inputs.push_back(upstream_op);
}
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
if (read_op == write_op) {
......
......@@ -28,6 +28,38 @@ namespace paddle {
namespace framework {
namespace ir {
/*
* The graph is a Directed Acyclic Single Static Assignment Graph.
*
* In more detail, the following properties must hold:
*
* The graph shouldn't contain cycle. Each node is a black-box to the graph
* so the node itself could be a loop operator.
*
* Each Variable-type node has only one input (thus single static assignment).
*
* The output/input of operator is variable and the output/input of variable
* is operator.
*
* The following data harzards in Program are addressed in the Graph:
*
* Write-After-Read
* a = op1(x)
* x = op2(b)
* A control-dependency connection is created bettwen op1 and op2 such that
* op1->op2, so as to ensure correct order.
*
* Write-After-Write
* x = op1(a)
* x = op2(b)
* A control-dependency connection is created between op1 and op2 such that
* op1->op2, so as to ensure correct order.
*
* Other properties currently hold, but is not enforced yet:
*
* Variable-type node (not control dep) with the same variable name share
* the same underlying VarDesc.
*/
class Graph {
public:
explicit Graph(const ProgramDesc &program);
......
......@@ -116,8 +116,8 @@ TEST(GraphHelperTest, Basic) {
for (size_t i = 0; i < sorted.size(); ++i) {
node_map[sorted[i]->Name()] = i;
}
ASSERT_EQ(node_map.at("op1"), 0);
ASSERT_EQ(node_map.at("op2"), 1);
ASSERT_EQ(node_map.at("op1"), 0UL);
ASSERT_EQ(node_map.at("op2"), 1UL);
ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
}
} // namespace ir
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
nodes_.emplace_back(new PDNode(std::move(teller), name));
auto* cur = nodes_.back().get();
return cur;
}
void PDPattern::AddEdge(PDNode* a, PDNode* b) {
PADDLE_ENFORCE(a);
PADDLE_ENFORCE(b);
PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
edges_.emplace_back(a, b);
}
void GraphPatternDetecter::operator()(Graph* graph,
GraphPatternDetecter::handle_t handler) {
if (!MarkPDNodesInGraph(*graph)) return;
auto subgraphs = DetectPatterns();
UniquePatterns(&subgraphs);
RemoveOverlappedMatch(&subgraphs);
for (auto& g : subgraphs) {
handler(g, graph);
}
}
bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
if (graph.Nodes().empty()) return false;
for (auto& node : GraphTraits::DFS(graph)) {
for (const auto& pdnode : pattern_.nodes()) {
if (pdnode->Tell(&node)) {
pdnodes2nodes_[pdnode.get()].insert(&node);
}
}
}
return !pdnodes2nodes_.empty();
}
struct HitGroup {
std::unordered_map<PDNode*, Node*> roles;
bool Match(Node* node, PDNode* pat) {
return !roles.count(pat) || roles.at(pat) == node;
}
void Register(Node* node, PDNode* pat) { roles[pat] = node; }
};
// Tell whether Node a links to b.
bool IsNodesLink(Node* a, Node* b) {
for (auto* node : a->outputs) {
if (b == node) {
return true;
}
}
return false;
}
std::vector<GraphPatternDetecter::subgraph_t>
GraphPatternDetecter::DetectPatterns() {
// Init empty subgraphs.
std::vector<GraphPatternDetecter::subgraph_t> result;
std::vector<HitGroup> init_groups;
PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
auto* first_pnode = pattern_.edges().front().first;
if (!pdnodes2nodes_.count(first_pnode)) return result;
for (auto* node : pdnodes2nodes_[first_pnode]) {
HitGroup group;
group.roles[first_pnode] = node;
init_groups.emplace_back(group);
}
int step = 0;
std::array<std::vector<HitGroup>, 2> bi_records;
bi_records[0] = std::move(init_groups);
// Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes.
for (const auto& edge : pattern_.edges()) {
// Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
auto& pre_groups = bi_records[step % 2];
auto& cur_groups = bi_records[1 - (step++ % 2)];
cur_groups.clear();
// source -> target
for (Node* source : pdnodes2nodes_[edge.first]) {
for (Node* target : pdnodes2nodes_[edge.second]) {
// TODO(Superjomn) add some prune strategies.
for (const auto& group : pre_groups) {
HitGroup new_group = group;
if (IsNodesLink(source, target) &&
new_group.Match(source, edge.first)) {
new_group.Register(source, edge.first);
if (new_group.Match(target, edge.second)) {
new_group.Register(target, edge.second);
cur_groups.push_back(new_group);
// TODO(Superjomn) need to unique
}
}
}
}
}
}
for (auto& group : bi_records[step % 2]) {
GraphPatternDetecter::subgraph_t subgraph;
for (auto& role : group.roles) {
subgraph.emplace(role.first, role.second);
}
result.emplace_back(subgraph);
}
return result;
}
void GraphPatternDetecter::UniquePatterns(
std::vector<GraphPatternDetecter::subgraph_t>* subgraphs) {
if (subgraphs->empty()) return;
std::vector<GraphPatternDetecter::subgraph_t> result;
std::unordered_set<size_t> set;
for (auto& g : *subgraphs) {
size_t key = 0;
for (auto& item : g) {
key ^= std::hash<void*>{}(item.first);
key ^= std::hash<void*>{}(item.second);
}
if (!set.count(key)) {
result.emplace_back(g);
set.insert(key);
}
}
*subgraphs = result;
}
void GraphPatternDetecter::RemoveOverlappedMatch(
std::vector<subgraph_t>* subgraphs) {
std::vector<subgraph_t> result;
std::unordered_set<Node*> node_set;
for (const auto& subgraph : *subgraphs) {
bool valid = true;
for (auto& item : subgraph) {
if (node_set.count(item.second)) {
valid = false;
break;
}
}
if (valid) {
for (auto& item : subgraph) {
node_set.insert(item.second);
}
result.push_back(subgraph);
}
}
*subgraphs = result;
}
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest_prod.h>
#endif
#include <numeric>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace framework {
namespace ir {
// Some basic torminolygies:
// - PDPattern: a pattern defined as a data flow graph.
// - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
// that meets some conditions defined in `PDNode.teller`.
// - A pattern is defined with PDNodes with edges.
// Pattern detector node. This node helps to build a pattern.
struct PDNode {
// tell whether an ir::Node* is a candidation for a PDNode.
using teller_t = std::function<bool(Node*)>;
PDNode(teller_t&& teller, const std::string& name = "")
: teller_(teller), name_(name) {
PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
}
PDNode(PDNode&& other) = default;
std::vector<PDNode*> inlinks;
std::vector<PDNode*> outlinks;
bool Tell(Node* node) const {
PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode");
return teller_(node);
}
const std::string& name() const { return name_; }
PDNode(const PDNode&) = delete;
PDNode& operator=(const PDNode&) = delete;
private:
teller_t teller_;
std::string name_;
};
/*
* A pattern in a graph, which defined with PDNode and edges. Most graph
* patterns can be divided into PDNodes and link relations between them.
*
* For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
* operators from the computation graph, the MUL's output should have only one
* consumer which is the ELEMENTWISE_ADD.
* This pattern can be defined as with the following pseudo codes
*
* // Create two operator PDNodes.
* MUL = PDPattern.NewNode()
* ELE = PDPattern.NewNode()
* // Create the variable PDNodes.
* MUL_out = PDPattern.NewNode()
* // Add teller to define some rules that help to filter the target Nodes.
* MUL.teller = lambda(node): node->IsOp() && node->Op()->Type == "mul";
* ELE.teller = lambda(node): \
* node->IsOp() && node->Op()->Type == "elementwise_add";
* MUL_out.teller = lambda(node): node->IsVar() && (MUL in node->inputs)
* && (ELE in node->outputs)
*
* One can add more specific tellers for PDNodes or edges, both the Operator
* and Variable Nodes can be ruled in PDNode.teller.
*
* PDPattern can record the general patterns, such as the pattern represents
* - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
* - Ops whose inputs and outputs share the same variables
*/
class PDPattern {
public:
using edge_t = std::pair<PDNode*, PDNode*>;
void AddEdge(PDNode* a, PDNode* b);
PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
const std::vector<edge_t>& edges() const { return edges_; }
private:
#ifdef PADDLE_WITH_TESTING
FRIEND_TEST(PDPattern, AddEdge);
FRIEND_TEST(PDPattern, NewNode);
#endif
std::vector<std::unique_ptr<PDNode>> nodes_;
std::vector<edge_t> edges_;
};
/*
* GraphPatternDetecter helps to detect the specific patterns in the graph.
* Input a pattern, output a list of the matched subgraphs/nodes.
* This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
*
* The algorithm has three phases:
* 1. Mark the nodes that match the defined PDNodes in a PDPattern,
* 2. Extend a PDNode to subgraphs by deducing the connection relation defined
* in PAPattern(the edges),
* 3. Get the filtered subgraphs and treat them with a pre-defined handler.
*
* Usage:
* // Create a detector
* GraphPatternDetecter detector;
* // Define the detector's pattern, by adding PDNode and define the edges.
* auto* node0 = detector.mutable_pattern().AddNode(...)
* auto* node1 = detector.mutable_pattern().AddNode(...)
* node0->teller = some lambda.
* node1->teller = some lambda.
* detector.mutable_pattern().AddEdge(node0, node1);
* // Create an handler, to define the behavior of treating the filtered
* // subgraphs that comply with the patterns.
* GraphPatternDetecter::handle_t handler = some labmda
* // Execute the detector.
* detector(&graph, handler);
*/
class GraphPatternDetecter {
public:
using subgraph_t = std::unordered_map<PDNode*, Node*>;
// Operate on the detected pattern.
using handle_t =
std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
void operator()(Graph* graph, handle_t handler);
const PDPattern& pattern() const { return pattern_; }
PDPattern* mutable_pattern() { return &pattern_; }
private:
// Mark the nodes that fits the pattern.
bool MarkPDNodesInGraph(const ir::Graph& graph);
// Detect all the pattern and output the hit records.
std::vector<subgraph_t> DetectPatterns();
// Remove duplicate patterns.
void UniquePatterns(std::vector<subgraph_t>* subgraphs);
// Remove overlapped match subgraphs, when overlapped, keep the previous one.
void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
#ifdef PADDLE_WITH_TESTING
FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
#endif
private:
using hit_rcd_t =
std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
PDPattern pattern_;
std::vector<hit_rcd_t> marked_records_;
std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void BuildGraph(Graph* g) {
ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
// o1->v1->o2
o1->outputs.push_back(v1);
o2->inputs.push_back(v1);
v1->inputs.push_back(o1);
v1->outputs.push_back(o2);
// o2->v2->o3
// o2->v2->o4
o2->outputs.push_back(v2);
o3->inputs.push_back(v2);
o4->inputs.push_back(v2);
v2->inputs.push_back(o2);
v2->outputs.push_back(o3);
v2->outputs.push_back(o4);
// o2->v3->o5
o2->outputs.push_back(v3);
o5->inputs.push_back(v3);
v3->inputs.push_back(o2);
v3->outputs.push_back(o5);
// o3-v4->o5
o3->outputs.push_back(v4);
o5->inputs.push_back(v4);
v4->inputs.push_back(o3);
v4->outputs.push_back(o5);
}
TEST(PDPattern, NewNode) {
PDPattern x;
auto* n = x.NewNode([](Node* x) { return true; });
ASSERT_TRUE(n);
ASSERT_EQ(x.nodes_.size(), 1UL);
}
TEST(PDPattern, AddEdge) {
PDPattern x;
auto* a = x.NewNode([](Node* x) { return true; });
auto* b = x.NewNode([](Node* x) { return true; });
ASSERT_TRUE(a);
ASSERT_TRUE(b);
x.AddEdge(a, b);
ASSERT_EQ(x.nodes_.size(), 2UL);
ASSERT_EQ(x.edges_.size(), 1UL);
ASSERT_EQ(x.edges_.front().first, a);
ASSERT_EQ(x.edges_.front().second, b);
ASSERT_EQ(x.nodes().size(), 2UL);
ASSERT_EQ(x.edges().size(), 1UL);
ASSERT_EQ(x.edges().front().first, a);
ASSERT_EQ(x.edges().front().second, b);
}
TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
GraphPatternDetecter x;
// mark o2, o3, v2
// The pattern is a graph:
// o2(a node named o2) -> v2(a node named v2)
// v2 -> o3(a node named o3)
auto* o2 = x.pattern_.NewNode([](Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->Name() == "op2" && node->IsOp();
});
auto* o3 = x.pattern_.NewNode([](Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->Name() == "op3" && node->IsOp();
});
auto* v2 = x.pattern_.NewNode([](Node* node) {
// The teller can be any condition, such as op type, or variable's shape.
return node && node->Name() == "var2" && node->IsVar();
});
ASSERT_FALSE(o2->Tell(nullptr));
ASSERT_FALSE(o3->Tell(nullptr));
ASSERT_FALSE(v2->Tell(nullptr));
x.pattern_.AddEdge(o2, v2);
x.pattern_.AddEdge(v2, o3);
ASSERT_EQ(x.pattern_.edges().size(), 2UL);
ASSERT_EQ(x.pattern_.edges()[0].first, o2);
ASSERT_EQ(x.pattern_.edges()[0].second, v2);
ASSERT_EQ(x.pattern_.edges()[1].first, v2);
ASSERT_EQ(x.pattern_.edges()[1].second, o3);
ProgramDesc program;
Graph graph(program);
BuildGraph(&graph);
x.MarkPDNodesInGraph(graph);
ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
auto subgraphs = x.DetectPatterns();
ASSERT_EQ(subgraphs.size(), 1UL);
}
TEST(GraphPatternDetecter, MultiSubgraph) {
ProgramDesc program;
Graph graph(program);
BuildGraph(&graph);
GraphPatternDetecter x;
// The pattern is a graph:
// op -> var
auto* any_op = x.mutable_pattern()->NewNode(
[](Node* node) {
return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
},
"OP0");
auto* any_var = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsVar(); }, "VAR");
auto* any_op1 = x.mutable_pattern()->NewNode(
[](Node* node) { return node->IsOp(); }, "OP1");
x.mutable_pattern()->AddEdge(any_op, any_var);
x.mutable_pattern()->AddEdge(any_var, any_op1);
int count = 0;
GraphPatternDetecter::handle_t handle = [&](
const GraphPatternDetecter::subgraph_t& s, Graph* g) {
LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
<< s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
count++;
};
x(&graph, handle);
// 1. Detect op3 -> var4 -> op5
// 2. Detect op2 -> var2 -> op3
// 3. Detect op2 -> var2 -> op4
// 4. Detect op2 -> var3 -> op5
// But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
ASSERT_GE(count, 1UL);
ASSERT_LE(count, 2UL);
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -36,7 +36,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "");
AddOutput("Out", "").AsDuplicable();
AddComment("");
}
};
......@@ -59,11 +59,27 @@ class SumOpVarTypeInference : public VarTypeInference {
block->Var(out_var_name)->SetType(default_var_type);
}
};
class DummyOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "").AsDuplicable();
AddComment("");
}
};
class DummyOpVarTypeInference : public VarTypeInference {
public:
void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
};
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
paddle::framework::SumOpVarTypeInference);
REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker,
paddle::framework::SumOpVarTypeInference);
REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
paddle::framework::SumOpMaker);
......@@ -97,18 +113,96 @@ TEST(GraphTest, Basic) {
std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
for (ir::Node *n : nodes) {
if (n->Name() == "sum") {
ASSERT_EQ(n->inputs.size(), 3);
ASSERT_EQ(n->outputs.size(), 1);
ASSERT_EQ(n->inputs.size(), 3UL);
ASSERT_EQ(n->outputs.size(), 1UL);
} else if (n->Name() == "test_a" || n->Name() == "test_b" ||
n->Name() == "test_c") {
ASSERT_EQ(n->inputs.size(), 0);
ASSERT_EQ(n->outputs.size(), 1);
ASSERT_EQ(n->inputs.size(), 0UL);
ASSERT_EQ(n->outputs.size(), 1UL);
} else if (n->Name() == "test_out") {
ASSERT_EQ(n->inputs.size(), 1);
ASSERT_EQ(n->outputs.size(), 0);
ASSERT_EQ(n->inputs.size(), 1UL);
ASSERT_EQ(n->outputs.size(), 0UL);
}
}
ASSERT_EQ(nodes.size(), 5);
}
TEST(GraphTest, WriteAfterRead) {
// void Test() {
ProgramDesc prog;
auto *op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
op->SetAttr("op_role", 1);
op = prog.MutableBlock(0)->AppendOp();
op->SetType("dummy");
op->SetInput("X", {"c"});
op->SetOutput("Out", {"a"});
op->SetAttr("op_role", 1);
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
ir::Node *control_dep1 = nullptr;
ir::Node *control_dep2 = nullptr;
for (ir::Node *n : g->Nodes()) {
if (n->Name() == "sum") {
ASSERT_EQ(n->outputs[0]->Name(), "b");
ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
control_dep1 = n->outputs[1];
ASSERT_EQ(n->outputs.size(), 2);
}
if (n->Name() == "dummy") {
ASSERT_EQ(n->inputs[0]->Name(), "c");
ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
control_dep2 = n->inputs[1];
ASSERT_EQ(n->inputs.size(), 2);
}
}
ASSERT_EQ(control_dep1, control_dep2);
}
TEST(GraphTest, WriteAfterWrite) {
// void Test() {
ProgramDesc prog;
auto *op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"a"});
op->SetOutput("Out", {"b"});
op->SetAttr("op_role", 1);
op = prog.MutableBlock(0)->AppendOp();
op->SetType("dummy");
op->SetInput("X", {"c"});
op->SetOutput("Out", {"b"});
op->SetAttr("op_role", 1);
prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
ir::Node *control_dep1 = nullptr;
ir::Node *control_dep2 = nullptr;
for (ir::Node *n : g->Nodes()) {
if (n->Name() == "sum") {
ASSERT_EQ(n->outputs[0]->Name(), "b");
ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
ASSERT_EQ(n->outputs.size(), 2);
control_dep1 = n->outputs[1];
}
if (n->Name() == "dummy") {
ASSERT_EQ(n->inputs[0]->Name(), "c");
ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
control_dep2 = n->inputs[1];
ASSERT_EQ(n->inputs.size(), 2);
ASSERT_EQ(control_dep1, control_dep2);
}
}
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/graph_traits.h"
namespace paddle {
namespace framework {
namespace ir {
//
// NodesDFSIterator
//
NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
for (auto *x : source) stack_.push(x);
}
NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
: stack_(std::move(other.stack_)),
visited_(std::move(other.visited_)) {}
NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
: stack_(other.stack_), visited_(other.visited_) {}
Node &NodesDFSIterator::operator*() {
PADDLE_ENFORCE(!stack_.empty());
return *stack_.top();
}
NodesDFSIterator &NodesDFSIterator::operator++() {
PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
visited_.insert(stack_.top());
auto *cur = stack_.top();
stack_.pop();
for (auto *x : cur->outputs) {
if (!visited_.count(x)) {
stack_.push(x);
}
}
return *this;
}
bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
if (stack_.empty()) return other.stack_.empty();
if ((!stack_.empty()) && (!other.stack_.empty())) {
return stack_.top() == other.stack_.top();
}
return false;
}
NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
stack_ = other.stack_;
visited_ = other.visited_;
return *this;
}
Node *NodesDFSIterator::operator->() { return stack_.top(); }
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stack>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace framework {
namespace ir {
template <typename IteratorT>
class iterator_range {
IteratorT begin_, end_;
public:
template <typename Container>
explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
iterator_range(const IteratorT &begin, const IteratorT &end)
: begin_(begin), end_(end) {}
const IteratorT &begin() const { return begin_; }
const IteratorT &end() const { return end_; }
};
// DFS iterator on nodes.
struct NodesDFSIterator
: public std::iterator<std::forward_iterator_tag, Node *> {
NodesDFSIterator() = default;
explicit NodesDFSIterator(const std::vector<Node *> &source);
NodesDFSIterator(NodesDFSIterator &&other) noexcept;
NodesDFSIterator(const NodesDFSIterator &other);
Node &operator*();
NodesDFSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesDFSIterator &operator=(const NodesDFSIterator &other);
bool operator==(const NodesDFSIterator &other);
bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
Node *operator->();
private:
std::stack<Node *> stack_;
std::unordered_set<Node *> visited_;
};
/*
* GraphTraits contains some graph traversal algorithms.
*
* Usage:
*
*/
struct GraphTraits {
static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
auto start_points = ExtractStartPoints(g);
NodesDFSIterator x(start_points);
return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
NodesDFSIterator());
}
private:
// The nodes those have no input will be treated as start points.
static std::vector<Node *> ExtractStartPoints(const Graph &g) {
std::vector<Node *> result;
for (auto *node : g.Nodes()) {
if (node->inputs.empty()) {
result.push_back(node);
}
}
return result;
}
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -58,6 +58,9 @@ class Node {
return op_desc_;
}
bool IsOp() const { return type_ == Type::kOperation; }
bool IsVar() const { return type_ == Type::kVariable; }
std::vector<Node*> inputs;
std::vector<Node*> outputs;
......
......@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
}
}
int OpDesc::GetBlockAttr(const std::string &name) const {
std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
std::vector<int> ids;
for (auto n : blocks) {
ids.push_back(n->ID());
}
return ids;
}
int OpDesc::GetBlockAttrId(const std::string &name) const {
auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
return boost::get<BlockDesc *>(it->second)->ID();
......
......@@ -83,7 +83,9 @@ class OpDesc {
Attribute GetNullableAttr(const std::string &name) const;
int GetBlockAttr(const std::string &name) const;
int GetBlockAttrId(const std::string &name) const;
std::vector<int> GetBlocksAttrIds(const std::string &name) const;
void Rename(const std::string &old_name, const std::string &new_name);
......
......@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
"data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
"CUDNN]");
using CUDAPlace = paddle::platform::CUDAPlace;
OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
LibraryType::kCUDNN);
ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
"data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
"type[CUDNN]");
}
TEST(OpKernelType, Hash) {
......
......@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
return OpProtoAndCheckerMaker::VariableBuilder{output};
}
void OpProtoAndCheckerMaker::Reuse(const std::string& name,
const std::string& reused_name) {
bool found = false;
proto::OpProto::Var* var;
for (auto& var : proto_->inputs()) {
if (var.name() == reused_name) {
found = true;
break;
}
}
PADDLE_ENFORCE(found == true,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched.",
name, reused_name);
found = false;
for (int i = 0; i < proto_->outputs().size(); ++i) {
var = proto_->mutable_outputs()->Mutable(i);
if (var->name() == name) {
PADDLE_ENFORCE(!var->has_reuse(),
"Output(%s) has been set reused var of %s", name,
var->reuse());
found = true;
var->set_reuse(reused_name);
break;
}
}
PADDLE_ENFORCE(found == true,
"Input/Output name: %s reused_name: %s, one of them is not "
"exists or not matched.",
name, reused_name);
}
void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
std::unordered_set<std::string> names;
auto checker = [&](const std::string& name) {
......
......@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
VariableBuilder AddOutput(const std::string &name,
const std::string &comment);
void Reuse(const std::string &name, const std::string &reused_name);
template <typename T>
TypedAttrChecker<T> &AddAttr(const std::string &name,
const std::string &comment,
......
......@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
}
class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "input of test op");
AddOutput("XOut", "output of test op").Reuse("X");
}
};
class TestInplaceProtoMaker2
: public paddle::framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "input of test op");
......@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
};
TEST(ProtoMaker, InplaceOutput) {
paddle::framework::proto::OpProto op_proto;
paddle::framework::proto::OpProto op_proto, op_proto2;
paddle::framework::OpAttrChecker op_checker;
TestInplaceProtoMaker proto_maker;
ASSERT_THROW(proto_maker(&op_proto, &op_checker),
TestInplaceProtoMaker2 proto_maker2;
proto_maker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
paddle::platform::EnforceNotMet);
// proto_maker(&op_proto, &op_checker);
// proto_maker.Make();
// ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
// normal reuse
class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "input of test op");
AddInput("Y", "input of test op");
AddOutput("Out", "output of test op");
AddOutput("XOut", "output of test op");
// avoid destructor exception.
// Validate();
TestReuse();
}
virtual void TestReuse() {}
};
// test duplicate reuse error
class TestReuseProtoMaker2 : public TestReuseProtoMaker {
public:
void TestReuse() {
Reuse("Out", "X");
Reuse("Out", "Y");
}
};
// NotExists Input
class TestReuseProtoMaker3 : public TestReuseProtoMaker {
public:
void TestReuse() {
Reuse("Out", "NotExists");
Reuse("XOut", "X");
}
};
// NotExists Output
class TestReuseProtoMaker4 : public TestReuseProtoMaker {
public:
void TestReuse() { Reuse("NotExists", "X"); }
};
TEST(ProtoMaker, Reuse) {
paddle::framework::proto::OpProto op_proto;
paddle::framework::OpAttrChecker op_checker;
TestReuseProtoMaker proto_maker;
proto_maker(&op_proto, &op_checker);
}
// NOTE(dzhwinter):
// There is a Fatal CHECK on base class destructor, which will call abort inside
// instead of
// throw an exception. If we throw an exception in Make(), we will trigger the
// CHECK and terminate the tests.
//
// I had tried to replace the default CHECK with a exception, however, it's
// still not supported by glog.
// the details:
// https://github.com/google/glog/issues/249
// https://github.com/facebookresearch/TensorComprehensions/issues/351
/*
TEST(ProtoMaker, ReuseWithException) {
paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
paddle::framework::OpAttrChecker op_checker;
TestReuseProtoMaker2 proto_maker2;
TestReuseProtoMaker3 proto_maker3;
TestReuseProtoMaker4 proto_maker4;
EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
paddle::platform::EnforceNotMet);
EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
paddle::platform::EnforceNotMet);
}
void FailureFunction() {
throw std::runtime_error("Check failed in destructor.");
// return 0;
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
google::InstallFailureFunction(&FailureFunction);
return RUN_ALL_TESTS();
}
*/
......@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h"
......@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
}
if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims();
const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return DDim({-1});
}
return tensor.dims();
} else if (var->IsType<SelectedRows>()) {
if (get_actual_dim) {
return var->Get<SelectedRows>().value().dims();
......@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
}
}
static std::string GetDtype(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
return "";
}
if (var->IsType<LoDTensor>()) {
const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return "";
}
return DataTypeToString(ToDataType(tensor.type()));
} else if (var->IsType<SelectedRows>()) {
return DataTypeToString(
ToDataType(var->Get<SelectedRows>().value().type()));
} else {
return "";
}
}
static int GetRowSize(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
......@@ -91,14 +116,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().lod();
const LoDTensor& tensor = var->Get<LoDTensor>();
if (UNLIKELY(!tensor.IsInitialized())) {
return default_lod;
}
return tensor.lod();
} else {
return default_lod;
}
}
void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
VLOG(10) << "- " << DebugStringEx(&scope);
VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot run operator on place %s", place);
......@@ -107,8 +136,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id);
#endif
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place);
VLOG(10) << "+ " << DebugStringEx(&scope);
VLOG(3) << place << " " << DebugStringEx(&scope);
}
bool OperatorBase::HasInputs(const std::string& name) const {
......@@ -172,6 +203,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if (row_size >= 0) {
ss << "[row_size=" << row_size << "]";
}
std::string dtype = GetDtype(*scope, input.second[i]);
ss << ":" << dtype;
ss << "[" << GetDims(*scope, input.second[i], true) << "]";
ss << "(" << GetLoD(*scope, input.second[i]) << ")";
}
......@@ -608,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// For profiling, don't move out of this function because that will result
// in the failure of multi-GPU profiling.
platform::RecordEvent record_event(Type(), dev_ctx);
// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);
......@@ -748,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {
auto& scope = ctx.scope();
int data_type = -1;
std::string last_input_name;
for (auto& input : this->inputs_) {
for (auto& ipt_name : input.second) {
auto* var = scope.FindVar(ipt_name);
......@@ -764,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
int tmp = static_cast<int>(ToDataType(t->type()));
PADDLE_ENFORCE(
tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
data_type, tmp);
"DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
Type(), last_input_name, data_type, ipt_name, tmp);
data_type = tmp;
last_input_name = ipt_name;
}
}
}
......
......@@ -25,9 +25,9 @@ limitations under the License. */
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
#include "paddle/fluid/framework/details/ssa_graph_printer.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -57,39 +57,39 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
}
// Convert graph to run on multi-devices.
auto multi_device_pass =
ir::PassRegistry::Instance().Get("multi_device_pass");
multi_device_pass->SetNotOwned<const std::vector<platform::Place>>("places",
&places);
multi_device_pass->SetNotOwned<const std::string>("loss_var_name",
&loss_var_name);
multi_device_pass->SetNotOwned<const std::unordered_set<std::string>>(
auto multi_devices_pass =
ir::PassRegistry::Instance().Get("multi_devices_pass");
multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
&places);
multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
&loss_var_name);
multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
"params", &param_names);
multi_device_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
multi_device_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
&local_scopes);
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
#ifdef PADDLE_WITH_CUDA
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
multi_device_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
graph = multi_device_pass->Apply(std::move(graph));
graph = multi_devices_pass->Apply(std::move(graph));
// Apply a graph print pass to record a graph with device info.
if (!strategy.debug_graphviz_path_.empty()) {
auto multi_device_print_pass =
ir::PassRegistry::Instance().Get("multi_device_print_pass");
multi_device_print_pass->SetNotOwned<const std::string>(
auto multi_devices_print_pass =
ir::PassRegistry::Instance().Get("multi_devices_print_pass");
multi_devices_print_pass->SetNotOwned<const std::string>(
"debug_graphviz_path", &strategy.debug_graphviz_path_);
multi_device_print_pass->Set<details::GraphvizSSAGraphPrinter>(
multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
"graph_printer", new details::GraphvizSSAGraphPrinter);
graph = multi_device_print_pass->Apply(std::move(graph));
graph = multi_devices_print_pass->Apply(std::move(graph));
}
// Verify that the graph is correct for multi-device executor.
auto multi_device_check_pass =
ir::PassRegistry::Instance().Get("multi_device_check_pass");
graph = multi_device_check_pass->Apply(std::move(graph));
auto multi_devices_check_pass =
ir::PassRegistry::Instance().Get("multi_devices_check_pass");
graph = multi_devices_check_pass->Apply(std::move(graph));
return graph;
}
......@@ -354,6 +354,6 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace paddle
USE_PASS(graph_viz_pass);
USE_PASS(multi_device_pass);
USE_PASS(multi_device_check_pass);
USE_PASS(multi_device_print_pass);
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
......@@ -19,7 +19,7 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
......
......@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
for (const std::string &attr_name : op->AttrNames()) {
if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
int sub_block_id =
o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
}
}
......
......@@ -112,5 +112,6 @@ Tensor& Tensor::Resize(const DDim& dims) {
const DDim& Tensor::dims() const { return dims_; }
int64_t Tensor::numel() const { return product(dims_); }
} // namespace framework
} // namespace paddle
......@@ -82,7 +82,7 @@ class Tensor {
template <typename T>
const T* data() const;
bool IsInitialized() const;
inline bool IsInitialized() const;
/**
* @brief Return a pointer to mutable memory block.
......
......@@ -59,6 +59,14 @@ inline T* Tensor::mutable_data(platform::Place place) {
}
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
int rank = src.dims().size();
PADDLE_ENFORCE_GE(
rank, 2,
"'ReshapeToMatrix()' is only used for flatten high rank "
"tensors to matrixs. Can not be used in reshaping vectors.");
if (rank == 2) {
return src;
}
Tensor res;
res.ShareDataWith(src);
res.Resize(flatten_to_2d(src.dims(), num_col_dims));
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/tensor.h"
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/platform/float16.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
......@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
src.set_layout(framework::DataLayout::kAnyLayout);
ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
}
TEST(Tensor, FP16) {
using platform::float16;
framework::Tensor src;
float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
for (int i = 0; i < 2 * 3; ++i) {
src_ptr[i] = static_cast<float16>(i);
}
EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
// EXPECT a human readable error message
// src.data<uint8_t>();
// Tensor holds the wrong type, it holds N6paddle8platform7float16E at
// [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
}
......@@ -20,6 +20,9 @@
DEFINE_int32(io_threadpool_size, 100,
"number of threads used for doing IO, default 100");
DEFINE_int32(dist_threadpool_size, 0,
"number of threads used for distributed executed.");
namespace paddle {
namespace framework {
......@@ -35,6 +38,10 @@ void ThreadPool::Init() {
if (threadpool_.get() == nullptr) {
// TODO(Yancey1989): specify the max threads number
int num_threads = std::thread::hardware_concurrency();
if (FLAGS_dist_threadpool_size > 0) {
num_threads = FLAGS_dist_threadpool_size;
VLOG(1) << "set dist_threadpool_size to " << num_threads;
}
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads));
}
......
......@@ -24,7 +24,7 @@
namespace paddle {
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
"Enable subgraph to TensorRT engine for acceleration");
DEFINE_string(inference_analysis_graphviz_log_root, "./",
......@@ -42,10 +42,19 @@ class DfgPassManagerImpl final : public DfgPassManager {
// TODO(Superjomn) set the key with pass reprs.
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
auto trt_teller = [](const Node* node) {
auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set(
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
if (!node->IsFunction()) return false;
return static_cast<const Function*>(node)->func_type() == "mul";
const auto* func = static_cast<const Function*>(node);
if (teller_set.count(func->func_type())) {
return true;
} else {
return false;
}
};
AddPass("tensorrt-subgraph-marker",
new TensorRTSubgraphNodeMarkPass(trt_teller));
AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
......
......@@ -23,6 +23,7 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
......
......@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<Node *>(outputs.begin(), outputs.end()));
}
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.type() == Node::Type::kValue || node.deleted()) {
continue;
}
op_nodes.push_back(&node);
}
size_t op_num = op_nodes.size();
for (size_t i = 0; i < op_num; i++) {
if (op_nodes[i]->type() == Node::Type::kFunction) continue;
std::unordered_set<std::string> follow_up_input_names;
for (size_t j = i + 1; j < op_num; j++) {
for (auto *in : op_nodes[j]->inlinks) {
follow_up_input_names.insert(in->name());
}
}
std::vector<Node *> filtered_subgraph_outlinks;
for (auto *out : op_nodes[i]->outlinks) {
if (follow_up_input_names.count(out->name())) {
filtered_subgraph_outlinks.push_back(out);
}
}
PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
op_nodes[i]->outlinks = filtered_subgraph_outlinks;
}
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -176,8 +176,9 @@ struct GraphTraits<DataFlowGraph> {
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph); // NOLINT
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -23,7 +23,7 @@
namespace paddle {
namespace inference {
DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
namespace analysis {
......@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
FilterRedundantOutputOfSubGraph(graph);
LOG(INFO) << "graph.inputs " << graph->inputs.size();
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.deleted()) continue;
......@@ -87,34 +88,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
}
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
const framework::proto::BlockDesc &block) {
framework::proto::BlockDesc *block) {
static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc;
auto *func = static_cast<FunctionBlock *>(node);
// collect inputs
std::vector<std::string> io;
std::unordered_set<std::string> input_names;
for (auto *x : func->inlinks) {
io.push_back(x->name());
input_names.insert(x->name());
}
desc.SetInput("Xs", io);
desc.SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
// collect outputs
io.clear();
std::unordered_set<std::string> output_names;
for (auto *x : func->outlinks) {
io.push_back(x->name());
output_names.insert(x->name());
}
desc.SetOutput("Ys", io);
std::vector<std::string> output_temp(output_names.begin(),
output_names.end());
desc.SetOutput("Ys", output_temp);
desc.SetType("tensorrt_engine");
PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
std::unordered_map<std::string, std::string> output_name_map;
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
// Why we do this?
// During the transition from fluid OP to tensorrt OP, we map
// the input and output Tensor(fluid data structure) of fluid OP
// to the correspondin ITensor (trt data structure) through the
// Tensor name. When we set up ITensor for an variable, we must
// ensure that it has not been set before.
// If there is variable in the fluid graph, which is not only the
// input of a OP, but also the output of a Op, there will be problems.
// So we have to rename the variable in the subgraph to make sure
// it is either an OP's input or an OP's output.
auto subgraph_nodes = func->subgraph;
for (int index = 0; index < block->ops_size(); index++) {
framework::proto::OpDesc *op = block->mutable_ops(index);
auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
std::unordered_map<std::string, size_t> var2id;
for (auto *in_var : correspond_node->inlinks) {
var2id[in_var->name()] = in_var->id();
}
// rename for the input variables of op inside subgraph
for (int i = 0; i < op->inputs_size(); i++) {
framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) {
std::string arg_value = in_var->arguments(k);
if (input_names.count(arg_value)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value +
std::to_string(var2id[arg_value]));
}
}
in_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
in_var->add_arguments(replaced_names[k]);
}
}
var2id.clear();
for (auto out_var : correspond_node->outlinks) {
var2id[out_var->name()] = out_var->id();
}
// rename for the output variables of op inside subgraph
for (int i = 0; i < op->outputs_size(); i++) {
framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
if (output_names.count(arg_value)) {
output_name_map[arg_value] =
arg_value + std::to_string(var2id[arg_value]);
}
replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
out_var->add_arguments(replaced_names[k]);
}
}
}
// When tensorrt engine runs at the end of the operation,
// output_mapping help us copy the data from the renamed ITensor
// to Tensor.
std::vector<std::string> output_mapping;
for (auto name : output_names) {
PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]);
}
PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
// Set attrs
SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
node->SetPbMsg(desc.Proto()->SerializeAsString());
}
......@@ -146,15 +226,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
LOG(INFO) << "transformed variable size: "
<< block_desc.Proto()->vars().size();
// copy ops.
for (auto *node : block_node->subgraph) {
auto *op = block_desc.AppendOp();
PADDLE_ENFORCE(!node->pb_msg().empty());
op->Proto()->ParseFromString(node->pb_msg());
}
*block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
......
......@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
for (size_t i = 0; i < graph->nodes.size(); i++) {
const Node &node = graph->nodes.Get(i);
if (!config_.display_deleted_node && node.deleted()) continue;
for (auto &in : node.inlinks) {
if (!config_.display_deleted_node && in->deleted()) continue;
dot.AddEdge(in->repr(), node.repr(), {});
for (auto &out : node.outlinks) {
if (!config_.display_deleted_node && out->deleted()) continue;
dot.AddEdge(node.repr(), out->repr(), {});
}
}
return dot.Build();
......
......@@ -12,11 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
namespace paddle {
namespace inference {
......
......@@ -17,6 +17,8 @@
* model in the disk, and that model can be reloaded for prediction.
*/
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/pass.h"
namespace paddle {
......
......@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
std::vector<Node *> marked_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
if (node.attr(kMarkerAttrName).Bool()) {
marked_nodes.push_back(&node);
}
......
......@@ -19,6 +19,7 @@ endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api)
if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
endif()
......@@ -44,7 +45,6 @@ endfunction(inference_api_test)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
cc_test(test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api)
......@@ -60,20 +60,19 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
endif()
if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
target_link_libraries(inference_anakin_api anakin anakin_saber_common)
target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
function(anakin_target target_name)
target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endfunction()
anakin_target(inference_anakin_api)
#anakin_target(inference_anakin_api_shared)
if (WITH_TESTING)
cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api)
ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
DEPS inference_anakin_api dynload_cuda SERIAL)
target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif(WITH_TESTING)
endif(WITH_TESTING)
endif()
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
......@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
if (!other.memory_owned_) {
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
} else {
Resize(other.length());
memcpy(data_, other.data(), other.length());
length_ = other.length();
memory_owned_ = true;
}
return *this;
}
PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
// only the buffer with external memory can be copied
assert(!other.memory_owned_);
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
other.data_ = nullptr;
other.length_ = 0;
other.memory_owned_ = false;
return *this;
}
void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return;
assert(memory_owned_);
Free();
if (memory_owned_) {
Free();
}
data_ = new char[length];
length_ = length;
memory_owned_ = true;
......@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
assert(length_ > 0);
delete static_cast<char*>(data_);
delete[] static_cast<char*>(data_);
data_ = nullptr;
length_ = 0;
}
......
......@@ -18,26 +18,36 @@
namespace paddle {
PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
template <typename Target>
PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) {
CHECK(Init(config));
}
bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
template <typename Target>
bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
if (!(graph_.load(config.model_file))) {
LOG(FATAL) << "fail to load graph from " << config.model_file;
return false;
}
graph_.ResetBatchSize("input_0", config.max_batch_size);
auto inputs = graph_.get_ins();
for (auto &input_str : inputs) {
graph_.ResetBatchSize(input_str, config.max_batch_size);
}
// optimization for graph
if (!(graph_.Optimize())) {
return false;
}
// construct executer
executor_.init(graph_);
if (executor_p_ == nullptr) {
executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
anakin::Precision::FP32>(graph_, true);
}
return true;
}
bool PaddleInferenceAnakinPredictor::Run(
template <typename Target>
bool PaddleInferenceAnakinPredictor<Target>::Run(
const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, int batch_size) {
for (const auto &input : inputs) {
......@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
<< "'s type is not float";
return false;
}
auto d_tensor_in_p = executor_.get_in(input.name);
auto d_tensor_in_p = executor_p_->get_in(input.name);
auto net_shape = d_tensor_in_p->valid_shape();
if (net_shape.size() != input.shape.size()) {
LOG(ERROR) << " input " << input.name
<< "'s shape size should be equal to that of net";
return false;
}
int sum = 1;
for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
if (sum > net_shape.count()) {
graph_.Reshape(input.name, input.shape);
delete executor_p_;
executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
anakin::Precision::FP32>(graph_, true);
d_tensor_in_p = executor_p_->get_in(input.name);
}
anakin::saber::Shape tmp_shape;
for (auto s : input.shape) {
tmp_shape.push_back(s);
}
d_tensor_in_p->reshape(tmp_shape);
float *d_data_p = d_tensor_in_p->mutable_data();
if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
d_tensor_in_p->valid_size() * sizeof(float),
......@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
}
cudaStreamSynchronize(NULL);
}
executor_.prediction();
cudaDeviceSynchronize();
executor_p_->prediction();
cudaDeviceSynchronize();
if (output_data->empty()) {
LOG(ERROR) << "At least one output should be set with tensors' names.";
return false;
}
for (auto &output : *output_data) {
auto *tensor = executor_.get_out(output.name);
output.shape = tensor->shape();
auto *tensor = executor_p_->get_out(output.name);
output.shape = tensor->valid_shape();
if (output.data.length() < tensor->valid_size() * sizeof(float)) {
output.data.Resize(tensor->valid_size() * sizeof(float));
}
......@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
return true;
}
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
&PaddleInferenceAnakinPredictor::get_executer() {
return executor_;
template <typename Target>
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
&PaddleInferenceAnakinPredictor<Target>::get_executer() {
return *executor_p_;
}
// the cloned new Predictor of anakin share the same net weights from original
// Predictor
std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
template <typename Target>
std::unique_ptr<PaddlePredictor>
PaddleInferenceAnakinPredictor<Target>::Clone() {
VLOG(3) << "Anakin Predictor::clone";
std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
std::unique_ptr<PaddlePredictor> cls(
new PaddleInferenceAnakinPredictor<Target>());
// construct executer from other graph
auto anakin_predictor_p =
dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
if (!anakin_predictor_p) {
LOG(ERROR) << "fail to call Init";
return nullptr;
......@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
return std::move(cls);
}
template class PaddleInferenceAnakinPredictor<anakin::NV>;
template class PaddleInferenceAnakinPredictor<anakin::X86>;
// A factory to help create difference predictor.
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create.";
std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor(config));
return x;
}
if (config.target_type == AnakinConfig::NVGPU) {
VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor<anakin::NV>(config));
return x;
} else if (config.target_type == AnakinConfig::X86) {
VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor<anakin::X86>(config));
return x;
} else {
VLOG(3) << "Anakin Predictor create on unknown platform.";
return nullptr;
}
};
} // namespace paddle
......@@ -20,14 +20,16 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
// from anakin
#include "framework/core/net/net.h"
#include "framework/graph/graph.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "saber/core/shape.h"
#include "saber/saber_types.h"
namespace paddle {
template <typename Target>
class PaddleInferenceAnakinPredictor : public PaddlePredictor {
public:
PaddleInferenceAnakinPredictor() {}
......@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
std::unique_ptr<PaddlePredictor> Clone() override;
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
get_executer();
~PaddleInferenceAnakinPredictor() override{};
~PaddleInferenceAnakinPredictor() override {
delete executor_p_;
executor_p_ = nullptr;
};
private:
bool Init(const AnakinConfig& config);
anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT,
anakin::Precision::FP32>
anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
graph_;
anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
executor_;
anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
executor_p_{nullptr};
AnakinConfig config_;
};
......
......@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(model, "", "Directory of the inference model.");
DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
namespace paddle {
AnakinConfig GetConfig() {
AnakinConfig config;
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = AnakinConfig::NVGPU;
config.model_file = FLAGS_model;
config.device = 0;
config.max_batch_size = 1;
......@@ -36,7 +38,6 @@ TEST(inference, anakin) {
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
float data[1 * 3 * 224 * 224] = {1.0f};
PaddleTensor tensor;
tensor.name = "input_0";
tensor.shape = std::vector<int>({1, 3, 224, 224});
......@@ -44,22 +45,20 @@ TEST(inference, anakin) {
tensor.dtype = PaddleDType::FLOAT32;
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> paddle_tensor_feeds;
paddle_tensor_feeds.emplace_back(std::move(tensor));
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
PaddleTensor tensor_out;
tensor_out.name = "prob_out";
tensor_out.shape = std::vector<int>({1000, 1});
tensor_out.shape = std::vector<int>({});
tensor_out.data = PaddleBuf();
tensor_out.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> outputs;
outputs.emplace_back(std::move(tensor_out));
std::vector<PaddleTensor> outputs(1, tensor_out);
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < 1000; ++j) {
for (size_t j = 0; j < outputs[0].data.length(); ++j) {
LOG(INFO) << "output[" << j << "]: " << data_o[j];
}
}
......
......@@ -22,6 +22,9 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid");
namespace paddle {
namespace {
......@@ -58,6 +61,15 @@ bool NativePaddlePredictor::Init(
std::shared_ptr<framework::Scope> parent_scope) {
VLOG(3) << "Predictor::init()";
if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'";
auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device);
}
if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
......@@ -102,6 +114,10 @@ bool NativePaddlePredictor::Init(
}
NativePaddlePredictor::~NativePaddlePredictor() {
if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log");
}
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
......
......@@ -20,8 +20,8 @@ limitations under the License. */
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/demo_ci/utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "utils.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
......
......@@ -40,11 +40,12 @@ class PaddleBuf {
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
......@@ -126,9 +127,11 @@ struct NativeConfig : public PaddlePredictor::Config {
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
enum TargetType { NVGPU = 0, X86 };
int device;
std::string model_file;
int max_batch_size{-1};
TargetType target_type;
};
struct TensorRTConfig : public NativeConfig {
......
# Add TRT tests
nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc softmax_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......@@ -13,6 +14,13 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
......@@ -20,11 +20,60 @@ namespace tensorrt {
class Conv2dOpConverter : public OpConverter {
public:
Conv2dOpConverter() {}
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
LOG(INFO)
<< "convert a fluid conv2d op to tensorrt conv layer without bias";
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("Input").front());
// Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
const int n_output = Y_t->dims()[0];
const int filter_h = Y_t->dims()[2];
const int filter_w = Y_t->dims()[3];
const int groups = boost::get<int>(op_desc.GetAttr("groups"));
const std::vector<int> dilations =
boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)};
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
auto* layer = TRT_ENGINE_ADD_LAYER(
engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
nv_ksize, weight.get(), bias.get());
PADDLE_ENFORCE(layer != nullptr);
layer->setStride(nv_strides);
layer->setPadding(nv_paddings);
layer->setDilation(nv_dilations);
layer->setNbGroups(groups);
auto output_name = op_desc.Output("Output").front();
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class ElementwiseWeightOpConverter : public OpConverter {
public:
ElementwiseWeightOpConverter() {}
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr);
LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("X").front());
nvinfer1::Dims dims_x = X->getDimensions();
PADDLE_ENFORCE(dims_x.nbDims >= 3);
auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
}
if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
scale_mode = nvinfer1::ScaleMode::kCHANNEL;
} else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
dims_y[0] == dims_x.d[0]) {
scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
for (int i = 1; i < dims_x.nbDims; i++) {
if (dims_y[i] != dims_x.d[i]) {
scale_mode = nvinfer1::ScaleMode::kCHANNEL;
break;
}
}
if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
for (int i = 1; i < dims_x.nbDims; i++) {
if (dims_y[i] != 1)
PADDLE_THROW(
"TensorRT unsupported weight shape for Elementwise op!");
}
}
} else {
PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
}
TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)};
TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
};
class ElementwiseTensorOpConverter : public OpConverter {
public:
ElementwiseTensorOpConverter() {}
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
framework::OpDesc op_desc(op, nullptr);
LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("X").front());
auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
nvinfer1::Dims dims_x = X->getDimensions();
nvinfer1::Dims dims_y = Y->getDimensions();
// The two input tensor should have the same dims
PADDLE_ENFORCE(dims_x.nbDims >= 3);
if (dims_x.nbDims == dims_y.nbDims) {
for (int i = 0; i < dims_x.nbDims; i++) {
if (dims_x.d[i] != dims_y.d[i])
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
} else {
PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
}
auto op_pair = ops.find(op_type_);
if (op_pair == ops.end()) {
PADDLE_THROW("Wrong elementwise op type!");
}
nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
protected:
static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
ops;
std::string op_type_;
};
const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
ElementwiseTensorOpConverter::ops = {
{"add", nvinfer1::ElementWiseOperation::kSUM},
{"mul", nvinfer1::ElementWiseOperation::kPROD},
{"sub", nvinfer1::ElementWiseOperation::kSUB},
{"div", nvinfer1::ElementWiseOperation::kDIV},
{"min", nvinfer1::ElementWiseOperation::kMIN},
{"pow", nvinfer1::ElementWiseOperation::kPOW},
{"max", nvinfer1::ElementWiseOperation::kMAX},
};
class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
};
class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
};
class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
};
class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
};
class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
};
class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
};
class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
public:
ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
ElementwiseTensorAddOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
ElementwiseTensorSubOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
ElementwiseTensorDivOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
ElementwiseTensorMulOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
ElementwiseTensorMaxOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
ElementwiseTensorMinOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
ElementwiseTensorPowOpConverter);
......@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
}
// indata c * k
// Reorder the data layout from CK to KC.
void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
void ReorderCKtoKC(TensorRTEngine::Weight& iweights, // NOLINT
TensorRTEngine::Weight* oweights) {
int c = iweights.dims[0];
int k = iweights.dims[1];
......
......@@ -55,6 +55,32 @@ class OpConverter {
it = Registry<OpConverter>::Lookup("fc");
}
}
if (op_desc.Type().find("elementwise") != std::string::npos) {
static std::unordered_set<std::string> add_tensor_op_set{
"add", "mul", "sub", "div", "max", "min", "pow"};
// TODO(xingzhaolong): all mul, sub, div
// static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
// "sub", "div"};
static std::unordered_set<std::string> add_weight_op_set{"add"};
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
int op_type_len = op_desc.Type().size();
std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
std::string Y = op_desc.Input("Y")[0];
if (parameters.count(Y)) {
PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
"Unsupported elementwise type" + op_type);
it =
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
op_desc.Type());
} else {
PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
"Unsupported elementwise type" + op_type);
it =
Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
}
}
if (!it) {
it = Registry<OpConverter>::Lookup(op_desc.Type());
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
*/
class SoftMaxOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4)
<< "convert a fluid softmax op to tensorrt softmax layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax,
*const_cast<nvinfer1::ITensor*>(input1));
auto output_name = op_desc.Output("Out")[0];
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(conv2d_op, test) {
std::unordered_set<std::string> parameters({"conv2d-Y"});
framework::Scope scope;
TRTConvertValidation validator(5, parameters, scope, 1 << 15);
validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("conv2d");
desc.SetInput("Input", {"conv2d-X"});
desc.SetInput("Filter", {"conv2d-Y"});
desc.SetOutput("Output", {"conv2d-Out"});
const std::vector<int> strides({1, 1});
const std::vector<int> paddings({1, 1});
const std::vector<int> dilations({1, 1});
const int groups = 1;
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("dilations", dilations);
desc.SetAttr("groups", groups);
validator.SetOp(*desc.Proto());
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(conv2d);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(elementwise_op, add_weight_test) {
std::unordered_set<std::string> parameters({"elementwise_add-Y"});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("elementwise_add");
desc.SetInput("X", {"elementwise_add-X"});
desc.SetInput("Y", {"elementwise_add-Y"});
desc.SetOutput("Out", {"elementwise_add-Out"});
int axis = 1;
desc.SetAttr("axis", axis);
validator.SetOp(*desc.Proto());
validator.Execute(8);
}
TEST(elementwise_op, add_tensor_test) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
TRTConvertValidation validator(8, parameters, scope, 1 << 15);
validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("elementwise_add");
desc.SetInput("X", {"elementwise_add-X"});
desc.SetInput("Y", {"elementwise_add-Y"});
desc.SetOutput("Out", {"elementwise_add-Out"});
// the defalut axis of elementwise op is -1
validator.SetOp(*desc.Proto());
validator.Execute(8);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(elementwise_add);
......@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
framework::ProgramDesc prog;
auto* block = prog.MutableBlock(0);
auto* conv2d_op = block->AppendOp();
// init trt engine
cudaStream_t stream_;
std::unique_ptr<TensorRTEngine> engine_;
engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3(2, 5, 5));
conv2d_op->SetType("conv2d");
conv2d_op->SetInput("Input", {"conv2d-X"});
conv2d_op->SetInput("Filter", {"conv2d-Y"});
conv2d_op->SetOutput("Output", {"conv2d-Out"});
OpConverter converter;
const std::vector<int> strides({1, 1});
const std::vector<int> paddings({1, 1});
const std::vector<int> dilations({1, 1});
const int groups = 1;
conv2d_op->SetAttr("strides", strides);
conv2d_op->SetAttr("paddings", paddings);
conv2d_op->SetAttr("dilations", dilations);
conv2d_op->SetAttr("groups", groups);
// init scope
framework::Scope scope;
converter.ConvertBlock(*block->Proto(), {}, scope,
nullptr /*TensorRTEngine*/);
std::vector<int> dim_vec = {3, 2, 3, 3};
auto* x = scope.Var("conv2d-Y");
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
OpConverter converter;
converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
engine_.get() /*TensorRTEngine*/);
}
} // namespace tensorrt
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(SoftMaxOpConverter, main) {
framework::Scope scope;
std::unordered_set<std::string> parameters;
TRTConvertValidation validator(8, parameters, scope, 1000);
std::vector<int> tensor_shape{8, 10};
validator.DeclInputVar("softmax-X", tensor_shape,
nvinfer1::DimsCHW(10, 1, 1));
validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("softmax");
desc.SetInput("X", {"softmax-X"});
desc.SetOutput("Out", {"softmax-Out"});
LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto());
LOG(INFO) << "execute";
validator.Execute(3);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(softmax);
......@@ -79,6 +79,12 @@ class TRTConvertValidation {
}
// Declare a Variable as input with random initialization.
void DeclInputVar(const std::string& name, const std::vector<int> tensor_dims,
const nvinfer1::Dims& trt_dims) {
DeclVar(name, tensor_dims);
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims);
}
void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims);
// Declare TRT inputs.
......@@ -94,12 +100,18 @@ class TRTConvertValidation {
DeclVar(name, dims);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
}
// Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
bool is_param = false) {
// Init Fluid tensor.
std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
// There is no batchsize in ITensor's shape, but We should add it to
......@@ -107,10 +119,8 @@ class TRTConvertValidation {
// if_add_batch_ flag is true, add the max batchsize to dim_vec.
if (is_param != true && if_add_batch_ == true)
dim_vec.insert(dim_vec.begin(), max_batch_size_);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable<framework::LoDTensor>();
x_tensor->Resize(framework::make_ddim(dim_vec));
RandomizeTensor(x_tensor, place, ctx);
DeclVar(name, dim_vec);
}
void SetOp(const framework::proto::OpDesc& desc) {
......@@ -149,7 +159,7 @@ class TRTConvertValidation {
cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 2000;
const size_t output_space_size = 3000;
for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size);
......
......@@ -170,6 +170,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
elseif(${TARGET} STREQUAL "fc")
# HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
......@@ -235,7 +238,12 @@ else()
endif()
op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
if(WITH_GPU)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub)
else()
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
endif()
op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND)
......@@ -273,9 +281,9 @@ op_library(squeeze_op DEPS reshape_op)
op_library(extract_rows_op DEPS memory)
op_library(flatten_op DEPS reshape_op)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub)
else()
op_library(conv_op DEPS vol2col im2col)
endif()
......@@ -295,12 +303,6 @@ op_library(channel_recv_op DEPS concurrency)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
# Because the fully connected layer has only one MKLDNN's operator
if(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS fc_op)
endif(NOT WITH_MKLDNN)
foreach(src ${GENERAL_OPS})
op_library(${src})
endforeach()
......
......@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>);
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
......@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
const Out out_conj = Eigen::numext::conj(out);
dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
dx.device(d) = static_cast<T>(0.5) * dout / out;
}
};
......@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(factor) *
x.pow(static_cast<T>(factor - static_cast<T>(1)));
x.pow(static_cast<T>(factor) - static_cast<T>(1));
}
};
......@@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
T b = static_cast<T>(beta);
auto temp1 = static_cast<T>(1) /
(static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
dx.device(d) = dout * ((beta * out) + temp2);
(static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
dx.device(d) = dout * ((b * out) + temp2);
}
};
......
......@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
ops::AssignValueKernel<float>);
ops::AssignValueKernel<float>,
ops::AssignValueKernel<plat::float16>);
......@@ -20,10 +20,10 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
DEFINE_bool(cudnn_deterministic, true,
DEFINE_bool(cudnn_deterministic, false,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"false, the algorithm is deterministic.");
"true, the algorithm is deterministic.");
namespace paddle {
namespace operators {
......@@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024;
template <typename T, typename DeviceContext>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool EnableFp16(const DeviceContext& dev_ctx,
cudnnConvolutionDescriptor_t cudnn_conv_desc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
return true;
} else {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
}
#endif
return false;
}
template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> {
public:
......@@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t algo;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
}
#endif
// get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
......@@ -272,7 +280,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
if (input_grad) {
if (FLAGS_cudnn_deterministic) {
if (!FLAGS_cudnn_deterministic) {
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc,
......@@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
......@@ -297,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
if (filter_grad) {
if (FLAGS_cudnn_deterministic) {
if (!FLAGS_cudnn_deterministic) {
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc,
......@@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} else {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
}
CUDNN_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
......@@ -362,7 +376,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>);
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
......@@ -370,4 +385,5 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>);
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册