Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
557229bd
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
557229bd
编写于
11月 26, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'ups/develop' into fea/jit/rnn
上级
6a7f83d4
6224e61f
变更
79
隐藏空白更改
内联
并排
Showing
79 changed file
with
1943 addition
and
359 deletion
+1943
-359
CMakeLists.txt
CMakeLists.txt
+1
-0
Dockerfile
Dockerfile
+39
-0
cmake/external/eigen.cmake
cmake/external/eigen.cmake
+1
-1
cmake/external/mkldnn.cmake
cmake/external/mkldnn.cmake
+1
-1
cmake/external/rocprim.cmake
cmake/external/rocprim.cmake
+44
-0
cmake/flags.cmake
cmake/flags.cmake
+3
-0
cmake/generic.cmake
cmake/generic.cmake
+15
-11
cmake/hip.cmake
cmake/hip.cmake
+27
-5
cmake/operators.cmake
cmake/operators.cmake
+2
-1
paddle/fluid/API.spec
paddle/fluid/API.spec
+2
-2
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+7
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+3
-2
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+1
-0
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+6
-0
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+17
-32
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+4
-0
paddle/fluid/framework/transfer_scope_cache.cc
paddle/fluid/framework/transfer_scope_cache.cc
+62
-0
paddle/fluid/framework/transfer_scope_cache.h
paddle/fluid/framework/transfer_scope_cache.h
+41
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+1
-0
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+3
-1
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+2
-0
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+27
-8
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+2
-0
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+20
-8
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+0
-2
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+1
-1
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+13
-0
paddle/fluid/inference/tensorrt/convert/split_op.cc
paddle/fluid/inference/tensorrt/convert/split_op.cc
+2
-10
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+75
-13
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+127
-31
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+7
-2
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+25
-25
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+1
-0
paddle/fluid/inference/tests/api/config_printer.h
paddle/fluid/inference/tests/api/config_printer.h
+2
-0
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+11
-9
paddle/fluid/inference/utils/CMakeLists.txt
paddle/fluid/inference/utils/CMakeLists.txt
+2
-0
paddle/fluid/inference/utils/benchmark.cc
paddle/fluid/inference/utils/benchmark.cc
+49
-0
paddle/fluid/inference/utils/benchmark.h
paddle/fluid/inference/utils/benchmark.h
+52
-0
paddle/fluid/inference/utils/benchmark_tester.cc
paddle/fluid/inference/utils/benchmark_tester.cc
+39
-0
paddle/fluid/memory/allocation/retry_allocator_test.cc
paddle/fluid/memory/allocation/retry_allocator_test.cc
+1
-1
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+1
-1
paddle/fluid/operators/detection/density_prior_box_op.cc
paddle/fluid/operators/detection/density_prior_box_op.cc
+21
-15
paddle/fluid/operators/detection/density_prior_box_op.cu
paddle/fluid/operators/detection/density_prior_box_op.cu
+170
-0
paddle/fluid/operators/detection/density_prior_box_op.h
paddle/fluid/operators/detection/density_prior_box_op.h
+35
-38
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+5
-0
paddle/fluid/operators/distributed/grpc_server.cc
paddle/fluid/operators/distributed/grpc_server.cc
+20
-0
paddle/fluid/operators/distributed/sendrecvop_utils.cc
paddle/fluid/operators/distributed/sendrecvop_utils.cc
+2
-0
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+5
-1
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
...uid/operators/fused/fusion_transpose_flatten_concat_op.cc
+114
-0
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
.../operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+115
-0
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
...luid/operators/fused/fusion_transpose_flatten_concat_op.h
+50
-0
paddle/fluid/operators/lookup_sparse_table_op.cc
paddle/fluid/operators/lookup_sparse_table_op.cc
+1
-0
paddle/fluid/operators/math/fc_compute.h
paddle/fluid/operators/math/fc_compute.h
+1
-3
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+3
-0
paddle/fluid/operators/tensor_array_to_tensor_op.cc
paddle/fluid/operators/tensor_array_to_tensor_op.cc
+5
-5
paddle/fluid/platform/cpu_helper.cc
paddle/fluid/platform/cpu_helper.cc
+1
-1
paddle/fluid/platform/dynload/cublas.cc
paddle/fluid/platform/dynload/cublas.cc
+3
-0
paddle/fluid/platform/dynload/cublas.h
paddle/fluid/platform/dynload/cublas.h
+20
-10
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+2
-2
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+37
-1
paddle/testing/paddle_gtest_main.cc
paddle/testing/paddle_gtest_main.cc
+6
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+10
-5
python/paddle/fluid/contrib/utils/__init__.py
python/paddle/fluid/contrib/utils/__init__.py
+3
-1
python/paddle/fluid/contrib/utils/lookup_table_utils.py
python/paddle/fluid/contrib/utils/lookup_table_utils.py
+256
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+20
-0
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+40
-11
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+5
-3
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+20
-23
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+19
-14
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+8
-1
python/paddle/fluid/tests/book/test_image_classification.py
python/paddle/fluid/tests/book/test_image_classification.py
+1
-1
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+32
-28
python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
...paddle/fluid/tests/unittests/test_density_prior_box_op.py
+19
-11
python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
...ests/unittests/test_fusion_transpose_flatten_concat_op.py
+105
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+11
-0
python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
...on/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+7
-2
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+19
-10
tools/manylinux1/Dockerfile.x64
tools/manylinux1/Dockerfile.x64
+1
-1
tools/manylinux1/build_scripts/build_utils.sh
tools/manylinux1/build_scripts/build_utils.sh
+12
-3
未找到文件。
CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -204,6 +204,7 @@ include(external/eigen) # download eigen3
include
(
external/pybind11
)
# download pybind11
include
(
external/cares
)
include
(
external/cub
)
include
(
external/rocprim
)
include
(
external/xxhash
)
# download xxhash
include
(
external/dlpack
)
include
(
external/snappy
)
# download snappy
...
...
Dockerfile
浏览文件 @
557229bd
...
...
@@ -22,6 +22,27 @@ ENV HOME /root
# Add bash enhancements
COPY
./paddle/scripts/docker/root/ /root/
# Prepare packages for Python
RUN
apt-get update
&&
\
apt-get
install
-y
make build-essential libssl-dev zlib1g-dev libbz2-dev
\
libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev
\
xz-utils tk-dev libffi-dev liblzma-dev
# Install Python3.6
RUN
mkdir
-p
/root/python_build/
&&
wget
-q
https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
&&
\
tar
-zxf
sqlite-autoconf-3250300.tar.gz
&&
cd
sqlite-autoconf-3250300
&&
\
./configure
-prefix
=
/usr/local
&&
make
-j8
&&
make
install
&&
cd
../
&&
rm
sqlite-autoconf-3250300.tar.gz
&&
\
wget
-q
https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz
&&
\
tar
-xzf
Python-3.6.0.tgz
&&
cd
Python-3.6.0
&&
\
CFLAGS
=
"-Wformat"
./configure
--prefix
=
/usr/local/
--enable-shared
>
/dev/null
&&
\
make
-j8
>
/dev/null
&&
make altinstall
>
/dev/null
# Install Python3.7
RUN
wget
-q
https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz
&&
\
tar
-xzf
Python-3.7.0.tgz
&&
cd
Python-3.7.0
&&
\
CFLAGS
=
"-Wformat"
./configure
--prefix
=
/usr/local/
--enable-shared
>
/dev/null
&&
\
make
-j8
>
/dev/null
&&
make altinstall
>
/dev/null
RUN
apt-get update
&&
\
apt-get
install
-y
--allow-downgrades
patchelf
\
python3 python3-dev python3-pip
\
...
...
@@ -74,6 +95,12 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN
pip3
install
-U
wheel
&&
\
pip3
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
pip3.6
install
-U
wheel
&&
\
pip3.6
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3.6
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
pip3.7
install
-U
wheel
&&
\
pip3.7
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3.7
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
easy_install
-U
pip
&&
\
pip
install
-U
pip setuptools wheel
&&
\
pip
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
...
...
@@ -82,22 +109,34 @@ RUN pip3 install -U wheel && \
RUN
pip3
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3
install
opencv-python
&&
\
pip3.6
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3.6
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3.6
install
opencv-python
&&
\
pip3.7
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip3.7
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3.7
install
opencv-python
&&
\
pip
install
'pre-commit==1.10.4'
'ipython==5.3.0'
&&
\
pip
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip
install
opencv-python
#For docstring checker
RUN
pip3
install
pylint pytest astroid isort
RUN
pip3.6
install
pylint pytest astroid isort
RUN
pip3.7
install
pylint pytest astroid isort
RUN
pip
install
pylint pytest astroid isort LinkChecker
COPY
./python/requirements.txt /root/
RUN
pip3
install
-r
/root/requirements.txt
RUN
pip3.6
install
-r
/root/requirements.txt
RUN
pip3.7
install
-r
/root/requirements.txt
RUN
pip
install
-r
/root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN
apt-get
install
-y
libssl-dev libffi-dev
RUN
pip3
install
certifi urllib3[secure]
RUN
pip3.6
install
certifi urllib3[secure]
RUN
pip3.7
install
certifi urllib3[secure]
RUN
pip
install
certifi urllib3[secure]
...
...
cmake/external/eigen.cmake
浏览文件 @
557229bd
...
...
@@ -17,7 +17,7 @@ if(WITH_AMD_GPU)
extern_eigen3
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/sabreshao/hipeigen.git"
GIT_TAG
0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
GIT_TAG
7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
PREFIX
${
EIGEN_SOURCE_DIR
}
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
...
...
cmake/external/mkldnn.cmake
浏览文件 @
557229bd
...
...
@@ -53,7 +53,7 @@ ExternalProject_Add(
${
EXTERNAL_PROJECT_LOG_ARGS
}
DEPENDS
${
MKLDNN_DEPENDS
}
GIT_REPOSITORY
"https://github.com/01org/mkl-dnn.git"
GIT_TAG
"
21fb5f2af1dd14e132af4f1b79160977ee487818
"
GIT_TAG
"
830a10059a018cd2634d94195140cf2d8790a75a
"
PREFIX
${
MKLDNN_SOURCES_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
...
...
cmake/external/rocprim.cmake
0 → 100644
浏览文件 @
557229bd
if
(
NOT WITH_AMD_GPU
)
return
()
endif
()
# rocprim is "ROCm Parallel Primitives" for short.
# It is a header-only library providing HIP and HC parallel primitives
# for developing performant GPU-accelerated code on AMD ROCm platform.
if
(
"x
${
HCC_HOME
}
"
STREQUAL
"x"
)
set
(
HCC_HOME
"/opt/rocm/hcc"
)
endif
()
INCLUDE
(
ExternalProject
)
SET
(
ROCPRIM_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/rocprim
)
SET
(
ROCPRIM_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/rocprim
)
SET
(
ROCPRIM_INCLUDE_DIR
${
ROCPRIM_INSTALL_DIR
}
/include
)
ExternalProject_Add
(
extern_rocprim
GIT_REPOSITORY
"https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc
PREFIX
${
ROCPRIM_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
HCC_HOME
}
/bin/hcc
CMAKE_ARGS -DONLY_INSTALL=ON
CMAKE_ARGS -DBUILD_TEST=OFF
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=
${
ROCPRIM_INSTALL_DIR
}
INSTALL_DIR
${
ROCPRIM_INSTALL_DIR
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
)
INCLUDE_DIRECTORIES
(
${
ROCPRIM_INCLUDE_DIR
}
)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.3.0"
)
set
(
dummyfile
${
CMAKE_CURRENT_BINARY_DIR
}
/rocprim_dummy.c
)
file
(
WRITE
${
dummyfile
}
"const char *dummy_rocprim =
\"
${
dummyfile
}
\"
;"
)
add_library
(
rocprim STATIC
${
dummyfile
}
)
else
()
add_library
(
rocprim INTERFACE
)
endif
()
add_dependencies
(
rocprim extern_rocprim
)
cmake/flags.cmake
浏览文件 @
557229bd
...
...
@@ -129,6 +129,9 @@ set(COMMON_FLAGS
-Wno-error=parentheses-equality
# Warnings in pybind11
-Wno-error=ignored-attributes
# Warnings in Eigen, gcc 6.3
-Wno-error=terminate
# Warning in PADDLE_ENFORCE
-Wno-error=int-in-bool-context
# Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0
# Warning in tinyformat.h
-Wno-error=maybe-uninitialized
# Warning in boost gcc 7.2
)
set
(
GPU_COMMON_FLAGS
...
...
cmake/generic.cmake
浏览文件 @
557229bd
...
...
@@ -454,25 +454,29 @@ function(hip_library TARGET_NAME)
else
()
add_library
(
${
TARGET_NAME
}
STATIC
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE CXX
)
target_link_libraries
(
${
TARGET_NAME
}
/opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a
)
find_fluid_modules
(
${
TARGET_NAME
}
)
target_link_libraries
(
${
TARGET_NAME
}
/opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a
/opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so
)
find_fluid_modules
(
${
TARGET_NAME
}
)
endif
()
if
(
hip_library_DEPS
)
add_dependencies
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
if
(
"
${
hip_library_DEPS
}
"
MATCHES
"ARCHIVE_START"
)
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
# WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
target_circle_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
list
(
REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END
)
else
()
target_link_libraries
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
endif
()
# cpplint code style
foreach
(
source_file
${
hip_library_SRCS
}
)
string
(
REGEX REPLACE
"
\\
.[^.]*$"
""
source
${
source_file
}
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
source
}
.h
)
list
(
APPEND hip_library_HEADERS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
source
}
.h
)
endif
()
string
(
REGEX REPLACE
"
\\
.[^.]*$"
""
source
${
source_file
}
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
source
}
.h
)
list
(
APPEND hip_library_HEADERS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
source
}
.h
)
endif
()
endforeach
()
else
(
hip_library_SRCS
)
if
(
hip_library_DEPS
)
merge_static_libs
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
merge_static_libs
(
${
TARGET_NAME
}
${
hip_library_DEPS
}
)
else
()
message
(
FATAL
"Please specify source file or library in nv_library."
)
message
(
FATAL
"Please specify source file or library in nv_library."
)
endif
()
endif
(
hip_library_SRCS
)
endif
()
...
...
cmake/hip.cmake
浏览文件 @
557229bd
...
...
@@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU)
endif
()
include_directories
(
"/opt/rocm/include"
)
include_directories
(
"/opt/rocm/hip/include"
)
include_directories
(
"/opt/rocm/miopen/include"
)
include_directories
(
"/opt/rocm/hipblas/include"
)
include_directories
(
"/opt/rocm/hiprand/include"
)
include_directories
(
"/opt/rocm/rocrand/include"
)
...
...
@@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust")
list
(
APPEND EXTERNAL_LIBS
"-L/opt/rocm/lib/ -lhip_hcc"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-fPIC -DPADDLE_WITH_HIP -std=c++1
4
"
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-fPIC -DPADDLE_WITH_HIP -std=c++1
1
"
)
if
(
WITH_DSO
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_USE_DSO"
)
endif
(
WITH_DSO
)
if
(
WITH_DOUBLE
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_TYPE_DOUBLE"
)
endif
(
WITH_DOUBLE
)
if
(
WITH_TESTING
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_TESTING"
)
endif
(
WITH_TESTING
)
if
(
WITH_DISTRIBUTE
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_DISTRIBUTE"
)
endif
(
WITH_DISTRIBUTE
)
if
(
WITH_GRPC
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_GRPC"
)
endif
(
WITH_GRPC
)
if
(
NOT WITH_GOLANG
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITHOUT_GOLANG"
)
endif
(
NOT WITH_GOLANG
)
if
(
WITH_MKLDNN
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_WITH_MKLDNN"
)
endif
(
WITH_MKLDNN
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DANY_IMPL_ANY_CAST_MOVEABLE"
)
if
(
NOT WITH_RDMA
)
set
(
HIP_HCC_FLAGS
"
${
HIP_HCC_FLAGS
}
-DPADDLE_DISABLE_RDMA"
)
endif
(
NOT WITH_RDMA
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
list
(
APPEND HIP_HCC_FLAGS
${
CMAKE_CXX_FLAGS_DEBUG
}
)
elseif
(
CMAKE_BUILD_TYPE STREQUAL
"RelWithDebInfo"
)
...
...
cmake/operators.cmake
浏览文件 @
557229bd
...
...
@@ -109,7 +109,8 @@ function(op_library TARGET)
# Define operators that don't need pybind here.
foreach
(
manual_pybind_op
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
)
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
"fusion_transpose_flatten_concat_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
endif
()
...
...
paddle/fluid/API.spec
浏览文件 @
557229bd
...
...
@@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', '
name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5
, None))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', '
flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False
, None))
paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
...
...
@@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va
paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'
], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'
))
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'
, 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None
))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -116,8 +116,14 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library
(
op_info SRCS op_info.cc DEPS attribute framework_proto
)
cc_library
(
shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context
)
if
(
NOT WIN32
)
cc_library
(
transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context
)
cc_library
(
operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache
)
else
()
cc_library
(
operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler
)
shape_inference data_transform lod_tensor
)
endif
(
NOT WIN32
)
cc_test
(
operator_test SRCS operator_test.cc DEPS operator op_registry device_context
)
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
557229bd
...
...
@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -391,8 +392,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
std
::
unique_ptr
<
GarbageCollector
<
Tensor
>>
gc
;
// WhileOp would set keep_kids to
false
//
WhileGradOp would need the scopes created in WhileOp
// WhileOp would set keep_kids to
true,
//
because WhileGradOp needs the scopes created in WhileOp.
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
557229bd
...
...
@@ -83,6 +83,7 @@ void NaiveExecutor::Run() {
for
(
auto
&
op
:
ops_
)
{
VLOG
(
3
)
<<
std
::
this_thread
::
get_id
()
<<
" run "
<<
op
->
Type
()
<<
" on scope "
<<
scope_
;
op
->
SetIsCalledByExecutor
(
false
);
op
->
Run
(
*
scope_
,
place_
);
}
}
...
...
paddle/fluid/framework/op_desc.cc
浏览文件 @
557229bd
...
...
@@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
this
->
attrs_
[
name
]
=
std
::
vector
<
int
>
();
break
;
}
case
proto
::
AttrType
::
LONGS
:
{
VLOG
(
110
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from LONGS to LONGS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
int64_t
>
();
break
;
}
case
proto
::
AttrType
::
FLOATS
:
{
VLOG
(
110
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to FLOATS"
;
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
557229bd
...
...
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -33,11 +34,6 @@ DEFINE_bool(check_nan_inf, false,
namespace
paddle
{
namespace
framework
{
// Combine two hash values to a single hash.
inline
size_t
CombineHash
(
size_t
seed
,
size_t
a
)
{
return
(
seed
^
a
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
std
::
vector
<
std
::
tuple
<
platform
::
Place
,
LibraryType
>>
kKernelPriority
=
{
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kCUDNN
),
std
::
make_tuple
(
platform
::
CUDAPlace
(
0
),
LibraryType
::
kPlain
),
...
...
@@ -797,17 +793,6 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope
*
OperatorWithKernel
::
TryTransferData
(
const
Scope
&
scope
,
const
OpKernelType
&
expected_kernel_key
,
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
)
const
{
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local
std
::
unordered_map
<
size_t
,
Scope
*>
infer_transfer_scope_cache
;
#endif
Scope
*
new_scope
=
nullptr
;
for
(
auto
&
var_name_item
:
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
...
...
@@ -838,23 +823,23 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG
(
30
)
<<
"Transform Variable "
<<
var_name
<<
" from "
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
#ifdef PADDLE_ON_INFERENCE
size_t
infer_cache_key
=
CombineHash
(
OpKernelType
::
Hash
()(
kernel_type_for_var
),
OpKernelType
::
Hash
()(
expected_kernel_key
));
infer_cache_key
=
CombineHash
(
infer_cache_key
,
std
::
hash
<
const
Scope
*>
()(
&
scope
));
auto
it
=
infer_transfer_scope_cache
.
find
(
infer_cache_key
);
if
(
it
!=
infer_transfer_scope_cache
.
end
())
{
new_scope
=
infer_transfer_scope_cache
[
infer_cache_key
];
}
else
{
new_scope
=
&
scope
.
NewScope
();
infer_transfer_scope_cache
[
infer_cache_key
]
=
new_scope
;
// In the inference scenerio, the scopes will be reused across the
// batches, so the `new_scope` here will result in GPU memroy explosion
// over the running of operators.
// We use a thread_local cache to fix that issue, the key in the cache is
// the combination of the `scope` argument, from_kernel_type,
// target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some
// changes on this logic for this macro might not tested on the other
// scenerios.
// If this op is not called by an Executor or ParallelExecutor, it should
// called by a NaiveExecutor, the NaiveExecutor will cache the scopes and
// variables, that behavior a lot different.
if
(
!
run_by_executor_
)
{
new_scope
=
TryCreateTransferScope
(
kernel_type_for_var
,
expected_kernel_key
,
&
scope
);
}
#endif
if
(
new_scope
==
nullptr
)
{
if
(
!
new_scope
)
{
new_scope
=
&
scope
.
NewScope
();
}
...
...
paddle/fluid/framework/operator.h
浏览文件 @
557229bd
...
...
@@ -127,6 +127,8 @@ class OperatorBase {
//! Get all outputs variable names
virtual
std
::
vector
<
std
::
string
>
OutputVars
(
bool
has_intermediate
)
const
;
void
SetIsCalledByExecutor
(
bool
x
)
{
run_by_executor_
=
x
;
}
protected:
std
::
string
type_
;
// NOTE: in case of OpGrad, inputs_ contains:
...
...
@@ -139,6 +141,8 @@ class OperatorBase {
// IG (Inputs Gradients)
VariableNameMap
outputs_
;
AttributeMap
attrs_
;
// Whether this operator executes in an Executor.
bool
run_by_executor_
{
true
};
private:
void
GenerateTemporaryNames
();
...
...
paddle/fluid/framework/transfer_scope_cache.cc
0 → 100644
浏览文件 @
557229bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/transfer_scope_cache.h"
namespace
paddle
{
namespace
framework
{
// Holds all the transfer scope across the process.
std
::
unordered_map
<
size_t
,
Scope
*>&
global_transfer_data_cache
()
{
typedef
std
::
unordered_map
<
size_t
,
Scope
*>
map_t
;
thread_local
std
::
unique_ptr
<
map_t
>
x
(
new
map_t
);
return
*
x
;
}
// Holds all the transfer scope for this thread.
std
::
unordered_set
<
Scope
*>&
global_transfer_scope_cache
()
{
typedef
std
::
unordered_set
<
Scope
*>
set_t
;
thread_local
std
::
unique_ptr
<
set_t
>
x
(
new
set_t
);
return
*
x
;
}
// Try to create a transfer scope. If one cached scope has match the
// requirement, just return that one.
// Inputs:
// @type0: the source kernel type.
// @type1: the target kernel type.
// @scope: the execution scope of this op.
// Returns: A scope used to hold the transfer data across the different kernel
// type.
Scope
*
TryCreateTransferScope
(
OpKernelType
type0
,
OpKernelType
type1
,
const
Scope
*
scope
)
{
Scope
*
new_scope
{
nullptr
};
size_t
infer_cache_key
=
CombineHash
(
OpKernelType
::
Hash
()(
type0
),
OpKernelType
::
Hash
()(
type1
));
infer_cache_key
=
CombineHash
(
infer_cache_key
,
std
::
hash
<
const
Scope
*>
()(
scope
));
auto
it
=
global_transfer_data_cache
().
find
(
infer_cache_key
);
if
(
it
!=
global_transfer_data_cache
().
end
())
{
new_scope
=
global_transfer_data_cache
()[
infer_cache_key
];
}
else
{
new_scope
=
&
scope
->
NewScope
();
global_transfer_data_cache
()[
infer_cache_key
]
=
new_scope
;
}
global_transfer_scope_cache
().
insert
(
new_scope
);
return
new_scope
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/transfer_scope_cache.h
0 → 100644
浏览文件 @
557229bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <thread> // NOLINT
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
std
::
unordered_map
<
size_t
,
Scope
*>&
global_transfer_data_cache
();
std
::
unordered_set
<
Scope
*>&
global_transfer_scope_cache
();
// Combine two hash values to a single hash.
static
size_t
CombineHash
(
size_t
seed
,
size_t
a
)
{
return
(
seed
^
a
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
Scope
*
TryCreateTransferScope
(
OpKernelType
type0
,
OpKernelType
type1
,
const
Scope
*
scope
);
void
RemoveKidsFromTransferScopeCache
(
Scope
*
scope
);
}
// namespace framework
}
// namespace paddle
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -4,6 +4,7 @@ endif()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory
(
analysis
)
add_subdirectory
(
utils
)
if
(
TENSORRT_FOUND
)
add_subdirectory
(
tensorrt
)
endif
()
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -30,7 +30,9 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array
)
cc_test
(
test_paddle_inference_api
SRCS api_tester.cc
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
557229bd
...
...
@@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
...
...
@@ -72,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
557229bd
...
...
@@ -31,11 +31,11 @@
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
profile
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
...
...
@@ -67,7 +67,7 @@ bool AnalysisPredictor::Init(
#endif
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
()
);
if
(
!
PrepareScope
(
parent_scope
))
{
return
false
;
...
...
@@ -160,6 +160,14 @@ bool AnalysisPredictor::PrepareExecutor() {
return
true
;
}
void
AnalysisPredictor
::
SetMkldnnThreadID
(
int
tid
)
{
#ifdef PADDLE_WITH_MKLDNN
platform
::
set_cur_thread_id
(
tid
);
#else
LOG
(
ERROR
)
<<
"Please compile with MKLDNN first to use MKLDNN"
;
#endif
}
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
...
...
@@ -167,7 +175,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
inference
::
Timer
timer
;
timer
.
tic
();
// set feed variable
std
::
vector
<
framework
::
LoDTensor
>
feeds
;
framework
::
Scope
*
scope
=
sub_scope_
?
sub_scope_
:
scope_
.
get
();
if
(
!
SetFeed
(
inputs
,
scope
))
{
LOG
(
ERROR
)
<<
"fail to set feed"
;
...
...
@@ -208,17 +215,29 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
void
*
input_ptr
;
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
ce_
);
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
ce_
);
}
else
{
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
return
false
;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
if
(
platform
::
is_cpu_place
(
place_
))
{
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
}
else
{
#ifdef PADDLE_WITH_CUDA
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
memory
::
Copy
(
dst_gpu_place
,
static_cast
<
void
*>
(
input_ptr
),
platform
::
CPUPlace
(),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
(),
0
);
// stream 0 for sync copy
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
557229bd
...
...
@@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor {
framework
::
Scope
*
scope
()
{
return
scope_
.
get
();
}
framework
::
ProgramDesc
&
program
()
{
return
*
inference_program_
;
}
void
SetMkldnnThreadID
(
int
tid
);
protected:
bool
PrepareProgram
(
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
);
bool
PrepareScope
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
);
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
557229bd
...
...
@@ -24,11 +24,11 @@ limitations under the License. */
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool
(
profile
,
false
,
"Turn on profiler for fluid"
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
{
...
...
@@ -76,7 +76,7 @@ bool NativePaddlePredictor::Init(
#endif
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
paddle
::
platform
::
SetNumThreads
(
config_
.
cpu_math_library_num_threads
()
);
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
...
...
@@ -139,7 +139,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
Timer
timer
;
timer
.
tic
();
// set feed variable
std
::
vector
<
framework
::
LoDTensor
>
feeds
;
framework
::
Scope
*
scope
=
sub_scope_
!=
nullptr
?
sub_scope_
:
scope_
.
get
();
if
(
!
SetFeed
(
inputs
,
scope
))
{
LOG
(
ERROR
)
<<
"fail to set feed"
;
...
...
@@ -195,17 +194,30 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
framework
::
DDim
ddim
=
framework
::
make_ddim
(
inputs
[
i
].
shape
);
void
*
input_ptr
;
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
INT64
)
{
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
int64_t
>
(
ddim
,
pla
ce_
);
}
else
if
(
inputs
[
i
].
dtype
==
PaddleDType
::
FLOAT32
)
{
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
tform
::
CPUPlace
()
);
input_ptr
=
input
.
mutable_data
<
float
>
(
ddim
,
pla
ce_
);
}
else
{
LOG
(
ERROR
)
<<
"unsupported feed type "
<<
inputs
[
i
].
dtype
;
return
false
;
}
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
if
(
platform
::
is_cpu_place
(
place_
))
{
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std
::
memcpy
(
static_cast
<
void
*>
(
input_ptr
),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
());
}
else
{
#ifdef PADDLE_WITH_CUDA
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
);
memory
::
Copy
(
dst_gpu_place
,
static_cast
<
void
*>
(
input_ptr
),
platform
::
CPUPlace
(),
inputs
[
i
].
data
.
data
(),
inputs
[
i
].
data
.
length
(),
0
);
// stream 0 for sync copy
#else
PADDLE_THROW
(
"Not compile with CUDA, should not reach here."
);
#endif
}
// TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
framework
::
LoD
lod
;
for
(
auto
&
level
:
inputs
[
i
].
lod
)
{
...
...
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -46,8 +46,6 @@ if(WITH_GPU)
endif
()
endif
(
NOT WIN32
)
endif
()
include_directories
(
"D:/Paddle/"
)
include_directories
(
"
${
PADDLE_LIB
}
"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/include"
)
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
557229bd
...
...
@@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig {
int
max_batch_size
=
1
);
bool
use_tensorrt
()
const
{
return
use_tensorrt_
;
}
void
EnableMKLDNN
();
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
void
EnableMKLDNN
();
bool
use_mkldnn
()
const
{
return
use_mkldnn_
;
}
friend
class
::
paddle
::
AnalysisPredictor
;
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
557229bd
...
...
@@ -186,6 +186,19 @@ struct NativeConfig : public PaddlePredictor::Config {
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
bool
specify_input_name
{
false
};
// Set and get the number of cpu math library threads.
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
}
int
cpu_math_library_num_threads
()
const
{
return
cpu_math_library_num_threads_
;
}
protected:
// number of cpu math library (such as MKL, OpenBlas) threads for each
// instance.
int
cpu_math_library_num_threads_
{
1
};
};
// A factory to help create different predictors.
...
...
paddle/fluid/inference/tensorrt/convert/split_op.cc
浏览文件 @
557229bd
...
...
@@ -19,9 +19,6 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
/*
* SplitOp.
*/
class
SplitOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
...
...
@@ -40,16 +37,11 @@ class SplitOpConverter : public OpConverter {
int
axis
=
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"axis"
));
std
::
vector
<
int
>
output_lengths
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"sections"
));
// split on batch is not supported in TensorRT
PADDLE_ENFORCE
(
axis
!=
0
);
if
(
axis
<
0
)
{
axis
+=
input_dims
.
nbDims
;
}
else
{
axis
-=
1
;
}
axis
+=
(
axis
<
0
)
?
input_dims
.
nbDims
:
-
1
;
PADDLE_ENFORCE
(
output_lengths
.
size
()
==
output_num
);
//
plugin
::
SplitPlugin
*
plugin
=
new
plugin
::
SplitPlugin
(
axis
,
output_lengths
);
nvinfer1
::
IPluginLayer
*
layer
=
engine_
->
AddPlugin
(
&
input
,
input_num
,
plugin
);
...
...
paddle/fluid/inference/tensorrt/convert/test_split_op.cc
浏览文件 @
557229bd
...
...
@@ -20,30 +20,92 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
TEST
(
split_op
,
test
)
{
template
<
int
BatchSize
,
int
Axis
>
void
TensorRTSplitTest
(
const
std
::
vector
<
int
>
&
in_shape
,
const
std
::
vector
<
int
>
&
sections
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
""
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"split_input"
,
nvinfer1
::
DimsCHW
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out1"
,
nvinfer1
::
DimsCHW
(
2
,
2
,
2
));
validator
.
DeclOutputVar
(
"split_out2"
,
nvinfer1
::
DimsCHW
(
1
,
2
,
2
));
TRTConvertValidation
validator
(
BatchSize
+
1
,
parameters
,
scope
,
10000
);
auto
make_dim
=
[](
const
std
::
vector
<
int
>
&
shape
)
{
nvinfer1
::
DimsCHW
dim
;
dim
.
c
()
=
shape
[
0
];
dim
.
h
()
=
shape
[
1
];
dim
.
w
()
=
shape
[
2
];
return
dim
;
};
validator
.
DeclInputVar
(
"split_input"
,
make_dim
(
in_shape
));
std
::
vector
<
std
::
string
>
output_vars
;
for
(
size_t
i
=
0
;
i
<
sections
.
size
();
++
i
)
{
auto
out_shape
=
in_shape
;
out_shape
[
Axis
-
1
]
=
sections
[
i
];
std
::
string
output_name
=
"split_out"
+
std
::
to_string
(
i
);
validator
.
DeclOutputVar
(
output_name
,
make_dim
(
out_shape
));
output_vars
.
push_back
(
output_name
);
}
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"split"
);
desc
.
SetInput
(
"X"
,
{
"split_input"
});
desc
.
SetOutput
(
"Out"
,
{
"split_out1"
,
"split_out2"
}
);
desc
.
SetOutput
(
"Out"
,
output_vars
);
int
num
=
0
;
int
axis
=
1
;
std
::
vector
<
int
>
output_lengths
=
{
2
,
1
};
desc
.
SetAttr
(
"axis"
,
axis
);
desc
.
SetAttr
(
"num"
,
num
);
desc
.
SetAttr
(
"sections"
,
output_lengths
);
desc
.
SetAttr
(
"axis"
,
Axis
);
desc
.
SetAttr
(
"num"
,
0
);
desc
.
SetAttr
(
"sections"
,
sections
);
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
);
validator
.
Execute
(
BatchSize
);
}
// batch = 0, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch1
)
{
TensorRTSplitTest
<
1
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 1, same shape
TEST
(
split_op
,
test_same_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
4
,
2
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 1, different shape
TEST
(
split_op
,
test_different_shape_axis1_batch10
)
{
TensorRTSplitTest
<
10
,
1
>
({
3
,
2
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 0, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch1
)
{
TensorRTSplitTest
<
1
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 10, axis = 2, same shape
TEST
(
split_op
,
test_same_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
4
,
2
},
{
2
,
2
});
}
// batch = 10, axis = 2, different shape
TEST
(
split_op
,
test_different_shape_axis2_batch10
)
{
TensorRTSplitTest
<
10
,
2
>
({
3
,
3
,
2
},
{
2
,
1
});
}
// batch = 0, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 0, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch1
)
{
TensorRTSplitTest
<
1
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
// batch = 10, axis = 3, same shape
TEST
(
split_op
,
test_same_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
4
},
{
2
,
2
});
}
// batch = 10, axis = 3, different shape
TEST
(
split_op
,
test_different_shape_axis3_batch10
)
{
TensorRTSplitTest
<
10
,
3
>
({
3
,
2
,
3
},
{
2
,
1
});
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
浏览文件 @
557229bd
...
...
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_fp16.h>
#include <algorithm>
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
namespace
paddle
{
...
...
@@ -19,6 +21,52 @@ namespace inference {
namespace
tensorrt
{
namespace
plugin
{
// copied from operators::math::SplitFunctor
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
*
out_cols
,
int
out_cols_size
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
curr_segment
=
0
;
int
curr_offset
=
out_cols
[
0
];
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
while
(
curr_col_offset
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
curr_col_offset
=
out_cols
[
curr_segment
+
1
];
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
output_ptr
=
outputs_data
[
curr_segment
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
template
<
typename
T
>
__global__
void
SplitKernel
(
const
T
*
input_data
,
const
int
in_row
,
const
int
in_col
,
const
int
fixed_out_col
,
T
**
outputs_data
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(;
tid_x
<
in_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
/
fixed_out_col
;
int
in_offset
=
tid_x
-
split
*
fixed_out_col
;
T
*
output_ptr
=
outputs_data
[
split
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
fixed_out_col
+
in_offset
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
nvinfer1
::
Dims
SplitPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
input_dims
,
int
num_inputs
)
{
PADDLE_ENFORCE_EQ
(
num_inputs
,
1
);
...
...
@@ -31,48 +79,96 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
int
SplitPlugin
::
initialize
()
{
PADDLE_ENFORCE_LE
(
axis_
,
nvinfer1
::
Dims
::
MAX_DIMS
);
// notice input dims is [C, H, W]
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
outer_rows_
=
1
;
inner_cols_
=
1
;
for
(
int
i
=
0
;
i
<
axis_
;
++
i
)
{
outer_rows_
*=
dims
.
d
[
i
];
}
for
(
int
i
=
axis_
+
1
;
i
<
dims
.
nbDims
;
++
i
)
{
inner_cols_
*=
dims
.
d
[
i
];
}
same_shape_
=
true
;
std
::
vector
<
int
>
segment_offsets
(
1
,
0
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]);
if
(
output_length_
[
i
]
!=
output_length_
[
0
])
{
same_shape_
=
false
;
}
segment_offsets
.
push_back
(
segment_offsets
.
back
()
+
output_length_
[
i
]
*
inner_cols_
);
}
segment_offsets_
=
segment_offsets
;
nvinfer1
::
Dims
dims
=
this
->
getInputDims
(
0
);
nx_
=
1
;
for
(
int
i
=
dims
.
nbDims
-
1
;
i
>
axis_
;
--
i
)
{
nx_
*=
dims
.
d
[
i
];
inner_cols_
*=
dims
.
d
[
axis_
];
d_segment_offsets_
=
segment_offsets
;
segment_offsets_
=
std
::
move
(
segment_offsets
);
d_output_ptrs_
.
resize
(
this
->
getNbOutputs
(),
nullptr
);
return
0
;
}
template
<
typename
T
>
inline
void
Split
(
cudaStream_t
stream
,
const
bool
same_shape
,
const
int
outer_rows
,
const
int
inner_cols
,
const
std
::
vector
<
int
>&
segment_offsets
,
const
int
*
d_segment_offsets
,
const
T
*
input
,
T
**
outputs
)
{
const
int
kThreadsPerBlock
=
1024
;
const
int
kMaxBlocks
=
65535
;
int
block_cols
=
kThreadsPerBlock
;
if
(
inner_cols
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
inner_cols
+
31
)
>>
5
)
<<
5
;
}
ny_
=
dims
.
d
[
axis_
];
nz_
=
1
;
for
(
int
i
=
axis_
-
1
;
i
>=
0
;
--
i
)
{
nz_
*=
dims
.
d
[
i
];
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
grid_cols
=
std
::
min
((
inner_cols
+
block_cols
-
1
)
/
block_cols
,
kMaxBlocks
);
int
grid_rows
=
std
::
min
(
kMaxBlocks
/
grid_cols
,
std
::
max
(
outer_rows
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
same_shape
)
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
segment_offsets
[
1
],
outputs
);
}
else
{
SplitKernel
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
input
,
outer_rows
,
inner_cols
,
d_segment_offsets
,
static_cast
<
int
>
(
segment_offsets
.
size
()),
outputs
);
}
return
0
;
}
int
SplitPlugin
::
enqueue
(
int
batchSize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
{
auto
const
&
input_dims
=
this
->
getInputDims
(
0
);
int
input_size
=
0
;
float
const
*
idata
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
float
**
odatas
=
reinterpret_cast
<
float
**>
(
outputs
);
// kernel impl here.
int
inputBatchOffset
=
nx_
*
ny_
*
nz_
;
for
(
size_t
i
=
0
;
i
<
this
->
getNbOutputs
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
batchSize
;
j
++
)
{
cudaMemcpyAsync
(
odatas
[
i
]
+
j
*
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
inputs
[
0
]
+
(
inputBatchOffset
*
j
+
segment_offsets_
[
i
]
*
nx_
)
*
sizeof
(
float
),
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
nx_
*
sizeof
(
float
),
cudaMemcpyDeviceToDevice
,
stream
);
float
const
*
input_ptr
=
reinterpret_cast
<
float
const
*>
(
inputs
[
0
]);
if
(((
batchSize
==
1
&&
axis_
==
0
)
||
axis_
==
-
1
)
&&
this
->
getNbOutputs
()
<
10
)
{
float
**
output_ptrs
=
reinterpret_cast
<
float
**>
(
outputs
);
int
data_type_size
=
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
?
sizeof
(
float
)
:
sizeof
(
__half
);
for
(
int
i
=
0
;
i
<
this
->
getNbOutputs
();
++
i
)
{
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
output_ptrs
[
i
],
input_ptr
+
segment_offsets_
[
i
],
(
segment_offsets_
[
i
+
1
]
-
segment_offsets_
[
i
])
*
data_type_size
,
cudaMemcpyDeviceToDevice
,
stream
)
==
cudaSuccess
);
}
}
else
{
outer_rows_
*=
batchSize
;
const
int
*
d_segment_offsets_ptr
=
thrust
::
raw_pointer_cast
(
&
d_segment_offsets_
[
0
]);
float
**
output_ptrs
=
thrust
::
raw_pointer_cast
(
&
d_output_ptrs_
[
0
]);
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
output_ptrs
,
outputs
,
this
->
getNbOutputs
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
,
stream
)
==
cudaSuccess
);
if
(
this
->
getDataType
()
==
nvinfer1
::
DataType
::
kFLOAT
)
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
input_ptr
,
output_ptrs
);
}
else
{
Split
(
stream
,
same_shape_
,
outer_rows_
,
inner_cols_
,
segment_offsets_
,
d_segment_offsets_ptr
,
(
__half
*
)
input_ptr
,
// NOLINT
(
__half
**
)
output_ptrs
);
// NOLINT
}
}
return
cudaGetLastError
()
!=
cudaSuccess
;
}
...
...
paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
浏览文件 @
557229bd
...
...
@@ -14,6 +14,7 @@
#pragma once
#include <thrust/device_vector.h>
#include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...
...
@@ -25,7 +26,7 @@ namespace plugin {
class
SplitPlugin
:
public
PluginTensorRT
{
public:
SplitPlugin
(
int
axis
,
std
::
vector
<
int
>
const
&
output_lengths
)
:
axis_
(
axis
),
output_length_
(
output_lengths
)
{}
:
axis_
(
axis
),
same_shape_
(
true
),
output_length_
(
output_lengths
)
{}
SplitPlugin
(
void
const
*
serial_data
,
size_t
serial_length
)
{
deserializeBase
(
serial_data
,
serial_length
);
...
...
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
}
int
axis_
;
int
outer_rows_
;
int
inner_cols_
;
bool
same_shape_
;
std
::
vector
<
int
>
output_length_
;
int
nx_
,
ny_
,
nz_
;
std
::
vector
<
int
>
segment_offsets_
;
thrust
::
device_vector
<
int
>
d_segment_offsets_
;
thrust
::
device_vector
<
float
*>
d_output_ptrs_
;
};
}
// namespace plugin
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -74,7 +74,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# ocr
set
(
OCR_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/ocr"
)
if
(
NOT EXISTS
${
OCR_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
OCR_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Focr.tar.gz"
)
inference_download_and_uncompress
(
${
OCR_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Focr.tar.gz"
)
endif
()
inference_analysis_api_test
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
...
...
@@ -88,31 +88,31 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
# anakin
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
# anakin rnn1
set
(
ANAKIN_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/anakin"
)
set
(
ANAKIN_RNN1_INSTALL_DIR
"
${
ANAKIN_INSTALL_DIR
}
/rnn1"
)
inference_download
(
${
ANAKIN_RNN1_INSTALL_DIR
}
${
INFERENCE_URL
}
"anakin_test%2Fditu_rnn.anakin2.model.bin"
)
inference_download
(
${
ANAKIN_RNN1_INSTALL_DIR
}
${
INFERENCE_URL
}
"anakin_test%2Fditu_rnn_data.txt"
)
cc_test
(
test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
ARGS --model=
${
ANAKIN_RNN1_INSTALL_DIR
}
/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=
${
ANAKIN_RNN1_INSTALL_DIR
}
/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared SERIAL
)
# anakin mobilenet
if
(
WITH_GPU
)
set
(
ANAKIN_MOBILENET_INSTALL_DIR
"
${
ANAKIN_INSTALL_DIR
}
/mobilenet"
)
inference_download
(
${
ANAKIN_MOBILENET_INSTALL_DIR
}
${
INFERENCE_URL
}
"mobilenet_v2.anakin.bin"
)
cc_test
(
test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
ARGS --model=
${
ANAKIN_MOBILENET_INSTALL_DIR
}
/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL
)
endif
()
# anakin rnn1
set
(
ANAKIN_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/anakin"
)
set
(
ANAKIN_RNN1_INSTALL_DIR
"
${
ANAKIN_INSTALL_DIR
}
/rnn1"
)
inference_download
(
${
ANAKIN_RNN1_INSTALL_DIR
}
${
INFERENCE_URL
}
"anakin_test%2Fditu_rnn.anakin2.model.bin"
)
inference_download
(
${
ANAKIN_RNN1_INSTALL_DIR
}
${
INFERENCE_URL
}
"anakin_test%2Fditu_rnn_data.txt"
)
cc_test
(
test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
ARGS --model=
${
ANAKIN_RNN1_INSTALL_DIR
}
/anakin_test%2Fditu_rnn.anakin2.model.bin
--datapath=
${
ANAKIN_RNN1_INSTALL_DIR
}
/anakin_test%2Fditu_rnn_data.txt
DEPS inference_anakin_api_shared SERIAL
)
# anakin mobilenet
if
(
WITH_GPU
)
set
(
ANAKIN_MOBILENET_INSTALL_DIR
"
${
ANAKIN_INSTALL_DIR
}
/mobilenet"
)
inference_download
(
${
ANAKIN_MOBILENET_INSTALL_DIR
}
${
INFERENCE_URL
}
"mobilenet_v2.anakin.bin"
)
cc_test
(
test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
ARGS --model=
${
ANAKIN_MOBILENET_INSTALL_DIR
}
/mobilenet_v2.anakin.bin
DEPS inference_anakin_api_shared dynload_cuda SERIAL
)
endif
()
endif
()
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
TRT_MODEL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/trt"
)
if
(
NOT EXISTS
${
TRT_MODEL_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
TRT_MODEL_INSTALL_DIR
}
${
INFERENCE_URL
}
/tensorrt_test
"trt_test_models.tar.gz"
)
endif
()
inference_analysis_test
(
test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRT_MODEL_INSTALL_DIR
}
/trt_test_models SERIAL
)
set
(
TRT_MODEL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/trt"
)
if
(
NOT EXISTS
${
TRT_MODEL_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
TRT_MODEL_INSTALL_DIR
}
${
INFERENCE_URL
}
/tensorrt_test
"trt_test_models.tar.gz"
)
endif
()
inference_analysis_test
(
test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRT_MODEL_INSTALL_DIR
}
/trt_test_models SERIAL
)
endif
()
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
浏览文件 @
557229bd
...
...
@@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) {
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/config_printer.h
浏览文件 @
557229bd
...
...
@@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
param_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"specify_input_name: "
<<
config
.
specify_input_name
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"cpu_num_threads: "
<<
config
.
cpu_math_library_num_threads
()
<<
"
\n
"
;
num_spaces
--
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
return
os
;
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
557229bd
...
...
@@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode."
);
DECLARE_bool
(
profile
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
inference
{
...
...
@@ -206,22 +207,23 @@ void TestMultiThreadPrediction(
int
batch_size
=
FLAGS_batch_size
;
int
num_times
=
FLAGS_repeat
;
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreateTestPredictor
(
config
,
use_analysis
));
for
(
int
tid
=
1
;
tid
<
num_threads
;
++
tid
)
{
predictors
.
emplace_back
(
predictors
.
front
()
->
Clone
());
}
auto
main_predictor
=
CreateTestPredictor
(
config
,
use_analysis
);
size_t
total_time
{
0
};
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
#ifdef PADDLE_WITH_MKLDNN
platform
::
set_cur_thread_id
(
static_cast
<
int
>
(
tid
)
+
1
);
#endif
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std
::
vector
<
PaddleTensor
>
outputs_tid
;
auto
&
predictor
=
predictors
[
tid
];
// To ensure the thread binding correctly,
// please clone inside the threadpool.
auto
predictor
=
main_predictor
->
Clone
();
#ifdef PADDLE_WITH_MKLDNN
if
(
use_analysis
)
{
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
())
->
SetMkldnnThreadID
(
static_cast
<
int
>
(
tid
)
+
1
);
}
#endif
// warmup run
LOG
(
INFO
)
<<
"Running thread "
<<
tid
<<
", warm up run..."
;
...
...
paddle/fluid/inference/utils/CMakeLists.txt
0 → 100644
浏览文件 @
557229bd
cc_library
(
benchmark SRCS benchmark.cc DEPS enforce
)
cc_test
(
test_benchmark SRCS benchmark_tester.cc DEPS benchmark
)
paddle/fluid/inference/utils/benchmark.cc
0 → 100644
浏览文件 @
557229bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
std
::
string
Benchmark
::
SerializeToString
()
const
{
std
::
stringstream
ss
;
ss
<<
"-----------------------------------------------------
\n
"
;
ss
<<
"name
\t
"
;
ss
<<
"batch_size
\t
"
;
ss
<<
"num_threads
\t
"
;
ss
<<
"latency
\t
"
;
ss
<<
"qps"
;
ss
<<
'\n'
;
ss
<<
name_
<<
"
\t
"
;
ss
<<
batch_size_
<<
"
\t
"
;
ss
<<
num_threads_
<<
"
\t
"
;
ss
<<
latency_
<<
"
\t
"
;
ss
<<
1000
/
latency_
;
ss
<<
'\n'
;
return
ss
.
str
();
}
void
Benchmark
::
PersistToFile
(
const
std
::
string
&
path
)
const
{
std
::
ofstream
file
(
path
,
std
::
ios
::
app
);
PADDLE_ENFORCE
(
file
.
is_open
(),
"Can not open %s to add benchmark"
,
path
);
file
<<
SerializeToString
();
file
.
flush
();
file
.
close
();
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark.h
0 → 100644
浏览文件 @
557229bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
namespace
paddle
{
namespace
inference
{
/*
* Helper class to calculate the performance.
*/
struct
Benchmark
{
int
batch_size
()
const
{
return
batch_size_
;
}
void
SetBatchSize
(
int
x
)
{
batch_size_
=
x
;
}
int
num_threads
()
const
{
return
num_threads_
;
}
void
SetNumThreads
(
int
x
)
{
num_threads_
=
x
;
}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
void
SetUseGpu
()
{
use_gpu_
=
true
;
}
int
latency
()
const
{
return
latency_
;
}
void
SetLatency
(
int
x
)
{
latency_
=
x
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
std
::
string
SerializeToString
()
const
;
void
PersistToFile
(
const
std
::
string
&
path
)
const
;
private:
bool
use_gpu_
{
false
};
int
batch_size_
{
0
};
int
latency_
;
int
num_threads_
{
1
};
std
::
string
name_
;
};
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/utils/benchmark_tester.cc
0 → 100644
浏览文件 @
557229bd
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/benchmark.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
using
namespace
paddle
::
inference
;
TEST
(
Benchmark
,
basic
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
benchmark
.
SetBatchSize
(
10
);
benchmark
.
SetUseGpu
();
benchmark
.
SetLatency
(
220
);
LOG
(
INFO
)
<<
"benchmark:
\n
"
<<
benchmark
.
SerializeToString
();
}
TEST
(
Benchmark
,
PersistToFile
)
{
Benchmark
benchmark
;
benchmark
.
SetName
(
"key0"
);
benchmark
.
SetBatchSize
(
10
);
benchmark
.
SetUseGpu
();
benchmark
.
SetLatency
(
220
);
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
benchmark
.
PersistToFile
(
"1.log"
);
}
\ No newline at end of file
paddle/fluid/memory/allocation/retry_allocator_test.cc
浏览文件 @
557229bd
...
...
@@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) {
size_t
thread_num
=
32
;
size_t
sleep_time
=
40
;
size_t
extra_time
=
2
;
size_t
extra_time
=
10
;
// Reserve to perform more tests in the future
std
::
vector
<
std
::
shared_ptr
<
Allocator
>>
allocators
;
...
...
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -22,7 +22,7 @@ iou_similarity_op.cu)
detection_library
(
mine_hard_examples_op SRCS mine_hard_examples_op.cc
)
detection_library
(
multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op.cu
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
)
detection_library
(
density_prior_box_op SRCS density_prior_box_op.cc
density_prior_box_op.cu
)
detection_library
(
anchor_generator_op SRCS anchor_generator_op.cc
anchor_generator_op.cu
)
detection_library
(
target_assign_op SRCS target_assign_op.cc
...
...
paddle/fluid/operators/detection/density_prior_box_op.cc
浏览文件 @
557229bd
...
...
@@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
auto
fixed_sizes
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_sizes"
);
auto
fixed_ratios
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
float
>>
(
"fixed_ratios"
);
auto
densities
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"densities"
);
bool
flatten
=
ctx
->
Attrs
().
Get
<
bool
>
(
"flatten_to_2d"
);
PADDLE_ENFORCE_EQ
(
fixed_sizes
.
size
(),
densities
.
size
(),
"The number of fixed_sizes and densities must be equal."
);
size_t
num_priors
=
0
;
if
((
fixed_sizes
.
size
()
>
0
)
&&
(
densities
.
size
()
>
0
))
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
if
(
fixed_ratios
.
size
()
>
0
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
}
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
if
(
!
flatten
)
{
std
::
vector
<
int64_t
>
dim_vec
(
4
);
dim_vec
[
0
]
=
input_dims
[
2
];
dim_vec
[
1
]
=
input_dims
[
3
];
dim_vec
[
2
]
=
num_priors
;
dim_vec
[
3
]
=
4
;
ctx
->
SetOutputDim
(
"Boxes"
,
framework
::
make_ddim
(
dim_vec
));
ctx
->
SetOutputDim
(
"Variances"
,
framework
::
make_ddim
(
dim_vec
));
}
else
{
int64_t
dim0
=
input_dims
[
2
]
*
input_dims
[
3
]
*
num_priors
;
ctx
->
SetOutputDim
(
"Boxes"
,
{
dim0
,
4
});
ctx
->
SetOutputDim
(
"Variances"
,
{
dim0
,
4
});
}
std
::
vector
<
int64_t
>
dim_vec
(
4
);
dim_vec
[
0
]
=
input_dims
[
2
];
dim_vec
[
1
]
=
input_dims
[
3
];
dim_vec
[
2
]
=
num_priors
;
dim_vec
[
3
]
=
4
;
ctx
->
SetOutputDim
(
"Boxes"
,
framework
::
make_ddim
(
dim_vec
));
ctx
->
SetOutputDim
(
"Variances"
,
framework
::
make_ddim
(
dim_vec
));
}
protected:
...
...
@@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"Input"
)
->
type
()),
platform
::
CPU
Place
());
ctx
.
Get
Place
());
}
};
...
...
@@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddAttr
<
bool
>
(
"clip"
,
"(bool) Whether to clip out-of-boundary boxes."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"flatten_to_2d"
,
"(bool) Whether to flatten to 2D and "
"the second dim is 4."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"step_w"
,
"Density prior boxes step across width, 0.0 for auto calculation."
)
...
...
paddle/fluid/operators/detection/density_prior_box_op.cu
0 → 100644
浏览文件 @
557229bd
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/density_prior_box_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
static
__device__
inline
T
Clip
(
T
in
)
{
return
min
(
max
(
in
,
0.
),
1.
);
}
template
<
typename
T
>
static
__global__
void
GenDensityPriorBox
(
const
int
height
,
const
int
width
,
const
int
im_height
,
const
int
im_width
,
const
T
offset
,
const
T
step_width
,
const
T
step_height
,
const
int
num_priors
,
const
T
*
ratios_shift
,
bool
is_clip
,
const
T
var_xmin
,
const
T
var_ymin
,
const
T
var_xmax
,
const
T
var_ymax
,
T
*
out
,
T
*
var
)
{
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
gidy
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
step_x
=
blockDim
.
x
*
gridDim
.
x
;
int
step_y
=
blockDim
.
y
*
gridDim
.
y
;
const
T
*
width_ratio
=
ratios_shift
;
const
T
*
height_ratio
=
ratios_shift
+
num_priors
;
const
T
*
width_shift
=
ratios_shift
+
2
*
num_priors
;
const
T
*
height_shift
=
ratios_shift
+
3
*
num_priors
;
for
(
int
j
=
gidy
;
j
<
height
;
j
+=
step_y
)
{
for
(
int
i
=
gidx
;
i
<
width
*
num_priors
;
i
+=
step_x
)
{
int
h
=
j
;
int
w
=
i
/
num_priors
;
int
k
=
i
%
num_priors
;
T
center_x
=
(
w
+
offset
)
*
step_width
;
T
center_y
=
(
h
+
offset
)
*
step_height
;
T
center_x_temp
=
center_x
+
width_shift
[
k
];
T
center_y_temp
=
center_y
+
height_shift
[
k
];
T
box_width_ratio
=
width_ratio
[
k
]
/
2.
;
T
box_height_ratio
=
height_ratio
[
k
]
/
2.
;
T
xmin
=
max
((
center_x_temp
-
box_width_ratio
)
/
im_width
,
0.
);
T
ymin
=
max
((
center_y_temp
-
box_height_ratio
)
/
im_height
,
0.
);
T
xmax
=
min
((
center_x_temp
+
box_width_ratio
)
/
im_width
,
1.
);
T
ymax
=
min
((
center_y_temp
+
box_height_ratio
)
/
im_height
,
1.
);
int
out_offset
=
(
j
*
width
*
num_priors
+
i
)
*
4
;
out
[
out_offset
]
=
is_clip
?
Clip
<
T
>
(
xmin
)
:
xmin
;
out
[
out_offset
+
1
]
=
is_clip
?
Clip
<
T
>
(
ymin
)
:
ymin
;
out
[
out_offset
+
2
]
=
is_clip
?
Clip
<
T
>
(
xmax
)
:
xmax
;
out
[
out_offset
+
3
]
=
is_clip
?
Clip
<
T
>
(
ymax
)
:
ymax
;
var
[
out_offset
]
=
var_xmin
;
var
[
out_offset
+
1
]
=
var_ymin
;
var
[
out_offset
+
2
]
=
var_xmax
;
var
[
out_offset
+
3
]
=
var_ymax
;
}
}
}
template
<
typename
T
>
class
DensityPriorBoxOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
paddle
::
framework
::
Tensor
>
(
"Input"
);
auto
*
image
=
ctx
.
Input
<
paddle
::
framework
::
Tensor
>
(
"Image"
);
auto
*
boxes
=
ctx
.
Output
<
paddle
::
framework
::
Tensor
>
(
"Boxes"
);
auto
*
vars
=
ctx
.
Output
<
paddle
::
framework
::
Tensor
>
(
"Variances"
);
auto
variances
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"variances"
);
auto
is_clip
=
ctx
.
Attr
<
bool
>
(
"clip"
);
auto
fixed_sizes
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"fixed_sizes"
);
auto
fixed_ratios
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"fixed_ratios"
);
auto
densities
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"densities"
);
T
step_w
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"step_w"
));
T
step_h
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"step_h"
));
T
offset
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"offset"
));
auto
img_width
=
image
->
dims
()[
3
];
auto
img_height
=
image
->
dims
()[
2
];
auto
feature_width
=
input
->
dims
()[
3
];
auto
feature_height
=
input
->
dims
()[
2
];
T
step_width
,
step_height
;
if
(
step_w
==
0
||
step_h
==
0
)
{
step_width
=
static_cast
<
T
>
(
img_width
)
/
feature_width
;
step_height
=
static_cast
<
T
>
(
img_height
)
/
feature_height
;
}
else
{
step_width
=
step_w
;
step_height
=
step_h
;
}
int
num_priors
=
0
;
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
int
step_average
=
static_cast
<
int
>
((
step_width
+
step_height
)
*
0.5
);
framework
::
Tensor
h_temp
;
T
*
tdata
=
h_temp
.
mutable_data
<
T
>
({
num_priors
*
4
},
platform
::
CPUPlace
());
int
idx
=
0
;
for
(
size_t
s
=
0
;
s
<
fixed_sizes
.
size
();
++
s
)
{
auto
fixed_size
=
fixed_sizes
[
s
];
int
density
=
densities
[
s
];
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
float
ar
=
fixed_ratios
[
r
];
int
shift
=
step_average
/
density
;
float
box_width_ratio
=
fixed_size
*
sqrt
(
ar
);
float
box_height_ratio
=
fixed_size
/
sqrt
(
ar
);
for
(
int
di
=
0
;
di
<
density
;
++
di
)
{
for
(
int
dj
=
0
;
dj
<
density
;
++
dj
)
{
float
center_x_temp
=
shift
/
2.
+
dj
*
shift
-
step_average
/
2.
;
float
center_y_temp
=
shift
/
2.
+
di
*
shift
-
step_average
/
2.
;
tdata
[
idx
]
=
box_width_ratio
;
tdata
[
num_priors
+
idx
]
=
box_height_ratio
;
tdata
[
2
*
num_priors
+
idx
]
=
center_x_temp
;
tdata
[
3
*
num_priors
+
idx
]
=
center_y_temp
;
idx
++
;
}
}
}
}
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
Tensor
d_temp
;
framework
::
TensorCopySync
(
h_temp
,
ctx
.
GetPlace
(),
&
d_temp
);
// At least use 32 threads, at most 512 threads.
// blockx is multiple of 32.
int
blockx
=
std
::
min
(((
feature_width
*
num_priors
+
31
)
>>
5
)
<<
5
,
512L
);
int
gridx
=
(
feature_width
*
num_priors
+
blockx
-
1
)
/
blockx
;
dim3
threads
(
blockx
,
1
);
dim3
grids
(
gridx
,
feature_height
);
auto
stream
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>().
stream
();
GenDensityPriorBox
<
T
><<<
grids
,
threads
,
0
,
stream
>>>
(
feature_height
,
feature_width
,
img_height
,
img_width
,
offset
,
step_width
,
step_height
,
num_priors
,
d_temp
.
data
<
T
>
(),
is_clip
,
variances
[
0
],
variances
[
1
],
variances
[
2
],
variances
[
3
],
boxes
->
data
<
T
>
(),
vars
->
data
<
T
>
());
}
};
// namespace operators
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
density_prior_box
,
ops
::
DensityPriorBoxOpCUDAKernel
<
float
>
,
ops
::
DensityPriorBoxOpCUDAKernel
<
double
>
);
paddle/fluid/operators/detection/density_prior_box_op.h
浏览文件 @
557229bd
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
...
...
@@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
step_height
=
step_h
;
}
int
num_priors
=
0
;
if
(
fixed_sizes
.
size
()
>
0
&&
densities
.
size
()
>
0
)
{
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
if
(
fixed_ratios
.
size
()
>
0
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
}
for
(
size_t
i
=
0
;
i
<
densities
.
size
();
++
i
)
{
num_priors
+=
(
fixed_ratios
.
size
())
*
(
pow
(
densities
[
i
],
2
));
}
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
e_boxes
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
boxes
).
setConstant
(
0.0
);
auto
box_dim
=
vars
->
dims
();
boxes
->
Resize
({
feature_height
,
feature_width
,
num_priors
,
4
});
auto
e_boxes
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
boxes
).
setConstant
(
0.0
);
int
step_average
=
static_cast
<
int
>
((
step_width
+
step_height
)
*
0.5
);
for
(
int
h
=
0
;
h
<
feature_height
;
++
h
)
{
...
...
@@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
auto
fixed_size
=
fixed_sizes
[
s
];
int
density
=
densities
[
s
];
// Generate density prior boxes with fixed ratios.
if
(
fixed_ratios
.
size
()
>
0
)
{
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
float
ar
=
fixed_ratios
[
r
];
int
shift
=
step_average
/
density
;
float
box_width_ratio
=
fixed_size
*
sqrt
(
ar
);
float
box_height_ratio
=
fixed_size
/
sqrt
(
ar
);
for
(
int
di
=
0
;
di
<
density
;
++
di
)
{
for
(
int
dj
=
0
;
dj
<
density
;
++
dj
)
{
float
center_x_temp
=
center_x
-
step_average
/
2.
+
shift
/
2.
+
dj
*
shift
;
float
center_y_temp
=
center_y
-
step_average
/
2.
+
shift
/
2.
+
di
*
shift
;
e_boxes
(
h
,
w
,
idx
,
0
)
=
(
center_x_temp
-
box_width_ratio
/
2.
)
/
img_width
>=
0
?
(
center_x_temp
-
box_width_ratio
/
2.
)
/
img_width
:
0
;
e_boxes
(
h
,
w
,
idx
,
1
)
=
(
center_y_temp
-
box_height_ratio
/
2.
)
/
img_height
>=
0
?
(
center_y_temp
-
box_height_ratio
/
2.
)
/
img_height
:
0
;
e_boxes
(
h
,
w
,
idx
,
2
)
=
(
center_x_temp
+
box_width_ratio
/
2.
)
/
img_width
<=
1
?
(
center_x_temp
+
box_width_ratio
/
2.
)
/
img_width
:
1
;
e_boxes
(
h
,
w
,
idx
,
3
)
=
(
center_y_temp
+
box_height_ratio
/
2.
)
/
img_height
<=
1
?
(
center_y_temp
+
box_height_ratio
/
2.
)
/
img_height
:
1
;
idx
++
;
}
for
(
size_t
r
=
0
;
r
<
fixed_ratios
.
size
();
++
r
)
{
float
ar
=
fixed_ratios
[
r
];
int
shift
=
step_average
/
density
;
float
box_width_ratio
=
fixed_size
*
sqrt
(
ar
);
float
box_height_ratio
=
fixed_size
/
sqrt
(
ar
);
for
(
int
di
=
0
;
di
<
density
;
++
di
)
{
for
(
int
dj
=
0
;
dj
<
density
;
++
dj
)
{
float
center_x_temp
=
center_x
-
step_average
/
2.
+
shift
/
2.
+
dj
*
shift
;
float
center_y_temp
=
center_y
-
step_average
/
2.
+
shift
/
2.
+
di
*
shift
;
e_boxes
(
h
,
w
,
idx
,
0
)
=
(
center_x_temp
-
box_width_ratio
/
2.
)
/
img_width
>=
0
?
(
center_x_temp
-
box_width_ratio
/
2.
)
/
img_width
:
0
;
e_boxes
(
h
,
w
,
idx
,
1
)
=
(
center_y_temp
-
box_height_ratio
/
2.
)
/
img_height
>=
0
?
(
center_y_temp
-
box_height_ratio
/
2.
)
/
img_height
:
0
;
e_boxes
(
h
,
w
,
idx
,
2
)
=
(
center_x_temp
+
box_width_ratio
/
2.
)
/
img_width
<=
1
?
(
center_x_temp
+
box_width_ratio
/
2.
)
/
img_width
:
1
;
e_boxes
(
h
,
w
,
idx
,
3
)
=
(
center_y_temp
+
box_height_ratio
/
2.
)
/
img_height
<=
1
?
(
center_y_temp
+
box_height_ratio
/
2.
)
/
img_height
:
1
;
idx
++
;
}
}
}
...
...
@@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
e_vars
=
var_et
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
box_num
,
1
));
vars
->
Resize
(
var_dim
);
boxes
->
Resize
(
box_dim
);
}
};
// namespace operators
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
557229bd
...
...
@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
rpc_disable_reuse_port
);
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
...
...
@@ -383,6 +385,9 @@ std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
// Channel configurations:
grpc
::
ChannelArguments
args
;
args
.
SetInt
(
GRPC_ARG_MAX_RECONNECT_BACKOFF_MS
,
2000
);
if
(
FLAGS_rpc_disable_reuse_port
)
{
args
.
SetInt
(
GRPC_ARG_ALLOW_REUSEPORT
,
0
);
}
args
.
SetCompressionAlgorithm
(
GRPC_COMPRESS_NONE
);
args
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
args
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
...
...
paddle/fluid/operators/distributed/grpc_server.cc
浏览文件 @
557229bd
...
...
@@ -20,6 +20,8 @@ limitations under the License. */
using
::
grpc
::
ServerAsyncResponseWriter
;
DECLARE_bool
(
rpc_disable_reuse_port
);
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
...
...
@@ -252,6 +254,20 @@ void AsyncGRPCServer::WaitServerReady() {
VLOG
(
40
)
<<
"AsyncGRPCServer WaitSeverReady"
;
}
// Define an option subclass in order to disable SO_REUSEPORT for the
// server socket.
// Come from:
// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
class
NoReusePortOption
:
public
::
grpc
::
ServerBuilderOption
{
public:
void
UpdateArguments
(
::
grpc
::
ChannelArguments
*
args
)
override
{
args
->
SetInt
(
GRPC_ARG_ALLOW_REUSEPORT
,
0
);
}
void
UpdatePlugins
(
std
::
vector
<
std
::
unique_ptr
<::
grpc
::
ServerBuilderPlugin
>>*
plugins
)
override
{}
};
void
AsyncGRPCServer
::
StartServer
()
{
::
grpc
::
ServerBuilder
builder
;
builder
.
AddListeningPort
(
bind_address_
,
::
grpc
::
InsecureServerCredentials
(),
...
...
@@ -259,6 +275,10 @@ void AsyncGRPCServer::StartServer() {
builder
.
SetMaxSendMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
builder
.
SetMaxReceiveMessageSize
(
std
::
numeric_limits
<
int
>::
max
());
if
(
FLAGS_rpc_disable_reuse_port
)
{
builder
.
SetOption
(
std
::
unique_ptr
<::
grpc
::
ServerBuilderOption
>
(
new
NoReusePortOption
));
}
builder
.
RegisterService
(
&
service_
);
for
(
auto
t
:
rpc_call_map_
)
{
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.cc
浏览文件 @
557229bd
...
...
@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
DEFINE_bool
(
rpc_disable_reuse_port
,
false
,
"Disable SO_REUSEPORT or not."
);
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
557229bd
include
(
operators
)
register_operators
()
register_operators
(
EXCLUDES fusion_transpose_flatten_concat_op
)
if
(
WITH_GPU
)
op_library
(
fusion_transpose_flatten_concat_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);
\n
"
)
endif
()
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
0 → 100644
浏览文件 @
557229bd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
class
TransposeFlattenConcatFusionOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_GE
(
ctx
->
Inputs
(
"X"
).
size
(),
1UL
,
"Inputs(X) of ConcatOp should be empty."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of ConcatOp should not be null."
);
auto
ins
=
ctx
->
GetInputsDim
(
"X"
);
const
size_t
n
=
ins
.
size
();
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensors count should > 0."
);
std
::
vector
<
int
>
trans_axis
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"trans_axis"
);
int
flatten_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"flatten_axis"
);
int
concat_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"concat_axis"
);
size_t
x_rank
=
ins
[
0
].
size
();
size_t
trans_axis_size
=
trans_axis
.
size
();
PADDLE_ENFORCE_EQ
(
x_rank
,
trans_axis_size
,
"The input tensor's rank(%d) "
"should be equal to the permutation axis's size(%d)"
,
x_rank
,
trans_axis_size
);
auto
dims0
=
GetFlattenShape
(
flatten_axis
,
GetPermuteShape
(
trans_axis
,
ins
[
0
]));
std
::
vector
<
int
>
out_dims
(
dims0
);
for
(
size_t
i
=
1
;
i
<
n
;
i
++
)
{
auto
dimsi
=
GetFlattenShape
(
flatten_axis
,
GetPermuteShape
(
trans_axis
,
ins
[
i
]));
for
(
int
j
=
0
;
j
<
static_cast
<
int
>
(
dims0
.
size
());
j
++
)
{
if
(
j
==
concat_axis
)
{
out_dims
[
concat_axis
]
+=
dimsi
[
j
];
}
else
{
PADDLE_ENFORCE_EQ
(
out_dims
[
j
],
dimsi
[
j
],
"After flatting, the %d-th dim should be save "
"except the specify axis."
,
j
);
}
}
}
if
(
out_dims
[
concat_axis
]
<
0
)
{
out_dims
[
concat_axis
]
=
-
1
;
}
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
}
};
class
TransposeFlattenConcatFusionOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) The input tensor, tensors with rank up to 6 are supported."
)
.
AsDuplicable
();
AddOutput
(
"Out"
,
"(Tensor)The output tensor."
);
AddAttr
<
std
::
vector
<
int
>>
(
"trans_axis"
,
"(vector<int>) A list of values, and the size of the list should be "
"the same with the input tensor rank. This operator permutes the input "
"tensor's axes according to the values given."
);
AddAttr
<
int
>
(
"flatten_axis"
,
"(int)"
"Indicate up to which input dimensions (exclusive) should be"
"flattened to the outer dimension of the output. The value"
"for axis must be in the range [0, R], where R is the rank of"
"the input tensor. When axis = 0, the shape of the output"
"tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
"input tensor is (d_0, d_1, ... d_n)."
);
AddAttr
<
int
>
(
"concat_axis"
,
"The axis along which the input tensors will be concatenated. "
"It should be 0 or 1, since the tensor is 2D after flatting."
);
AddComment
(
R"DOC(
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_transpose_flatten_concat
,
ops
::
TransposeFlattenConcatFusionOp
,
ops
::
TransposeFlattenConcatFusionOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
0 → 100644
浏览文件 @
557229bd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
using
CudnnDataType
=
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
class
TransposeFlattenConcatFusionKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
odims
=
out
->
dims
();
std
::
vector
<
int
>
trans_axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"trans_axis"
);
int
flatten_axis
=
ctx
.
Attr
<
int
>
(
"flatten_axis"
);
int
concat_axis
=
ctx
.
Attr
<
int
>
(
"concat_axis"
);
int
rank
=
ins
[
0
]
->
dims
().
size
();
// use at least 4D in cudnnTransformTensor
int
max_dim
=
rank
<
4
?
4
:
rank
;
std
::
vector
<
int
>
stride_x
(
max_dim
,
0
);
std
::
vector
<
int
>
stride_y
(
max_dim
,
0
);
std
::
vector
<
int
>
dims_y
(
max_dim
,
0
);
cudnnTensorDescriptor_t
in_desc
;
cudnnTensorDescriptor_t
out_desc
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
out_desc
));
cudnnDataType_t
cudnn_dtype
=
CudnnDataType
<
T
>::
type
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
T
*
odata
=
out
->
data
<
T
>
();
for
(
size_t
k
=
0
;
k
<
ins
.
size
();
++
k
)
{
auto
perm_shape
=
GetPermuteShape
(
trans_axis
,
ins
[
k
]
->
dims
());
int
osize
=
1
;
auto
idims
=
ins
[
k
]
->
dims
();
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
{
stride_x
[
i
]
=
1
;
for
(
int
j
=
trans_axis
[
i
]
+
1
;
j
<
rank
;
j
++
)
{
stride_x
[
i
]
*=
idims
[
j
];
}
dims_y
[
i
]
=
perm_shape
[
i
];
osize
*=
perm_shape
[
i
];
}
stride_y
[
rank
-
1
]
=
1
;
for
(
int
i
=
rank
-
2
;
i
>=
0
;
i
--
)
{
if
(((
i
+
1
)
==
flatten_axis
)
&&
(
concat_axis
==
1
))
{
stride_y
[
i
]
=
odims
[
1
];
}
else
{
stride_y
[
i
]
=
stride_y
[
i
+
1
]
*
perm_shape
[
i
+
1
];
}
}
// Since concat is aftern flatten, the output is 2D tensor.
// If concat_axis is 0, each input's permutated tensor is continuous.
// If concat_axis is 1, the stride of 0-th dim of each input's
// permutated tensor is odims()[1].
for
(
int
i
=
rank
;
i
<
max_dim
;
i
++
)
{
stride_x
[
i
]
=
1
;
stride_y
[
i
]
=
1
;
dims_y
[
i
]
=
1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
in_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_x
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
,
cudnn_dtype
,
max_dim
,
dims_y
.
data
(),
stride_y
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnTransformTensor
(
handle
,
CudnnDataType
<
T
>::
kOne
(),
in_desc
,
static_cast
<
const
void
*>
(
ins
[
k
]
->
data
<
T
>
()),
CudnnDataType
<
T
>::
kZero
(),
out_desc
,
static_cast
<
void
*>
(
odata
)));
if
(
concat_axis
==
0
)
{
odata
+=
osize
;
}
else
{
auto
flat_shape
=
GetFlattenShape
(
flatten_axis
,
perm_shape
);
odata
+=
flat_shape
[
1
];
}
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_desc
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
out_desc
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
fusion_transpose_flatten_concat
,
ops
::
TransposeFlattenConcatFusionKernel
<
float
>
,
ops
::
TransposeFlattenConcatFusionKernel
<
double
>
);
paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h
0 → 100644
浏览文件 @
557229bd
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
namespace
paddle
{
namespace
operators
{
inline
std
::
vector
<
int32_t
>
GetPermuteShape
(
const
std
::
vector
<
int
>&
axis
,
const
framework
::
DDim
&
in_dims
)
{
std
::
vector
<
int32_t
>
out_dims
(
in_dims
.
size
());
for
(
size_t
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
out_dims
[
i
]
=
in_dims
[
axis
[
i
]];
}
return
out_dims
;
}
inline
std
::
vector
<
int32_t
>
GetFlattenShape
(
const
int
axis
,
const
std
::
vector
<
int
>&
in_dims
)
{
int64_t
outer
=
1
,
inner
=
1
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
in_dims
.
size
());
++
i
)
{
if
(
i
<
axis
)
{
outer
*=
in_dims
[
i
];
}
else
{
inner
*=
in_dims
[
i
];
}
}
std
::
vector
<
int32_t
>
out_shape
(
2
);
out_shape
[
0
]
=
outer
;
out_shape
[
1
]
=
inner
;
return
out_shape
;
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/lookup_sparse_table_op.cc
浏览文件 @
557229bd
...
...
@@ -67,6 +67,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
framework
::
proto
::
VarType
::
FP32
,
"The sparse table only support FP32"
);
w_t
->
Get
(
ids_t
,
out_t
,
true
,
is_test
);
out_t
->
set_lod
(
ids_t
.
lod
());
}
};
...
...
paddle/fluid/operators/math/fc_compute.h
浏览文件 @
557229bd
...
...
@@ -17,8 +17,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
operators
{
namespace
math
{
...
...
@@ -43,7 +41,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
.
template
Get
<
jitkernel
::
VAddKernel
<
T
>
>
(
N
);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
if (FLAGS_paddle_num_threads > 1)
#pragma omp parallel for
#endif
for
(
int
i
=
0
;
i
<
M
;
i
++
)
{
T
*
dst
=
Y
+
i
*
N
;
...
...
paddle/fluid/operators/sum_op.h
浏览文件 @
557229bd
...
...
@@ -127,6 +127,9 @@ class SumKernel : public framework::OpKernel<T> {
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_add
;
merge_add
(
context
.
template
device_context
<
DeviceContext
>(),
inputs
,
out
);
out
->
SyncIndex
();
}
else
{
// no data, just set a empty out tensor.
out
->
mutable_value
()
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
0
}),
...
...
paddle/fluid/operators/tensor_array_to_tensor_op.cc
浏览文件 @
557229bd
...
...
@@ -106,9 +106,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
out_inx_dim
[
0
]
=
inx
.
size
();
out_inx
.
Resize
(
out_inx_dim
);
auto
&
local_scope
=
scope
.
NewScope
();
std
::
string
var_name
=
"out_index"
;
framework
::
Variable
*
tmp_index_var
=
const_cast
<
framework
::
Scope
&>
(
scope
).
Var
(
var_name
);
framework
::
Variable
*
tmp_index_var
=
local_scope
.
Var
(
var_name
);
auto
&
tmp_index_tensor
=
*
(
tmp_index_var
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
());
tmp_index_tensor
.
Resize
(
out_inx_dim
);
...
...
@@ -128,12 +128,12 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
out_dims
[
axis
]
=
out_dim_sum
;
out
.
Resize
(
out_dims
);
LodTensorArray2LodTensorVector
(
scope
,
base_name
,
Input
(
"X"
),
&
names
);
// Invoke
Reshape
Op
LodTensorArray2LodTensorVector
(
local_
scope
,
base_name
,
Input
(
"X"
),
&
names
);
// Invoke
concat
Op
auto
concat_op
=
framework
::
OpRegistry
::
CreateOp
(
"concat"
,
{{
"X"
,
names
}},
{{
"Out"
,
{
Output
(
"Out"
)}}},
attrs
);
concat_op
->
Run
(
scope
,
place
);
concat_op
->
Run
(
local_
scope
,
place
);
}
};
...
...
paddle/fluid/platform/cpu_helper.cc
浏览文件 @
557229bd
...
...
@@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) {
#elif defined(PADDLE_WITH_MKLML)
int
real_num_threads
=
num_threads
>
1
?
num_threads
:
1
;
platform
::
dynload
::
MKL_Set_Num_Threads
(
real_num_threads
);
omp_set_num_threads
(
num_threads
);
omp_set_num_threads
(
real_
num_threads
);
#else
PADDLE_ENFORCE
(
false
,
"To be implemented."
);
#endif
...
...
paddle/fluid/platform/dynload/cublas.cc
浏览文件 @
557229bd
...
...
@@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DEFINE_WRAP
);
#endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
CUBLAS_BLAS_ROUTINE_EACH_R4
(
DEFINE_WRAP
);
#endif
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/dynload/cublas.h
浏览文件 @
557229bd
...
...
@@ -90,23 +90,33 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasGemmEx
);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasSgemmStridedBatched
);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasDgemmStridedBatched
);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasCgemmStridedBatched
);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasZgemmStridedBatched
);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasHgemmStridedBatched
);
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(cublasGemmEx); \
__macro(cublasSgemmStridedBatched); \
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
#endif
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasSetMathMode
);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasGetMathMode
);
#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
__macro(cublasSetMathMode); \
__macro(cublasGetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R3
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
#endif
// APIs available after CUDA 9.1
#if CUDA_VERSION >= 9010
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasGemmBatchedEx
);
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
(
cublasGemmStridedBatchedEx
);
#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(cublasGemmBatchedEx); \
__macro(cublasGemmStridedBatchedEx);
CUBLAS_BLAS_ROUTINE_EACH_R4
(
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
557229bd
...
...
@@ -5,8 +5,8 @@ if(WITH_PYTHON)
if
(
WITH_AMD_GPU
)
hip_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
DEPS
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
)
DEPS
ARCHIVE_START
${
PYBIND_DEPS
}
${
GLOB_OP_LIB
}
${
GLOB_OPERATOR_DEPS
}
ARCHIVE_END
)
else
()
cc_library
(
paddle_pybind SHARED
SRCS
${
PYBIND_SRCS
}
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
557229bd
...
...
@@ -94,6 +94,30 @@ function cmake_gen() {
else
exit
1
fi
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
if
[
-d
"/Library/Frameworks/Python.framework/Versions/3.6"
]
;
then
export
LD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/lib/
export
DYLD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/lib/
export
PATH
=
/Library/Frameworks/Python.framework/Versions/3.6/bin/:
${
PATH
}
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
WITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
ON
}
else
exit
1
fi
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
if
[
-d
"/Library/Frameworks/Python.framework/Versions/3.7"
]
;
then
export
LD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/lib/
export
DYLD_LIBRARY_PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/lib/
export
PATH
=
/Library/Frameworks/Python.framework/Versions/3.7/bin/:
${
PATH
}
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
WITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
ON
}
else
exit
1
fi
fi
else
if
[
"
$1
"
!=
""
]
;
then
...
...
@@ -116,6 +140,18 @@ function cmake_gen() {
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.6.0/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.6.0/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so"
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-3.7.0/lib/:
${
LD_LIBRARY_PATH
}
export
PATH
=
/opt/_internal/cpython-3.7.0/bin/:
${
PATH
}
export
PYTHON_FLAGS
=
"-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
fi
fi
fi
...
...
@@ -419,7 +455,7 @@ function assert_api_not_changed() {
source
.env/bin/activate
pip
install
${
PADDLE_ROOT
}
/build/python/dist/
*
whl
python
${
PADDLE_ROOT
}
/tools/print_signatures.py paddle.fluid
>
new.spec
if
[
"
$1
"
==
"cp35-cp35m"
]
;
then
if
[
"
$1
"
==
"cp35-cp35m"
]
||
[
"
$1
"
==
"cp36-cp36m"
]
||
[
"
$1
"
==
"cp37-cp37m"
]
;
then
# Use sed to make python2 and python3 sepc keeps the same
sed
-i
's/arg0: str/arg0: unicode/g'
new.spec
sed
-i
"s/
\(
.*Transpiler.*
\)
.__init__ ArgSpec(args=
\[
'self'].*/
\1
.__init__ /g"
new.spec
...
...
paddle/testing/paddle_gtest_main.cc
浏览文件 @
557229bd
...
...
@@ -28,9 +28,14 @@ int main(int argc, char** argv) {
for
(
int
i
=
0
;
i
<
argc
;
++
i
)
{
new_argv
.
push_back
(
argv
[
i
]);
}
#if
def PADDLE_WITH_CUDA
#if
defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new_argv
.
push_back
(
strdup
(
"--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"
));
#elif __clang__
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
"mb,allocator_strategy"
));
new_argv
.
push_back
(
strdup
(
"--undefok=use_mkldnn,initial_cpu_memory_in_mb"
));
#else
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
...
...
python/paddle/fluid/__init__.py
浏览文件 @
557229bd
...
...
@@ -91,6 +91,7 @@ def __bootstrap__():
"""
import
sys
import
os
import
platform
from
.
import
core
in_test
=
'unittest'
in
sys
.
modules
...
...
@@ -110,14 +111,17 @@ def __bootstrap__():
print
(
'PLEASE USE OMP_NUM_THREADS WISELY.'
,
file
=
sys
.
stderr
)
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
sysstr
=
platform
.
system
()
read_env_flags
=
[
'
use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope
'
,
'use_
mkldnn'
,
'use_ngraph'
,
'initial_cpu_memory_in_mb
'
,
'
init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'
check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'use_mkldnn
'
,
'use_
ngraph'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem
'
,
'
free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'warpctc_dir'
)
read_env_flags
.
append
(
'cpu_deterministic'
)
...
...
@@ -129,6 +133,7 @@ def __bootstrap__():
read_env_flags
.
append
(
'rpc_send_thread_num'
)
read_env_flags
.
append
(
'rpc_get_thread_num'
)
read_env_flags
.
append
(
'rpc_prefetch_thread_num'
)
read_env_flags
.
append
(
'rpc_disable_reuse_port'
)
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
...
...
python/paddle/fluid/contrib/utils/__init__.py
浏览文件 @
557229bd
...
...
@@ -13,8 +13,10 @@
# limitations under the License.
from
__future__
import
print_function
from
.
import
lookup_table_utils
from
.lookup_table_utils
import
*
from
.
import
hdfs_utils
from
.hdfs_utils
import
*
__all__
=
lookup_table_utils
.
__all__
__all__
=
hdfs_utils
.
__all__
python/paddle/fluid/contrib/utils/lookup_table_utils.py
0 → 100644
浏览文件 @
557229bd
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
time
import
logging
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid
import
io
from
paddle.fluid
import
Program
__all__
=
[
"load_inference_model"
,
"load_persistable_vars"
,
"convert_dist_to_sparse_program"
]
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
)
_logger
=
logging
.
getLogger
(
"lookup_table_utils"
)
_logger
.
setLevel
(
logging
.
INFO
)
model_filename
=
"__model__"
lookup_table_dir
=
"__lookup_table__"
def
__insert_lookup_sparse_table_op
(
main_program
,
idx
,
ids
,
w
,
out
):
main_program
.
global_block
().
_insert_op
(
index
=
idx
,
type
=
"lookup_sparse_table"
,
inputs
=
{
"Ids"
:
[
ids
],
"W"
:
[
w
]},
outputs
=
{
"Out"
:
[
out
]},
attrs
=
{
"is_distributed"
:
False
,
"is_sparse"
:
True
,
"grad_inplace"
:
False
})
def
__get_prefetch_op_tuples
(
main_program
):
# current lookup tables op is split_ids->prefetch->merge_ids
prefetch_op_tuples
=
None
op_types
=
[
op
.
type
for
op
in
main_program
.
global_block
().
ops
]
for
i
in
range
(
len
(
op_types
)):
if
op_types
[
i
]
==
"prefetch"
:
if
op_types
[
i
-
1
]
==
"split_ids"
and
op_types
[
i
+
1
]
==
"merge_ids"
:
split_ids_op_id
=
i
-
1
split_ids_inputs
=
main_program
.
global_block
().
ops
[
i
-
1
].
input
(
"Ids"
)
prefetch_op_inputs
=
main_program
.
global_block
().
ops
[
i
].
input
(
"X"
)
prefetch_op_outputs
=
main_program
.
global_block
().
ops
[
i
].
output
(
"Out"
)
merge_ids_outputs
=
main_program
.
global_block
().
ops
[
i
+
1
].
output
(
"Out"
)
need_delete_vars
=
[]
need_delete_vars
.
extend
(
prefetch_op_inputs
)
need_delete_vars
.
extend
(
prefetch_op_outputs
)
prefetch_op_tuples
=
(
split_ids_op_id
,
split_ids_inputs
,
merge_ids_outputs
,
need_delete_vars
)
break
return
prefetch_op_tuples
def
convert_dist_to_sparse_program
(
main_program
):
if
not
main_program
.
_distributed_lookup_table
:
_logger
.
warn
(
"There are no distributed lookup tables need to be converted"
)
return
# create table param and grad var in pserver program
origin_emb_var
=
"{}.origin"
.
format
(
main_program
.
_distributed_lookup_table
)
emb_var
=
main_program
.
_distributed_lookup_table
main_program
.
global_block
().
_rename_var
(
emb_var
,
origin_emb_var
)
origin_param_var
=
main_program
.
global_block
().
vars
[
origin_emb_var
]
param_var
=
main_program
.
global_block
().
create_var
(
name
=
emb_var
,
shape
=
origin_param_var
.
shape
,
dtype
=
origin_param_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
# parameter must be selected rows
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
main_program
.
_sync_with_cpp
()
prefetch_op_tuples
=
__get_prefetch_op_tuples
(
main_program
)
split_ids_id
=
prefetch_op_tuples
[
0
]
for
idx
in
range
(
split_ids_id
+
2
,
split_ids_id
-
1
,
-
1
):
main_program
.
global_block
().
_remove_op
(
idx
)
main_program
.
desc
.
flush
()
in_out_pairs
=
zip
(
prefetch_op_tuples
[
1
],
prefetch_op_tuples
[
2
])
for
in_out_pair
in
in_out_pairs
:
idx
=
split_ids_id
ids
=
main_program
.
global_block
().
vars
[
in_out_pair
[
0
]]
out
=
main_program
.
global_block
().
vars
[
in_out_pair
[
1
]]
__insert_lookup_sparse_table_op
(
main_program
,
idx
,
ids
,
param_var
,
out
)
main_program
.
desc
.
flush
()
return
main_program
def
load_persistable_vars
(
executor
,
dirname
,
program
,
lookup_table_var
):
def
_is_checkpoint_var
(
exclude_fluid_vars
=
None
):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if
exclude_fluid_vars
is
None
:
exclude_fluid_vars
=
[]
def
is_valid
(
var
):
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
:
return
False
# @GRAD are named for gradient variables, checkpoint will not save it.
if
"@GRAD"
in
var
.
name
:
return
False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if
".trainer_"
in
var
.
name
:
return
False
# .block is named for distribute train variables, checkpoint will not save it.
if
".block"
in
var
.
name
:
return
False
if
"tmp_"
in
var
.
name
:
return
False
if
var
.
name
in
exclude_fluid_vars
:
return
False
return
var
.
persistable
return
is_valid
def
_load_lookup_table_vars
(
executor
,
dirname
,
main_program
,
lookup_table_vars
):
if
not
os
.
path
.
isdir
(
dirname
):
raise
ValueError
(
"There is no directory named '%s'"
,
dirname
)
lookup_table_dirname
=
os
.
path
.
join
(
dirname
,
lookup_table_dir
)
emb_var_name
=
lookup_table_vars
[
0
]
emb_var
=
main_program
.
global_block
().
var
(
emb_var_name
)
emb_files
=
[]
for
emb_name
in
os
.
listdir
(
lookup_table_dirname
):
if
emb_var_name
in
emb_name
:
emb_files
.
append
(
emb_name
)
convert_program
=
Program
()
global_block
=
convert_program
.
global_block
()
emb_var
=
global_block
.
create_var
(
name
=
emb_var
.
name
,
shape
=
emb_var
.
shape
,
dtype
=
emb_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
emb_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
sums
=
[]
for
i
,
emb_file
in
enumerate
(
emb_files
):
var_name
=
"{}_{}"
.
format
(
emb_var
.
name
,
i
)
param_var
=
global_block
.
create_var
(
name
=
var_name
,
shape
=
emb_var
.
shape
,
dtype
=
emb_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
global_block
.
append_op
(
type
=
'load'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
param_var
]},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
lookup_table_dirname
,
var_name
)
})
sums
.
append
(
param_var
)
global_block
.
append_op
(
type
=
'sum'
,
inputs
=
{
"X"
:
sums
},
outputs
=
{
'Out'
:
emb_var
},
attrs
=
{})
global_block
.
append_op
(
type
=
'delete_var'
,
inputs
=
{
'X'
:
sums
})
executor
.
run
(
convert_program
)
_logger
.
info
(
"Start Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}"
.
format
(
dirname
,
time
.
ctime
()))
lookup_table_vars
=
[
lookup_table_var
]
io
.
load_vars
(
executor
,
dirname
=
dirname
,
main_program
=
program
,
predicate
=
_is_checkpoint_var
(
lookup_table_vars
),
filename
=
None
)
_load_lookup_table_vars
(
executor
,
dirname
,
program
,
lookup_table_vars
)
_logger
.
info
(
"Finish Load Sparse Program With "
"Distributed Lookup Table Vars from {}, time = {}"
.
format
(
dirname
,
time
.
ctime
()))
def
load_inference_model
(
dirname
,
executor
,
lookup_table_var_name
):
if
not
os
.
path
.
isdir
(
dirname
):
raise
ValueError
(
"There is no directory named '%s'"
,
dirname
)
local_model
=
os
.
path
.
join
(
dirname
,
model_filename
)
with
open
(
local_model
,
"rb"
)
as
f
:
program_desc_str
=
f
.
read
()
program
=
Program
.
parse_from_string
(
program_desc_str
)
if
not
core
.
_is_program_version_supported
(
program
.
_version
()):
raise
ValueError
(
"Unsupported program version: %d
\n
"
%
program
.
_version
())
# Binary data also need version.
load_persistable_vars
(
executor
,
dirname
,
program
,
lookup_table_var_name
)
feed_target_names
=
program
.
desc
.
get_feed_target_names
()
fetch_target_names
=
program
.
desc
.
get_fetch_target_names
()
fetch_targets
=
[
program
.
global_block
().
var
(
name
)
for
name
in
fetch_target_names
]
return
[
program
,
feed_target_names
,
fetch_targets
]
python/paddle/fluid/framework.py
浏览文件 @
557229bd
...
...
@@ -1698,6 +1698,7 @@ class Program(object):
p
.
_copy_param_info_from
(
self
)
p
.
_copy_data_info_from
(
self
)
p
.
_copy_dist_param_info_from
(
self
)
return
p
def
_prune
(
self
,
targets
):
...
...
@@ -1938,6 +1939,25 @@ class Program(object):
"program, with represent the same topology"
)
self
.
global_block
().
_copy_param_info_from
(
other
.
global_block
())
def
_copy_dist_param_info_from
(
self
,
other
):
"""
Copy the information of distributed information from other program.
Args:
other(Program): Other program
Returns:
None
"""
if
not
isinstance
(
other
,
Program
):
raise
TypeError
(
"_copy_dist_param_info_from should be invoked with "
"Program"
)
self
.
_is_distributed
=
other
.
_is_distributed
self
.
_is_chief
=
other
.
_is_chief
self
.
_slice_vars_and_attrs
=
other
.
_slice_vars_and_attrs
self
.
_endpoints
=
other
.
_endpoints
self
.
_distributed_lookup_table
=
other
.
_distributed_lookup_table
def
_copy_data_info_from
(
self
,
other
):
"""
Copy the information of data variables from other program.
...
...
python/paddle/fluid/io.py
浏览文件 @
557229bd
...
...
@@ -165,6 +165,7 @@ def save_vars(executor,
save_vars
(
executor
,
main_program
=
main_program
,
dirname
=
dirname
,
vars
=
list
(
filter
(
predicate
,
main_program
.
list_vars
())),
filename
=
filename
)
...
...
@@ -172,11 +173,18 @@ def save_vars(executor,
save_program
=
Program
()
save_block
=
save_program
.
global_block
()
if
main_program
is
None
:
main_program
=
default_main_program
()
if
not
isinstance
(
main_program
,
Program
):
raise
TypeError
(
"program should be as Program type or None"
)
save_var_map
=
{}
for
each_var
in
vars
:
# NOTE: don't save the variable which type is RAW
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
continue
if
each_var
.
name
==
main_program
.
_distributed_lookup_table
:
continue
new_var
=
_clone_var_in_block_
(
save_block
,
each_var
)
if
filename
is
None
:
save_block
.
append_op
(
...
...
@@ -198,6 +206,16 @@ def save_vars(executor,
outputs
=
{},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
# if there is lookup table, the trainer 0 will notify all pserver to save.
if
main_program
.
_is_distributed
and
main_program
.
_is_chief
and
main_program
.
_distributed_lookup_table
:
lookup_table_filename
=
os
.
path
.
join
(
dirname
,
"__lookup_table__"
)
attrs
=
{}
attrs
[
'epmap'
]
=
main_program
.
_endpoints
attrs
[
'dir'
]
=
lookup_table_filename
attrs
[
'lookup_table'
]
=
main_program
.
_distributed_lookup_table
save_block
.
append_op
(
type
=
'checkpoint_notify'
,
inputs
=
{},
outputs
=
{},
attrs
=
attrs
)
executor
.
run
(
save_program
)
...
...
@@ -379,11 +397,22 @@ def load_vars(executor,
load_prog
=
Program
()
load_block
=
load_prog
.
global_block
()
if
main_program
is
None
:
main_program
=
default_main_program
()
if
not
isinstance
(
main_program
,
Program
):
raise
TypeError
(
"program should be as Program type or None"
)
load_slice_vars
=
[]
for
each_var
in
main_program
.
_slice_vars_and_attrs
:
load_slice_vars
.
append
(
each_var
[
2
].
name
)
load_var_map
=
{}
for
each_var
in
vars
:
assert
isinstance
(
each_var
,
Variable
)
if
each_var
.
type
==
core
.
VarDesc
.
VarType
.
RAW
:
continue
if
each_var
.
name
in
load_slice_vars
:
continue
new_var
=
_clone_var_in_block_
(
load_block
,
each_var
)
if
filename
is
None
:
load_block
.
append_op
(
...
...
@@ -406,9 +435,6 @@ def load_vars(executor,
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
filename
)})
executor
.
run
(
load_prog
)
if
main_program
is
None
:
main_program
=
default_main_program
()
# load slice vars on pserver, if have it.
_load_slice_up_vars
(
executor
,
dirname
,
main_program
.
_slice_vars_and_attrs
)
...
...
@@ -618,13 +644,6 @@ def save_inference_model(dirname,
if
main_program
is
None
:
main_program
=
default_main_program
()
# if there is lookup table, the trainer 0 will notify all pserver to save.
if
main_program
.
_is_distributed
and
main_program
.
_is_chief
and
main_program
.
_distributed_lookup_table
:
lookup_table_filename
=
os
.
path
.
join
(
dirname
,
"__lookup_table__"
)
_save_lookup_tables_by_notify
(
executor
,
lookup_table_filename
,
main_program
.
_distributed_lookup_table
,
main_program
.
_endpoints
)
# when a pserver and a trainer running on the same machine, mkdir may conflict
try
:
os
.
makedirs
(
dirname
)
...
...
@@ -642,6 +661,9 @@ def save_inference_model(dirname,
# it can only be loaded for inference directly. If it's false, the whole
# original program and related meta are saved so that future usage can be
# more flexible.
origin_program
=
main_program
.
clone
()
if
export_for_deployment
:
main_program
=
main_program
.
clone
()
global_block
=
main_program
.
global_block
()
...
...
@@ -666,8 +688,11 @@ def save_inference_model(dirname,
with
open
(
model_basename
+
".main_program"
,
"wb"
)
as
f
:
f
.
write
(
main_program
.
desc
.
serialize_to_string
())
main_program
.
_copy_dist_param_info_from
(
origin_program
)
if
params_filename
is
not
None
:
params_filename
=
os
.
path
.
basename
(
params_filename
)
save_persistables
(
executor
,
dirname
,
main_program
,
params_filename
)
...
...
@@ -897,6 +922,9 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
slice_var
=
var_tuple
[
2
]
end
=
start
+
slice_var
.
shape
[
0
]
orig_var_name
=
orig_var
.
name
orig_var
.
name
=
"{}.origin"
.
format
(
orig_var_name
)
clone_orig_var
=
load_block
.
create_var
(
name
=
orig_var
.
name
,
type
=
orig_var
.
type
,
...
...
@@ -915,7 +943,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
type
=
'load'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
clone_orig_var
]},
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
clone_orig_var
.
name
)})
attrs
=
{
'file_path'
:
os
.
path
.
join
(
dirname
,
orig_var_
name
)})
load_block
.
append_op
(
type
=
"slice"
,
inputs
=
{
'Input'
:
clone_orig_var
},
...
...
@@ -924,6 +952,7 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
'starts'
:
[
start
],
'ends'
:
[
end
]})
need_delete_vars
.
append
(
clone_orig_var
)
load_block
.
append_op
(
type
=
'delete_var'
,
inputs
=
{
'X'
:
need_delete_vars
},
)
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
557229bd
...
...
@@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table):
def
increment
(
x
,
value
=
1.0
,
in_place
=
True
):
"""
This function performs an operation that increments
each
value in the
This function performs an operation that increments
the
value in the
input :math:`x` by an amount: :math:`value` as mentioned in the input
parameter. This operation is performed in-place by default.
parameter. This operation is performed in-place by default. Notice that
the number of elements in :math:`x` must be equal to 1.
Args:
x (Variable|list): The tensor that has the input values.
...
...
@@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True):
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
data = fluid.layers.data(name='data', shape=[1], dtype='float32',
append_batch_size=False)
data = fluid.layers.increment(x=data, value=3.0, in_place=True)
"""
helper
=
LayerHelper
(
"increment"
,
**
locals
())
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
557229bd
...
...
@@ -1029,6 +1029,7 @@ def density_prior_box(input,
clip
=
False
,
steps
=
[
0.0
,
0.0
],
offset
=
0.5
,
flatten_to_2d
=
False
,
name
=
None
):
"""
**Density Prior Box Operator**
...
...
@@ -1065,22 +1066,24 @@ def density_prior_box(input,
height/weight of the input will be automatically calculated.
Default: [0., 0.]
offset(float): Prior boxes center offset. Default: 0.5
flatten_to_2d(bool): Whether to flatten output prior boxes and variance
to 2D shape, the second dim is 4. Default: False.
name(str): Name of the density prior box op. Default: None.
Returns:
tuple: A tuple with two Variable (boxes, variances)
boxes: the output density prior boxes of PriorBox.
The layout is [H, W, num_priors, 4]
.
H is the height of input, W is the width of input,
num_priors is the total
box count of each position of input.
The layout is [H, W, num_priors, 4] when flatten_to_2d is False
.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input,
num_priors is the total
box count of each position of input.
variances: the expanded variances of PriorBox.
The layout is [H, W, num_priors, 4]
.
H is the height of input, W is the width of input
num_priors is the total
box count of each position of input
The layout is [H, W, num_priors, 4] when flatten_to_2d is False
.
The layout is [H * W * num_priors, 4] when flatten_to_2d is True.
H is the height of input, W is the width of input
num_priors is the total box count of each position of input.
Examples:
...
...
@@ -1089,14 +1092,11 @@ def density_prior_box(input,
box, var = fluid.layers.density_prior_box(
input=conv1,
image=images,
min_sizes=[100.],
max_sizes=[200.],
aspect_ratios=[1.0, 1.0 / 2.0, 2.0],
densities=[3, 4],
fixed_sizes=[50., 60.],
fixed_ratios=[1.0, 3.0, 1.0 / 3.0],
flip=True,
clip=True)
densities=[4, 2, 1],
fixed_sizes=[32.0, 64.0, 128.0],
fixed_ratios=[1.],
clip=True,
flatten_to_2d=True)
"""
helper
=
LayerHelper
(
"density_prior_box"
,
**
locals
())
dtype
=
helper
.
input_dtype
()
...
...
@@ -1127,14 +1127,11 @@ def density_prior_box(input,
'step_w'
:
steps
[
0
],
'step_h'
:
steps
[
1
],
'offset'
:
offset
,
'densities'
:
densities
,
'fixed_sizes'
:
fixed_sizes
,
'fixed_ratios'
:
fixed_ratios
,
'flatten_to_2d'
:
flatten_to_2d
,
}
if
densities
is
not
None
and
len
(
densities
)
>
0
:
attrs
[
'densities'
]
=
densities
if
fixed_sizes
is
not
None
and
len
(
fixed_sizes
)
>
0
:
attrs
[
'fixed_sizes'
]
=
fixed_sizes
if
fixed_ratios
is
not
None
and
len
(
fixed_ratios
)
>
0
:
attrs
[
'fixed_ratios'
]
=
fixed_ratios
box
=
helper
.
create_variable_for_type_inference
(
dtype
)
var
=
helper
.
create_variable_for_type_inference
(
dtype
)
helper
.
append_op
(
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
557229bd
...
...
@@ -2134,11 +2134,16 @@ def pool2d(input,
input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the
feature, and W is the width of the feature.
pool_size (int): The side length of pooling windows. All pooling
windows are squares with pool_size on a side.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be a square of an int.
pool_type: ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
it must contain two integers, (pool_stride_Height, pool_stride_Width).
Otherwise, the pool stride size will be a square of an int.
pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
Otherwise, the pool padding size will be a square of an int.
global_pooling (bool): ${global_pooling_comment}
use_cudnn (bool): ${use_cudnn_comment}
ceil_mode (bool): ${ceil_mode_comment}
...
...
@@ -6967,18 +6972,18 @@ def prelu(x, mode, param_attr=None, name=None):
"""
Equation:
y = \max(0, x) + alpha \min(0, x)
y = \max(0, x) + alpha
*
\min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
. It supports all, channel
and element.
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
...
...
@@ -6987,7 +6992,7 @@ def prelu(x, mode, param_attr=None, name=None):
.. code-block:: python
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
mode = 'channel'
output = fluid.layers.prelu(x,mode)
"""
...
...
python/paddle/fluid/nets.py
浏览文件 @
557229bd
...
...
@@ -250,7 +250,8 @@ def sequence_conv_pool(input,
filter_size
,
param_attr
=
None
,
act
=
"sigmoid"
,
pool_type
=
"max"
):
pool_type
=
"max"
,
bias_attr
=
None
):
"""
The sequence_conv_pool is composed with Sequence Convolution and Pooling.
...
...
@@ -266,6 +267,11 @@ def sequence_conv_pool(input,
pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
Default :math:`max`.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
If it is set to False, no bias will be added to the output units.
If it is set to None or one attribute of ParamAttr, sequence_conv
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
Return:
Variable: The final result after Sequence Convolution and Pooling.
...
...
@@ -289,6 +295,7 @@ def sequence_conv_pool(input,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
act
=
act
)
pool_out
=
layers
.
sequence_pool
(
input
=
conv_out
,
pool_type
=
pool_type
)
...
...
python/paddle/fluid/tests/book/test_image_classification.py
浏览文件 @
557229bd
...
...
@@ -239,7 +239,7 @@ def infer(use_cuda, save_dirname=None):
assert
len
(
results
[
0
])
==
len
(
transpiler_results
[
0
])
for
i
in
range
(
len
(
results
[
0
])):
np
.
testing
.
assert_almost_equal
(
results
[
0
][
i
],
transpiler_results
[
0
][
i
],
decimal
=
5
)
results
[
0
][
i
],
transpiler_results
[
0
][
i
],
decimal
=
4
)
print
(
"infer results: "
,
results
[
0
])
...
...
python/paddle/fluid/tests/test_detection.py
浏览文件 @
557229bd
...
...
@@ -112,38 +112,42 @@ class TestDetection(unittest.TestCase):
class
TestPriorBox
(
unittest
.
TestCase
):
def
test_prior_box
(
self
):
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
conv1
=
fluid
.
layers
.
conv2d
(
images
,
3
,
3
,
2
)
box
,
var
=
layers
.
prior_box
(
input
=
conv1
,
image
=
images
,
min_sizes
=
[
100.0
],
aspect_ratios
=
[
1.
],
flip
=
True
,
clip
=
True
)
assert
len
(
box
.
shape
)
==
4
assert
box
.
shape
==
var
.
shape
assert
box
.
shape
[
3
]
==
4
program
=
Program
()
with
program_guard
(
program
):
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
conv1
=
fluid
.
layers
.
conv2d
(
images
,
3
,
3
,
2
)
box
,
var
=
layers
.
prior_box
(
input
=
conv1
,
image
=
images
,
min_sizes
=
[
100.0
],
aspect_ratios
=
[
1.
],
flip
=
True
,
clip
=
True
)
assert
len
(
box
.
shape
)
==
4
assert
box
.
shape
==
var
.
shape
assert
box
.
shape
[
3
]
==
4
class
TestDensityPriorBox
(
unittest
.
TestCase
):
def
test_density_prior_box
(
self
):
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
conv1
=
fluid
.
layers
.
conv2d
(
images
,
3
,
3
,
2
)
box
,
var
=
layers
.
density_prior_box
(
input
=
conv1
,
image
=
images
,
densities
=
[
3
,
4
],
fixed_sizes
=
[
50.
,
60.
],
fixed_ratios
=
[
1.0
],
clip
=
True
)
assert
len
(
box
.
shape
)
==
4
assert
box
.
shape
==
var
.
shape
assert
box
.
shape
[
3
]
==
4
program
=
Program
()
with
program_guard
(
program
):
data_shape
=
[
3
,
224
,
224
]
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
data_shape
,
dtype
=
'float32'
)
conv1
=
fluid
.
layers
.
conv2d
(
images
,
3
,
3
,
2
)
box
,
var
=
layers
.
density_prior_box
(
input
=
conv1
,
image
=
images
,
densities
=
[
3
,
4
],
fixed_sizes
=
[
50.
,
60.
],
fixed_ratios
=
[
1.0
],
clip
=
True
)
assert
len
(
box
.
shape
)
==
4
assert
box
.
shape
==
var
.
shape
assert
box
.
shape
[
-
1
]
==
4
class
TestAnchorGenerator
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
浏览文件 @
557229bd
...
...
@@ -36,7 +36,8 @@ class TestDensityPriorBoxOp(OpTest):
'offset'
:
self
.
offset
,
'densities'
:
self
.
densities
,
'fixed_sizes'
:
self
.
fixed_sizes
,
'fixed_ratios'
:
self
.
fixed_ratios
'fixed_ratios'
:
self
.
fixed_ratios
,
'flatten_to_2d'
:
self
.
flatten_to_2d
}
self
.
outputs
=
{
'Boxes'
:
self
.
out_boxes
,
'Variances'
:
self
.
out_var
}
...
...
@@ -48,16 +49,17 @@ class TestDensityPriorBoxOp(OpTest):
self
.
set_data
()
def
set_density
(
self
):
self
.
densities
=
[]
self
.
fixed_sizes
=
[]
self
.
fixed_ratios
=
[]
self
.
densities
=
[
4
,
2
,
1
]
self
.
fixed_sizes
=
[
32.0
,
64.0
,
128.0
]
self
.
fixed_ratios
=
[
1.0
]
self
.
layer_w
=
17
self
.
layer_h
=
17
self
.
image_w
=
533
self
.
image_h
=
533
self
.
flatten_to_2d
=
False
def
init_test_params
(
self
):
self
.
layer_w
=
32
self
.
layer_h
=
32
self
.
image_w
=
40
self
.
image_h
=
40
self
.
set_density
()
self
.
step_w
=
float
(
self
.
image_w
)
/
float
(
self
.
layer_w
)
self
.
step_h
=
float
(
self
.
image_h
)
/
float
(
self
.
layer_h
)
...
...
@@ -69,8 +71,6 @@ class TestDensityPriorBoxOp(OpTest):
self
.
variances
=
[
0.1
,
0.1
,
0.2
,
0.2
]
self
.
variances
=
np
.
array
(
self
.
variances
,
dtype
=
np
.
float
).
flatten
()
self
.
set_density
()
self
.
clip
=
True
self
.
num_priors
=
0
if
len
(
self
.
fixed_sizes
)
>
0
and
len
(
self
.
densities
)
>
0
:
...
...
@@ -129,6 +129,9 @@ class TestDensityPriorBoxOp(OpTest):
(
self
.
layer_h
,
self
.
layer_w
,
self
.
num_priors
,
1
))
self
.
out_boxes
=
out_boxes
.
astype
(
'float32'
)
self
.
out_var
=
out_var
.
astype
(
'float32'
)
if
self
.
flatten_to_2d
:
self
.
out_boxes
=
self
.
out_boxes
.
reshape
((
-
1
,
4
))
self
.
out_var
=
self
.
out_var
.
reshape
((
-
1
,
4
))
class
TestDensityPriorBox
(
TestDensityPriorBoxOp
):
...
...
@@ -136,6 +139,11 @@ class TestDensityPriorBox(TestDensityPriorBoxOp):
self
.
densities
=
[
3
,
4
]
self
.
fixed_sizes
=
[
1.0
,
2.0
]
self
.
fixed_ratios
=
[
1.0
]
self
.
layer_w
=
32
self
.
layer_h
=
32
self
.
image_w
=
40
self
.
image_h
=
40
self
.
flatten_to_2d
=
True
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py
0 → 100644
浏览文件 @
557229bd
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
class
TestFusionTransposeFlattenConcationOp
(
OpTest
):
def
setUp
(
self
):
self
.
init_test_case
()
self
.
op_type
=
"fusion_transpose_flatten_concat"
ins
=
[]
flats
=
[]
for
i
in
range
(
len
(
self
.
shapes
)):
in_shape
=
self
.
shapes
[
i
]
a
=
np
.
random
.
random
(
in_shape
).
astype
(
"float32"
)
ins
.
append
((
"x%d"
%
i
,
a
))
b
=
a
.
transpose
(
self
.
trans_axis
)
flat_shape
=
(
np
.
prod
(
b
.
shape
[:
self
.
flatten_axis
]),
np
.
prod
(
b
.
shape
[
self
.
flatten_axis
:]))
c
=
b
.
reshape
(
flat_shape
)
flats
.
append
(
c
)
out
=
np
.
concatenate
(
flats
,
axis
=
self
.
concat_axis
)
self
.
inputs
=
{
'X'
:
ins
}
self
.
attrs
=
{
'trans_axis'
:
list
(
self
.
trans_axis
),
'flatten_axis'
:
self
.
flatten_axis
,
'concat_axis'
:
self
.
concat_axis
}
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_output_with_place
(
place
,
1e-6
)
else
:
pass
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
4
,
17
,
17
),
(
3
,
8
,
7
,
7
),
(
3
,
12
,
5
,
5
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
class
TestCase1
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
4
,
18
,
17
),
(
3
,
8
,
18
,
7
),
(
6
,
12
,
9
,
5
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
2
self
.
concat_axis
=
1
class
TestCase2
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
20
,
17
),
(
3
,
8
,
19
,
17
),
(
3
,
8
,
40
,
17
)]
self
.
trans_axis
=
(
0
,
2
,
3
,
1
)
self
.
flatten_axis
=
2
self
.
concat_axis
=
0
class
TestCase3
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
20
,
17
),
(
3
,
8
,
19
,
17
),
(
3
,
8
,
40
,
17
)]
self
.
trans_axis
=
(
0
,
3
,
2
,
1
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
class
TestCase4
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
9
,
17
),
(
8
,
3
,
9
,
17
),
(
4
,
6
,
9
,
17
)]
self
.
trans_axis
=
(
0
,
2
,
1
,
3
)
self
.
flatten_axis
=
3
self
.
concat_axis
=
1
class
TestCase5
(
TestFusionTransposeFlattenConcationOp
):
def
init_test_case
(
self
):
self
.
shapes
=
[(
3
,
8
,
9
,
17
,
2
),
(
3
,
8
,
2
,
17
,
9
),
(
3
,
17
,
9
,
8
,
2
)]
self
.
trans_axis
=
(
0
,
2
,
1
,
4
,
3
)
self
.
flatten_axis
=
1
self
.
concat_axis
=
1
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
557229bd
...
...
@@ -202,6 +202,17 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
layers
.
sequence_unpad
(
x
=
x
,
length
=
length
))
print
(
str
(
program
))
def
test_pool2d
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
self
.
assertIsNotNone
(
layers
.
pool2d
(
x
,
pool_size
=
[
5
,
3
],
pool_stride
=
[
1
,
2
],
pool_padding
=
(
2
,
1
)))
def
test_lstm_unit
(
self
):
program
=
Program
()
with
program_guard
(
program
):
...
...
python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
浏览文件 @
557229bd
...
...
@@ -145,10 +145,15 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
lod
.
append
(
nmsed_num
)
if
nmsed_num
==
0
:
continue
tmp_det_out
=
[]
for
c
,
indices
in
nmsed_outs
.
items
():
for
idx
in
indices
:
xmin
,
ymin
,
xmax
,
ymax
=
boxes
[
n
][
idx
][:]
det_outs
.
append
([
c
,
scores
[
n
][
c
][
idx
],
xmin
,
ymin
,
xmax
,
ymax
])
tmp_det_out
.
append
(
[
c
,
scores
[
n
][
c
][
idx
],
xmin
,
ymin
,
xmax
,
ymax
])
sorted_det_out
=
sorted
(
tmp_det_out
,
key
=
lambda
tup
:
tup
[
0
],
reverse
=
False
)
det_outs
.
extend
(
sorted_det_out
)
return
det_outs
,
lod
...
...
@@ -210,7 +215,7 @@ class TestMulticlassNMSOp(OpTest):
class
TestMulticlassNMSOpNoOutput
(
TestMulticlassNMSOp
):
def
set_argument
(
self
):
# Here set 2.0 to test the case there is no outputs.
# In practical use, 0.0 < score_threshold < 1.0
# In practical use, 0.0 < score_threshold < 1.0
self
.
score_threshold
=
2.0
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
557229bd
...
...
@@ -644,6 +644,9 @@ in a single call.")
else
:
recv_inputs
.
append
(
single_trainer_var
)
self
.
_slice_params_and_optimizes
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
# step 3
# Create a union-find data structure from optimize ops,
# If two ops are connected, we could add these two ops
...
...
@@ -766,7 +769,7 @@ in a single call.")
grad_to_block_id
,
merged_var
,
lr_ops
)
# dedup grad to ids list
# dedup grad to ids list
grad_to_block_id
=
list
(
set
(
grad_to_block_id
))
# append global ops
if
global_ops
:
...
...
@@ -827,8 +830,8 @@ in a single call.")
attrs
=
attrs
)
# add distributed attrs
pserver_program
.
_slice_vars_and_attrs
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
pserver_program
.
_slice_vars_and_attrs
=
list
(
self
.
_slice_params_and_optimizes
.
values
()
)
pserver_program
.
_sync_with_cpp
()
# save pserver program to generate pserver side startup relatively.
...
...
@@ -941,12 +944,12 @@ to transpile() call.")
outputs
=
{
"Out"
:
startup_tmpvar
})
# add slice vars
s_prog
.
_slice_vars_and_attrs
=
self
.
_get_slice_vars_and_attrs
(
endpoint
)
s_prog
.
_slice_vars_and_attrs
=
pserver_program
.
_slice_vars_and_attrs
return
s_prog
def
_get_slice_vars_and_attrs
(
self
,
endpoint
):
slice_vars_and_attrs
=
[]
slice_vars_and_attrs
=
{}
block_suffix
=
"block"
for
param
in
self
.
param_grad_ep_mapping
[
endpoint
][
"params"
]:
orig_var_name
,
block_name
,
_
=
self
.
_get_varname_parts
(
param
.
name
)
...
...
@@ -960,8 +963,7 @@ to transpile() call.")
slice_vars
=
self
.
param_var_mapping
[
orig_var_name
]
for
slice_var
in
slice_vars
[:
block_idx
]:
skip_dim0
+=
slice_var
.
shape
[
0
]
slice_vars_and_attrs
.
append
([
orig_var
,
skip_dim0
,
param
])
slice_vars_and_attrs
[
param
.
name
]
=
[
orig_var
,
skip_dim0
,
param
]
return
slice_vars_and_attrs
# ====================== private transpiler functions =====================
...
...
@@ -1662,10 +1664,10 @@ to transpile() call.")
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
]:
continue
var
=
self
.
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
param_var
=
new_inputs
[
"Param"
]
# update accumulator variable shape
param_shape
=
new_inputs
[
"Param"
].
shape
new_shape
=
self
.
_get_optimizer_input_shape
(
opt_op
.
type
,
key
,
var
.
shape
,
param_shape
)
new_shape
=
self
.
_get_optimizer_input_shape
(
opt_op
.
type
,
key
,
var
.
shape
,
param_var
.
shape
)
tmpvar
=
pserver_block
.
create_var
(
name
=
var
.
name
,
persistable
=
var
.
persistable
,
...
...
@@ -1673,6 +1675,13 @@ to transpile() call.")
shape
=
new_shape
)
new_inputs
[
key
]
=
tmpvar
# var shape been changed
if
new_shape
!=
var
.
shape
:
slice_var_args
=
self
.
_slice_params_and_optimizes
[
param_var
.
name
]
self
.
_slice_params_and_optimizes
[
var
.
name
]
=
[
var
,
slice_var_args
[
1
],
tmpvar
]
# change output's ParamOut variable
outputs
=
self
.
_get_output_map_from_op
(
self
.
origin_program
.
global_block
().
vars
,
opt_op
)
...
...
tools/manylinux1/Dockerfile.x64
浏览文件 @
557229bd
...
...
@@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
bash build_scripts/install_nccl2.sh && rm -r
f
build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
...
...
tools/manylinux1/build_scripts/build_utils.sh
浏览文件 @
557229bd
...
...
@@ -50,6 +50,15 @@ function do_cpython_build {
mkdir
-p
${
prefix
}
/lib
# -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
if
[
$(
lex_pyver
$py_ver
)
-eq
$(
lex_pyver 3.6
)
]
;
then
wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz
tar
-zxf
sqlite-autoconf-3250300.tar.gz
cd
sqlite-autoconf-3250300
./configure
--prefix
=
/usr/local
make
-j8
&&
make
install
cd
../
&&
rm
sqlite-autoconf-3250300.tar.gz
fi
# NOTE --enable-shared for generating libpython shared library needed for
# linking of some of the nupic.core test executables.
if
[
$(
lex_pyver
$py_ver
)
-ge
$(
lex_pyver 3.7
)
]
;
then
...
...
@@ -59,9 +68,9 @@ function do_cpython_build {
make
-j8
>
/dev/null
make altinstall
>
/dev/null
else
CFLAGS
=
"-Wformat"
./configure
--prefix
=
${
prefix
}
--enable-shared
$unicode_flags
>
/dev/null
make
-j8
>
/dev/null
make
install
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
CFLAGS
=
"-Wformat"
./configure
--prefix
=
${
prefix
}
--enable-shared
$unicode_flags
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
make
-j8
>
/dev/null
LD_LIBRARY_PATH
=
/usr/local/lib:
${
LD_LIBRARY_PATH
}
make
install
>
/dev/null
fi
popd
echo
"ZZZ looking for libpython"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录