Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5cedfb60
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5cedfb60
编写于
10月 15, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
test=develop
上级
84d93003
8e2fdc54
变更
144
隐藏空白更改
内联
并排
Showing
144 changed file
with
3719 addition
and
716 deletion
+3719
-716
.gitignore
.gitignore
+1
-0
CMakeLists.txt
CMakeLists.txt
+1
-0
Dockerfile
Dockerfile
+12
-2
cmake/cblas.cmake
cmake/cblas.cmake
+1
-1
cmake/cuda.cmake
cmake/cuda.cmake
+4
-1
cmake/external/eigen.cmake
cmake/external/eigen.cmake
+8
-0
cmake/external/openblas.cmake
cmake/external/openblas.cmake
+4
-4
cmake/flags.cmake
cmake/flags.cmake
+2
-0
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+18
-3
paddle/fluid/framework/data_type.h
paddle/fluid/framework/data_type.h
+0
-1
paddle/fluid/framework/details/reference_count_pass.cc
paddle/fluid/framework/details/reference_count_pass.cc
+9
-9
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+2
-0
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+327
-0
paddle/fluid/framework/ir/conv_bn_fuse_pass.h
paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+49
-0
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+106
-0
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+38
-0
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+17
-0
paddle/fluid/framework/naive_executor.h
paddle/fluid/framework/naive_executor.h
+4
-0
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+21
-0
paddle/fluid/framework/op_proto_maker.cc
paddle/fluid/framework/op_proto_maker.cc
+1
-3
paddle/fluid/framework/op_proto_maker.h
paddle/fluid/framework/op_proto_maker.h
+0
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+49
-68
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+7
-0
paddle/fluid/framework/rw_lock.h
paddle/fluid/framework/rw_lock.h
+1
-0
paddle/fluid/framework/shape_inference.cc
paddle/fluid/framework/shape_inference.cc
+0
-10
paddle/fluid/framework/shape_inference.h
paddle/fluid/framework/shape_inference.h
+2
-1
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+103
-1
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+7
-0
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+67
-21
paddle/fluid/framework/tensor_util_test.cu
paddle/fluid/framework/tensor_util_test.cu
+173
-3
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+13
-4
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+0
-2
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+1
-1
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+11
-9
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+0
-1
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+6
-1
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+0
-1
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+1
-0
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+12
-0
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+26
-0
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+9
-9
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+82
-0
paddle/fluid/inference/api/demo_ci/utils.h
paddle/fluid/inference/api/demo_ci/utils.h
+59
-0
paddle/fluid/inference/api/demo_ci/vis_demo.cc
paddle/fluid/inference/api/demo_ci/vis_demo.cc
+6
-67
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+19
-126
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+2
-3
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+4
-2
paddle/fluid/inference/tensorrt/convert/pad_op.cc
paddle/fluid/inference/tensorrt/convert/pad_op.cc
+68
-0
paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+52
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+8
-0
paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+0
-1
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+96
-0
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+3
-3
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+0
-2
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+123
-0
paddle/fluid/inference/tests/book/CMakeLists.txt
paddle/fluid/inference/tests/book/CMakeLists.txt
+0
-1
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+8
-5
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+18
-2
paddle/fluid/operators/adadelta_op.cc
paddle/fluid/operators/adadelta_op.cc
+12
-0
paddle/fluid/operators/adadelta_op.h
paddle/fluid/operators/adadelta_op.h
+11
-0
paddle/fluid/operators/adagrad_op.h
paddle/fluid/operators/adagrad_op.h
+20
-13
paddle/fluid/operators/adam_op.h
paddle/fluid/operators/adam_op.h
+6
-0
paddle/fluid/operators/adamax_op.cc
paddle/fluid/operators/adamax_op.cc
+10
-0
paddle/fluid/operators/adamax_op.h
paddle/fluid/operators/adamax_op.h
+11
-0
paddle/fluid/operators/argsort_op.cc
paddle/fluid/operators/argsort_op.cc
+2
-2
paddle/fluid/operators/clip_by_norm_op.h
paddle/fluid/operators/clip_by_norm_op.h
+37
-3
paddle/fluid/operators/conv_shift_op.cc
paddle/fluid/operators/conv_shift_op.cc
+1
-1
paddle/fluid/operators/cub_reduce.h
paddle/fluid/operators/cub_reduce.h
+7
-1
paddle/fluid/operators/decayed_adagrad_op.cc
paddle/fluid/operators/decayed_adagrad_op.cc
+10
-0
paddle/fluid/operators/decayed_adagrad_op.h
paddle/fluid/operators/decayed_adagrad_op.h
+11
-0
paddle/fluid/operators/detection/roi_perspective_transform_op.cc
...fluid/operators/detection/roi_perspective_transform_op.cc
+5
-6
paddle/fluid/operators/detection/roi_perspective_transform_op.cu
...fluid/operators/detection/roi_perspective_transform_op.cu
+3
-3
paddle/fluid/operators/elementwise_op.h
paddle/fluid/operators/elementwise_op.h
+13
-8
paddle/fluid/operators/fake_dequantize_op.cc
paddle/fluid/operators/fake_dequantize_op.cc
+2
-1
paddle/fluid/operators/ftrl_op.cc
paddle/fluid/operators/ftrl_op.cc
+10
-0
paddle/fluid/operators/ftrl_op.h
paddle/fluid/operators/ftrl_op.h
+11
-0
paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+2
-8
paddle/fluid/operators/fusion_gru_op.cc
paddle/fluid/operators/fusion_gru_op.cc
+3
-2
paddle/fluid/operators/fusion_lstm_op.cc
paddle/fluid/operators/fusion_lstm_op.cc
+2
-1
paddle/fluid/operators/isfinite_op.cc
paddle/fluid/operators/isfinite_op.cc
+113
-0
paddle/fluid/operators/isfinite_op.cu
paddle/fluid/operators/isfinite_op.cu
+33
-0
paddle/fluid/operators/isfinite_op.h
paddle/fluid/operators/isfinite_op.h
+71
-0
paddle/fluid/operators/lookup_table_op.cc
paddle/fluid/operators/lookup_table_op.cc
+1
-0
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+3
-3
paddle/fluid/operators/math/cpu_lstm_compute.cc
paddle/fluid/operators/math/cpu_lstm_compute.cc
+26
-1
paddle/fluid/operators/math/cpu_lstm_compute.h
paddle/fluid/operators/math/cpu_lstm_compute.h
+2
-19
paddle/fluid/operators/math/depthwise_conv.cu
paddle/fluid/operators/math/depthwise_conv.cu
+210
-65
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+9
-0
paddle/fluid/operators/math/math_function.h
paddle/fluid/operators/math/math_function.h
+0
-12
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+70
-20
paddle/fluid/operators/math/selected_rows_functor.h
paddle/fluid/operators/math/selected_rows_functor.h
+113
-0
paddle/fluid/operators/math/selected_rows_functor_test.cc
paddle/fluid/operators/math/selected_rows_functor_test.cc
+171
-0
paddle/fluid/operators/momentum_op.cc
paddle/fluid/operators/momentum_op.cc
+5
-0
paddle/fluid/operators/momentum_op.cu
paddle/fluid/operators/momentum_op.cu
+11
-0
paddle/fluid/operators/momentum_op.h
paddle/fluid/operators/momentum_op.h
+6
-0
paddle/fluid/operators/prelu_op.cc
paddle/fluid/operators/prelu_op.cc
+1
-1
paddle/fluid/operators/rmsprop_op.cc
paddle/fluid/operators/rmsprop_op.cc
+5
-0
paddle/fluid/operators/rnn_memory_helper_op.cc
paddle/fluid/operators/rnn_memory_helper_op.cc
+1
-1
paddle/fluid/operators/sequence_conv_op.cc
paddle/fluid/operators/sequence_conv_op.cc
+2
-2
paddle/fluid/operators/sequence_erase_op.cc
paddle/fluid/operators/sequence_erase_op.cc
+2
-2
paddle/fluid/operators/sequence_pool_op.cc
paddle/fluid/operators/sequence_pool_op.cc
+3
-2
paddle/fluid/operators/sequence_reshape_op.cc
paddle/fluid/operators/sequence_reshape_op.cc
+1
-1
paddle/fluid/operators/sequence_softmax_op.cc
paddle/fluid/operators/sequence_softmax_op.cc
+2
-1
paddle/fluid/operators/sgd_op.cc
paddle/fluid/operators/sgd_op.cc
+16
-13
paddle/fluid/operators/sgd_op.cu
paddle/fluid/operators/sgd_op.cu
+6
-0
paddle/fluid/operators/shrink_rnn_memory_op.cc
paddle/fluid/operators/shrink_rnn_memory_op.cc
+3
-3
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
...e/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+1
-1
paddle/fluid/operators/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt_engine_op.h
+1
-1
paddle/fluid/operators/top_k_op.cc
paddle/fluid/operators/top_k_op.cc
+0
-2
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+64
-27
paddle/fluid/operators/truncated_gaussian_random_op.cc
paddle/fluid/operators/truncated_gaussian_random_op.cc
+1
-1
paddle/fluid/operators/truncated_gaussian_random_op.cu
paddle/fluid/operators/truncated_gaussian_random_op.cu
+2
-1
paddle/fluid/operators/uniform_random_op.cc
paddle/fluid/operators/uniform_random_op.cc
+16
-16
paddle/fluid/operators/while_op.cc
paddle/fluid/operators/while_op.cc
+6
-4
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+5
-2
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+47
-18
paddle/fluid/pybind/const_value.cc
paddle/fluid/pybind/const_value.cc
+0
-3
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+140
-18
paddle/fluid/train/CMakeLists.txt
paddle/fluid/train/CMakeLists.txt
+0
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+7
-7
python/CMakeLists.txt
python/CMakeLists.txt
+1
-1
python/paddle/dataset/flowers.py
python/paddle/dataset/flowers.py
+4
-3
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+1
-10
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+4
-0
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+5
-1
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+12
-5
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+54
-16
python/paddle/fluid/lod_tensor.py
python/paddle/fluid/lod_tensor.py
+1
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+12
-0
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+19
-2
python/paddle/fluid/tests/CMakeLists.txt
python/paddle/fluid/tests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+5
-3
python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+57
-0
python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
...n/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+53
-0
python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
.../fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
+218
-0
python/paddle/fluid/tests/unittests/test_isfinite_op.py
python/paddle/fluid/tests/unittests/test_isfinite_op.py
+97
-0
python/paddle/fluid/tests/unittests/test_operator_desc.py
python/paddle/fluid/tests/unittests/test_operator_desc.py
+1
-1
python/paddle/fluid/tests/unittests/test_reduce_op.py
python/paddle/fluid/tests/unittests/test_reduce_op.py
+82
-0
python/paddle/fluid/tests/unittests/test_rmsprop_op.py
python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+6
-3
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+2
-1
python/paddle/fluid/transpiler/inference_transpiler.py
python/paddle/fluid/transpiler/inference_transpiler.py
+2
-2
python/paddle/reader/decorator.py
python/paddle/reader/decorator.py
+37
-1
python/paddle/reader/tests/decorator_test.py
python/paddle/reader/tests/decorator_test.py
+16
-0
未找到文件。
.gitignore
浏览文件 @
5cedfb60
...
...
@@ -25,5 +25,6 @@ third_party/
bazel-*
third_party/
build_*
# clion workspace.
cmake-build-*
CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -72,6 +72,7 @@ option(WITH_INFERENCE "Compile fluid inference library" ON)
option
(
WITH_INFERENCE_API_TEST
"Test fluid inference high-level api interface"
OFF
)
option
(
WITH_SYSTEM_BLAS
"Use system blas library"
OFF
)
option
(
PY_VERSION
"Compile PaddlePaddle with python3 support"
${
PY_VERSION
}
)
option
(
WITH_FAST_MATH
"Make use of fast math library, might affect the precision to some extent"
ON
)
# PY_VERSION
if
(
NOT PY_VERSION
)
...
...
Dockerfile
浏览文件 @
5cedfb60
...
...
@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
RUN
apt-get update
&&
\
apt-get
install
-y
--allow-downgrades
patchelf
\
python3 python3-dev python3-pip
\
git python-pip python-dev python-opencv openssh-server bison
\
libnccl2
=
2.1.2-1+cuda8.0 libnccl-dev
=
2.1.2-1+cuda8.0
\
wget unzip unrar
tar
xz-utils bzip2
gzip
coreutils ntp
\
...
...
@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN
easy_install
-U
pip
&&
\
RUN
pip3
install
-U
wheel
&&
\
pip3
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip3
install
sphinx-rtd-theme
==
0.1.9 recommonmark
&&
\
easy_install
-U
pip
&&
\
pip
install
-U
wheel
&&
\
pip
install
-U
docopt PyYAML
sphinx
==
1.5.6
&&
\
pip
install
sphinx-rtd-theme
==
0.1.9 recommonmark
RUN
pip
install
pre-commit
'ipython==5.3.0'
&&
\
RUN
pip3
install
pre-commit
'ipython==5.3.0'
&&
\
pip3
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip3
install
opencv-python
&&
\
pip
install
pre-commit
'ipython==5.3.0'
&&
\
pip
install
'ipykernel==4.6.0'
'jupyter==1.0.0'
&&
\
pip
install
opencv-python
#For docstring checker
RUN
pip3
install
pylint pytest astroid isort
RUN
pip
install
pylint pytest astroid isort LinkChecker
COPY
./python/requirements.txt /root/
RUN
pip3
install
-r
/root/requirements.txt
RUN
pip
install
-r
/root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN
apt-get
install
-y
libssl-dev libffi-dev
RUN
pip3
install
certifi urllib3[secure]
RUN
pip
install
certifi urllib3[secure]
...
...
cmake/cblas.cmake
浏览文件 @
5cedfb60
...
...
@@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS
/usr/local/opt/openblas/lib
)
find_path
(
OPENBLAS_INC_DIR NAMES cblas.h
PATHS
${
OPENBLAS_INCLUDE_SEARCH_PATHS
}
)
PATHS
${
OPENBLAS_INCLUDE_SEARCH_PATHS
}
NO_DEFAULT_PATH
)
find_path
(
OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
PATHS
${
OPENBLAS_INCLUDE_SEARCH_PATHS
}
)
find_library
(
OPENBLAS_LIB NAMES openblas
...
...
cmake/cuda.cmake
浏览文件 @
5cedfb60
...
...
@@ -175,7 +175,10 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list
(
APPEND CUDA_NVCC_FLAGS
"-Xcompiler -fPIC"
)
endif
(
NOT WIN32
)
list
(
APPEND CUDA_NVCC_FLAGS
"--use_fast_math"
)
if
(
WITH_FAST_MATH
)
# Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
list
(
APPEND CUDA_NVCC_FLAGS
"--use_fast_math"
)
endif
()
# in cuda9, suppress cuda warning on eigen
list
(
APPEND CUDA_NVCC_FLAGS
"-w"
)
# Set :expt-relaxed-constexpr to suppress Eigen warnings
...
...
cmake/external/eigen.cmake
浏览文件 @
5cedfb60
...
...
@@ -3,6 +3,14 @@ INCLUDE(ExternalProject)
SET
(
EIGEN_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/eigen3
)
SET
(
EIGEN_INCLUDE_DIR
${
EIGEN_SOURCE_DIR
}
/src/extern_eigen3
)
INCLUDE_DIRECTORIES
(
${
EIGEN_INCLUDE_DIR
}
)
if
(
NOT WITH_FAST_MATH
)
# EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
# enables some optimizations which might affect the accuracy of the result.
# This currently enables the SSE vectorization of sin() and cos(),
# and speedups sqrt() for single precision.
# Defined to 1 by default. Define it to 0 to disable.
add_definitions
(
-DEIGEN_FAST_MATH=0
)
endif
()
if
(
WITH_AMD_GPU
)
ExternalProject_Add
(
...
...
cmake/external/openblas.cmake
浏览文件 @
5cedfb60
...
...
@@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND})
SET
(
CBLAS_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/openblas
)
SET
(
CBLAS_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/openblas
)
SET
(
CBLAS_INC
LUDE
_DIR
"
${
CBLAS_INSTALL_DIR
}
/include"
CACHE PATH
"openblas include directory."
FORCE
)
SET
(
CBLAS_INC_DIR
"
${
CBLAS_INSTALL_DIR
}
/include"
CACHE PATH
"openblas include directory."
FORCE
)
SET
(
CBLAS_LIBRARIES
"
${
CBLAS_INSTALL_DIR
}
/lib/
${
CMAKE_STATIC_LIBRARY_PREFIX
}
openblas
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
"
...
...
@@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND})
ENDIF
(
NOT WIN32
)
SET
(
CBLAS_PROVIDER openblas
)
IF
(
WITH_C_API
)
INSTALL
(
DIRECTORY
${
CBLAS_INC
LUDE
_DIR
}
DESTINATION third_party/openblas
)
INSTALL
(
DIRECTORY
${
CBLAS_INC_DIR
}
DESTINATION third_party/openblas
)
# Because libopenblas.a is a symbolic link of another library, thus need to
# install the whole directory.
IF
(
ANDROID
)
...
...
@@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND})
ENDIF
(
NOT
${
CBLAS_FOUND
}
)
MESSAGE
(
STATUS
"BLAS library:
${
CBLAS_LIBRARIES
}
"
)
MESSAGE
(
STATUS
"BLAS Include:
${
CBLAS_INC
LUDE
_DIR
}
"
)
INCLUDE_DIRECTORIES
(
${
CBLAS_INC
LUDE
_DIR
}
)
MESSAGE
(
STATUS
"BLAS Include:
${
CBLAS_INC_DIR
}
"
)
INCLUDE_DIRECTORIES
(
${
CBLAS_INC_DIR
}
)
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
...
...
cmake/flags.cmake
浏览文件 @
5cedfb60
...
...
@@ -157,6 +157,8 @@ if (APPLE)
# On Mac OS X build fat binaries with x86_64 architectures by default.
set
(
CMAKE_OSX_ARCHITECTURES
"x86_64"
CACHE STRING
"Build architectures for OSX"
FORCE
)
endif
()
# On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
set
(
COMMON_FLAGS -Wno-deprecated-register
)
endif
(
APPLE
)
if
(
LINUX
)
...
...
paddle/fluid/API.spec
浏览文件 @
5cedfb60
...
...
@@ -198,6 +198,9 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.has_inf ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.has_nan ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.isfinite ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
5cedfb60
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
function
(
windows_symbolic TARGET
)
...
...
@@ -9,11 +10,23 @@ function(windows_symbolic TARGET)
if
(
NOT EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc OR NOT EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cu
)
message
(
FATAL
"
${
src
}
.cc and
${
src
}
.cu must exsits, and
${
src
}
.cu must be symbolic file."
)
endif
()
add_custom_command
(
OUTPUT .
${
src
}
.cu
# only copy the xx.cu to .xx.cu when the content are modified
set
(
copy_flag 1
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu
)
file
(
READ
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc SOURCE_STR
)
file
(
READ
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu TARGET_STR
)
if
(
SOURCE_STR STREQUAL TARGET_STR
)
set
(
copy_flag 0
)
endif
()
endif
()
if
(
copy_flag
)
add_custom_command
(
OUTPUT .
${
src
}
.cu
COMMAND
${
CMAKE_COMMAND
}
-E remove
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu
COMMAND
${
CMAKE_COMMAND
}
-E copy
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
.cc"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/.
${
src
}
.cu"
COMMENT
"create hidden file of
${
src
}
.cu"
)
add_custom_target
(
${
TARGET
}
ALL DEPENDS .
${
src
}
.cu
)
endif
(
copy_flag
)
add_custom_target
(
${
TARGET
}
ALL DEPENDS .
${
src
}
.cu
)
endforeach
()
endfunction
()
...
...
@@ -81,6 +94,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
if
(
WITH_GPU
)
if
(
WIN32
)
# windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file.
windows_symbolic
(
hidden_file SRCS data_type_transform.cu
)
nv_library
(
data_type_transform SRCS .data_type_transform.cu DEPS tensor
)
add_dependencies
(
data_type_transform hidden_file
)
...
...
@@ -149,7 +164,7 @@ if(WITH_DISTRIBUTE)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor
op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
elementwise_add_op
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
if
(
NOT WIN32
)
...
...
paddle/fluid/framework/data_type.h
浏览文件 @
5cedfb60
...
...
@@ -17,7 +17,6 @@ limitations under the License. */
#include <typeindex>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
...
...
paddle/fluid/framework/details/reference_count_pass.cc
浏览文件 @
5cedfb60
...
...
@@ -80,15 +80,15 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
// This is weird but there is really some variables without var_desc
// in computation_op
if
(
var_desc
==
nullptr
)
{
if
(
compute_op
->
Node
()
->
Op
()
->
Block
()
->
FindVar
(
var_name
)
==
nullptr
)
continue
;
}
else
{
if
(
var_desc
->
Persistable
())
continue
;
auto
var_type
=
var_desc
->
Proto
()
->
type
().
type
()
;
if
(
var_type
!=
proto
::
VarType
::
LOD_TENSOR
&&
var_type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
continue
;
}
var_desc
=
compute_op
->
Node
()
->
Op
()
->
Block
()
->
FindVar
(
var_name
);
if
(
var_desc
==
nullptr
)
continue
;
}
if
(
var_desc
->
Persistable
())
continue
;
auto
var_type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
var_type
!=
proto
::
VarType
::
LOD_TENSOR
&&
var_type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
continue
;
}
// compute op only runs in one device
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
5cedfb60
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
WRITE
${
pass_file
}
"// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!
\n\n
"
)
file
(
APPEND
${
pass_file
}
"
\#
pragma once
\n
"
)
file
(
APPEND
${
pass_file
}
"
\#
include
\"
paddle/fluid/framework/ir/pass.h
\"\n
"
)
...
...
@@ -37,6 +38,7 @@ pass_library(fc_lstm_fuse_pass inference)
pass_library
(
embedding_fc_lstm_fuse_pass inference
)
pass_library
(
fc_gru_fuse_pass inference
)
pass_library
(
seq_concat_fc_fuse_pass inference
)
pass_library
(
conv_bn_fuse_pass inference
)
cc_library
(
fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector
)
...
...
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
0 → 100644
浏览文件 @
5cedfb60
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h"
#include <functional>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
#define GET_CONV_BN_NODES(pattern_name) \
/* OPERATORS */
\
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, pattern_name); \
/* CONV inputs */
\
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \
/* CONV outputs */
\
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \
/* BN inputs */
\
GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, pattern_name); \
/* BN outputs */
\
GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, pattern_name);
/* Out */
\
GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
template
<
typename
UnaryOperation
>
LoDTensor
tensor_apply
(
const
LoDTensor
&
vec
,
UnaryOperation
f
)
{
LoDTensor
vec_y
;
vec_y
.
Resize
(
vec
.
dims
());
const
float
*
x
=
vec
.
data
<
float
>
();
float
*
y
=
vec_y
.
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
vec
.
numel
();
i
++
)
{
y
[
i
]
=
f
(
x
[
i
]);
}
return
vec_y
;
}
void
tensor_apply_inplace
(
LoDTensor
*
vec
,
float
(
*
f
)(
float
))
{
float
*
data
=
vec
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
vec
->
numel
();
i
++
)
{
data
[
i
]
=
f
(
data
[
i
]);
}
}
template
<
typename
BinaryOperation
>
LoDTensor
tensor_apply_eltwise
(
const
LoDTensor
&
vec_a
,
const
LoDTensor
&
vec_b
,
BinaryOperation
f
)
{
PADDLE_ENFORCE_EQ
(
vec_a
.
dims
(),
vec_b
.
dims
());
LoDTensor
vec_y
;
vec_y
.
Resize
(
vec_a
.
dims
());
const
float
*
a
=
vec_a
.
data
<
float
>
();
const
float
*
b
=
vec_b
.
data
<
float
>
();
float
*
y
=
vec_y
.
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
vec_a
.
numel
();
i
++
)
{
y
[
i
]
=
f
(
a
[
i
],
b
[
i
]);
}
return
vec_y
;
}
template
<
typename
BinaryOperation
>
LoDTensor
tensor_apply_eltwise_broadcast
(
const
LoDTensor
&
vec_a
,
const
LoDTensor
&
vec_b
,
BinaryOperation
f
)
{
PADDLE_ENFORCE_EQ
(
vec_a
.
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
vec_b
.
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
vec_a
.
dims
()[
0
],
vec_b
.
dims
()[
0
]);
PADDLE_ENFORCE_EQ
(
vec_b
.
dims
()[
1
],
1
);
LoDTensor
vec_y
;
vec_y
.
Resize
(
vec_a
.
dims
());
const
float
*
a
=
vec_a
.
data
<
float
>
();
const
float
*
b
=
vec_b
.
data
<
float
>
();
float
*
y
=
vec_y
.
mutable_data
<
float
>
(
platform
::
CPUPlace
());
size_t
a_height
=
vec_a
.
dims
()[
0
];
size_t
a_width
=
vec_a
.
dims
()[
1
];
for
(
size_t
h
=
0
;
h
<
a_height
;
h
++
)
{
for
(
size_t
w
=
0
;
w
<
a_width
;
++
w
)
{
*
(
y
++
)
=
f
(
*
(
a
++
),
b
[
h
]);
}
}
return
vec_y
;
}
// reshape to two dimensions {A, B * C * ...}
void
make_tensor_2d
(
LoDTensor
*
tensor_to_reshape
)
{
auto
dims_count
=
tensor_to_reshape
->
dims
().
size
();
PADDLE_ENFORCE_GT
(
dims_count
,
0
);
int
size2
=
1
;
for
(
int
i
=
1
;
i
<
dims_count
;
i
++
)
{
size2
*=
tensor_to_reshape
->
dims
()[
i
];
}
tensor_to_reshape
->
Resize
(
make_ddim
({
tensor_to_reshape
->
dims
()[
0
],
size2
}));
}
void
recompute_conv_weights
(
LoDTensor
*
weights
,
LoDTensor
*
tmp
)
{
// remember the weights tensor shape {A, B, C, ...}
auto
weights_shape
=
weights
->
dims
();
// reduce the weights to 2d {A, B * C * ...}
make_tensor_2d
(
weights
);
// make tmp tensor 2d by adding 1 as second dim {A, 1}
make_tensor_2d
(
tmp
);
*
weights
=
tensor_apply_eltwise_broadcast
(
*
weights
,
*
tmp
,
std
::
multiplies
<
float
>
());
// reshape weights to the original dims {A, B, C, ...}
weights
->
Resize
(
weights_shape
);
}
void
recompute_bias_and_weights
(
const
Scope
*
scope
,
ir
::
Node
*
conv_weight
,
//
const
ir
::
Node
&
bn_scale
,
//
const
LoDTensor
&
bn_bias_tensor
,
//
const
ir
::
Node
&
bn_mean
,
//
const
ir
::
Node
&
bn_variance
,
//
LoDTensor
*
eltwise_y_in_tensor
,
//
float
epsilon
)
{
// Re-compute bias of conv2d from BN
PADDLE_ENFORCE_EQ
(
eltwise_y_in_tensor
->
dims
(),
bn_bias_tensor
.
dims
());
auto
*
scale_tensor
=
scope
->
FindVar
(
bn_scale
.
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
*
variance_tensor
=
scope
->
FindVar
(
bn_variance
.
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
*
mean_tensor
=
scope
->
FindVar
(
bn_mean
.
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
std_tensor
=
LoDTensor
();
std_tensor
.
Resize
(
bn_bias_tensor
.
dims
());
std_tensor
=
tensor_apply
(
*
variance_tensor
,
[
&
](
float
x
)
{
return
x
+
epsilon
;
});
using
EigenVectorArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
float
,
Eigen
::
Dynamic
,
1
>>
;
EigenVectorArrayMap
std_vec
(
std_tensor
.
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
std_tensor
.
numel
(),
1
);
std_vec
=
std_vec
.
sqrt
();
auto
tmp_tensor
=
tensor_apply_eltwise
(
*
scale_tensor
,
std_tensor
,
std
::
divides
<
float
>
());
auto
tensor_minus
=
tensor_apply_eltwise
(
*
eltwise_y_in_tensor
,
*
mean_tensor
,
std
::
minus
<
float
>
());
auto
tensor_mul
=
tensor_apply_eltwise
(
tensor_minus
,
tmp_tensor
,
std
::
multiplies
<
float
>
());
*
eltwise_y_in_tensor
=
tensor_apply_eltwise
(
tensor_mul
,
bn_bias_tensor
,
std
::
plus
<
float
>
());
// Re-compute weight of conv2d from BN
auto
*
current_param
=
scope
->
FindVar
(
conv_weight
->
Name
())
->
GetMutable
<
LoDTensor
>
();
recompute_conv_weights
(
current_param
,
&
tmp_tensor
);
}
std
::
unique_ptr
<
ir
::
Graph
>
ConvBNFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
PADDLE_ENFORCE
(
graph
.
get
());
FusePassBase
::
Init
(
name_scope_
,
graph
.
get
());
auto
*
scope
=
param_scope
();
PADDLE_ENFORCE
(
scope
);
GraphPatternDetector
gpd
;
auto
*
conv_input
=
gpd
.
mutable_pattern
()
->
NewNode
(
patterns
::
PDNodeName
(
name_scope_
,
"conv_input"
))
->
AsInput
()
->
assert_is_op_input
(
"conv2d"
,
"Input"
);
patterns
::
ConvBN
conv_bn_pattern
(
gpd
.
mutable_pattern
(),
name_scope_
);
conv_bn_pattern
(
conv_input
,
false
/*with_eltwise_add*/
);
int
found_conv_bn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle ConvBN fuse"
;
// conv, batch_norm,
// conv_weight, conv_out,
// bn_scale, bn_bias, bn_mean, bn_variance,
// bn_out, bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance
GET_CONV_BN_NODES
(
conv_bn_pattern
);
// Create eltwise_y (conv bias) variable
VarDesc
eltwise_y_in_desc
(
patterns
::
PDNodeName
(
name_scope_
,
"eltwise_y_in"
));
auto
*
eltwise_y_in_node
=
g
->
CreateVarNode
(
&
eltwise_y_in_desc
);
auto
*
eltwise_y_in_tensor
=
scope
->
Var
(
eltwise_y_in_node
->
Name
())
->
GetMutable
<
LoDTensor
>
();
// Get batch norm bias
auto
*
bn_bias_tensor
=
scope
->
FindVar
(
bn_bias
->
Name
())
->
GetMutable
<
LoDTensor
>
();
// Initialize eltwise_y
eltwise_y_in_tensor
->
Resize
(
bn_bias_tensor
->
dims
());
std
::
fill_n
(
eltwise_y_in_tensor
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
eltwise_y_in_tensor
->
numel
(),
0.0
f
);
// update weights and biases
float
epsilon
=
boost
::
get
<
float
>
(
batch_norm
->
Op
()
->
GetAttr
(
"epsilon"
));
recompute_bias_and_weights
(
scope
,
conv_weight
,
*
bn_scale
,
*
bn_bias_tensor
,
*
bn_mean
,
*
bn_variance
,
eltwise_y_in_tensor
,
epsilon
);
// Create an elementwise add node
OpDesc
desc
;
desc
.
SetInput
(
"X"
,
std
::
vector
<
std
::
string
>
({
conv_out
->
Name
()}));
desc
.
SetInput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
eltwise_y_in_node
->
Name
()}));
desc
.
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
bn_out
->
Name
()}));
desc
.
SetType
(
"elementwise_add"
);
desc
.
SetAttr
(
"axis"
,
1
);
bool
a
=
boost
::
get
<
bool
>
(
conv
->
Op
()
->
GetAttr
(
"use_mkldnn"
));
desc
.
SetAttr
(
"use_mkldnn"
,
a
);
auto
eltwise_op
=
g
->
CreateOpNode
(
&
desc
);
// OpDesc will be copied.
GraphSafeRemoveNodes
(
graph
.
get
(),
{
bn_scale
,
bn_bias
,
bn_mean
,
bn_variance
,
batch_norm
,
bn_mean_out
,
bn_variance_out
,
bn_saved_mean
,
bn_saved_variance
});
PADDLE_ENFORCE
(
subgraph
.
count
(
conv_input
));
IR_NODE_LINK_TO
(
conv_out
,
eltwise_op
);
IR_NODE_LINK_TO
(
eltwise_y_in_node
,
eltwise_op
);
IR_NODE_LINK_TO
(
eltwise_op
,
bn_out
);
found_conv_bn_count
++
;
};
gpd
(
graph
.
get
(),
handler
);
AddStatis
(
found_conv_bn_count
);
return
graph
;
}
std
::
unique_ptr
<
ir
::
Graph
>
ConvEltwiseAddBNFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
PADDLE_ENFORCE
(
graph
.
get
());
FusePassBase
::
Init
(
name_scope_
,
graph
.
get
());
auto
*
scope
=
param_scope
();
PADDLE_ENFORCE
(
scope
);
GraphPatternDetector
gpd
;
auto
*
conv_input
=
gpd
.
mutable_pattern
()
->
NewNode
(
patterns
::
PDNodeName
(
name_scope_
,
"conv_input"
))
->
AsInput
()
->
assert_is_op_input
(
"conv2d"
,
"Input"
);
patterns
::
ConvBN
conv_bn_pattern
(
gpd
.
mutable_pattern
(),
name_scope_
);
conv_bn_pattern
(
conv_input
,
true
/*with_eltwise_add*/
);
int
found_conv_bn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle ConvBN fuse"
;
// conv, batch_norm,
// conv_weight, conv_out,
// bn_scale, bn_bias, bn_mean, bn_variance,
// bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,bn_saved_variance
GET_CONV_BN_NODES
(
conv_bn_pattern
);
// OPERATORS
GET_IR_NODE_FROM_SUBGRAPH
(
eltwise
,
eltwise
,
conv_bn_pattern
);
// BIAS inputs
GET_IR_NODE_FROM_SUBGRAPH
(
eltwise_y_in
,
eltwise_y_in
,
conv_bn_pattern
);
// BIAS outputs
GET_IR_NODE_FROM_SUBGRAPH
(
eltwise_out
,
eltwise_out
,
conv_bn_pattern
);
// Get eltwise_y (conv bias) variable
auto
*
eltwise_y_in_tensor
=
scope
->
FindVar
(
eltwise_y_in
->
Name
())
->
GetMutable
<
LoDTensor
>
();
// Get batch norm bias
auto
*
bn_bias_tensor
=
scope
->
FindVar
(
bn_bias
->
Name
())
->
GetMutable
<
LoDTensor
>
();
// update weights and biases
float
epsilon
=
boost
::
get
<
float
>
(
batch_norm
->
Op
()
->
GetAttr
(
"epsilon"
));
recompute_bias_and_weights
(
scope
,
conv_weight
,
*
bn_scale
,
*
bn_bias_tensor
,
*
bn_mean
,
*
bn_variance
,
eltwise_y_in_tensor
,
epsilon
);
// Update the elementwise_add node
eltwise
->
Op
()
->
SetAttr
(
"axis"
,
1
);
eltwise
->
Op
()
->
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
bn_out
->
Name
()}));
GraphSafeRemoveNodes
(
graph
.
get
(),
{
bn_scale
,
bn_bias
,
bn_mean
,
bn_variance
,
batch_norm
,
bn_mean_out
,
bn_variance_out
,
bn_saved_mean
,
bn_saved_variance
,
eltwise_out
});
PADDLE_ENFORCE
(
subgraph
.
count
(
conv_input
));
IR_NODE_LINK_TO
(
eltwise
,
bn_out
);
found_conv_bn_count
++
;
};
gpd
(
graph
.
get
(),
handler
);
AddStatis
(
found_conv_bn_count
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
conv_bn_fuse_pass
,
paddle
::
framework
::
ir
::
ConvBNFusePass
);
REGISTER_PASS
(
conv_eltwiseadd_bn_fuse_pass
,
paddle
::
framework
::
ir
::
ConvEltwiseAddBNFusePass
);
paddle/fluid/
inference/api/timer
.h
→
paddle/fluid/
framework/ir/conv_bn_fuse_pass
.h
浏览文件 @
5cedfb60
...
...
@@ -11,29 +11,39 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono> // NOLINT
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
inference
{
namespace
framework
{
namespace
ir
{
/*
* Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp.
*/
class
ConvBNFusePass
:
public
FusePassBase
{
public:
virtual
~
ConvBNFusePass
()
{}
// Timer for timer
class
Timer
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
const
std
::
string
name_scope_
{
"conv_bn_fuse"
};
};
class
ConvEltwiseAddBNFusePass
:
public
FusePassBase
{
public:
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
std
::
chrono
::
high_resolution_clock
::
time_point
startu
;
void
tic
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
double
toc
()
{
startu
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
time_span
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
startu
-
start
);
double
used_time_ms
=
static_cast
<
double
>
(
time_span
.
count
())
*
1000.0
;
return
used_time_ms
;
}
virtual
~
ConvEltwiseAddBNFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
const
std
::
string
name_scope_
{
"conv_eltwiseadd_bn_fuse"
};
};
}
// namespace inference
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
5cedfb60
...
...
@@ -626,6 +626,112 @@ bool VarLinksFromOp(Node *node, const std::string &op_type) {
return
false
;
}
PDNode
*
patterns
::
ConvBN
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
conv_input
,
bool
with_eltwise_add
)
{
// Create Operators
conv_input
->
assert_is_op_input
(
"conv2d"
,
"Input"
);
auto
*
conv_op
=
pattern
->
NewNode
(
conv_repr
())
->
assert_is_op
(
"conv2d"
);
PDNode
*
eltwise_op
=
nullptr
;
if
(
with_eltwise_add
)
{
eltwise_op
=
pattern
->
NewNode
(
eltwise_repr
())
->
assert_is_op
(
"elementwise_add"
);
}
auto
*
batch_norm_op
=
pattern
->
NewNode
(
batch_norm_repr
())
->
assert_is_op
(
"batch_norm"
);
// Create variables
// Conv Filter
auto
*
conv_weight_var
=
pattern
->
NewNode
(
conv_weight_repr
())
->
AsInput
()
->
assert_is_persistable_var
()
->
assert_is_op_input
(
"conv2d"
,
"Filter"
);
auto
*
conv_out_var
=
pattern
->
NewNode
(
conv_out_repr
())
->
AsIntermediate
()
->
assert_is_only_output_of_op
(
"conv2d"
);
PDNode
*
eltwise_y_in_var
=
nullptr
;
PDNode
*
eltwise_out_var
=
nullptr
;
if
(
with_eltwise_add
)
{
// Conv output as Bias input
conv_out_var
->
assert_is_op_input
(
"elementwise_add"
,
"X"
);
// Bias
eltwise_y_in_var
=
pattern
->
NewNode
(
eltwise_y_in_repr
())
->
assert_is_op_input
(
"elementwise_add"
,
"Y"
)
->
AsInput
();
eltwise_out_var
=
pattern
->
NewNode
(
eltwise_out_repr
())
->
AsIntermediate
()
->
assert_is_only_output_of_op
(
"elementwise_add"
);
}
else
{
// Conv output as BN input
conv_out_var
->
assert_is_op_input
(
"batch_norm"
,
"X"
);
}
// BN Scale
auto
*
bn_scale_var
=
pattern
->
NewNode
(
bn_scale_repr
())
->
AsInput
()
->
assert_is_persistable_var
()
->
assert_is_op_input
(
"batch_norm"
,
"Scale"
);
// BN Bias
auto
*
bn_bias_var
=
pattern
->
NewNode
(
bn_bias_repr
())
->
AsInput
()
->
assert_is_persistable_var
()
->
assert_is_op_input
(
"batch_norm"
,
"Bias"
);
// BN Mean
auto
*
bn_mean_var
=
pattern
->
NewNode
(
bn_mean_repr
())
->
AsInput
()
->
assert_is_persistable_var
()
->
assert_is_op_input
(
"batch_norm"
,
"Mean"
);
// BN Variance
auto
*
bn_variance_var
=
pattern
->
NewNode
(
bn_variance_repr
())
->
AsInput
()
->
assert_is_persistable_var
()
->
assert_is_op_input
(
"batch_norm"
,
"Variance"
);
// BN output
auto
*
bn_out_var
=
pattern
->
NewNode
(
bn_out_repr
())
->
AsOutput
()
->
assert_is_op_output
(
"batch_norm"
);
auto
*
bn_mean_out_var
=
pattern
->
NewNode
(
bn_mean_out_repr
())
->
AsOutput
()
->
assert_is_op_output
(
"batch_norm"
,
"MeanOut"
);
auto
*
bn_variance_out_var
=
pattern
->
NewNode
(
bn_variance_out_repr
())
->
AsOutput
()
->
assert_is_op_output
(
"batch_norm"
,
"VarianceOut"
);
auto
*
bn_saved_mean_var
=
pattern
->
NewNode
(
bn_saved_mean_repr
())
->
AsOutput
()
->
assert_is_op_output
(
"batch_norm"
,
"SavedMean"
);
auto
*
bn_saved_variance_var
=
pattern
->
NewNode
(
bn_saved_variance_repr
())
->
AsOutput
()
->
assert_is_op_output
(
"batch_norm"
,
"SavedVariance"
);
conv_op
->
LinksFrom
({
conv_input
,
conv_weight_var
}).
LinksTo
({
conv_out_var
});
if
(
with_eltwise_add
)
{
eltwise_op
->
LinksFrom
({
conv_out_var
,
eltwise_y_in_var
})
.
LinksTo
({
eltwise_out_var
});
batch_norm_op
->
LinksFrom
({
eltwise_out_var
,
bn_scale_var
,
bn_bias_var
,
bn_mean_var
,
bn_variance_var
})
.
LinksTo
({
bn_out_var
,
bn_mean_out_var
,
bn_variance_out_var
,
bn_saved_mean_var
,
bn_saved_variance_var
});
}
else
{
batch_norm_op
->
LinksFrom
({
conv_out_var
,
bn_scale_var
,
bn_bias_var
,
bn_mean_var
,
bn_variance_var
})
.
LinksTo
({
bn_out_var
,
bn_mean_out_var
,
bn_variance_out_var
,
bn_saved_mean_var
,
bn_saved_variance_var
});
}
return
bn_out_var
;
}
PDNode
*
patterns
::
ConvReLU
::
operator
()(
paddle
::
framework
::
ir
::
PDNode
*
conv_input
)
{
// Create Operators
...
...
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
5cedfb60
...
...
@@ -375,6 +375,44 @@ struct PatternBase {
size_t
id_
;
};
// Conv with batch norm
// op: conv + (elementwise_add +) batch_norm
// named nodes:
// conv_weight, conv_out, conv,
// bn_x, bn_scale, bn_bias, bn_mean, bn_variance,
// bn_batch_norm, bn_y, bn_mean_out, bn_variance_out,
// bn_saved_mean, bn_saved_variance
struct
ConvBN
:
public
PatternBase
{
ConvBN
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"conv_bn"
)
{}
PDNode
*
operator
()(
PDNode
*
conv_input
,
bool
with_eltwise_add
);
// declare operator node's name
PATTERN_DECL_NODE
(
conv
);
PATTERN_DECL_NODE
(
batch_norm
);
PATTERN_DECL_NODE
(
eltwise
);
// ELEMENTWISE_ADD
// CONV inputs
PATTERN_DECL_NODE
(
conv_weight
);
// Filter
// CONV outputs
PATTERN_DECL_NODE
(
conv_out
);
// tmp
// ELTWISE inputs
PATTERN_DECL_NODE
(
eltwise_y_in
);
// ELTWISE outputs
PATTERN_DECL_NODE
(
eltwise_out
);
// tmp
// BN inputs
PATTERN_DECL_NODE
(
bn_scale
);
PATTERN_DECL_NODE
(
bn_bias
);
PATTERN_DECL_NODE
(
bn_mean
);
PATTERN_DECL_NODE
(
bn_variance
);
// BN outputs
PATTERN_DECL_NODE
(
bn_out
);
// Out
PATTERN_DECL_NODE
(
bn_mean_out
);
PATTERN_DECL_NODE
(
bn_variance_out
);
PATTERN_DECL_NODE
(
bn_saved_mean
);
PATTERN_DECL_NODE
(
bn_saved_variance
);
};
// CONV with ReLU
// op: conv + relu
// named nodes:
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
5cedfb60
...
...
@@ -146,5 +146,22 @@ void NaiveExecutor::CleanFeedFetchOps() {
ops_
.
swap
(
ops
);
}
void
NaiveExecutor
::
EnableMKLDNN
(
const
ProgramDesc
&
program
)
{
#ifdef PADDLE_WITH_MKLDNN
VLOG
(
3
)
<<
"use_mkldnn=True"
;
for
(
size_t
block_id
=
0
;
block_id
<
program
.
Size
();
++
block_id
)
{
auto
*
block
=
const_cast
<
ProgramDesc
&>
(
program
).
MutableBlock
(
block_id
);
for
(
auto
*
op
:
block
->
AllOps
())
{
if
(
op
->
HasAttr
(
"use_mkldnn"
))
{
op
->
SetAttr
(
"use_mkldnn"
,
true
);
}
}
}
#else
LOG
(
WARNING
)
<<
"'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"
;
#endif
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/naive_executor.h
浏览文件 @
5cedfb60
...
...
@@ -14,6 +14,8 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
...
...
@@ -46,6 +48,8 @@ class NaiveExecutor {
void
CleanFeedFetchOps
();
void
EnableMKLDNN
(
const
ProgramDesc
&
program
);
protected:
void
CreateVariables
(
const
ProgramDesc
&
desc
,
Scope
*
scope
,
int
block_id
);
...
...
paddle/fluid/framework/op_desc.cc
浏览文件 @
5cedfb60
...
...
@@ -50,6 +50,27 @@ class CompileTimeInferShapeContext : public InferShapeContext {
const
std
::
vector
<
std
::
string
>
&
Outputs
(
const
std
::
string
&
name
)
const
override
;
void
ShareDim
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
override
{
PADDLE_ENFORCE_LT
(
i
,
Inputs
(
in
).
size
());
PADDLE_ENFORCE_LT
(
j
,
Outputs
(
out
).
size
());
const
std
::
string
&
input_n
=
Inputs
(
in
)[
i
];
const
std
::
string
&
output_n
=
Outputs
(
out
)[
j
];
PADDLE_ENFORCE
(
input_n
!=
framework
::
kEmptyVarName
,
"The %s[%d] is @EMPTY@"
,
in
,
i
);
PADDLE_ENFORCE
(
output_n
!=
framework
::
kEmptyVarName
,
"The %s[%d] is @EMPTY@"
,
out
,
j
);
auto
*
in_var
=
block_
.
FindVarRecursive
(
input_n
);
auto
*
out_var
=
block_
.
FindVarRecursive
(
output_n
);
PADDLE_ENFORCE
(
in_var
->
GetType
()
==
out_var
->
GetType
(),
"The type of %s and %s is not the same."
,
input_n
,
output_n
);
SetDim
(
output_n
,
GetDim
(
input_n
));
}
void
ShareLoD
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
const
override
{
PADDLE_ENFORCE_LT
(
i
,
Inputs
(
in
).
size
());
...
...
paddle/fluid/framework/op_proto_maker.cc
浏览文件 @
5cedfb60
...
...
@@ -132,9 +132,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
AddAttr
<
std
::
string
>
(
OpNamescopeAttrName
(),
"Operator name with namesope."
)
.
SetDefault
(
""
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
OpCreationCallstackAttrName
(),
"Callstack for Op Creatation."
)
.
SetDefault
({});
Validate
();
}
...
...
paddle/fluid/framework/op_proto_maker.h
浏览文件 @
5cedfb60
...
...
@@ -46,7 +46,6 @@ class OpProtoAndCheckerMaker {
static
const
char
*
OpRoleAttrName
()
{
return
"op_role"
;
}
static
const
char
*
OpRoleVarAttrName
()
{
return
"op_role_var"
;
}
static
const
char
*
OpNamescopeAttrName
()
{
return
"op_namescope"
;
}
static
const
char
*
OpCreationCallstackAttrName
()
{
return
"op_callstack"
;
}
void
operator
()(
proto
::
OpProto
*
proto
,
OpAttrChecker
*
attr_checker
);
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
5cedfb60
...
...
@@ -14,17 +14,15 @@ limitations under the License. */
#define GLOG_NO_ABBREVIATED_SEVERITIES
#define GOOGLE_GLOG_DLL_DECL
#include "paddle/fluid/framework/operator.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <algorithm>
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op
_proto_make
r.h"
#include "paddle/fluid/framework/op
erato
r.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -142,54 +140,19 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
void
OperatorBase
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
{
try
{
if
(
VLOG_IS_ON
(
4
))
{
VLOG
(
4
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
if
(
platform
::
is_gpu_place
(
place
))
{
VLOG
(
4
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
if
(
platform
::
is_gpu_place
(
place
))
{
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW
(
"Cannot run operator on place %s"
,
place
);
PADDLE_THROW
(
"Cannot run operator on place %s"
,
place
);
#else
auto
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
platform
::
SetDeviceId
(
dev_id
);
auto
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
;
platform
::
SetDeviceId
(
dev_id
);
#endif
}
if
(
platform
::
IsProfileEnabled
())
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
}
RunImpl
(
scope
,
place
);
if
(
VLOG_IS_ON
(
3
))
{
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
}
catch
(
platform
::
EnforceNotMet
exception
)
{
if
(
Attrs
().
count
(
"sub_block"
)
!=
0
)
{
throw
exception
;
}
auto
&
callstack
=
Attr
<
std
::
vector
<
std
::
string
>>
(
OpProtoAndCheckerMaker
::
OpCreationCallstackAttrName
());
if
(
callstack
.
empty
())
{
throw
exception
;
}
std
::
ostringstream
sout
;
sout
<<
"Invoke operator "
<<
Type
()
<<
" error.
\n
"
;
sout
<<
"Python Callstacks:
\n
"
;
for
(
auto
&
line
:
callstack
)
{
sout
<<
line
;
}
sout
<<
"C++ Callstacks:
\n
"
;
sout
<<
exception
.
err_str_
;
exception
.
err_str_
=
sout
.
str
();
throw
exception
;
}
catch
(...)
{
std
::
rethrow_exception
(
std
::
current_exception
());
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
RunImpl
(
scope
,
place
);
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
bool
OperatorBase
::
HasInputs
(
const
std
::
string
&
name
)
const
{
...
...
@@ -217,7 +180,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
}
bool
OperatorBase
::
HasOutputs
(
const
std
::
string
&
name
)
const
{
if
(
outputs_
.
end
()
!=
outputs_
.
find
(
name
))
{
if
(
outputs_
.
find
(
name
)
!=
outputs_
.
end
(
))
{
return
true
;
}
else
{
return
false
;
...
...
@@ -579,13 +542,45 @@ class RuntimeInferShapeContext : public InferShapeContext {
return
op_
.
Outputs
(
name
);
}
void
Share
LoD
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
const
override
{
void
Share
Dim
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
override
{
PADDLE_ENFORCE_LT
(
i
,
Inputs
(
in
).
size
());
PADDLE_ENFORCE_LT
(
j
,
Outputs
(
out
).
size
());
Variable
*
in_var
=
scope_
.
FindVar
(
Inputs
(
in
)[
i
]);
Variable
*
out_var
=
scope_
.
FindVar
(
Outputs
(
out
)[
j
]);
const
std
::
string
&
input_n
=
Inputs
(
in
)[
i
];
const
std
::
string
&
output_n
=
Outputs
(
out
)[
j
];
Variable
*
in_var
=
scope_
.
FindVar
(
input_n
);
Variable
*
out_var
=
scope_
.
FindVar
(
output_n
);
PADDLE_ENFORCE
(
in_var
->
Type
()
==
out_var
->
Type
(),
"The type of %s and %s is not the same."
,
output_n
,
GetDim
(
input_n
));
if
(
in_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
in_sele_rows
=
in_var
->
Get
<
framework
::
SelectedRows
>
();
auto
out_sele_rows
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
out_sele_rows
->
mutable_value
()
->
Resize
(
in_sele_rows
.
value
().
dims
());
out_sele_rows
->
set_rows
(
in_sele_rows
.
rows
());
out_sele_rows
->
set_height
(
in_sele_rows
.
height
());
}
else
if
(
in_var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
&
in_lod_tensor
=
in_var
->
Get
<
framework
::
LoDTensor
>
();
auto
*
out_lod_tensor
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
out_lod_tensor
->
Resize
(
in_lod_tensor
.
dims
());
}
else
{
PADDLE_THROW
(
"Currently, the input type of ShareDim only can be LoDTensor "
"or SelectedRows."
);
}
}
void
ShareLoD
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
const
override
{
const
std
::
vector
<
std
::
string
>&
inputs
=
Inputs
(
in
);
const
std
::
vector
<
std
::
string
>&
outputs
=
Outputs
(
out
);
PADDLE_ENFORCE_LT
(
i
,
inputs
.
size
());
PADDLE_ENFORCE_LT
(
j
,
outputs
.
size
());
Variable
*
in_var
=
scope_
.
FindVar
(
inputs
.
at
(
i
));
if
(
!
in_var
->
IsType
<
LoDTensor
>
())
return
;
Variable
*
out_var
=
scope_
.
FindVar
(
outputs
.
at
(
j
));
PADDLE_ENFORCE
(
out_var
->
IsType
<
LoDTensor
>
(),
"The %d-th output of Output(%s) must be LoDTensor."
,
j
,
out
);
auto
in_tensor
=
in_var
->
Get
<
LoDTensor
>
();
...
...
@@ -613,20 +608,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
out_tensor
->
set_layout
(
in_tensor
.
layout
());
}
void
ShareLayout
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
const
{
PADDLE_ENFORCE_LT
(
i
,
Inputs
(
in
).
size
());
PADDLE_ENFORCE_LT
(
j
,
Outputs
(
out
).
size
());
Variable
*
in_var
=
scope_
.
FindVar
(
Inputs
(
in
)[
i
]);
Variable
*
out_var
=
scope_
.
FindVar
(
Outputs
(
out
)[
j
]);
if
(
!
in_var
->
IsType
<
LoDTensor
>
())
return
;
PADDLE_ENFORCE
(
out_var
->
IsType
<
LoDTensor
>
(),
"The %d-th output of Output(%s) must be LoDTensor."
,
j
,
out
);
auto
in_tensor
=
in_var
->
Get
<
LoDTensor
>
();
auto
*
out_tensor
=
out_var
->
GetMutable
<
LoDTensor
>
();
out_tensor
->
set_layout
(
in_tensor
.
layout
());
}
bool
IsRuntime
()
const
override
{
return
true
;
}
protected:
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
5cedfb60
...
...
@@ -250,6 +250,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
#ifdef PADDLE_WITH_CUDA
if
(
!
gcs_
.
empty
())
{
ResetReferenceCount
();
for
(
auto
&
pair
:
cur_ref_cnts_
)
{
auto
&
name_map
=
*
(
pair
.
second
);
for
(
auto
&
fetch_name
:
fetch_tensors
)
{
name_map
.
erase
(
fetch_name
);
}
name_map
.
erase
(
fetched_var_name
);
}
}
#endif
auto
fetch_data
=
member_
->
executor_
->
Run
(
fetch_tensors
);
...
...
paddle/fluid/framework/rw_lock.h
浏览文件 @
5cedfb60
...
...
@@ -46,6 +46,7 @@ struct RWLock {
private:
pthread_rwlock_t
lock_
;
};
// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
#else
// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
// In windows, rw_lock seems like a hack. Use empty object and do nothing.
...
...
paddle/fluid/framework/shape_inference.cc
浏览文件 @
5cedfb60
...
...
@@ -46,16 +46,6 @@ std::vector<DDim> InferShapeContext::GetReaderDims(
return
this
->
GetRepeatedDims
(
arg_names
[
0
]);
}
void
InferShapeContext
::
ShareLoDs
(
const
std
::
string
&
in
,
const
std
::
string
&
out
)
const
{
PADDLE_ENFORCE_EQ
(
Inputs
(
in
).
size
(),
Outputs
(
out
).
size
(),
"The number of arguments in %s and %s is not equal."
,
in
,
out
);
for
(
size_t
i
=
0
;
i
<
in
.
size
();
++
i
)
{
ShareLoD
(
in
,
out
,
i
,
i
);
}
}
DDim
InferShapeContext
::
GetInputsElementDim
(
const
std
::
string
&
name
,
int
idx
)
const
{
const
std
::
vector
<
std
::
string
>
&
names
=
Inputs
(
name
);
...
...
paddle/fluid/framework/shape_inference.h
浏览文件 @
5cedfb60
...
...
@@ -56,7 +56,8 @@ class InferShapeContext {
virtual
const
std
::
vector
<
std
::
string
>
&
Outputs
(
const
std
::
string
&
name
)
const
=
0
;
void
ShareLoDs
(
const
std
::
string
&
in
,
const
std
::
string
&
out
)
const
;
virtual
void
ShareDim
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
=
0
;
virtual
void
ShareLoD
(
const
std
::
string
&
in
,
const
std
::
string
&
out
,
size_t
i
=
0
,
size_t
j
=
0
)
const
=
0
;
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
5cedfb60
...
...
@@ -165,10 +165,12 @@ inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
}
template
<
typename
Predicate
>
struct
AnyVisitor
:
public
boost
::
static_visitor
<
bool
>
{
class
AnyVisitor
:
public
boost
::
static_visitor
<
bool
>
{
private:
const
framework
::
Tensor
&
tensor_
;
Predicate
predicate_
;
public:
AnyVisitor
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
)
:
tensor_
(
tensor
),
predicate_
(
std
::
move
(
predicate
))
{}
...
...
@@ -206,6 +208,27 @@ struct AnyVisitor : public boost::static_visitor<bool> {
}
};
template
<
typename
Predicate
>
class
AnyOutVisitor
:
public
boost
::
static_visitor
<>
{
private:
const
framework
::
Tensor
&
tensor_
;
mutable
framework
::
Tensor
*
out_
;
Predicate
predicate_
;
public:
AnyOutVisitor
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
,
framework
::
Tensor
*
out
)
:
tensor_
(
tensor
),
out_
(
out
),
predicate_
(
std
::
move
(
predicate
))
{}
template
<
typename
Place
>
void
operator
()(
const
Place
&
place
)
const
{
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
place
);
out_
->
Resize
({
1
});
out_
->
mutable_data
<
bool
>
(
place
);
AnyImpl
(
predicate_
,
tensor_
,
*
ctx
,
out_
);
}
};
template
<
typename
Predicate
>
inline
bool
Any
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
)
{
AnyVisitor
<
Predicate
>
visitor
(
tensor
,
predicate
);
...
...
@@ -213,6 +236,14 @@ inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
return
platform
::
VisitPlace
(
place
,
visitor
);
}
template
<
typename
Predicate
>
inline
void
Any
(
const
framework
::
Tensor
&
tensor
,
Predicate
predicate
,
framework
::
Tensor
*
out
)
{
AnyOutVisitor
<
Predicate
>
visitor
(
tensor
,
predicate
,
out
);
auto
place
=
tensor
.
place
();
platform
::
VisitPlace
(
place
,
visitor
);
}
struct
ContainsNANPredicate
{
template
<
typename
T
>
auto
operator
()(
const
T
&
eigen_vec
)
const
...
...
@@ -227,6 +258,12 @@ bool TensorContainsNAN(const framework::Tensor& tensor) {
return
Any
(
tensor
,
predicate
);
}
void
TensorContainsNAN
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
ContainsNANPredicate
predicate
;
Any
(
tensor
,
predicate
,
out
);
}
struct
ContainsInfPredicate
{
template
<
typename
T
>
auto
operator
()(
const
T
&
eigen_vec
)
const
...
...
@@ -241,6 +278,71 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
return
Any
(
tensor
,
predicate
);
}
void
TensorContainsInf
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
ContainsInfPredicate
predicate
;
Any
(
tensor
,
predicate
,
out
);
}
// NOTE(dzhwinter):
// Isfinite need a AllVisitor to loop through all the elements.
// We choose two cuda call instead of one allvisitor. The AllVisitor
// should be implemented if the performance hurts.
bool
TensorIsfinite
(
const
framework
::
Tensor
&
tensor
)
{
ContainsInfPredicate
pred_inf
;
ContainsNANPredicate
pred_nan
;
return
!
Any
(
tensor
,
pred_inf
)
&&
!
Any
(
tensor
,
pred_nan
);
}
#ifdef PADDLE_WITH_CUDA
template
<
typename
T
>
static
inline
void
__global__
BothFalse
(
const
T
*
cmp
,
T
*
out
)
{
out
[
0
]
=
(
!
cmp
[
0
])
&&
(
!
out
[
0
]);
}
#endif
struct
BothFalseVisitor
:
public
boost
::
static_visitor
<>
{
const
framework
::
Tensor
&
in_
;
mutable
framework
::
Tensor
*
out_
;
BothFalseVisitor
(
const
framework
::
Tensor
&
in
,
framework
::
Tensor
*
out
)
:
in_
(
in
),
out_
(
out
)
{}
template
<
typename
Place
>
void
operator
()(
const
Place
&
place
)
const
{
VisitorImpl
(
place
);
}
void
VisitorImpl
(
const
platform
::
CUDAPlace
&
gpu
)
const
{
#ifdef PADDLE_WITH_CUDA
auto
*
ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
gpu
);
BothFalse
<
bool
><<<
1
,
1
,
0
,
ctx
->
stream
()
>>>
(
in_
.
data
<
bool
>
(),
out_
->
mutable_data
<
bool
>
(
gpu
));
#endif
}
void
VisitorImpl
(
const
platform
::
CPUPlace
&
cpu
)
const
{
bool
lhs
=
!
in_
.
data
<
bool
>
()[
0
];
bool
rhs
=
!
out_
->
mutable_data
<
bool
>
(
cpu
)[
0
];
out_
->
mutable_data
<
bool
>
(
cpu
)[
0
]
=
lhs
&&
rhs
;
}
void
VisitorImpl
(
const
platform
::
CUDAPinnedPlace
&
cpu
/* equals to cpu*/
)
const
{
bool
lhs
=
!
in_
.
data
<
bool
>
()[
0
];
bool
rhs
=
!
out_
->
mutable_data
<
bool
>
(
cpu
)[
0
];
out_
->
mutable_data
<
bool
>
(
cpu
)[
0
]
=
lhs
&&
rhs
;
}
};
void
TensorIsfinite
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
framework
::
Tensor
tmp
;
TensorContainsInf
(
tensor
,
&
tmp
);
TensorContainsNAN
(
tensor
,
out
);
BothFalseVisitor
visitor
(
tmp
,
out
);
auto
place
=
tensor
.
place
();
platform
::
VisitPlace
(
place
,
visitor
);
}
void
TensorToStream
(
std
::
ostream
&
os
,
const
Tensor
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
)
{
{
// the 1st field, uint32_t version
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
5cedfb60
...
...
@@ -57,8 +57,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
template
<
typename
T
>
void
TesnorToVector
(
const
Tensor
&
src
,
std
::
vector
<
T
>*
dst
);
// copy the result bool to cpu
bool
TensorContainsNAN
(
const
framework
::
Tensor
&
tensor
);
bool
TensorContainsInf
(
const
framework
::
Tensor
&
tensor
);
bool
TensorIsfinite
(
const
framework
::
Tensor
&
tensor
);
// store the result bool in gpu tensor, async operation. Faster than above ones.
void
TensorContainsNAN
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
);
void
TensorContainsInf
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
);
void
TensorIsfinite
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
);
void
TensorToStream
(
std
::
ostream
&
os
,
const
Tensor
&
tensor
,
const
platform
::
DeviceContext
&
dev_ctx
);
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
5cedfb60
...
...
@@ -36,7 +36,7 @@ TEST(TensorCopy, Tensor) {
TensorCopy
(
src_tensor
,
*
cpu_place
,
&
dst_tensor
);
const
int
*
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSER
T_NE
(
src_ptr
,
dst_ptr
);
EXPEC
T_NE
(
src_ptr
,
dst_ptr
);
for
(
size_t
i
=
0
;
i
<
9
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
}
...
...
@@ -47,7 +47,7 @@ TEST(TensorCopy, Tensor) {
TensorCopy
(
slice_tensor
,
*
cpu_place
,
&
dst_tensor
);
const
int
*
slice_ptr
=
slice_tensor
.
data
<
int
>
();
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSER
T_NE
(
dst_ptr
,
slice_ptr
);
EXPEC
T_NE
(
dst_ptr
,
slice_ptr
);
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
EXPECT_EQ
(
dst_ptr
[
i
],
slice_ptr
[
i
]);
}
...
...
@@ -77,7 +77,7 @@ TEST(TensorCopy, Tensor) {
// Sync before Compare Tensors
gpu_ctx
.
Wait
();
const
int
*
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSER
T_NE
(
src_ptr
,
dst_ptr
);
EXPEC
T_NE
(
src_ptr
,
dst_ptr
);
for
(
size_t
i
=
0
;
i
<
9
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
}
...
...
@@ -94,7 +94,7 @@ TEST(TensorCopy, Tensor) {
gpu_ctx
.
Wait
();
const
int
*
slice_ptr
=
slice_tensor
.
data
<
int
>
();
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSER
T_NE
(
dst_ptr
,
slice_ptr
);
EXPEC
T_NE
(
dst_ptr
,
slice_ptr
);
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
EXPECT_EQ
(
dst_ptr
[
i
],
slice_ptr
[
i
]);
}
...
...
@@ -117,7 +117,7 @@ TEST(TensorFromVector, Tensor) {
// Compare Tensors
const
int
*
cpu_ptr
=
cpu_tensor
.
data
<
int
>
();
const
int
*
src_ptr
=
src_vec
.
data
();
ASSER
T_NE
(
src_ptr
,
cpu_ptr
);
EXPEC
T_NE
(
src_ptr
,
cpu_ptr
);
for
(
size_t
i
=
0
;
i
<
9
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
cpu_ptr
[
i
]);
}
...
...
@@ -127,7 +127,7 @@ TEST(TensorFromVector, Tensor) {
paddle
::
framework
::
TensorFromVector
<
int
>
(
src_vec
,
&
cpu_tensor
);
cpu_ptr
=
cpu_tensor
.
data
<
int
>
();
src_ptr
=
src_vec
.
data
();
ASSER
T_NE
(
src_ptr
,
cpu_ptr
);
EXPEC
T_NE
(
src_ptr
,
cpu_ptr
);
for
(
size_t
i
=
0
;
i
<
5
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
cpu_ptr
[
i
]);
}
...
...
@@ -161,8 +161,8 @@ TEST(TensorFromVector, Tensor) {
const
int
*
src_ptr
=
src_vec
.
data
();
const
int
*
cpu_ptr
=
cpu_tensor
.
data
<
int
>
();
const
int
*
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSER
T_NE
(
src_ptr
,
cpu_ptr
);
ASSER
T_NE
(
src_ptr
,
dst_ptr
);
EXPEC
T_NE
(
src_ptr
,
cpu_ptr
);
EXPEC
T_NE
(
src_ptr
,
dst_ptr
);
for
(
size_t
i
=
0
;
i
<
9
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
cpu_ptr
[
i
]);
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
...
...
@@ -181,8 +181,8 @@ TEST(TensorFromVector, Tensor) {
src_ptr
=
src_vec
.
data
();
cpu_ptr
=
cpu_tensor
.
data
<
int
>
();
dst_ptr
=
dst_tensor
.
data
<
int
>
();
ASSER
T_NE
(
src_ptr
,
cpu_ptr
);
ASSER
T_NE
(
src_ptr
,
dst_ptr
);
EXPEC
T_NE
(
src_ptr
,
cpu_ptr
);
EXPEC
T_NE
(
src_ptr
,
dst_ptr
);
for
(
size_t
i
=
0
;
i
<
5
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
cpu_ptr
[
i
]);
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
...
...
@@ -235,9 +235,9 @@ TEST(TensorContainsNAN, CPU) {
buf
[
0
]
=
0.0
;
buf
[
1
]
=
NAN
;
buf
[
2
]
=
0.0
;
ASSER
T_TRUE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
EXPEC
T_TRUE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
buf
[
1
]
=
0.0
;
ASSER
T_FALSE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
EXPEC
T_FALSE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
}
{
...
...
@@ -248,9 +248,9 @@ TEST(TensorContainsNAN, CPU) {
buf
[
0
]
=
0.0
;
buf
[
1
].
x
=
0x7fff
;
buf
[
2
]
=
0.0
;
ASSER
T_TRUE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
EXPEC
T_TRUE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
buf
[
1
]
=
0.0
;
ASSER
T_FALSE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
EXPEC
T_FALSE
(
paddle
::
framework
::
TensorContainsNAN
(
src
));
}
}
...
...
@@ -261,9 +261,9 @@ TEST(TensorContainsInf, CPU) {
buf
[
0
]
=
1.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.0
;
ASSER
T_TRUE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
EXPEC
T_TRUE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
buf
[
1
]
=
1.0
;
ASSER
T_FALSE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
EXPEC
T_FALSE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
}
{
...
...
@@ -274,9 +274,55 @@ TEST(TensorContainsInf, CPU) {
buf
[
0
]
=
1.0
;
buf
[
1
].
x
=
0x7c00
;
buf
[
2
]
=
0.0
;
ASSER
T_TRUE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
EXPEC
T_TRUE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
buf
[
1
]
=
1.0
;
ASSERT_FALSE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
EXPECT_FALSE
(
paddle
::
framework
::
TensorContainsInf
(
src
));
}
}
TEST
(
TensorIsfinite
,
CPU
)
{
{
paddle
::
framework
::
Tensor
src
,
out
;
double
*
buf
=
src
.
mutable_data
<
double
>
({
3
},
paddle
::
platform
::
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.0
;
paddle
::
framework
::
TensorIsfinite
(
src
,
&
out
);
EXPECT_EQ
(
out
.
data
<
bool
>
()[
0
],
false
);
buf
[
1
]
=
1.0
;
paddle
::
framework
::
TensorIsfinite
(
src
,
&
out
);
EXPECT_EQ
(
out
.
data
<
bool
>
()[
0
],
true
);
}
{
paddle
::
framework
::
Tensor
src
,
out
;
double
*
buf
=
src
.
mutable_data
<
double
>
({
3
},
paddle
::
platform
::
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
]
=
NAN
;
buf
[
2
]
=
0.0
;
paddle
::
framework
::
TensorIsfinite
(
src
,
&
out
);
EXPECT_EQ
(
out
.
data
<
bool
>
()[
0
],
false
);
buf
[
1
]
=
1.0
;
paddle
::
framework
::
TensorIsfinite
(
src
,
&
out
);
EXPECT_EQ
(
out
.
data
<
bool
>
()[
0
],
true
);
}
{
paddle
::
framework
::
Tensor
src
,
out
;
paddle
::
platform
::
float16
*
buf
=
src
.
mutable_data
<
paddle
::
platform
::
float16
>
(
{
3
},
paddle
::
platform
::
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
].
x
=
0x7c00
;
buf
[
2
]
=
0.0
;
paddle
::
framework
::
TensorIsfinite
(
src
,
&
out
);
EXPECT_EQ
(
out
.
data
<
bool
>
()[
0
],
false
);
buf
[
1
]
=
1.0
;
paddle
::
framework
::
TensorIsfinite
(
src
,
&
out
);
EXPECT_EQ
(
out
.
data
<
bool
>
()[
0
],
true
);
buf
[
1
].
x
=
0x7fff
;
paddle
::
framework
::
TensorIsfinite
(
src
,
&
out
);
EXPECT_EQ
(
out
.
data
<
bool
>
()[
0
],
false
);
}
}
...
...
@@ -299,9 +345,9 @@ TEST(Tensor, FromAndToStream) {
TensorFromStream
(
iss
,
&
dst_tensor
,
cpu_ctx
);
int
*
dst_ptr
=
dst_tensor
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
ASSER
T_EQ
(
dst_ptr
[
i
],
array
[
i
]);
EXPEC
T_EQ
(
dst_ptr
[
i
],
array
[
i
]);
}
ASSER
T_EQ
(
dst_tensor
.
dims
(),
src_tensor
.
dims
());
EXPEC
T_EQ
(
dst_tensor
.
dims
(),
src_tensor
.
dims
());
delete
place
;
}
#ifdef PADDLE_WITH_CUDA
...
...
@@ -323,7 +369,7 @@ TEST(Tensor, FromAndToStream) {
int
*
dst_ptr
=
dst_tensor
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
ASSER
T_EQ
(
dst_ptr
[
i
],
array
[
i
]);
EXPEC
T_EQ
(
dst_ptr
[
i
],
array
[
i
]);
}
delete
gpu_place
;
}
...
...
paddle/fluid/framework/tensor_util_test.cu
浏览文件 @
5cedfb60
...
...
@@ -27,9 +27,9 @@ static __global__ void FillNAN(float* buf) {
}
static
__global__
void
FillInf
(
float
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.
5
;
buf
[
0
]
=
INFINITY
;
buf
[
1
]
=
0.1
;
buf
[
2
]
=
0.
2
;
}
static
__global__
void
FillNAN
(
platform
::
float16
*
buf
)
{
...
...
@@ -44,6 +44,18 @@ static __global__ void FillInf(platform::float16* buf) {
buf
[
2
]
=
0.5
;
}
static
__global__
void
FillFinite
(
float
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
0.1
;
buf
[
2
]
=
0.2
;
}
static
__global__
void
FillFinite
(
platform
::
float16
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
0.1
;
buf
[
2
]
=
0.2
;
}
TEST
(
TensorContainsNAN
,
GPU
)
{
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
...
...
@@ -86,5 +98,163 @@ TEST(TensorContainsInf, GPU) {
}
}
TEST
(
TensorIsfinite
,
GPU
)
{
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
using
paddle
::
platform
::
float16
;
auto
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
// contains inf
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
// contains nan
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
!
TensorIsfinite
(
tensor
));
}
// all element are finite
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
EXPECT_TRUE
(
TensorIsfinite
(
tensor
));
}
}
TEST
(
TensorContainsInf
,
GPUWithoutWait
)
{
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
Tensor
tmp
;
TensorCopy
(
out
,
cpu
,
*
cuda_ctx
,
&
tmp
);
cuda_ctx
->
Wait
();
ASSERT_EQ
(
tmp
.
data
<
bool
>
()[
0
],
true
);
}
{
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
TensorContainsInf
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
Tensor
tmp
;
TensorCopy
(
out
,
cpu
,
*
cuda_ctx
,
&
tmp
);
cuda_ctx
->
Wait
();
ASSERT_EQ
(
tmp
.
data
<
bool
>
()[
0
],
true
);
}
}
TEST
(
TensorContainsNAN
,
GPUWithoutWait
)
{
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
Tensor
tmp
;
TensorCopy
(
out
,
cpu
,
*
cuda_ctx
,
&
tmp
);
cuda_ctx
->
Wait
();
ASSERT_EQ
(
tmp
.
data
<
bool
>
()[
0
],
true
);
}
{
Tensor
tensor
,
out
;
paddle
::
platform
::
float16
*
buf
=
tensor
.
mutable_data
<
paddle
::
platform
::
float16
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
TensorContainsNAN
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
Tensor
tmp
;
TensorCopy
(
out
,
cpu
,
*
cuda_ctx
,
&
tmp
);
cuda_ctx
->
Wait
();
ASSERT_EQ
(
tmp
.
data
<
bool
>
()[
0
],
true
);
}
}
TEST
(
TensorIsfinite
,
GPUWithoutWait
)
{
paddle
::
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
Tensor
tmp
;
TensorCopy
(
out
,
cpu
,
*
cuda_ctx
,
&
tmp
);
cuda_ctx
->
Wait
();
EXPECT_EQ
(
tmp
.
data
<
bool
>
()[
0
],
false
);
}
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
Tensor
tmp
;
TensorCopy
(
out
,
cpu
,
*
cuda_ctx
,
&
tmp
);
cuda_ctx
->
Wait
();
EXPECT_EQ
(
tmp
.
data
<
bool
>
()[
0
],
false
);
}
{
Tensor
tensor
,
out
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillFinite
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
TensorIsfinite
(
tensor
,
&
out
);
platform
::
CPUPlace
cpu
;
Tensor
tmp
;
TensorCopy
(
out
,
cpu
,
*
cuda_ctx
,
&
tmp
);
cuda_ctx
->
Wait
();
EXPECT_EQ
(
tmp
.
data
<
bool
>
()[
0
],
true
);
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -19,8 +19,19 @@ cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
add_subdirectory
(
api
)
set
(
STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor
)
set
(
SHARED_INFERENCE_SRCS
io.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api_impl.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/analysis_predictor.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/details/zero_copy_tensor.cc
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
STATIC_INFERENCE_APIS
${
STATIC_INFERENCE_APIS
}
paddle_inference_tensorrt_subgraph_engine
)
set
(
SHARED_INFERENCE_SRCS
${
SHARED_INFERENCE_SRCS
}
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api_tensorrt_subgraph_engine.cc
)
endif
()
# Create static library
cc_library
(
paddle_fluid DEPS
${
fluid_modules
}
paddle_fluid_api paddle_inference_api analysis_predictor
)
cc_library
(
paddle_fluid DEPS
${
fluid_modules
}
${
STATIC_INFERENCE_APIS
}
zero_copy_tensor
)
if
(
NOT APPLE
)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
set
(
LINK_FLAGS
"-Wl,--retain-symbols-file
${
CMAKE_CURRENT_SOURCE_DIR
}
/paddle_fluid.sym"
)
...
...
@@ -28,9 +39,7 @@ if(NOT APPLE)
endif
()
# Create shared library
cc_library
(
paddle_fluid_shared SHARED
SRCS io.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/api_impl.cc
${
CMAKE_CURRENT_SOURCE_DIR
}
/api/analysis_predictor.cc
cc_library
(
paddle_fluid_shared SHARED SRCS
${
SHARED_INFERENCE_SRCS
}
DEPS
${
fluid_modules
}
paddle_fluid_api
)
set_target_properties
(
paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid
)
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -20,8 +20,6 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
function
(
inference_analysis_test TARGET
)
if
(
WITH_TESTING
)
set
(
options
""
)
...
...
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
5cedfb60
...
...
@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
auto
trt_teller
=
[
&
](
const
Node
*
node
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"dropout"
});
if
(
!
node
->
IsFunction
())
return
false
;
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
5cedfb60
...
...
@@ -64,15 +64,17 @@ class Analyzer : public OrderedRegistry<PassManager> {
// larger fusion.
const
std
::
vector
<
std
::
string
>
all_ir_passes_
{{
// Manual update the passes here.
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"embedding_fc_lstm_fuse_pass"
,
//
"fc_lstm_fuse_pass"
,
//
"mul_lstm_fuse_pass"
,
//
"fc_gru_fuse_pass"
,
//
"mul_gru_fuse_pass"
,
//
"seq_concat_fc_fuse_pass"
,
//
"fc_fuse_pass"
,
//
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"embedding_fc_lstm_fuse_pass"
,
//
"fc_lstm_fuse_pass"
,
//
"mul_lstm_fuse_pass"
,
//
"fc_gru_fuse_pass"
,
//
"mul_gru_fuse_pass"
,
//
"seq_concat_fc_fuse_pass"
,
//
"fc_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_eltwiseadd_bn_fuse_pass"
,
//
#ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass"
,
//
#endif
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -31,7 +31,6 @@ function(inference_api_test TARGET_NAME)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
inference_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
5cedfb60
...
...
@@ -24,7 +24,6 @@
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -72,6 +71,11 @@ bool AnalysisPredictor::Init(
}
else
{
inference_program_
=
program
;
}
if
(
config_
.
_use_mkldnn
)
{
executor_
->
EnableMKLDNN
(
*
inference_program_
);
}
executor_
->
Prepare
(
scope_
.
get
(),
*
inference_program_
,
0
,
config_
.
use_feed_fetch_ops
);
...
...
@@ -93,6 +97,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to set feed"
;
return
false
;
}
// Run the inference program
// if share variables, we need not create variables
executor_
->
Run
();
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
5cedfb60
...
...
@@ -23,7 +23,6 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool
(
profile
,
false
,
"Turn on profiler for fluid"
);
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
浏览文件 @
5cedfb60
...
...
@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER
(
batch_norm
);
USE_TRT_CONVERTER
(
concat
);
USE_TRT_CONVERTER
(
dropout
);
USE_TRT_CONVERTER
(
pad
);
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -3,6 +3,7 @@ project(cpp_inference_demo CXX C)
option
(
WITH_MKL
"Compile demo with MKL/OpenBlas support, default use MKL."
ON
)
option
(
WITH_GPU
"Compile demo with GPU/CPU, default use CPU."
OFF
)
option
(
WITH_STATIC_LIB
"Compile demo with static/shared library, default use static."
ON
)
option
(
USE_TENSORRT
"Compile demo with TensorRT."
OFF
)
macro
(
safe_set_static_flag
)
foreach
(
flag_var
...
...
@@ -60,6 +61,13 @@ endif(NOT WIN32)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/boost"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/eigen3"
)
if
(
NOT WIN32
)
if
(
USE_TENSORRT AND WITH_GPU
)
include_directories
(
"
${
TENSORRT_INCLUDE_DIR
}
"
)
link_directories
(
"
${
TENSORRT_LIB_DIR
}
"
)
endif
()
endif
(
NOT WIN32
)
if
(
NOT WIN32
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/snappy/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/snappystream/lib"
)
...
...
@@ -112,6 +120,10 @@ endif(NOT WIN32)
if
(
WITH_GPU
)
if
(
NOT WIN32
)
if
(
USE_TENSORRT
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer_plugin
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/libcudart
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
else
()
set
(
DEPS
${
DEPS
}
${
CUDA_LIB
}
/cudart
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
...
...
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
5cedfb60
...
...
@@ -3,6 +3,9 @@ PADDLE_ROOT=$1
TURN_ON_MKL
=
$2
# use MKL or Openblas
TEST_GPU_CPU
=
$3
# test both GPU/CPU mode or only CPU mode
DATA_DIR
=
$4
# dataset
TENSORRT_INCLUDE_DIR
=
$5
# TensorRT header file dir, defalut to /usr/local/TensorRT/include
TENSORRT_LIB_DIR
=
$6
# TensorRT lib file dir, default to /usr/local/TensorRT/lib
cd
`
dirname
$0
`
current_dir
=
`
pwd
`
if
[
$2
==
ON
]
;
then
...
...
@@ -16,6 +19,11 @@ else
use_gpu_list
=
'false'
fi
USE_TENSORRT
=
OFF
if
[
[
-d
"
$TENSORRT_INCLUDE_DIR
"
]
-a
[
-d
"
$TENSORRT_LIB_DIR
"
]
]
;
then
USE_TENSORRT
=
ON
fi
PREFIX
=
inference-vis-demos%2F
URL_ROOT
=
http://paddlemodels.cdn.bcebos.com/
${
PREFIX
}
...
...
@@ -86,5 +94,23 @@ for WITH_STATIC_LIB in ON OFF; do
fi
done
done
# --------tensorrt mobilenet------
if
[
$USE_TENSORRT
==
ON
-a
$TEST_GPU_CPU
==
ON
]
;
then
rm
-rf
*
cmake ..
-DPADDLE_LIB
=
${
PADDLE_ROOT
}
/build/fluid_install_dir/
\
-DWITH_MKL
=
$TURN_ON_MKL
\
-DDEMO_NAME
=
trt_mobilenet_demo
\
-DWITH_GPU
=
$TEST_GPU_CPU
\
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
\
-DUSE_TENSORRT
=
$USE_TENSORRT
\
-DTENSORRT_INCLUDE_DIR
=
$TENSORRT_INCLUDE_DIR
\
-DTENSORRT_LIB_DIR
=
$TENSORRT_LIB_DIR
make
-j
./trt_mobilenet_demo
\
--modeldir
=
$DATA_DIR
/mobilenet/model
\
--data
=
$DATA_DIR
/mobilenet/data.txt
\
--refer
=
$DATA_DIR
/mobilenet/result.txt
fi
done
set
+x
paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
浏览文件 @
5cedfb60
...
...
@@ -22,8 +22,8 @@ limitations under the License. */
#include <algorithm>
#include <memory>
#include <thread> //NOLINT
#include "paddle/fluid/inference/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_string
(
dirname
,
""
,
"Directory of the inference model."
);
DEFINE_bool
(
use_gpu
,
false
,
"Whether use gpu."
);
...
...
@@ -62,17 +62,17 @@ void Main(bool use_gpu) {
CHECK
(
predictor
->
Run
(
slots
,
&
outputs
));
//# 4. Get output.
PADDLE_ENFORCE
(
outputs
.
size
(),
1UL
);
CHECK_EQ
(
outputs
.
size
(),
1UL
);
// Check the output buffer size and result of each tid.
PADDLE_ENFORCE
(
outputs
.
front
().
data
.
length
(),
33168UL
);
CHECK_EQ
(
outputs
.
front
().
data
.
length
(),
33168UL
);
float
result
[
5
]
=
{
0.00129761
,
0.00151112
,
0.000423564
,
0.00108815
,
0.000932706
};
const
size_t
num_elements
=
outputs
.
front
().
data
.
length
()
/
sizeof
(
float
);
// The outputs' buffers are in CPU memory.
for
(
size_t
i
=
0
;
i
<
std
::
min
(
static_cast
<
size_t
>
(
5
),
num_elements
);
i
++
)
{
PADDLE_ENFORCE
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())
[
i
],
result
[
i
]
);
CHECK_NEAR
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
],
result
[
i
],
0.001
);
}
}
}
...
...
@@ -108,9 +108,9 @@ void MainThreads(int num_threads, bool use_gpu) {
CHECK
(
predictor
->
Run
(
inputs
,
&
outputs
));
// 4. Get output.
PADDLE_ENFORCE
(
outputs
.
size
(),
1UL
);
CHECK_EQ
(
outputs
.
size
(),
1UL
);
// Check the output buffer size and result of each tid.
PADDLE_ENFORCE
(
outputs
.
front
().
data
.
length
(),
33168UL
);
CHECK_EQ
(
outputs
.
front
().
data
.
length
(),
33168UL
);
float
result
[
5
]
=
{
0.00129761
,
0.00151112
,
0.000423564
,
0.00108815
,
0.000932706
};
const
size_t
num_elements
=
...
...
@@ -118,8 +118,8 @@ void MainThreads(int num_threads, bool use_gpu) {
// The outputs' buffers are in CPU memory.
for
(
size_t
i
=
0
;
i
<
std
::
min
(
static_cast
<
size_t
>
(
5
),
num_elements
);
i
++
)
{
PADDLE_ENFORCE
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
],
result
[
i
]
);
CHECK_NEAR
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
],
result
[
i
],
0.001
);
}
}
});
...
...
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
0 → 100644
浏览文件 @
5cedfb60
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains demo of mobilenet for tensorrt.
*/
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include "paddle/fluid/inference/demo_ci/utils.h"
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DEFINE_string
(
modeldir
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
refer
,
""
,
"path to reference result for comparison."
);
DEFINE_string
(
data
,
""
,
"path of data; each line is a record, format is "
"'<space splitted floats as data>
\t
<space splitted ints as shape'"
);
namespace
paddle
{
namespace
demo
{
/*
* Use the tensorrt fluid engine to inference the demo.
*/
void
Main
()
{
std
::
unique_ptr
<
PaddlePredictor
>
predictor
;
paddle
::
contrib
::
MixedRTConfig
config
;
config
.
param_file
=
FLAGS_modeldir
+
"/__params__"
;
config
.
prog_file
=
FLAGS_modeldir
+
"/__model__"
;
config
.
use_gpu
=
true
;
config
.
device
=
0
;
config
.
max_batch_size
=
1
;
config
.
fraction_of_gpu_memory
=
0.1
;
// set by yourself
predictor
=
CreatePaddlePredictor
<
paddle
::
contrib
::
MixedRTConfig
>
(
config
);
VLOG
(
3
)
<<
"begin to process data"
;
// Just a single batch of data.
std
::
string
line
;
std
::
ifstream
file
(
FLAGS_data
);
std
::
getline
(
file
,
line
);
auto
record
=
ProcessALine
(
line
);
file
.
close
();
// Inference.
PaddleTensor
input
;
input
.
shape
=
record
.
shape
;
input
.
data
=
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
input
.
dtype
=
PaddleDType
::
FLOAT32
;
VLOG
(
3
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
;
predictor
->
Run
({
input
},
&
output
,
1
);
VLOG
(
3
)
<<
"output.size "
<<
output
.
size
();
auto
&
tensor
=
output
.
front
();
VLOG
(
3
)
<<
"output: "
<<
SummaryTensor
(
tensor
);
// compare with reference result
CheckOutput
(
FLAGS_refer
,
tensor
);
}
}
// namespace demo
}
// namespace paddle
int
main
(
int
argc
,
char
**
argv
)
{
google
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
paddle
::
demo
::
Main
();
return
0
;
}
paddle/fluid/inference/api/demo_ci/utils.h
浏览文件 @
5cedfb60
...
...
@@ -14,6 +14,8 @@
#pragma once
#include <algorithm>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include "paddle/fluid/inference/paddle_inference_api.h"
...
...
@@ -21,6 +23,11 @@
namespace
paddle
{
namespace
demo
{
struct
Record
{
std
::
vector
<
float
>
data
;
std
::
vector
<
int32_t
>
shape
;
};
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>*
pieces
)
{
pieces
->
clear
();
...
...
@@ -39,6 +46,58 @@ static void split(const std::string& str, char sep,
}
}
Record
ProcessALine
(
const
std
::
string
&
line
)
{
VLOG
(
3
)
<<
"process a line"
;
std
::
vector
<
std
::
string
>
columns
;
split
(
line
,
'\t'
,
&
columns
);
CHECK_EQ
(
columns
.
size
(),
2UL
)
<<
"data format error, should be <data>
\t
<shape>"
;
Record
record
;
std
::
vector
<
std
::
string
>
data_strs
;
split
(
columns
[
0
],
' '
,
&
data_strs
);
for
(
auto
&
d
:
data_strs
)
{
record
.
data
.
push_back
(
std
::
stof
(
d
));
}
std
::
vector
<
std
::
string
>
shape_strs
;
split
(
columns
[
1
],
' '
,
&
shape_strs
);
for
(
auto
&
s
:
shape_strs
)
{
record
.
shape
.
push_back
(
std
::
stoi
(
s
));
}
VLOG
(
3
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
)
<<
"data shape size "
<<
record
.
shape
.
size
();
return
record
;
}
void
CheckOutput
(
const
std
::
string
&
referfile
,
const
PaddleTensor
&
output
)
{
std
::
string
line
;
std
::
ifstream
file
(
referfile
);
std
::
getline
(
file
,
line
);
auto
refer
=
ProcessALine
(
line
);
file
.
close
();
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
VLOG
(
3
)
<<
"predictor output numel "
<<
numel
;
VLOG
(
3
)
<<
"reference output numel "
<<
refer
.
data
.
size
();
CHECK_EQ
(
numel
,
refer
.
data
.
size
());
switch
(
output
.
dtype
)
{
case
PaddleDType
::
INT64
:
{
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
CHECK_EQ
(
static_cast
<
int64_t
*>
(
output
.
data
.
data
())[
i
],
refer
.
data
[
i
]);
}
break
;
}
case
PaddleDType
::
FLOAT32
:
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
CHECK_LT
(
fabs
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
]
-
refer
.
data
[
i
]),
1e-5
);
}
break
;
}
}
/*
* Get a summary of a PaddleTensor content.
*/
...
...
paddle/fluid/inference/api/demo_ci/vis_demo.cc
浏览文件 @
5cedfb60
...
...
@@ -17,11 +17,8 @@ limitations under the License. */
*/
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <fstream>
#include <iostream>
#include <glog/logging.h> // use glog instead of CHECK to avoid importing other paddle header files.
#include "paddle/fluid/inference/demo_ci/utils.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
...
...
@@ -37,70 +34,11 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
namespace
paddle
{
namespace
demo
{
struct
Record
{
std
::
vector
<
float
>
data
;
std
::
vector
<
int32_t
>
shape
;
};
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>*
pieces
);
Record
ProcessALine
(
const
std
::
string
&
line
)
{
VLOG
(
3
)
<<
"process a line"
;
std
::
vector
<
std
::
string
>
columns
;
split
(
line
,
'\t'
,
&
columns
);
CHECK_EQ
(
columns
.
size
(),
2UL
)
<<
"data format error, should be <data>
\t
<shape>"
;
Record
record
;
std
::
vector
<
std
::
string
>
data_strs
;
split
(
columns
[
0
],
' '
,
&
data_strs
);
for
(
auto
&
d
:
data_strs
)
{
record
.
data
.
push_back
(
std
::
stof
(
d
));
}
std
::
vector
<
std
::
string
>
shape_strs
;
split
(
columns
[
1
],
' '
,
&
shape_strs
);
for
(
auto
&
s
:
shape_strs
)
{
record
.
shape
.
push_back
(
std
::
stoi
(
s
));
}
VLOG
(
3
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
)
<<
"data shape size "
<<
record
.
shape
.
size
();
return
record
;
}
void
CheckOutput
(
const
std
::
string
&
referfile
,
const
PaddleTensor
&
output
)
{
std
::
string
line
;
std
::
ifstream
file
(
referfile
);
std
::
getline
(
file
,
line
);
auto
refer
=
ProcessALine
(
line
);
file
.
close
();
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
VLOG
(
3
)
<<
"predictor output numel "
<<
numel
;
VLOG
(
3
)
<<
"reference output numel "
<<
refer
.
data
.
size
();
PADDLE_ENFORCE_EQ
(
numel
,
refer
.
data
.
size
());
switch
(
output
.
dtype
)
{
case
PaddleDType
::
INT64
:
{
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
static_cast
<
int64_t
*>
(
output
.
data
.
data
())[
i
],
refer
.
data
[
i
]);
}
break
;
}
case
PaddleDType
::
FLOAT32
:
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
PADDLE_ENFORCE_LT
(
fabs
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
]
-
refer
.
data
[
i
]),
1e-5
);
}
break
;
}
}
/*
* Use the native fluid engine to inference the demo.
*/
void
Main
(
bool
use_gpu
)
{
std
::
unique_ptr
<
PaddlePredictor
>
predictor
;
NativeConfig
config
;
config
.
param_file
=
FLAGS_modeldir
+
"/__params__"
;
config
.
prog_file
=
FLAGS_modeldir
+
"/__model__"
;
...
...
@@ -111,7 +49,7 @@ void Main(bool use_gpu) {
}
VLOG
(
3
)
<<
"init predictor"
;
auto
predictor
=
predictor
=
CreatePaddlePredictor
<
NativeConfig
,
PaddleEngineKind
::
kNative
>
(
config
);
VLOG
(
3
)
<<
"begin to process data"
;
...
...
@@ -131,7 +69,7 @@ void Main(bool use_gpu) {
VLOG
(
3
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
;
predictor
->
Run
({
input
},
&
output
);
predictor
->
Run
({
input
},
&
output
,
1
);
VLOG
(
3
)
<<
"output.size "
<<
output
.
size
();
auto
&
tensor
=
output
.
front
();
...
...
@@ -146,9 +84,10 @@ void Main(bool use_gpu) {
int
main
(
int
argc
,
char
**
argv
)
{
google
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
paddle
::
demo
::
Main
(
false
/* use_gpu*/
);
if
(
FLAGS_use_gpu
)
{
paddle
::
demo
::
Main
(
true
/*use_gpu*/
);
}
else
{
paddle
::
demo
::
Main
(
false
/*use_gpu*/
);
}
return
0
;
}
paddle/fluid/inference/api/helper.h
浏览文件 @
5cedfb60
...
...
@@ -16,19 +16,34 @@
#include <glog/logging.h>
#include <sys/time.h>
#include <
algorithm>
#include <
chrono> // NOLINT
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
#include "paddle/fluid/string/printf.h"
#include "paddle_inference_api.h"
namespace
paddle
{
namespace
inference
{
// Timer for timer
class
Timer
{
public:
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
std
::
chrono
::
high_resolution_clock
::
time_point
startu
;
void
tic
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
double
toc
()
{
startu
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
time_span
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
startu
-
start
);
double
used_time_ms
=
static_cast
<
double
>
(
time_span
.
count
())
*
1000.0
;
return
used_time_ms
;
}
};
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>
*
pieces
)
{
pieces
->
clear
();
...
...
@@ -154,127 +169,5 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
}
}
template
<
typename
T
>
std
::
string
LoDTensorSummary
(
const
framework
::
LoDTensor
&
tensor
)
{
std
::
stringstream
ss
;
ss
<<
"
\n
---- tensor ---"
<<
'\n'
;
ss
<<
"lod: ["
;
for
(
const
auto
&
level
:
tensor
.
lod
())
{
ss
<<
"[ "
;
for
(
auto
i
:
level
)
{
ss
<<
i
<<
", "
;
}
ss
<<
"]"
;
}
ss
<<
"]
\n
"
;
ss
<<
"shape: ["
;
int
size
=
1
;
for
(
int
i
=
0
;
i
<
tensor
.
dims
().
size
();
i
++
)
{
int
dim
=
tensor
.
dims
()[
i
];
ss
<<
dim
<<
", "
;
size
*=
dim
;
}
ss
<<
"]
\n
"
;
ss
<<
"data: "
;
for
(
int
i
=
0
;
i
<
std
::
min
(
20
,
size
);
i
++
)
{
ss
<<
tensor
.
data
<
T
>
()[
i
]
<<
" "
;
}
ss
<<
"
\n
"
;
return
ss
.
str
();
}
static
bool
CompareLoD
(
const
framework
::
LoD
&
a
,
const
framework
::
LoD
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"lod size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
auto
&
al
=
a
[
i
];
auto
&
bl
=
b
[
i
];
if
(
al
.
size
()
!=
bl
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"level size %d != %d"
,
al
.
size
(),
bl
.
size
());
return
false
;
}
}
return
true
;
}
static
bool
CompareShape
(
const
std
::
vector
<
int64_t
>
&
a
,
const
std
::
vector
<
int64_t
>
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
if
(
a
[
i
]
!=
b
[
i
])
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape %d-th element not match %d != %d"
,
i
,
a
[
i
],
b
[
i
]);
return
false
;
}
}
return
true
;
}
static
bool
CompareTensorData
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
auto
a_shape
=
framework
::
vectorize
(
a
.
dims
());
auto
b_shape
=
framework
::
vectorize
(
b
.
dims
());
size_t
a_size
=
std
::
accumulate
(
a_shape
.
begin
(),
a_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
size_t
b_size
=
std
::
accumulate
(
b_shape
.
begin
(),
b_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
if
(
a_size
!=
b_size
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data size not match, %d != %d"
,
a_size
,
b_size
);
}
for
(
size_t
i
=
0
;
i
<
a_size
;
i
++
)
{
if
(
a
.
type
()
==
typeid
(
float
))
{
const
auto
*
a_data
=
a
.
data
<
float
>
();
const
auto
*
b_data
=
b
.
data
<
float
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
else
if
(
a
.
type
()
==
typeid
(
int64_t
))
{
const
auto
*
a_data
=
a
.
data
<
int64_t
>
();
const
auto
*
b_data
=
b
.
data
<
int64_t
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
}
return
true
;
}
static
bool
CompareTensor
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
if
(
!
CompareLoD
(
a
.
lod
(),
b
.
lod
()))
{
return
false
;
}
if
(
!
CompareShape
(
framework
::
vectorize
(
a
.
dims
()),
framework
::
vectorize
(
b
.
dims
())))
{
return
false
;
}
if
(
!
CompareTensorData
(
a
,
b
))
{
return
false
;
}
return
true
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
5cedfb60
...
...
@@ -268,9 +268,8 @@ struct AnalysisConfig : public NativeConfig {
// NOT stable yet.
bool
use_feed_fetch_ops
{
true
};
// NOTE this is just for internal development, please not use it. NOT
// stable
// yet.
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
bool
_use_mkldnn
{
false
};
};
...
...
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
5cedfb60
# Add TRT tests
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
...
...
@@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine batch_norm_op SERIAL
)
nv_test
(
test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine concat_op SERIAL
)
nv_test
(
test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine dropout_op SERIAL
)
nv_test
(
test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pad_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/pad_op.cc
0 → 100644
浏览文件 @
5cedfb60
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* PadOp.
*/
class
PadOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid transpose op to tensorrt tranpose layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
auto
*
input
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
const
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
const
float
pad_value
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"pad_value"
));
nvinfer1
::
Dims
input_shape
=
input
->
getDimensions
();
int
nbDims
=
input_shape
.
nbDims
;
int
pad_size
=
static_cast
<
int
>
(
paddings
.
size
());
PADDLE_ENFORCE_GE
(
nbDims
,
2
);
PADDLE_ENFORCE_EQ
((
nbDims
+
1
)
*
2
,
pad_size
);
PADDLE_ENFORCE
(
pad_value
==
0.0
,
"The pad layer of TRT only support zero."
);
nvinfer1
::
DimsHW
pre_pad
(
paddings
[
pad_size
-
4
],
paddings
[
pad_size
-
2
]);
nvinfer1
::
DimsHW
post_pad
(
paddings
[
pad_size
-
3
],
paddings
[
pad_size
-
1
]);
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Padding
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input
),
pre_pad
,
post_pad
);
PADDLE_ENFORCE
(
layer
!=
nullptr
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
layer
->
setName
((
"scale (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
pad
,
PadOpConverter
);
paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
0 → 100644
浏览文件 @
5cedfb60
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
PadConverter
,
main
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"pad-X"
,
nvinfer1
::
Dims3
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"pad-Out"
,
nvinfer1
::
Dims3
(
3
,
3
,
5
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"pad"
);
desc
.
SetInput
(
"X"
,
{
"pad-X"
});
desc
.
SetOutput
(
"Out"
,
{
"pad-Out"
});
std
::
vector
<
int
>
paddings
=
{
0
,
0
,
0
,
0
,
0
,
1
,
1
,
2
};
float
pad_value
=
0.0
;
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"pad_value"
,
pad_value
);
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
2
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
pad
);
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -70,6 +70,14 @@ if (NOT EXISTS ${OCR_INSTALL_DIR})
endif
()
inference_analysis_api_test
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
# resnet50
set
(
RESNET50_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
)
if
(
NOT EXISTS
${
RESNET50_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
RESNET50_INSTALL_DIR
}
${
INFERENCE_URL
}
"resnet50_model.tar.gz"
)
endif
()
inference_analysis_test
(
test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
RESNET50_INSTALL_DIR
}
/model
)
# anakin
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
# anakin rnn1
...
...
paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
浏览文件 @
5cedfb60
...
...
@@ -22,7 +22,6 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/timer.h"
#include "utils/logger/logger.h"
DEFINE_string
(
model
,
""
,
"Directory of the inference model."
);
...
...
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
0 → 100644
浏览文件 @
5cedfb60
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
param_file
=
FLAGS_infer_model
+
"/params"
;
cfg
->
prog_file
=
FLAGS_infer_model
+
"/model"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
PADDLE_ENFORCE_EQ
(
FLAGS_test_all_data
,
0
,
"Only have single batch of data."
);
PaddleTensor
input
;
// channel=3, height/width=318
std
::
vector
<
int
>
shape
({
FLAGS_batch_size
,
3
,
318
,
318
});
input
.
shape
=
shape
;
input
.
dtype
=
PaddleDType
::
FLOAT32
;
// fill input data, for profile easily, do not use random data here.
size_t
size
=
FLAGS_batch_size
*
3
*
318
*
318
;
input
.
data
.
Resize
(
size
*
sizeof
(
float
));
float
*
input_data
=
static_cast
<
float
*>
(
input
.
data
.
data
());
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
*
(
input_data
+
i
)
=
static_cast
<
float
>
(
i
)
/
size
;
}
std
::
vector
<
PaddleTensor
>
input_slots
;
input_slots
.
assign
({
input
});
(
*
inputs
).
emplace_back
(
input_slots
);
}
// Easy for profiling independently.
TEST
(
Analyzer_resnet50
,
profile
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
1UL
);
size_t
size
=
GetSize
(
outputs
[
0
]);
// output is a 512-dimension feature
EXPECT_EQ
(
size
,
512
*
FLAGS_batch_size
);
}
}
// Check the fuse status
TEST
(
Analyzer_resnet50
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
1
);
}
// Compare result of NativeConfig and AnalysisConfig
TEST
(
Analyzer_resnet50
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
cfg
,
input_slots_all
);
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
5cedfb60
...
...
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_bool
(
with_precision_check
,
true
,
"turn on test"
);
...
...
@@ -271,10 +270,11 @@ TEST(Analyzer_rnn1, multi_thread) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
TestPrediction
(
cfg
,
input_slots_all
,
&
outputs
,
4
/* multi_thread */
);
}
bool
CompareTensors
(
framework
::
Scope
&
a_scope
,
framework
::
Scope
&
b_scope
,
bool
CompareTensors
(
const
framework
::
Scope
&
a_scope
,
const
framework
::
Scope
&
b_scope
,
const
std
::
vector
<
std
::
string
>
&
tensors
)
{
for
(
auto
&
x
:
tensors
)
{
auto
*
a_var
=
a_scope
.
FindVar
(
x
);
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
5cedfb60
...
...
@@ -61,8 +61,6 @@ void SetConfig(AnalysisConfig *cfg) {
cfg
->
ir_passes
.
push_back
(
"fc_gru_fuse_pass"
);
#ifdef PADDLE_WITH_MKLDNN
cfg
->
_use_mkldnn
=
true
;
// disable mkldnn fuse since it should have some bugs
cfg
->
ir_passes
.
push_back
(
"conv_relu_mkldnn_fuse_pass"
);
#endif
}
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
5cedfb60
...
...
@@ -15,6 +15,7 @@
#pragma once
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
#include <thread> // NOLINT
#include <vector>
...
...
@@ -182,5 +183,127 @@ void CompareNativeAndAnalysis(
CompareResult
(
analysis_outputs
,
native_outputs
);
}
template
<
typename
T
>
std
::
string
LoDTensorSummary
(
const
framework
::
LoDTensor
&
tensor
)
{
std
::
stringstream
ss
;
ss
<<
"
\n
---- tensor ---"
<<
'\n'
;
ss
<<
"lod: ["
;
for
(
const
auto
&
level
:
tensor
.
lod
())
{
ss
<<
"[ "
;
for
(
auto
i
:
level
)
{
ss
<<
i
<<
", "
;
}
ss
<<
"]"
;
}
ss
<<
"]
\n
"
;
ss
<<
"shape: ["
;
int
size
=
1
;
for
(
int
i
=
0
;
i
<
tensor
.
dims
().
size
();
i
++
)
{
int
dim
=
tensor
.
dims
()[
i
];
ss
<<
dim
<<
", "
;
size
*=
dim
;
}
ss
<<
"]
\n
"
;
ss
<<
"data: "
;
for
(
int
i
=
0
;
i
<
std
::
min
(
20
,
size
);
i
++
)
{
ss
<<
tensor
.
data
<
T
>
()[
i
]
<<
" "
;
}
ss
<<
"
\n
"
;
return
ss
.
str
();
}
static
bool
CompareLoD
(
const
framework
::
LoD
&
a
,
const
framework
::
LoD
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"lod size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
auto
&
al
=
a
[
i
];
auto
&
bl
=
b
[
i
];
if
(
al
.
size
()
!=
bl
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"level size %d != %d"
,
al
.
size
(),
bl
.
size
());
return
false
;
}
}
return
true
;
}
static
bool
CompareShape
(
const
std
::
vector
<
int64_t
>
&
a
,
const
std
::
vector
<
int64_t
>
&
b
)
{
if
(
a
.
size
()
!=
b
.
size
())
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape size not match %d != %d"
,
a
.
size
(),
b
.
size
());
return
false
;
}
for
(
size_t
i
=
0
;
i
<
a
.
size
();
i
++
)
{
if
(
a
[
i
]
!=
b
[
i
])
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"shape %d-th element not match %d != %d"
,
i
,
a
[
i
],
b
[
i
]);
return
false
;
}
}
return
true
;
}
static
bool
CompareTensorData
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
auto
a_shape
=
framework
::
vectorize
(
a
.
dims
());
auto
b_shape
=
framework
::
vectorize
(
b
.
dims
());
size_t
a_size
=
std
::
accumulate
(
a_shape
.
begin
(),
a_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
size_t
b_size
=
std
::
accumulate
(
b_shape
.
begin
(),
b_shape
.
end
(),
1
,
[](
int
a
,
int
b
)
{
return
a
*
b
;
});
if
(
a_size
!=
b_size
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data size not match, %d != %d"
,
a_size
,
b_size
);
}
for
(
size_t
i
=
0
;
i
<
a_size
;
i
++
)
{
if
(
a
.
type
()
==
typeid
(
float
))
{
const
auto
*
a_data
=
a
.
data
<
float
>
();
const
auto
*
b_data
=
b
.
data
<
float
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
else
if
(
a
.
type
()
==
typeid
(
int64_t
))
{
const
auto
*
a_data
=
a
.
data
<
int64_t
>
();
const
auto
*
b_data
=
b
.
data
<
int64_t
>
();
if
(
std
::
abs
(
a_data
[
i
]
-
b_data
[
i
])
>
1e-3
)
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"tensor data %d-th element not match, %f != %f"
,
i
,
a_data
[
i
],
b_data
[
i
]);
return
false
;
}
}
}
return
true
;
}
static
bool
CompareTensor
(
const
framework
::
LoDTensor
&
a
,
const
framework
::
LoDTensor
&
b
)
{
if
(
!
CompareLoD
(
a
.
lod
(),
b
.
lod
()))
{
return
false
;
}
if
(
!
CompareShape
(
framework
::
vectorize
(
a
.
dims
()),
framework
::
vectorize
(
b
.
dims
())))
{
return
false
;
}
if
(
!
CompareTensorData
(
a
,
b
))
{
return
false
;
}
return
true
;
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/book/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -4,7 +4,6 @@ function(inference_test TARGET_NAME)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
inference_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
set
(
arg_list
""
)
if
(
inference_test_ARGS
)
foreach
(
arg
${
inference_test_ARGS
}
)
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -82,10 +82,11 @@ function(op_library TARGET)
if
(
${
cc_srcs_len
}
EQUAL 0
)
message
(
FATAL_ERROR
"The op library
${
TARGET
}
should contains at least one .cc file"
)
endif
()
#remove windows unsupported op
if
(
WIN32
)
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
"warpctc_op"
)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
"warpctc_op"
"hierarchical_sigmoid_op"
"crf_decoding_op"
"select_op"
"lstmp_op"
"gru_op"
"fusion_gru_op"
"lstm_op"
"fusion_lstm_op"
"cumsum_op"
"channel_send_op"
"channel_create_op"
"channel_close_op"
"channel_recv_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
windows_unsupport_op
}
"
)
return
()
endif
()
...
...
@@ -229,7 +230,7 @@ if(WITH_DISTRIBUTE)
op_library
(
${
dist_op
}
DEPS
${
DISTRIBUTE_DEPS
}
)
set_source_files_properties
(
${
dist_op
}
.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
endforeach
()
#set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
#cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
# listen_and_serv_op sum_op executor SERIAL)
...
...
@@ -267,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
else
()
set
(
DEPS_OPS
${
DEPS_OPS
}
tensorrt_engine_op
)
endif
()
op_library
(
clip_by_norm_op DEPS selected_rows_functor selected_rows
)
op_library
(
sum_op DEPS selected_rows_functor
)
op_library
(
sgd_op DEPS selected_rows_functor
)
op_library
(
print_op DEPS lod_tensor
)
...
...
@@ -281,10 +283,12 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
op_library
(
max_sequence_len_op DEPS lod_rank_table
)
op_library
(
sequence_conv_op DEPS context_project
)
op_library
(
sequence_pool_op DEPS sequence_pooling
)
if
(
NOT WIN32
)
op_library
(
lstm_op DEPS sequence2batch lstm_compute
)
op_library
(
hierarchical_sigmoid_op DEPS matrix_bit_code
)
op_library
(
lstmp_op DEPS sequence2batch lstm_compute
)
op_library
(
gru_op DEPS sequence2batch gru_compute
)
endif
(
NOT WIN32
)
op_library
(
recurrent_op DEPS executor
)
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
op_library
(
cos_sim_op DEPS cos_sim_functor
)
...
...
@@ -297,7 +301,6 @@ op_library(sequence_pad_op DEPS sequence_padding)
op_library
(
unstack_op DEPS stack_op
)
op_library
(
fake_quantize_op DEPS memory
)
op_library
(
fusion_lstm_op DEPS cpu_lstm_compute
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
layer_norm_op DEPS cub
)
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
5cedfb60
...
...
@@ -80,7 +80,7 @@ class ActivationOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
ctx
->
S
etOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
)
);
ctx
->
S
hareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
...
...
@@ -91,12 +91,26 @@ class ActivationOp : public framework::OperatorWithKernel {
}
};
class
ActivationOpInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
x_name
=
op_desc
.
Input
(
"X"
)[
0
];
auto
out_name
=
op_desc
.
Output
(
"Out"
)[
0
];
auto
&
x
=
block
->
FindRecursiveOrCreateVar
(
x_name
);
auto
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_name
);
out
.
SetType
(
x
.
GetType
());
out
.
SetDataType
(
x
.
GetDataType
());
}
};
class
ActivationOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"Out"
));
ctx
->
ShareDim
(
"Out"
,
framework
::
GradVarName
(
"X"
));
ctx
->
ShareLoD
(
"Out"
,
framework
::
GradVarName
(
"X"
));
}
protected:
...
...
@@ -525,12 +539,14 @@ namespace ops = paddle::operators;
#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \
REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
::paddle::operators::OP_NAME##OpMaker, \
::paddle::operators::ActivationOpInferVarType, \
::paddle::operators::OP_NAME##GradMaker); \
REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE) \
REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
::paddle::operators::OP_NAME##OpMaker, \
::paddle::operators::ActivationOpInferVarType, \
::paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
...
...
paddle/fluid/operators/adadelta_op.cc
浏览文件 @
5cedfb60
...
...
@@ -18,6 +18,7 @@ namespace paddle {
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
class
AdadeltaOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"AvgSquaredUpdate"
),
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of AdadeltaOp should not be null."
);
...
...
@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
"AvgSquaredGradOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"AvgSquaredUpdateOut"
,
param_dim
);
}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
...
...
paddle/fluid/operators/adadelta_op.h
浏览文件 @
5cedfb60
...
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class
AdadeltaOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
avg_squared_grad_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"AvgSquaredGradOut"
);
...
...
paddle/fluid/operators/adagrad_op.h
浏览文件 @
5cedfb60
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -21,25 +22,31 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
struct
SparseAdagradFunctor
{
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
SelectedRows
&
grad
,
const
framework
::
Tensor
&
learning_rate
,
T
epsilon
,
framework
::
Tensor
*
moment
,
framework
::
Tensor
*
param
);
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
SelectedRows
&
grad
,
const
framework
::
Tensor
&
learning_rate
,
T
epsilon
,
framework
::
Tensor
*
moment
,
framework
::
Tensor
*
param
);
};
template
<
typename
DeviceContext
,
typename
T
>
class
AdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
auto
*
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
param_out_tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
moment_out_tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
param
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
));
...
...
@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
));
auto
moment
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Moment"
));
auto
*
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
*
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
param_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param_out_tensor
);
auto
moment_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
moment_out_tensor
);
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
moment_out
.
device
(
*
place
)
=
moment
+
grad
*
grad
;
Eigen
::
DSizes
<
int
,
1
>
m_dsize
(
moment_out_tensor
->
numel
());
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
auto
*
lr
=
learning_rate
->
data
<
T
>
();
auto
*
lr
=
learning_rate
->
data
<
T
>
();
param_out
.
device
(
*
place
)
=
param
-
lr
[
0
]
*
grad
/
(
moment_out
.
sqrt
()
+
epsilon
);
}
else
{
...
...
@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
lr
.
broadcast
(
m_dsize
)
*
grad
/
(
moment_out
.
sqrt
()
+
epsilon
);
}
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
*
param_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
*
param_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_tensor
,
param_out_tensor
);
auto
*
moment_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Moment"
);
auto
*
moment_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Moment"
);
PADDLE_ENFORCE_EQ
(
moment_tensor
,
moment_out_tensor
);
SparseAdagradFunctor
<
DeviceContext
,
T
>
functor
;
...
...
paddle/fluid/operators/adam_op.h
浏览文件 @
5cedfb60
...
...
@@ -231,6 +231,12 @@ template <typename DeviceContext, typename T>
class
AdamOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
using
paddle
::
framework
::
LoDTensor
;
using
paddle
::
operators
::
detail
::
Ref
;
...
...
paddle/fluid/operators/adamax_op.cc
浏览文件 @
5cedfb60
...
...
@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
"Input(LearningRate) of AdamaxOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Beta1Pow"
),
"Input(Beta1Pow) of AdamaxOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of AdamaxOp should not be null."
);
...
...
paddle/fluid/operators/adamax_op.h
浏览文件 @
5cedfb60
...
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class
AdamaxOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
auto
inf_norm_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"InfNormOut"
);
...
...
paddle/fluid/operators/argsort_op.cc
浏览文件 @
5cedfb60
...
...
@@ -42,8 +42,8 @@ class ArgsortOp : public framework::OperatorWithKernel {
"-rank(Input(X)) (%d)."
,
axis
,
num_dims
);
ctx
->
S
etOutputDim
(
"Out"
,
in_dims
);
ctx
->
S
etOutputDim
(
"Indices"
,
in_dims
);
ctx
->
S
hareDim
(
"X"
,
"Out"
);
ctx
->
S
hareDim
(
"X"
,
"Indices"
);
ctx
->
ShareLoD
(
"X"
,
"Out"
);
ctx
->
ShareLoD
(
"X"
,
"Indices"
);
}
...
...
paddle/fluid/operators/clip_by_norm_op.h
浏览文件 @
5cedfb60
...
...
@@ -16,12 +16,15 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
SelectedRows
=
framework
::
SelectedRows
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
...
...
@@ -31,9 +34,40 @@ class ClipByNormKernel : public framework::OpKernel<T> {
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
max_norm
=
context
.
Attr
<
T
>
(
"max_norm"
);
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
in_var
=
context
.
InputVar
(
"X"
);
Tensor
*
output
=
nullptr
;
const
Tensor
*
input
=
nullptr
;
if
(
in_var
->
IsType
<
framework
::
LoDTensor
>
())
{
input
=
context
.
Input
<
Tensor
>
(
"X"
);
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
else
if
(
in_var
->
IsType
<
SelectedRows
>
())
{
auto
*
x
=
context
.
Input
<
SelectedRows
>
(
"X"
);
// merge ids in selected rows first
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_func
;
SelectedRows
*
merged_input
=
const_cast
<
framework
::
Scope
&>
(
context
.
scope
())
.
Var
()
->
GetMutable
<
SelectedRows
>
();
merge_func
(
context
.
template
device_context
<
DeviceContext
>(),
*
x
,
merged_input
);
input
=
&
(
merged_input
->
value
());
SelectedRows
*
output_selected_rows
=
context
.
Output
<
SelectedRows
>
(
"Out"
);
output_selected_rows
->
set_rows
(
merged_input
->
rows
());
output_selected_rows
->
set_height
(
merged_input
->
height
());
output
=
output_selected_rows
->
mutable_value
();
output
->
Resize
(
merged_input
->
value
().
dims
());
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
else
{
PADDLE_THROW
(
"Unexpected branch, input variable type is %s"
,
in_var
->
Type
().
name
());
}
PADDLE_ENFORCE_NOT_NULL
(
input
);
auto
x
=
EigenVector
<
T
>::
Flatten
(
*
input
);
auto
out
=
EigenVector
<
T
>::
Flatten
(
*
output
);
...
...
paddle/fluid/operators/conv_shift_op.cc
浏览文件 @
5cedfb60
...
...
@@ -44,7 +44,7 @@ class ConvShiftOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_LE
(
y_dims
[
1
],
x_dims
[
1
],
"The 2nd dimension of Input(Y) should be less than or "
"equal to the 2nd dimension of Input(X)."
);
ctx
->
S
etOutputDim
(
"Out"
,
x_dims
);
ctx
->
S
hareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
...
...
paddle/fluid/operators/cub_reduce.h
浏览文件 @
5cedfb60
...
...
@@ -22,6 +22,7 @@
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -293,7 +294,12 @@ void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
}
auto
x_data
=
x
.
data
<
Tx
>
();
auto
y_data
=
y
->
mutable_data
<
Ty
>
(
x
.
place
());
if
(
reduce_num
==
1
)
return
;
if
(
reduce_num
==
1
)
{
auto
out_dims
=
y
->
dims
();
framework
::
TensorCopy
(
x
,
y
->
place
(),
y
);
y
->
Resize
(
out_dims
);
return
;
}
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
...
...
paddle/fluid/operators/decayed_adagrad_op.cc
浏览文件 @
5cedfb60
...
...
@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of DecayedAdagradOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of DecayedAdagradOp should not be null."
);
...
...
paddle/fluid/operators/decayed_adagrad_op.h
浏览文件 @
5cedfb60
...
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class
DecayedAdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
...
...
paddle/fluid/operators/detection/roi_perspective_transform_op.cc
浏览文件 @
5cedfb60
...
...
@@ -104,7 +104,6 @@ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
* a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
* a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
* a33 = 1
*
*/
template
<
typename
T
>
void
get_transform_matrix
(
const
int
transformed_width
,
...
...
@@ -260,8 +259,8 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
roi2image
.
Resize
({
rois_num
});
int
*
roi2image_data
=
roi2image
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
auto
lod
=
rois
->
lod
().
back
();
for
(
in
t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
for
(
in
t
j
=
lod
[
i
];
j
<
lod
[
i
+
1
];
++
j
)
{
for
(
size_
t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
for
(
size_
t
j
=
lod
[
i
];
j
<
lod
[
i
+
1
];
++
j
)
{
roi2image_data
[
j
]
=
i
;
}
}
...
...
@@ -393,8 +392,8 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
roi2image
.
Resize
({
rois_num
});
int
*
roi2image_data
=
roi2image
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
auto
lod
=
rois
->
lod
().
back
();
for
(
in
t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
for
(
in
t
j
=
lod
[
i
];
j
<
lod
[
i
+
1
];
++
j
)
{
for
(
size_
t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
for
(
size_
t
j
=
lod
[
i
];
j
<
lod
[
i
+
1
];
++
j
)
{
roi2image_data
[
j
]
=
i
;
}
}
...
...
@@ -404,7 +403,7 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
for
(
int
in_h
=
0
;
in_h
<
in_height
;
++
in_h
)
{
for
(
int
in_w
=
0
;
in_w
<
in_width
;
++
in_w
)
{
T
gradient
=
0.0
;
for
(
in
t
roi_idx
=
lod
[
n
];
roi_idx
<
lod
[
n
+
1
];
++
roi_idx
)
{
for
(
size_
t
roi_idx
=
lod
[
n
];
roi_idx
<
lod
[
n
+
1
];
++
roi_idx
)
{
const
T
*
rois
=
rois_data
+
roi_idx
*
8
;
T
roi_x
[
4
];
T
roi_y
[
4
];
...
...
paddle/fluid/operators/detection/roi_perspective_transform_op.cu
浏览文件 @
5cedfb60
...
...
@@ -345,8 +345,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
roi2image
.
Resize
({
rois_num
});
int
*
roi2image_data
=
roi2image
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
auto
lod
=
rois
->
lod
().
back
();
for
(
in
t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
for
(
in
t
j
=
lod
[
i
];
j
<
lod
[
i
+
1
];
++
j
)
{
for
(
size_
t
i
=
0
;
i
<
lod
.
size
()
-
1
;
++
i
)
{
for
(
size_
t
j
=
lod
[
i
];
j
<
lod
[
i
+
1
];
++
j
)
{
roi2image_data
[
j
]
=
i
;
}
}
...
...
@@ -432,7 +432,7 @@ __global__ void RoiTransformGradKernel(
T
gradient
=
0.0
;
// Accumulate gradient over all RoIs that interpolated this element
for
(
in
t
roi_idx
=
lod
[
n
];
roi_idx
<
lod
[
n
+
1
];
++
roi_idx
)
{
for
(
size_
t
roi_idx
=
lod
[
n
];
roi_idx
<
lod
[
n
+
1
];
++
roi_idx
)
{
const
T
*
rois
=
rois_data
+
roi_idx
*
8
;
T
roi_x
[
4
];
T
roi_y
[
4
];
...
...
paddle/fluid/operators/elementwise_op.h
浏览文件 @
5cedfb60
...
...
@@ -41,7 +41,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
auto
y_dim
=
ctx
->
GetInputDim
(
"Y"
);
PADDLE_ENFORCE_GE
(
x_dim
.
size
(),
y_dim
.
size
(),
"Rank of first input must >= rank of second input."
);
ctx
->
SetOutputDim
(
"Out"
,
x_dim
);
ctx
->
ShareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
...
...
@@ -70,6 +71,7 @@ class ElementwiseOpInferVarType : public framework::VarTypeInference {
auto
&
x
=
block
->
FindRecursiveOrCreateVar
(
x_name
);
auto
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_name
);
out
.
SetType
(
x
.
GetType
());
out
.
SetDataType
(
x
.
GetDataType
());
}
};
...
...
@@ -89,7 +91,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false). Used by MKLDNN."
)
.
SetDefault
(
false
);
AddComment
(
string
::
Sprintf
(
R"DOC(
Limited
Elementwise %s Operator
Elementwise %s Operator
The equation is:
...
...
@@ -157,10 +159,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
auto
y_grad_name
=
framework
::
GradVarName
(
"Y"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
ctx
->
SetOutputDim
(
x_grad_name
,
x_dims
);
ctx
->
ShareDim
(
"X"
,
/*->*/
x_grad_name
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
x_grad_name
);
}
if
(
ctx
->
HasOutput
(
y_grad_name
))
{
ctx
->
SetOutputDim
(
y_grad_name
,
y_dims
);
ctx
->
ShareDim
(
"Y"
,
/*->*/
y_grad_name
);
ctx
->
ShareLoD
(
"Y"
,
/*->*/
y_grad_name
);
}
}
...
...
@@ -193,14 +197,15 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
auto
out_dims
=
ctx
->
GetInputDim
(
framework
::
GradVarName
(
"Out"
)
);
ctx
->
S
etOutputDim
(
x_grad_name
,
out_dims
);
ctx
->
ShareDim
(
framework
::
GradVarName
(
"Out"
),
/*->*/
x_grad_name
);
ctx
->
S
hareLoD
(
framework
::
GradVarName
(
"Out"
),
/*->*/
x_grad_name
);
}
auto
y_grad_name
=
framework
::
GradVarName
(
"Y"
);
if
(
ctx
->
HasOutput
(
y_grad_name
))
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Y"
),
"Input(Y) should not be null"
);
auto
y_dims
=
ctx
->
GetInputDim
(
"Y"
);
ctx
->
SetOutputDim
(
y_grad_name
,
y_dims
);
ctx
->
ShareDim
(
"Y"
,
/*->*/
y_grad_name
);
ctx
->
ShareLoD
(
"Y"
,
/*->*/
y_grad_name
);
}
}
};
...
...
paddle/fluid/operators/fake_dequantize_op.cc
浏览文件 @
5cedfb60
...
...
@@ -48,7 +48,8 @@ class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
"Input(X) of FakeDequantizeMaxAbsOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FakeDequantizeMaxAbsOp should not be null."
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
ctx
->
ShareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
...
...
paddle/fluid/operators/ftrl_op.cc
浏览文件 @
5cedfb60
...
...
@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
"Input(Grad) of FTRL should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of FTRL should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of FTRL should not be null."
);
...
...
paddle/fluid/operators/ftrl_op.h
浏览文件 @
5cedfb60
...
...
@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
class
FTRLOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
*
param_out
=
ctx
.
Output
<
Tensor
>
(
"ParamOut"
);
auto
*
sq_accum_out
=
ctx
.
Output
<
Tensor
>
(
"SquaredAccumOut"
);
auto
*
lin_accum_out
=
ctx
.
Output
<
Tensor
>
(
"LinearAccumOut"
);
...
...
paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
浏览文件 @
5cedfb60
...
...
@@ -93,11 +93,7 @@ void FusedEmbeddingFCLSTMOp::InferShape(
ctx
->
SetOutputDim
(
"Cell"
,
out_dims
);
ctx
->
ShareLoD
(
"Ids"
,
"Hidden"
);
ctx
->
ShareLoD
(
"Ids"
,
"Cell"
);
int
xx_width
;
if
(
ctx
->
Attrs
().
Get
<
bool
>
(
"use_seq"
))
{
xx_width
=
wh_dims
[
1
];
}
else
{
xx_width
=
x_dims
[
1
]
>
wh_dims
[
1
]
?
wh_dims
[
1
]
:
x_dims
[
1
];
if
(
!
ctx
->
Attrs
().
Get
<
bool
>
(
"use_seq"
))
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedInput"
),
"Assert only one Output(BatchedInput) of LSTM."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BatchedHidden"
),
...
...
@@ -112,7 +108,7 @@ void FusedEmbeddingFCLSTMOp::InferShape(
ctx
->
SetOutputDim
(
"BatchedHidden"
,
out_dims
);
ctx
->
SetOutputDim
(
"BatchedCell"
,
out_dims
);
}
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
xx_width
});
ctx
->
SetOutputDim
(
"XX"
,
{
x_dims
[
0
],
wh_dims
[
1
]
});
ctx
->
ShareLoD
(
"Ids"
,
"XX"
);
}
...
...
@@ -435,8 +431,6 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
// std::cout << "===> Batch Compute" << std::endl;
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
reordered_c0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedC0"
);
auto
*
batched_input
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedInput"
);
...
...
paddle/fluid/operators/fusion_gru_op.cc
浏览文件 @
5cedfb60
...
...
@@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel<T> {
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
xx
->
Resize
({
total_T
,
D3
});
SeqCompute
(
ctx
);
return
;
}
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
...
...
paddle/fluid/operators/fusion_lstm_op.cc
浏览文件 @
5cedfb60
...
...
@@ -432,11 +432,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
platform
::
CPUDeviceContext
;
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
xx
->
Resize
({
x_dims
[
0
],
D4
});
SeqCompute
(
ctx
);
return
;
}
INIT_BASE_SIZES
INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
...
...
paddle/fluid/operators/isfinite_op.cc
0 → 100644
浏览文件 @
5cedfb60
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/isfinite_op.h"
#include <string>
#include <vector>
namespace
paddle
{
namespace
operators
{
class
OverflowOp
:
public
framework
::
OperatorWithKernel
{
public:
OverflowOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
"X"
),
"Inputs(X) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of OverflowOp should not be null."
);
ctx
->
SetOutputDim
(
"Out"
,
{
1
});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
int
dtype
=
-
1
;
auto
*
x_var
=
ctx
.
InputVar
(
"X"
);
if
(
x_var
->
IsType
<
framework
::
LoDTensor
>
())
{
dtype
=
framework
::
ToDataType
(
x_var
->
Get
<
framework
::
LoDTensor
>
().
type
());
}
else
if
(
x_var
->
IsType
<
framework
::
SelectedRows
>
())
{
dtype
=
framework
::
ToDataType
(
x_var
->
Get
<
framework
::
SelectedRows
>
().
value
().
type
());
}
else
{
PADDLE_THROW
(
"Cannot find the input data type by all input data"
);
}
return
framework
::
OpKernelType
(
framework
::
proto
::
VarType
::
Type
(
dtype
),
ctx
.
GetPlace
());
}
};
class
OverflowOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) The input tensors of overflow operator."
);
AddOutput
(
"Out"
,
"(Tensor) 1-dim tensor, contains a bool scalar. The output "
"tensor of overflow operator."
);
AddComment
(
string
::
Sprintf
(
R"DOC(
Overflow operator.
$$Out = any(X)$$
If any X contains Inf or Nan, the Out will generate a indicator.
Out = Inf if any X contains Inf,
Out = Nan if any X contains Nan,
Out = 0 if no Inf/Nan detected.
If X contains both Inf/Nan, it will return the first indicator it meeted.
)DOC"
,
GetName
(),
GetComments
()));
}
protected:
virtual
std
::
string
GetName
()
const
=
0
;
virtual
std
::
string
GetComments
()
const
=
0
;
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
#define REGISTER_OP_MAKER(op_type, comment) \
namespace paddle { \
namespace operators { \
class _##op_type##OverflowOpMaker \
: public ::paddle::operators::OverflowOpMaker { \
protected: \
std::string GetName() const { return #op_type; } \
std::string GetComments() const { return comment; } \
}; \
} \
} \
REGISTER_OPERATOR(op_type, ops::OverflowOp, \
ops::_##op_type##OverflowOpMaker, \
paddle::framework::EmptyGradOpMaker)
#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor) \
REGISTER_OP_CPU_KERNEL( \
op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int, \
ops::functor>, \
ops::OverflowKernel<paddle::platform::CPUDeviceContext, float, \
ops::functor>, \
ops::OverflowKernel<paddle::platform::CPUDeviceContext, double, \
ops::functor>);
REGISTER_OP_MAKER
(
isinf
,
"isinf(X)"
);
REGISTER_OP_MAKER
(
isnan
,
"isnan(X)"
);
REGISTER_OP_MAKER
(
isfinite
,
"isfinite(X)"
);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_OVERFLOW_CPU_KERNEL
);
paddle/fluid/operators/isfinite_op.cu
0 → 100644
浏览文件 @
5cedfb60
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/isfinite_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
#define REGISTER_OVERFLOW_CUDA_KERNEL(op_type, functor) \
REGISTER_OP_CUDA_KERNEL( \
op_type, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int, \
ops::functor>, \
ops::OverflowKernel<paddle::platform::CUDADeviceContext, float, \
ops::functor>, \
ops::OverflowKernel<paddle::platform::CUDADeviceContext, double, \
ops::functor>, \
ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16, \
ops::functor>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_OVERFLOW_CUDA_KERNEL
);
paddle/fluid/operators/isfinite_op.h
0 → 100644
浏览文件 @
5cedfb60
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
struct
InfinityFunctor
{
void
operator
()(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
framework
::
TensorContainsInf
(
tensor
,
out
);
}
};
struct
NANFunctor
{
void
operator
()(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
framework
::
TensorContainsNAN
(
tensor
,
out
);
}
};
struct
IsfiniteFunctor
{
void
operator
()(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
framework
::
TensorIsfinite
(
tensor
,
out
);
}
};
template
<
typename
DeviceContext
,
typename
T
,
typename
Functor
>
class
OverflowKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
virtual
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
x
=
ctx
.
InputVar
(
"X"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Functor
functor
;
if
(
x
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
functor
(
*
in
,
out
);
}
else
if
(
x
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
in
=
ctx
.
Input
<
framework
::
SelectedRows
>
(
"X"
)
->
value
();
functor
(
in
,
out
);
}
else
{
PADDLE_THROW
(
"Unsupported input type."
);
}
}
};
}
// namespace operators
}
// namespace paddle
#define FOR_EACH_KERNEL_FUNCTOR(__macro) \
__macro(isinf, InfinityFunctor); \
__macro(isnan, NANFunctor); \
__macro(isfinite, IsfiniteFunctor);
paddle/fluid/operators/lookup_table_op.cc
浏览文件 @
5cedfb60
...
...
@@ -137,6 +137,7 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
<<
" is set to LoDTensor"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
block
->
Var
(
out_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
}
};
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -3,8 +3,8 @@ add_subdirectory(detail)
endif
(
NOT WIN32
)
function
(
math_library TARGET
)
# math_library is a function to create math library.
# The interface is the same as cc_library.
# math_library is a function to create math library.
# The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library.
set
(
cc_srcs
)
set
(
cu_srcs
)
...
...
@@ -53,7 +53,7 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
math_library
(
math_function DEPS blas
)
math_library
(
maxouting
)
math_library
(
pooling
)
math_library
(
selected_rows_functor DEPS selected_rows math_function
)
math_library
(
selected_rows_functor DEPS selected_rows math_function
blas
)
math_library
(
sequence2batch
)
math_library
(
sequence_padding
)
math_library
(
sequence_pooling DEPS math_function
)
...
...
paddle/fluid/operators/math/cpu_lstm_compute.cc
浏览文件 @
5cedfb60
...
...
@@ -13,6 +13,31 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
namespace
math
{}
// namespace math
namespace
math
{
#ifdef __AVX__
template
<
>
void
lstm_compute_ctht
<
float
>
(
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
namespace
act
=
detail
::
forward
::
avx
;
// gates: W_ch, W_ih, W_fh, W_oh
__m256
c
,
i
,
f
,
o
;
c
=
_mm256_loadu_ps
(
gates
);
i
=
_mm256_loadu_ps
(
gates
+
8
);
f
=
_mm256_loadu_ps
(
gates
+
16
);
o
=
_mm256_loadu_ps
(
gates
+
24
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
c
=
_mm256_mul_ps
(
act
::
Tanh
(
c
),
act
::
Sigmoid
(
i
));
i
=
_mm256_loadu_ps
(
ct_1
);
f
=
_mm256_mul_ps
(
i
,
act
::
Sigmoid
(
f
));
f
=
_mm256_add_ps
(
c
,
f
);
_mm256_storeu_ps
(
ct
,
f
);
/* H_t = act_cell(C_t) * ogated */
o
=
_mm256_mul_ps
(
act
::
Tanh
(
f
),
act
::
Sigmoid
(
o
));
_mm256_storeu_ps
(
ht
,
o
);
}
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cpu_lstm_compute.h
浏览文件 @
5cedfb60
...
...
@@ -48,32 +48,15 @@ namespace forward {
namespace
avx
{
__m256
Sigmoid
(
const
__m256
a
);
__m256
Tanh
(
const
__m256
a
);
}
// namespace avx
}
// namespace forward
}
// namespace detail
template
<
>
void
lstm_compute_ctht
<
float
>
(
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
namespace
act
=
detail
::
forward
::
avx
;
// gates: W_ch, W_ih, W_fh, W_oh
__m256
c
,
i
,
f
,
o
;
c
=
_mm256_loadu_ps
(
gates
);
i
=
_mm256_loadu_ps
(
gates
+
8
);
f
=
_mm256_loadu_ps
(
gates
+
16
);
o
=
_mm256_loadu_ps
(
gates
+
24
);
float
*
ht
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
c
=
_mm256_mul_ps
(
act
::
Tanh
(
c
),
act
::
Sigmoid
(
i
));
i
=
_mm256_loadu_ps
(
ct_1
);
f
=
_mm256_mul_ps
(
i
,
act
::
Sigmoid
(
f
));
f
=
_mm256_add_ps
(
c
,
f
);
_mm256_storeu_ps
(
ct
,
f
);
/* H_t = act_cell(C_t) * ogated */
o
=
_mm256_mul_ps
(
act
::
Tanh
(
f
),
act
::
Sigmoid
(
o
));
_mm256_storeu_ps
(
ht
,
o
);
}
#endif
}
// namespace math
...
...
paddle/fluid/operators/math/depthwise_conv.cu
浏览文件 @
5cedfb60
...
...
@@ -46,17 +46,20 @@ __forceinline__ __device__ unsigned warp_id() {
return
ret
;
}
#define ARG_DEFINE_KernelDepthwiseConv \
const T *const input_data, const T *const filter_data, const int batch_size, \
const int output_channels, const int output_height, \
const int output_width, const int input_channels, \
const int input_height, const int input_width, \
const int filter_multiplier, const int filter_height, \
const int filter_width, const int stride_height, const int stride_width, \
const int padding_height, const int padding_width, \
const int dilate_height, const int dilate_width, T *const output_data
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
template
<
typename
T
>
__device__
__inline__
void
KernelDepthwiseConv
(
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
output_data
)
{
__device__
__inline__
void
KernelDepthwiseConv
(
ARG_DEFINE_KernelDepthwiseConv
)
{
for
(
int
w_out
=
threadIdx
.
x
;
w_out
<
output_width
;
w_out
+=
blockDim
.
x
)
{
for
(
int
h_out
=
threadIdx
.
y
;
h_out
<
output_height
;
h_out
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
...
...
@@ -97,42 +100,105 @@ __device__ __inline__ void KernelDepthwiseConv(
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
>
__global__
void
KernelDepthwiseConvSp
(
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
output_data
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
template
<
typename
T
,
int
c_filter
>
__device__
__inline__
void
KernelDepthwiseConvCFilter
(
ARG_DEFINE_KernelDepthwiseConv
)
{
const
int
kWeghtSize
=
c_filter
*
c_filter
;
T
r_weight
[
kWeghtSize
];
const
int
batch
=
blockIdx
.
y
;
const
int
c_out
=
blockIdx
.
x
;
const
T
*
weight
=
filter_data
+
c_out
*
c_filter
*
c_filter
;
for
(
int
i
=
0
;
i
<
c_filter
*
c_filter
;
i
++
)
r_weight
[
i
]
=
weight
[
i
];
else
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_height
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
for
(
int
w_out
=
threadIdx
.
x
;
w_out
<
output_width
;
w_out
+=
blockDim
.
x
)
{
for
(
int
h_out
=
threadIdx
.
y
;
h_out
<
output_height
;
h_out
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_out
=
blockIdx
.
x
;
const
int
c_in
=
c_out
/
filter_multiplier
;
T
value
=
0
;
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
const
int
h_in_end
=
h_in_start
+
c_filter
*
dilate_height
;
const
int
w_in_end
=
w_in_start
+
c_filter
*
dilate_width
;
const
int
in_offset
=
((
batch
*
input_channels
+
c_in
)
*
input_height
)
*
input_width
;
const
int
h_end
=
h_in_end
<
input_height
?
h_in_end
:
input_height
;
const
int
w_end
=
w_in_end
<
input_width
?
w_in_end
:
input_width
;
const
int
h_start
=
h_in_start
>
0
?
h_in_start
:
0
;
const
int
w_start
=
w_in_start
>
0
?
w_in_start
:
0
;
for
(
int
h_in
=
h_in_start
,
h_f
=
0
;
h_f
<
c_filter
;
h_in
+=
dilate_height
,
h_f
++
)
{
for
(
int
w_in
=
w_in_start
,
w_f
=
0
;
w_f
<
c_filter
;
w_in
+=
dilate_width
,
w_f
++
)
{
if
(
h_in
>=
0
&&
h_in
<
input_height
&&
w_in
>=
0
&&
w_in
<
input_width
)
{
const
int
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
value
+=
r_weight
[
h_f
*
c_filter
+
w_f
]
*
input_data
[
offset
];
}
}
}
int
index
=
((
batch
*
gridDim
.
x
+
c_out
)
*
output_height
+
h_out
)
*
output_width
+
w_out
;
output_data
[
index
]
=
value
;
}
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
,
int
c_filter
>
__global__
void
KernelDepthwiseConvSp
(
ARG_DEFINE_KernelDepthwiseConv
)
{
if
(
c_filter_multiplier
==
0
)
{
if
(
c_filter
==
-
1
)
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
else
KernelDepthwiseConvCFilter
<
T
,
c_filter
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
}
else
{
if
(
c_filter
==
-
1
)
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_height
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
else
KernelDepthwiseConvCFilter
<
T
,
c_filter
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_height
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
#define ARG_DEFINE_KernelDepthwiseConvInputGrad \
const T *const output_grad_data, const T *const filter_data, \
const int batch_size, const int output_channels, \
const int output_height, const int output_width, \
const int input_channels, const int input_height, const int input_width, \
const int filter_multiplier, const int filter_height, \
const int filter_width, const int stride_height, const int stride_width, \
const int padding_height, const int padding_width, \
const int dilate_height, const int dilate_width, \
T *const input_grad_data
template
<
typename
T
>
__device__
__inline__
void
KernelDepthwiseConvInputGrad
(
const
T
*
const
output_grad_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
input_grad_data
)
{
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
for
(
int
w_in
=
threadIdx
.
x
;
w_in
<
input_width
;
w_in
+=
blockDim
.
x
)
{
for
(
int
h_in
=
threadIdx
.
y
;
h_in
<
input_height
;
h_in
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
...
...
@@ -184,15 +250,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
>
template
<
typename
T
,
int
c_filter
,
int
c_filter_multiplier
>
__device__
__inline__
void
KernelDepthwiseConvInputGradCFilter
(
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
const
int
kWeghtSize
=
c_filter
*
c_filter
*
c_filter_multiplier
+
1
;
T
r_weight
[
kWeghtSize
];
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
for
(
int
c_i
=
0
;
c_i
<
filter_multiplier
;
c_i
++
)
{
int
c_out
=
c_in
*
filter_multiplier
+
c_i
;
const
T
*
weight
=
filter_data
+
c_out
*
c_filter
*
c_filter
;
for
(
int
i
=
0
;
i
<
c_filter
*
c_filter
;
i
++
)
r_weight
[
i
+
c_i
*
c_filter
*
c_filter
]
=
weight
[
c_filter
*
c_filter
-
i
-
1
];
}
for
(
int
w_in
=
threadIdx
.
x
;
w_in
<
input_width
;
w_in
+=
blockDim
.
x
)
{
for
(
int
h_in
=
threadIdx
.
y
;
h_in
<
input_height
;
h_in
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
int
h_out_start
=
h_in
-
(
c_filter
-
1
)
*
dilate_height
+
padding_height
;
int
w_out_start
=
w_in
-
(
c_filter
-
1
)
*
dilate_width
+
padding_width
;
T
value
=
0
;
for
(
int
c_i
=
0
;
c_i
<
filter_multiplier
;
c_i
++
)
{
int
c_out
=
c_in
*
filter_multiplier
+
c_i
;
for
(
int
h_out
=
h_out_start
,
h_f
=
0
;
h_f
<
c_filter
;
h_out
+=
dilate_height
,
h_f
++
)
{
for
(
int
w_out
=
w_out_start
,
w_f
=
0
;
w_f
<
c_filter
;
w_out
+=
dilate_width
,
w_f
++
)
{
int
s_h_out
=
h_out
/
stride_height
;
int
s_w_out
=
w_out
/
stride_width
;
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
const
int
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
s_h_out
)
*
output_width
+
s_w_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
r_weight
[
h_f
*
c_filter
+
w_f
+
c_i
*
c_filter
*
c_filter
];
}
}
}
}
int
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
input_grad_data
[
index
]
=
value
;
}
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
,
int
c_filter
>
__global__
void
KernelDepthwiseConvInputGradSp
(
const
T
*
const
output_grad_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
input_grad_data
)
{
ARG_DEFINE_KernelDepthwiseConvInputGrad
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConvInputGrad
<
T
>
(
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
...
...
@@ -200,13 +318,20 @@ __global__ void KernelDepthwiseConvInputGradSp(
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
else
else
if
(
c_filter
==
-
1
)
KernelDepthwiseConvInputGrad
<
T
>
(
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
else
KernelDepthwiseConvInputGradCFilter
<
T
,
c_filter
,
c_filter_multiplier
>
(
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
...
...
@@ -325,12 +450,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
dim3
threads
(
std
::
min
(
output_width
,
thread
),
blocks
,
1
);
dim3
grid
(
output_channels
,
batch_size
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier, c_stride
)
\
#define check_case(c_filter_multiplier, c_stride
, c_filter)
\
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \
KernelDepthwiseConvSp<T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
KernelDepthwiseConvSp<T, c_filter_multiplier, c_stride, \
c_filter><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
...
...
@@ -338,11 +465,17 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
dilate_width, output_data); \
return; \
}
check_case
(
1
,
1
);
check_case
(
1
,
2
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case
(
0
,
0
);
check_case
(
1
,
1
,
3
);
check_case
(
1
,
1
,
5
);
check_case
(
1
,
1
,
-
1
);
check_case
(
1
,
2
,
3
);
check_case
(
1
,
2
,
5
);
check_case
(
1
,
2
,
-
1
);
check_case
(
0
,
0
,
3
);
check_case
(
0
,
0
,
5
);
check_case
(
0
,
0
,
-
1
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
#undef check_case
}
};
...
...
@@ -384,13 +517,15 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
dim3
grid
(
input_channels
,
batch_size
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier, c_stride
)
\
#define check_case(c_filter_multiplier, c_stride
, c_filter)
\
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \
stride_height == stride_width && stride_height == c_stride && \
(ksize_height == ksize_width && ksize_height == c_filter || \
c_filter == -1)) { \
KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier,
\
c_
stride
><<<grid, threads, 0, context.stream()>>>( \
T, c_filter_multiplier,
c_stride,
\
c_
filter
><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
...
...
@@ -398,11 +533,21 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
dilate_height, dilate_width, input_grad_data); \
return; \
}
check_case
(
1
,
1
);
check_case
(
1
,
2
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case
(
0
,
0
);
check_case
(
1
,
1
,
3
);
check_case
(
1
,
1
,
5
);
check_case
(
1
,
1
,
-
1
);
check_case
(
1
,
2
,
3
);
check_case
(
1
,
2
,
5
);
check_case
(
1
,
2
,
-
1
);
check_case
(
2
,
1
,
3
);
check_case
(
2
,
1
,
5
);
check_case
(
2
,
1
,
-
1
);
check_case
(
2
,
2
,
3
);
check_case
(
2
,
2
,
5
);
check_case
(
2
,
2
,
-
1
);
check_case
(
0
,
0
,
-
1
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
#undef check_case
}
};
...
...
paddle/fluid/operators/math/math_function.cc
浏览文件 @
5cedfb60
...
...
@@ -13,6 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#endif
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
...
...
paddle/fluid/operators/math/math_function.h
浏览文件 @
5cedfb60
...
...
@@ -13,18 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
// remove typedef in openblas
#undef FLOAT
#undef INT
#undef SIZE
#endif
#include <cmath>
#include <vector>
...
...
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
5cedfb60
...
...
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <map>
#include <set>
#include <vector>
#include "paddle/fluid/operators/math/
math_function
.h"
#include "paddle/fluid/operators/math/
blas
.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace
paddle
{
...
...
@@ -150,6 +151,45 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
template
struct
SelectedRowsAddTo
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
<
typename
T
>
struct
SelectedRowsSumTo
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
SelectedRows
*>&
input1
,
const
std
::
vector
<
int64_t
>&
input2_offsets
,
framework
::
SelectedRows
*
input2
)
{
// Ensure all selected rows have the same height
size_t
size
=
0u
;
for
(
auto
iter
=
input1
.
begin
();
iter
!=
input1
.
end
();
++
iter
)
{
auto
&
in_rows
=
(
*
iter
)
->
rows
();
size
+=
in_rows
.
end
()
-
in_rows
.
begin
();
auto
in1_height
=
(
*
iter
)
->
height
();
PADDLE_ENFORCE_EQ
(
in1_height
,
input2
->
height
());
}
// concat rows
std
::
vector
<
int64_t
>
in2_rows
;
in2_rows
.
reserve
(
in2_rows
.
size
()
+
size
);
for
(
auto
iter
=
input1
.
begin
();
iter
!=
input1
.
end
();
++
iter
)
{
const
framework
::
Vector
<
int64_t
>&
in_rows
=
(
*
iter
)
->
rows
();
in2_rows
.
insert
(
in2_rows
.
end
(),
in_rows
.
begin
(),
in_rows
.
end
());
}
input2
->
set_rows
(
in2_rows
);
auto
*
in2_value
=
input2
->
mutable_value
();
auto
*
in2_data
=
in2_value
->
data
<
T
>
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
size_t
offset
=
0u
;
for
(
size_t
i
=
0u
;
i
!=
input1
.
size
();
++
i
)
{
auto
&
in_value
=
input1
[
i
]
->
value
();
const
auto
*
in_data
=
in_value
.
data
<
T
>
();
offset
+=
input2_offsets
[
i
];
blas
.
VCOPY
(
in_value
.
numel
(),
in_data
,
in2_data
+
offset
);
}
}
};
template
struct
SelectedRowsSumTo
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
SelectedRowsSumTo
<
platform
::
CPUDeviceContext
,
double
>;
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
...
...
@@ -207,35 +247,45 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const
framework
::
SelectedRows
&
input
,
framework
::
SelectedRows
*
output
)
{
framework
::
SelectedRows
&
out
=
*
output
;
auto
input_rows
=
input
.
rows
();
std
::
set
<
int64_t
>
row_set
(
input_rows
.
begin
(),
input_rows
.
end
());
std
::
vector
<
int64_t
>
merge_rows
(
row_set
.
begin
(),
row_set
.
end
());
std
::
vector
<
int64_t
>
input_rows
(
input
.
rows
());
auto
input_width
=
input
.
value
().
dims
()[
1
];
out
.
set_rows
(
merge_rows
);
std
::
map
<
int64_t
,
std
::
vector
<
int64_t
>>
merge_row_map
;
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
++
i
)
{
merge_row_map
[
input_rows
[
i
]].
push_back
(
i
);
}
std
::
vector
<
int64_t
>
merge_rows
(
merge_row_map
.
size
());
size_t
idx
=
0
;
int64_t
input_width
=
input
.
value
().
dims
()[
1
];
out
.
set_height
(
input
.
height
());
out
.
mutable_value
()
->
mutable_data
<
T
>
(
T
*
out_data
=
out
.
mutable_value
()
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
merge_rows
.
size
()),
input_width
}),
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
i
++
)
{
size_t
out_i
=
FindPos
(
merge_rows
,
input_rows
[
i
]);
for
(
int64_t
j
=
0
;
j
<
input_width
;
j
++
)
{
out_data
[
out_i
*
input_width
+
j
]
+=
input_data
[
i
*
input_width
+
j
];
const
T
*
in_data
=
input
.
value
().
data
<
T
>
();
for
(
auto
&
row_pair
:
merge_row_map
)
{
auto
*
out_ptr
=
out_data
+
idx
*
input_width
;
auto
&
rows
=
row_pair
.
second
;
merge_rows
[
idx
]
=
row_pair
.
first
;
++
idx
;
// rows.size() is always larger than 0
std
::
memcpy
(
out_ptr
,
in_data
+
rows
[
0
]
*
input_width
,
sizeof
(
T
)
*
input_width
);
for
(
size_t
i
=
1
;
i
<
rows
.
size
();
++
i
)
{
auto
*
in_ptr
=
in_data
+
rows
[
i
]
*
input_width
;
for
(
int64_t
j
=
0
;
j
<
input_width
;
++
j
)
{
out_ptr
[
j
]
+=
in_ptr
[
j
];
}
}
}
out
.
set_rows
(
merge_rows
);
}
};
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
int64_t
>;
...
...
paddle/fluid/operators/math/selected_rows_functor.h
浏览文件 @
5cedfb60
...
...
@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#define INLINE_FOR2(sizei, sizej) \
...
...
@@ -49,6 +55,15 @@ struct SelectedRowsAddTo {
const
int64_t
input2_offset
,
framework
::
SelectedRows
*
input2
);
};
// input2 = [all input in input1] + input2
template
<
typename
DeviceContext
,
typename
T
>
struct
SelectedRowsSumTo
{
void
operator
()(
const
DeviceContext
&
context
,
const
std
::
vector
<
framework
::
SelectedRows
*>&
input1
,
const
std
::
vector
<
int64_t
>&
input2_offsets
,
framework
::
SelectedRows
*
input2
);
};
// input2 = input1 + input2
template
<
typename
DeviceContext
,
typename
T
>
struct
SelectedRowsAddToTensor
{
...
...
@@ -70,6 +85,104 @@ struct MergeAdd {
framework
::
SelectedRows
*
output
);
};
template
<
>
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
float
>
{
framework
::
SelectedRows
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
)
{
framework
::
SelectedRows
out
;
(
*
this
)(
context
,
input
,
&
out
);
return
out
;
}
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
,
framework
::
SelectedRows
*
output
)
{
framework
::
SelectedRows
&
out
=
*
output
;
std
::
vector
<
int64_t
>
input_rows
(
input
.
rows
());
std
::
map
<
int64_t
,
std
::
vector
<
int64_t
>>
merge_row_map
;
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
++
i
)
{
merge_row_map
[
input_rows
[
i
]].
push_back
(
i
);
}
std
::
vector
<
int64_t
>
merge_rows
(
merge_row_map
.
size
());
size_t
idx
=
0
;
int64_t
input_width
=
input
.
value
().
dims
()[
1
];
out
.
set_height
(
input
.
height
());
auto
*
out_data
=
out
.
mutable_value
()
->
mutable_data
<
float
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
merge_rows
.
size
()),
input_width
}),
context
.
GetPlace
());
auto
*
in_data
=
input
.
value
().
data
<
float
>
();
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
float
>
(
context
);
for
(
auto
&
row_pair
:
merge_row_map
)
{
auto
*
out_ptr
=
out_data
+
idx
*
input_width
;
auto
&
rows
=
row_pair
.
second
;
merge_rows
[
idx
]
=
row_pair
.
first
;
++
idx
;
// rows.size() is always larger than 0
blas
.
VCOPY
(
input_width
,
in_data
+
rows
[
0
]
*
input_width
,
out_ptr
);
for
(
size_t
i
=
1
;
i
<
rows
.
size
();
++
i
)
{
blas
.
AXPY
(
input_width
,
1.
,
in_data
+
rows
[
i
]
*
input_width
,
out_ptr
);
}
}
out
.
set_rows
(
merge_rows
);
}
};
template
<
>
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
double
>
{
framework
::
SelectedRows
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
)
{
framework
::
SelectedRows
out
;
(
*
this
)(
context
,
input
,
&
out
);
return
out
;
}
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
,
framework
::
SelectedRows
*
output
)
{
framework
::
SelectedRows
&
out
=
*
output
;
std
::
vector
<
int64_t
>
input_rows
(
input
.
rows
());
std
::
map
<
int64_t
,
std
::
vector
<
int64_t
>>
merge_row_map
;
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
++
i
)
{
merge_row_map
[
input_rows
[
i
]].
push_back
(
i
);
}
std
::
vector
<
int64_t
>
merge_rows
(
merge_row_map
.
size
());
size_t
idx
=
0
;
int64_t
input_width
=
input
.
value
().
dims
()[
1
];
out
.
set_height
(
input
.
height
());
auto
*
out_data
=
out
.
mutable_value
()
->
mutable_data
<
double
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
merge_rows
.
size
()),
input_width
}),
context
.
GetPlace
());
auto
*
in_data
=
input
.
value
().
data
<
double
>
();
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
double
>
(
context
);
for
(
auto
&
row_pair
:
merge_row_map
)
{
auto
*
out_ptr
=
out_data
+
idx
*
input_width
;
auto
&
rows
=
row_pair
.
second
;
merge_rows
[
idx
]
=
row_pair
.
first
;
++
idx
;
// rows.size() is always larger than 0
blas
.
VCOPY
(
input_width
,
in_data
+
rows
[
0
]
*
input_width
,
out_ptr
);
for
(
size_t
i
=
1
;
i
<
rows
.
size
();
++
i
)
{
blas
.
AXPY
(
input_width
,
1.
,
in_data
+
rows
[
i
]
*
input_width
,
out_ptr
);
}
}
out
.
set_rows
(
merge_rows
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
struct
Add
{
framework
::
SelectedRows
operator
()(
const
DeviceContext
&
context
,
...
...
paddle/fluid/operators/math/selected_rows_functor_test.cc
浏览文件 @
5cedfb60
...
...
@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) {
// row9: 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
9
*
row_numel
+
6
],
5.0
);
}
TEST
(
selected_rows_functor
,
cpu_merge_add_float
)
{
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
paddle
::
operators
::
math
::
SetConstant
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
functor
;
int64_t
height
=
10
;
int64_t
row_numel
=
10
;
std
::
vector
<
int64_t
>
rows
{
0
,
4
,
4
,
7
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows
{
new
paddle
::
framework
::
SelectedRows
(
rows
,
height
)};
auto
*
in_value
=
selected_rows
->
mutable_value
();
in_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in_value
,
1.0
);
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
output
{
new
paddle
::
framework
::
SelectedRows
()};
paddle
::
operators
::
math
::
scatter
::
MergeAdd
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
merge_add_functor
;
merge_add_functor
(
ctx
,
*
selected_rows
,
output
.
get
());
auto
out_height
=
output
->
height
();
EXPECT_EQ
(
out_height
,
height
);
auto
&
out_rows
=
output
->
rows
();
EXPECT_EQ
(
out_rows
[
0
],
0
);
EXPECT_EQ
(
out_rows
[
1
],
4
);
EXPECT_EQ
(
out_rows
[
2
],
7
);
auto
*
out_data
=
output
->
value
().
data
<
float
>
();
EXPECT_EQ
(
out_data
[
0
*
row_numel
],
1.0
);
EXPECT_EQ
(
out_data
[
1
*
row_numel
],
2.0
);
EXPECT_EQ
(
out_data
[
2
*
row_numel
],
1.0
);
}
TEST
(
selected_rows_functor
,
cpu_merge_add_int
)
{
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
paddle
::
operators
::
math
::
SetConstant
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
functor
;
int64_t
height
=
10
;
int64_t
row_numel
=
10
;
std
::
vector
<
int64_t
>
rows
{
0
,
4
,
4
,
7
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows
{
new
paddle
::
framework
::
SelectedRows
(
rows
,
height
)};
auto
*
in_value
=
selected_rows
->
mutable_value
();
in_value
->
mutable_data
<
int
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in_value
,
1
);
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
output
{
new
paddle
::
framework
::
SelectedRows
()};
paddle
::
operators
::
math
::
scatter
::
MergeAdd
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
merge_add_functor
;
merge_add_functor
(
ctx
,
*
selected_rows
,
output
.
get
());
auto
out_height
=
output
->
height
();
EXPECT_EQ
(
out_height
,
height
);
auto
&
out_rows
=
output
->
rows
();
EXPECT_EQ
(
out_rows
[
0
],
0
);
EXPECT_EQ
(
out_rows
[
1
],
4
);
EXPECT_EQ
(
out_rows
[
2
],
7
);
auto
*
out_data
=
output
->
value
().
data
<
int
>
();
EXPECT_EQ
(
out_data
[
0
*
row_numel
],
1
);
EXPECT_EQ
(
out_data
[
1
*
row_numel
],
2
);
EXPECT_EQ
(
out_data
[
2
*
row_numel
],
1
);
}
TEST
(
selected_rows_functor
,
cpu_sum_to
)
{
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
paddle
::
operators
::
math
::
SetConstant
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
functor
;
int64_t
height
=
10
;
int64_t
row_numel
=
10
;
std
::
vector
<
int64_t
>
rows1
{
0
,
4
,
7
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows1
{
new
paddle
::
framework
::
SelectedRows
(
rows1
,
height
)};
auto
*
in1_value
=
selected_rows1
->
mutable_value
();
in1_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows1
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in1_value
,
1.0
);
std
::
vector
<
int64_t
>
rows2
{
0
,
5
,
7
,
9
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows2
{
new
paddle
::
framework
::
SelectedRows
(
rows2
,
height
)};
auto
*
in2_value
=
selected_rows2
->
mutable_value
();
in2_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows2
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in2_value
,
2.0
);
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
output
{
new
paddle
::
framework
::
SelectedRows
()};
output
->
set_height
(
height
);
auto
*
out_value
=
output
->
mutable_value
();
// simplely concat two SelectedRows
out_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
7
,
10
}),
cpu_place
);
paddle
::
operators
::
math
::
SelectedRowsSumTo
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
sum_to_functor
;
sum_to_functor
(
ctx
,
std
::
vector
<
paddle
::
framework
::
SelectedRows
*>
(
{
selected_rows1
.
get
(),
selected_rows2
.
get
()}),
std
::
vector
<
int64_t
>
({
0
,
in1_value
->
numel
()}),
output
.
get
());
auto
out_height
=
output
->
height
();
EXPECT_EQ
(
out_height
,
height
);
auto
&
out_rows
=
output
->
rows
();
// input1 rows
EXPECT_EQ
(
out_rows
[
0
],
0
);
EXPECT_EQ
(
out_rows
[
1
],
4
);
EXPECT_EQ
(
out_rows
[
2
],
7
);
// input2 rows
EXPECT_EQ
(
out_rows
[
3
],
0
);
EXPECT_EQ
(
out_rows
[
4
],
5
);
EXPECT_EQ
(
out_rows
[
5
],
7
);
EXPECT_EQ
(
out_rows
[
6
],
9
);
auto
*
out_data
=
output
->
value
().
data
<
float
>
();
// input1 value
EXPECT_EQ
(
out_data
[
0
*
row_numel
+
0
],
1.0
);
EXPECT_EQ
(
out_data
[
0
*
row_numel
+
8
],
1.0
);
EXPECT_EQ
(
out_data
[
1
*
row_numel
+
1
],
1.0
);
EXPECT_EQ
(
out_data
[
2
*
row_numel
+
6
],
1.0
);
// input2 value
EXPECT_EQ
(
out_data
[
3
*
row_numel
+
3
],
2.0
);
EXPECT_EQ
(
out_data
[
3
*
row_numel
+
8
],
2.0
);
EXPECT_EQ
(
out_data
[
4
*
row_numel
+
4
],
2.0
);
EXPECT_EQ
(
out_data
[
5
*
row_numel
+
7
],
2.0
);
EXPECT_EQ
(
out_data
[
6
*
row_numel
+
9
],
2.0
);
std
::
unique_ptr
<
paddle
::
framework
::
Tensor
>
tensor1
{
new
paddle
::
framework
::
Tensor
()};
tensor1
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
height
,
row_numel
}),
cpu_place
);
functor
(
ctx
,
tensor1
.
get
(),
3.0
);
paddle
::
operators
::
math
::
SelectedRowsAddToTensor
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
add_to_tensor_functor
;
add_to_tensor_functor
(
ctx
,
*
output
,
tensor1
.
get
());
auto
*
tensor1_data
=
tensor1
->
data
<
float
>
();
// row0: 1.0 + 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
0
*
row_numel
+
0
],
6.0
);
// row1: 3.0
EXPECT_EQ
(
tensor1_data
[
1
*
row_numel
+
1
],
3.0
);
// row4 : 1.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
4
*
row_numel
+
6
],
4.0
);
// row5: 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
5
*
row_numel
+
7
],
5.0
);
// row6: 3.0
EXPECT_EQ
(
tensor1_data
[
6
*
row_numel
+
1
],
3.0
);
// row7: 1.0 + 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
7
*
row_numel
+
3
],
6.0
);
// row9: 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
9
*
row_numel
+
6
],
5.0
);
}
paddle/fluid/operators/momentum_op.cc
浏览文件 @
5cedfb60
...
...
@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
"Input(velocity) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of Momentum should not be null."
);
...
...
paddle/fluid/operators/momentum_op.cu
浏览文件 @
5cedfb60
...
...
@@ -46,6 +46,17 @@ template <typename T>
class
MomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
...
...
paddle/fluid/operators/momentum_op.h
浏览文件 @
5cedfb60
...
...
@@ -23,6 +23,12 @@ template <typename T>
class
MomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
...
...
paddle/fluid/operators/prelu_op.cc
浏览文件 @
5cedfb60
...
...
@@ -49,7 +49,7 @@ class PReluOp : public framework::OperatorWithKernel {
}
else
{
PADDLE_THROW
(
"Unkown mode %s"
,
mode
);
}
ctx
->
S
etOutputDim
(
"Out"
,
x_dim
);
ctx
->
S
hareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
...
...
paddle/fluid/operators/rmsprop_op.cc
浏览文件 @
5cedfb60
...
...
@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
"Input(Grad) of RmspropOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Moment"
),
"Input(Moment) of RmspropOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(param_out) of RmspropOp should not be null."
);
...
...
paddle/fluid/operators/rnn_memory_helper_op.cc
浏览文件 @
5cedfb60
...
...
@@ -54,7 +54,7 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
"Input(X) of rnn_memory_helper op should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output of rnn_memory_helper op should not be null."
);
ctx
->
S
etOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
)
);
ctx
->
S
hareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
...
...
paddle/fluid/operators/sequence_conv_op.cc
浏览文件 @
5cedfb60
...
...
@@ -90,8 +90,8 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
ctx
->
GetInputDim
(
"PaddingData"
));
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
ctx
->
S
etOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
framework
::
GradVarName
(
"X"
));
ctx
->
S
hareDim
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Filter"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Filter"
),
...
...
paddle/fluid/operators/sequence_erase_op.cc
浏览文件 @
5cedfb60
...
...
@@ -24,9 +24,9 @@ class SequenceEraseOp : public framework::OperatorWithKernel {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequenceErase
Op
should not be null."
);
"Input(X) of SequenceErase
operator
should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of SequenceErase
Op
should not be null."
);
"Output(Out) of SequenceErase
operator
should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE
(
x_dims
.
size
()
==
2
&&
x_dims
[
1
]
==
1
,
"Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
...
...
paddle/fluid/operators/sequence_pool_op.cc
浏览文件 @
5cedfb60
...
...
@@ -102,8 +102,9 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
for
(
int64_t
i
=
1
;
i
<
og_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
og_dims
[
i
],
x_dims
[
i
],
"The dimension mismatch."
);
}
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
ctx
->
ShareLoD
(
"X"
,
framework
::
GradVarName
(
"X"
));
ctx
->
ShareDim
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
}
protected:
...
...
paddle/fluid/operators/sequence_reshape_op.cc
浏览文件 @
5cedfb60
...
...
@@ -92,7 +92,7 @@ class SequenceReshapeGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequenceReshapeGradOp should not be null."
);
ctx
->
S
etOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
S
hareDim
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
}
};
...
...
paddle/fluid/operators/sequence_softmax_op.cc
浏览文件 @
5cedfb60
...
...
@@ -27,7 +27,8 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
"Input(X) of SequenceSoftmaxOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of SequenceSoftmaxOp should not be null."
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
ctx
->
ShareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
...
...
paddle/fluid/operators/sgd_op.cc
浏览文件 @
5cedfb60
...
...
@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Param"
),
"Input(Param) of SGDOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
...
...
@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
...
...
@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
class
SGDOpInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
input_var
=
op_desc
.
Input
(
"Param"
)[
0
];
for
(
auto
&
out_var
:
op_desc
.
Output
(
"ParamOut"
))
{
if
(
block
->
FindRecursiveOrCreateVar
(
input_var
).
GetType
()
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
)
{
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
input_var_n
=
op_desc
.
Input
(
"Param"
)[
0
];
auto
in_var_type
=
block
->
FindRecursiveOrCreateVar
(
input_var_n
).
GetType
();
PADDLE_ENFORCE
(
in_var_type
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
||
in_var_type
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input Var's type should be LoDtensor or SelectedRows,"
" but the received var(%s)'s type is %s"
,
input_var_n
,
in_var_type
);
for
(
auto
&
out_var_n
:
op_desc
.
Output
(
"ParamOut"
))
{
auto
&
out_var
=
block
->
FindRecursiveOrCreateVar
(
out_var_n
);
if
(
out_var
.
GetType
()
!=
in_var_type
)
{
out_var
.
SetType
(
in_var_type
);
}
}
}
...
...
paddle/fluid/operators/sgd_op.cu
浏览文件 @
5cedfb60
...
...
@@ -56,6 +56,12 @@ template <typename T>
class
SGDOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
auto
*
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
*
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
...
...
paddle/fluid/operators/shrink_rnn_memory_op.cc
浏览文件 @
5cedfb60
...
...
@@ -151,9 +151,9 @@ class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
void
operator
()(
framework
::
InferShapeContext
*
context
)
const
override
{
PADDLE_ENFORCE
(
context
->
HasInput
(
"X"
));
PADDLE_ENFORCE
(
context
->
HasOutput
(
framework
::
GradVarName
(
"X"
)));
context
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
context
->
GetInputDim
(
"X"
));
context
->
ShareLoD
(
"X"
,
framework
::
GradVarName
(
"X"
));
context
->
ShareDim
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
context
->
ShareLoD
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
}
};
...
...
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
浏览文件 @
5cedfb60
...
...
@@ -40,7 +40,7 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
"The 2nd dimension of Input(X) and Input(Label) should "
"be equal."
);
ctx
->
S
etOutputDim
(
"Out"
,
x_dims
);
ctx
->
S
hareDim
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
...
...
paddle/fluid/operators/tensorrt_engine_op.h
浏览文件 @
5cedfb60
...
...
@@ -34,7 +34,7 @@ namespace operators {
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
using
TRT_DT
=
nvinfer1
::
DataType
;
namespace
{
// NOLINT
namespace
{
TRT_DT
FluidDataType2TRT
(
FluidDT
type
)
{
switch
(
type
)
{
...
...
paddle/fluid/operators/top_k_op.cc
浏览文件 @
5cedfb60
...
...
@@ -30,8 +30,6 @@ class TopkOp : public framework::OperatorWithKernel {
"Output(Indices) of TopkOp should not be null."
);
auto
input_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_EQ
(
input_dims
.
size
(),
2
,
"Rank of TopK op's input must be 2."
);
const
int
k
=
static_cast
<
int
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"k"
));
PADDLE_ENFORCE_GE
(
k
,
1
,
"k must >= 1"
);
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
5cedfb60
...
...
@@ -256,36 +256,65 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
* 3. go to the second setp, until one thread's topk value is null;
* 4. go to the first setp, until get the topk value.
*/
template
<
typename
T
,
int
MaxLength
,
int
BlockSize
>
__global__
void
KeMatrixTopK
(
T
*
output
,
int
output_stride
,
int64_t
*
indices
,
const
T
*
src
,
int
lds
,
int
dim
,
int
k
)
{
const
T
*
src
,
int
lds
,
int
dim
,
int
k
,
int
grid_dim
,
int
num
)
{
__shared__
Pair
<
T
>
sh_topk
[
BlockSize
];
__shared__
int
maxid
[
BlockSize
/
2
];
const
int
tid
=
threadIdx
.
x
;
const
int
warp
=
threadIdx
.
x
/
32
;
output
+=
blockIdx
.
x
*
output_stride
;
indices
+=
blockIdx
.
x
*
k
;
Pair
<
T
>
topk
[
MaxLength
];
int
beam
=
MaxLength
;
Pair
<
T
>
max
;
bool
is_empty
=
false
;
bool
firststep
=
true
;
const
int
bid
=
blockIdx
.
x
;
for
(
int
i
=
bid
;
i
<
num
;
i
+=
grid_dim
)
{
output
+=
i
*
output_stride
;
indices
+=
i
*
k
;
Pair
<
T
>
topk
[
MaxLength
];
int
beam
=
MaxLength
;
Pair
<
T
>
max
;
bool
is_empty
=
false
;
bool
firststep
=
true
;
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
src
+
i
*
lds
,
&
firststep
,
&
is_empty
,
&
max
,
dim
,
tid
);
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
sh_topk
[
tid
]
=
topk
[
0
];
BlockReduce
<
T
,
MaxLength
,
BlockSize
>
(
sh_topk
,
maxid
,
topk
,
&
output
,
&
indices
,
&
beam
,
&
k
,
tid
,
warp
);
}
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
src
+
blockIdx
.
x
*
lds
,
&
firststep
,
&
is_empty
,
&
max
,
dim
,
tid
);
sh_topk
[
tid
]
=
topk
[
0
];
BlockReduce
<
T
,
MaxLength
,
BlockSize
>
(
sh_topk
,
maxid
,
topk
,
&
output
,
&
indices
,
&
beam
,
&
k
,
tid
,
warp
);
}
inline
static
int
GetDesiredBlockDim
(
int
dim
)
{
if
(
dim
>
128
)
{
return
256
;
}
else
if
(
dim
>
64
)
{
return
128
;
}
else
if
(
dim
>
32
)
{
return
64
;
}
else
{
return
32
;
}
}
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
case (dim): { \
constexpr auto kBlockDim = (dim); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM(...) \
FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
template
<
typename
T
>
class
TopkOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -310,18 +339,26 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
// NOTE: pass lds and dim same to input width.
// NOTE: old matrix implementation of stride is different to eigen.
// TODO(typhoonzero): refine this kernel.
dim3
threads
(
256
,
1
);
dim3
grid
(
input_height
,
1
);
KeMatrixTopK
<
T
,
5
,
256
><<<
grid
,
threads
,
0
,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
.
device_context
())
.
stream
()
>>>
(
output_data
,
output
->
dims
()[
1
],
indices_data
,
input_data
,
input_width
,
input_width
,
static_cast
<
int
>
(
k
));
const
int
kMaxHeight
=
2048
;
int
gridx
=
input_height
<
kMaxHeight
?
input_height
:
kMaxHeight
;
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
switch
(
GetDesiredBlockDim
(
input_width
))
{
FIXED_BLOCK_DIM
(
KeMatrixTopK
<
T
,
5
,
kBlockDim
><<<
gridx
,
kBlockDim
,
0
,
dev_ctx
.
stream
()
>>>
(
output_data
,
output
->
dims
()[
1
],
indices_data
,
input_data
,
input_width
,
input_width
,
static_cast
<
int
>
(
k
),
gridx
,
input_height
));
default:
PADDLE_THROW
(
"Error"
);
}
}
};
#undef FIXED_BLOCK_DIM_BASE
#undef FIXED_BLOCK_DIM
}
// namespace operators
}
// namespace paddle
...
...
paddle/fluid/operators/truncated_gaussian_random_op.cc
浏览文件 @
5cedfb60
...
...
@@ -148,7 +148,7 @@ struct TruncatedNormal {
T
operator
()(
T
value
)
const
{
auto
p
=
a_normal_cdf
+
(
b_normal_cdf
-
a_normal_cdf
)
*
value
;
return
(
std
::
sqrt
(
2.0
)
*
Erfinv
(
2
*
p
-
1
)
+
mean
)
*
std
;
return
std
::
sqrt
(
2.0
)
*
Erfinv
(
2
*
p
-
1
)
*
std
+
mean
;
}
};
...
...
paddle/fluid/operators/truncated_gaussian_random_op.cu
浏览文件 @
5cedfb60
...
...
@@ -42,7 +42,7 @@ struct TruncatedNormal {
rng
.
discard
(
n
);
T
value
=
dist
(
rng
);
auto
p
=
a_normal_cdf
+
(
b_normal_cdf
-
a_normal_cdf
)
*
value
;
return
(
std
::
sqrt
(
2.0
)
*
erfinvf
(
2
*
p
-
1
)
+
mean
)
*
std
;
return
std
::
sqrt
(
2.0
)
*
erfinvf
(
2
*
p
-
1
)
*
std
+
mean
;
}
};
...
...
@@ -52,6 +52,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
tensor
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
T
*
data
=
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
unsigned
int
seed
=
static_cast
<
unsigned
int
>
(
context
.
Attr
<
int
>
(
"seed"
));
if
(
seed
==
0
)
{
std
::
random_device
rd
;
...
...
paddle/fluid/operators/uniform_random_op.cc
浏览文件 @
5cedfb60
...
...
@@ -23,14 +23,14 @@ namespace operators {
template
<
typename
T
>
class
CPUUniformRandomKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
Tensor
*
tensor
=
nullptr
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
Tensor
*
tensor
=
nullptr
;
auto
out_var
=
ctx
.
OutputVar
(
"Out"
);
if
(
out_var
->
IsType
<
framework
::
LoDTensor
>
())
{
tensor
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
();
}
else
if
(
out_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
);
auto
*
selected_rows
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
selected_rows
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
tensor
=
selected_rows
->
mutable_value
();
tensor
->
Resize
(
framework
::
make_ddim
(
shape
));
selected_rows
->
mutable_rows
()
->
reserve
(
shape
[
0
]);
...
...
@@ -39,7 +39,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
"uniform_random_op's output only"
"supports SelectedRows and LoDTensor"
);
}
T
*
data
=
tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
data
=
tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
unsigned
int
seed
=
static_cast
<
unsigned
int
>
(
ctx
.
Attr
<
int
>
(
"seed"
));
std
::
minstd_rand
engine
;
if
(
seed
==
0
)
{
...
...
@@ -60,14 +60,14 @@ class UniformRandomOp : public framework::OperatorWithKernel {
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of UniformRandomOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
Attrs
().
Get
<
float
>
(
"min"
)
<
ctx
->
Attrs
().
Get
<
float
>
(
"max"
),
"uniform_random's min must less then max"
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
std
::
vector
<
int64_t
>
temp
;
temp
.
reserve
(
shape
.
size
());
for
(
auto
dim
:
shape
)
{
...
...
@@ -78,7 +78,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
)),
ctx
.
GetPlace
());
...
...
@@ -112,17 +112,17 @@ uniform distribution. The random result is in set [min, max].
class
UniformRandomOpVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
out_var_name
=
op_desc
.
Output
(
"Out"
).
front
();
if
(
block
->
FindRecursiveOrCreateVar
(
out_var_name
).
GetType
()
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
)
{
block
->
FindRecursiveOrCreateVar
(
out_var_name
)
.
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
block
->
FindRecursiveOrCreateVar
(
out_var_name
)
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
auto
var_data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
boost
::
get
<
int
>
(
op_desc
.
GetAttr
(
"dtype"
)));
auto
out_var
=
block
->
FindRecursiveOrCreateVar
(
out_var_name
);
if
(
out_var
.
GetType
()
!=
framework
::
proto
::
VarType
::
SELECTED_ROWS
)
{
out_var
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
out_var
.
SetDataType
(
var_data_type
);
}
};
...
...
paddle/fluid/operators/while_op.cc
浏览文件 @
5cedfb60
...
...
@@ -224,10 +224,12 @@ class WhileGradOp : public framework::OperatorBase {
if
(
cur_scope_iter
==
step_scopes
->
rbegin
())
{
auto
*
var
=
(
*
cur_scope_iter
)
->
FindVar
(
inside_grad_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"Can not find var %s"
,
inside_grad_name
);
PADDLE_ENFORCE
(
var
->
IsType
<
framework
::
LoDTensorArray
>
()
||
var
->
IsType
<
LoDTensor
>
(),
"Currently the type of var only can be LoDTensorArray "
"or LoDTensor."
);
PADDLE_ENFORCE
(
var
->
IsType
<
framework
::
LoDTensorArray
>
()
||
var
->
IsType
<
LoDTensor
>
(),
"Currently the type of var only can be LoDTensorArray, "
"or LoDTensor, but the received var[%s] is %s."
,
inside_grad_name
,
var
->
Type
().
name
());
if
(
var
->
IsType
<
LoDTensor
>
())
{
auto
&
inside_tensor
=
var
->
Get
<
framework
::
LoDTensor
>
();
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
5cedfb60
...
...
@@ -20,8 +20,11 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
DEFINE_double
(
fraction_of_gpu_memory_to_use
,
0.92
,
"Default use 92% of GPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
"Allocate a trunk of gpu memory that is this fraction of the "
"total gpu memory size. Future memory usage will be allocated "
"from the trunk. If the trunk doesn't have enough gpu memory, "
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."
);
namespace
paddle
{
namespace
platform
{
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
5cedfb60
...
...
@@ -276,7 +276,7 @@ struct EventItem {
// Print results
void
PrintProfiler
(
const
std
::
vector
<
std
::
vector
<
EventItem
>>&
events_table
,
const
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
,
double
total
)
{
const
size_t
data_width
,
bool
merge_thread
)
{
// Output header information
std
::
cout
<<
"
\n
------------------------->"
<<
" Profiling Report "
...
...
@@ -292,6 +292,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
PADDLE_THROW
(
"Invalid profiler state"
,
g_state
);
}
if
(
merge_thread
)
{
std
::
cout
<<
"Note! This Report merge all thread info into one."
<<
std
::
endl
;
}
std
::
cout
<<
"Place: "
<<
place
<<
std
::
endl
;
std
::
cout
<<
"Time unit: ms"
<<
std
::
endl
;
std
::
cout
<<
"Sorted by "
<<
sorted_domain
...
...
@@ -312,8 +316,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<<
std
::
setw
(
data_width
)
<<
event_item
.
min_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
max_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
ave_time
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
/
total
<<
std
::
endl
;
<<
std
::
setw
(
data_width
)
<<
event_item
.
ratio
<<
std
::
endl
;
}
}
std
::
cout
<<
std
::
endl
;
...
...
@@ -321,8 +324,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
// Parse the event list and output the profiling report
void
ParseEvents
(
const
std
::
vector
<
std
::
vector
<
Event
>>&
events
,
bool
merge_thread
,
EventSortingKey
sorted_by
=
EventSortingKey
::
kDefault
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
merge_thread
&&
events
.
size
()
<
2
)
return
;
std
::
string
sorted_domain
;
std
::
function
<
bool
(
const
EventItem
&
,
const
EventItem
&
)
>
sorted_func
;
...
...
@@ -361,34 +366,55 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain
=
"event first end time"
;
}
const
std
::
vector
<
std
::
vector
<
Event
>>*
analyze_events
;
std
::
vector
<
std
::
vector
<
Event
>>
merged_events_list
;
if
(
merge_thread
)
{
std
::
vector
<
Event
>
merged_events
;
for
(
int
i
=
0
;
i
<
events
.
size
();
++
i
)
{
for
(
int
j
=
0
;
j
<
events
[
i
].
size
();
++
j
)
{
merged_events
.
push_back
(
events
[
i
][
j
]);
}
}
merged_events_list
.
push_back
(
merged_events
);
analyze_events
=
&
merged_events_list
;
}
else
{
analyze_events
=
&
events
;
}
std
::
vector
<
std
::
vector
<
EventItem
>>
events_table
;
size_t
max_name_width
=
0
;
double
total
=
0.
;
// the total time
for
(
size_t
i
=
0
;
i
<
events
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
(
*
analyze_events
).
size
();
i
++
)
{
double
total
=
0.
;
// the total time in one thread
std
::
list
<
Event
>
pushed_events
;
std
::
vector
<
EventItem
>
event_items
;
std
::
unordered_map
<
std
::
string
,
int
>
event_idx
;
for
(
size_t
j
=
0
;
j
<
events
[
i
].
size
();
j
++
)
{
if
(
events
[
i
][
j
].
type
()
==
EventType
::
kPushRange
)
{
pushed_events
.
push_back
(
events
[
i
][
j
]);
}
else
if
(
events
[
i
][
j
].
type
()
==
EventType
::
kPopRange
)
{
for
(
size_t
j
=
0
;
j
<
(
*
analyze_events
)
[
i
].
size
();
j
++
)
{
if
(
(
*
analyze_events
)
[
i
][
j
].
type
()
==
EventType
::
kPushRange
)
{
pushed_events
.
push_back
(
(
*
analyze_events
)
[
i
][
j
]);
}
else
if
(
(
*
analyze_events
)
[
i
][
j
].
type
()
==
EventType
::
kPopRange
)
{
std
::
list
<
Event
>::
reverse_iterator
rit
=
pushed_events
.
rbegin
();
while
(
rit
!=
pushed_events
.
rend
()
&&
rit
->
name
()
!=
events
[
i
][
j
].
name
())
{
rit
->
name
()
!=
(
*
analyze_events
)
[
i
][
j
].
name
())
{
++
rit
;
}
if
(
rit
!=
pushed_events
.
rend
())
{
double
event_time
=
(
g_state
==
ProfilerState
::
kCUDA
||
g_state
==
ProfilerState
::
kAll
)
?
rit
->
CudaElapsedMs
(
events
[
i
][
j
])
:
rit
->
CpuElapsedMs
(
events
[
i
][
j
]);
?
rit
->
CudaElapsedMs
(
(
*
analyze_events
)
[
i
][
j
])
:
rit
->
CpuElapsedMs
(
(
*
analyze_events
)
[
i
][
j
]);
total
+=
event_time
;
std
::
string
event_name
=
"thread"
+
std
::
to_string
(
rit
->
thread_id
())
+
"::"
+
rit
->
name
();
max_name_width
=
std
::
max
(
max_name_width
,
event_name
.
size
());
std
::
string
event_name
;
if
(
merge_thread
)
{
event_name
=
rit
->
name
();
max_name_width
=
std
::
max
(
max_name_width
,
event_name
.
size
());
}
else
{
event_name
=
"thread"
+
std
::
to_string
(
rit
->
thread_id
())
+
"::"
+
rit
->
name
();
max_name_width
=
std
::
max
(
max_name_width
,
event_name
.
size
());
}
if
(
event_idx
.
find
(
event_name
)
==
event_idx
.
end
())
{
event_idx
[
event_name
]
=
event_items
.
size
();
...
...
@@ -413,7 +439,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
pushed_events
.
erase
((
++
rit
).
base
());
}
else
{
LOG
(
WARNING
)
<<
"Cannot find the push marker of event
\'
"
<<
events
[
i
][
j
].
name
()
<<
(
*
analyze_events
)
[
i
][
j
].
name
()
<<
"
\'
, which will be ignored in profiling report."
;
}
}
...
...
@@ -421,6 +447,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
// average time
for
(
auto
&
item
:
event_items
)
{
item
.
ave_time
=
item
.
total_time
/
item
.
calls
;
item
.
ratio
=
item
.
total_time
/
total
;
}
// sort
if
(
sorted_by
!=
EventSortingKey
::
kDefault
)
{
...
...
@@ -438,7 +465,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
// Print report
PrintProfiler
(
events_table
,
sorted_domain
,
max_name_width
+
4
,
12
,
total
);
PrintProfiler
(
events_table
,
sorted_domain
,
max_name_width
+
4
,
12
,
merge_thread
);
}
void
DisableProfiler
(
EventSortingKey
sorted_key
,
...
...
@@ -449,7 +477,8 @@ void DisableProfiler(EventSortingKey sorted_key,
Mark
(
"_stop_profiler_"
,
nullptr
);
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
ParseEvents
(
all_events
,
sorted_key
);
ParseEvents
(
all_events
,
true
,
sorted_key
);
ParseEvents
(
all_events
,
false
,
sorted_key
);
ResetProfiler
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
->
IsEnabled
())
{
...
...
paddle/fluid/pybind/const_value.cc
浏览文件 @
5cedfb60
...
...
@@ -48,9 +48,6 @@ void BindConstValue(pybind11::module* m) {
op_proto_and_checker_maker
.
def
(
"kOpNameScopeAttrName"
,
framework
::
OpProtoAndCheckerMaker
::
OpNamescopeAttrName
);
op_proto_and_checker_maker
.
def
(
"kOpCreationCallstackAttrName"
,
framework
::
OpProtoAndCheckerMaker
::
OpCreationCallstackAttrName
);
}
}
// namespace pybind
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
5cedfb60
...
...
@@ -157,7 +157,50 @@ PYBIND11_PLUGIN(core) {
.
def
(
"_get_double_element"
,
TensorGetElement
<
double
>
)
.
def
(
"_dtype"
,
[](
Tensor
&
self
)
{
return
ToDataType
(
self
.
type
());
});
py
::
class_
<
LoDTensor
,
Tensor
>
(
m
,
"LoDTensor"
)
py
::
class_
<
LoDTensor
,
Tensor
>
(
m
,
"LoDTensor"
,
R"DOC(
LoDTensor is a Tensor with optional LoD information.
np.array(lod_tensor) can convert LoDTensor to numpy array.
lod_tensor.lod() can retrieve the LoD information.
LoD is short for Level of Details and is usually used for varied sequence
length. You can skip the following comment if you don't need optional LoD.
For example:
A LoDTensor X can look like the example below. It contains 2 sequences.
The first has length 2 and the second has length 3, as described by x.lod.
The first tensor dimension 6=2+3 is calculated from LoD if it's available.
It means the total number of sequence element. In X, each element has 2
columns, hence [6, 2].
x.lod = [[2, 3]]
x.data = [[1, 2], [3, 4],
[5, 6], [7, 8], [9, 10], [11, 12]]
x.shape = [6, 2]
LoD can have multiple levels (for example, a paragraph can have multiple
sentences and a sentence can have multiple words). In the following
LodTensor Y, the lod_level is 2. It means there are 2 sequence, the
first sequence length is 2 (has 2 sub-sequences), the second one's
length is 1. The first sequence's 2 sub-sequences have length 2 and 2,
respectively. And the second sequence's 1 sub-sequence has length 3.
y.lod = [[2 1], [2 2 3]]
y.shape = [2+2+3, ...]
Note:
In above description, LoD is length-based. In Paddle internal
implementation, lod is offset-based. Hence, internally,
y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based
equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]).
Sometimes LoD is called recursive_sequence_length to be more
self-explanatory. In this case, it must be length-based. Due to history
reasons. when LoD is called lod in public API, it might be offset-based.
Users should be careful about it.
)DOC"
)
.
def_buffer
(
[](
Tensor
&
self
)
->
py
::
buffer_info
{
return
CastToPyBuffer
(
self
);
})
.
def
(
"__init__"
,
...
...
@@ -620,26 +663,58 @@ All parameter, weight, gradient are variables in Paddle.
// -- python binds for parallel executor.
py
::
class_
<
ParallelExecutor
>
pe
(
m
,
"ParallelExecutor"
);
py
::
class_
<
ExecutionStrategy
>
exec_strategy
(
pe
,
"ExecutionStrategy"
);
py
::
class_
<
ExecutionStrategy
>
exec_strategy
(
pe
,
"ExecutionStrategy"
,
R"DOC(
ExecutionStrategy allows the user to more preciously control how to run
the program in ParallelExecutor by setting the property.
Examples:
.. code-block:: python
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = 4
train_exe = fluid.ParallelExecutor(use_cuda=True,
loss_name=loss.name,
exec_strategy=exec_strategy)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
)DOC"
);
exec_strategy
.
def
(
py
::
init
())
.
def_property
(
"num_threads"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
num_threads_
;
},
[](
ExecutionStrategy
&
self
,
size_t
num_threads
)
{
self
.
num_threads_
=
num_threads
;
})
},
R"DOC(The type is INT, num_threads represents the size of thread pool that
used to run the operators of the current program in ParallelExecutor.
If :math:`num\_threads=1`, all the operators will execute one by one,
but the order maybe difference between iterations.
If it is not set, it will be set in ParallelExecutor according to the
device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
:math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
if it is not set, ParallelExecutor will get the cpu count by calling
`multiprocessing.cpu_count()`. Default 0.)DOC"
)
.
def_property
(
"use_cuda"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_cuda_
;
},
[](
ExecutionStrategy
&
self
,
bool
use_cuda
)
{
self
.
use_cuda_
=
use_cuda
;
})
})
// FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
// make user confuse, because ParallelExecutor has a parameter named
// 'use_cuda' too, in current implementation, ParallelExecutor's
// 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
.
def_property
(
"allow_op_delay"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
allow_op_delay_
;
},
[](
ExecutionStrategy
&
self
,
bool
allow_op_delay
)
{
self
.
allow_op_delay_
=
allow_op_delay
;
})
},
R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
communication operators to run, it may make the execution faster.
Note that in some models, allow_op_delay may cause program hang. Default False.)DOC"
)
.
def_property
(
"num_iteration_per_drop_scope"
,
[](
const
ExecutionStrategy
&
self
)
{
...
...
@@ -647,7 +722,19 @@ All parameter, weight, gradient are variables in Paddle.
},
[](
ExecutionStrategy
&
self
,
size_t
num_iteration_per_drop_scope
)
{
self
.
num_iteration_per_drop_scope_
=
num_iteration_per_drop_scope
;
});
},
R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
many iterations to clean up the temp variables which
is generated during execution. It may make the execution faster,
because the temp variable's shape maybe the same between two iterations. Default 100.
NOTES:
1. If you fetch data when calling the 'run', the ParallelExecutor
will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`.
)DOC"
);
exec_strategy
.
def_property
(
"use_experimental_executor"
,
[](
const
ExecutionStrategy
&
self
)
{
...
...
@@ -658,7 +745,22 @@ All parameter, weight, gradient are variables in Paddle.
:
ExecutionStrategy
::
kDefault
;
});
py
::
class_
<
BuildStrategy
>
build_strategy
(
pe
,
"BuildStrategy"
);
py
::
class_
<
BuildStrategy
>
build_strategy
(
pe
,
"BuildStrategy"
,
R"DOC(
BuildStrategy allows the user to more preciously control how to
build the SSA Graph in ParallelExecutor by setting the property.
Examples:
.. code-block:: python
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
train_exe = fluid.ParallelExecutor(use_cuda=True,
loss_name=loss.name,
build_strategy=build_strategy)
train_loss, = train_exe.run([loss.name], feed=feed_dict)
)DOC"
);
py
::
enum_
<
BuildStrategy
::
ReduceStrategy
>
(
build_strategy
,
"ReduceStrategy"
)
.
value
(
"Reduce"
,
BuildStrategy
::
ReduceStrategy
::
kReduce
)
...
...
@@ -676,31 +778,51 @@ All parameter, weight, gradient are variables in Paddle.
[](
const
BuildStrategy
&
self
)
{
return
self
.
reduce_
;
},
[](
BuildStrategy
&
self
,
BuildStrategy
::
ReduceStrategy
strategy
)
{
self
.
reduce_
=
strategy
;
})
},
R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
'AllReduce' and 'Reduce'. If you want that all the parameters'
optimization are done on all devices independently, you should choose 'AllReduce';
if you choose 'Reduce', all the parameters' optimization will be evenly distributed
to different devices, and then broadcast the optimized parameter to other devices.
In some models, `Reduce` is faster. Default 'AllReduce'. )DOC"
)
.
def_property
(
"gradient_scale_strategy"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
gradient_scale_
;
},
[](
BuildStrategy
&
self
,
BuildStrategy
::
GradientScaleStrategy
strategy
)
{
self
.
gradient_scale_
=
strategy
;
})
},
R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
If you want to customize :math:`loss@grad`, you can choose 'Customized'.
Default 'CoeffNumDevice'.)DOC"
)
.
def_property
(
"debug_graphviz_path"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
debug_graphviz_path_
;
},
[](
BuildStrategy
&
self
,
const
std
::
string
&
path
)
{
self
.
debug_graphviz_path_
=
path
;
})
},
R"DOC(The type is STR, debug_graphviz_path indicate the path that
writing the SSA Graph to file in the form of graphviz, you.
It is useful for debugging. Default "")DOC"
)
.
def_property
(
"enable_data_balance"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
enable_data_balance_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
enable_data_balance_
=
b
;
})
.
def_property
(
"fuse_elewise_add_act_ops"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
fuse_elewise_add_act_ops_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
fuse_elewise_add_act_ops_
=
b
;
})
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
enable_data_balance_
=
b
;
})
// FIXME(chengudo): enable_data_balance seems not important
.
def_property
(
"fuse_elewise_add_act_ops"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
fuse_elewise_add_act_ops_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
fuse_elewise_add_act_ops_
=
b
;
},
R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default False)DOC"
)
.
def
(
"_create_passes_from_strategy"
,
[](
BuildStrategy
&
self
)
->
std
::
shared_ptr
<
ir
::
PassBuilder
>
{
return
self
.
CreatePassesFromStrategy
();
...
...
paddle/fluid/train/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -4,7 +4,6 @@ function(train_test TARGET_NAME)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
train_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
set
(
arg_list
""
)
if
(
train_test_ARGS
)
foreach
(
arg
${
train_test_ARGS
}
)
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
5cedfb60
...
...
@@ -395,10 +395,11 @@ EOF
ctest
--output-on-failure
-j
$1
# make install should also be test when unittest
make
install
-j
8
pip
install
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
pip
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
if
[[
${
WITH_FLUID_ONLY
:-
OFF
}
==
"OFF"
]]
;
then
paddle version
fi
pip uninstall
-y
paddlepaddle
fi
}
...
...
@@ -597,9 +598,9 @@ EOF
EOF
if
[[
${
WITH_GPU
}
==
"ON"
]]
;
then
NCCL_DEPS
=
"apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda
${
CUDA_MAJOR
}
libnccl-dev=2.2.13-1+cuda
${
CUDA_MAJOR
}
&&
"
NCCL_DEPS
=
"apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda
${
CUDA_MAJOR
}
libnccl-dev=2.2.13-1+cuda
${
CUDA_MAJOR
}
|| true
"
else
NCCL_DEPS
=
""
NCCL_DEPS
=
"
true
"
fi
if
[[
${
WITH_FLUID_ONLY
:-
OFF
}
==
"OFF"
]]
;
then
...
...
@@ -613,9 +614,8 @@ EOF
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&
\
${
NCCL_DEPS
}
\
apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip &&
\
RUN apt-get update &&
${
NCCL_DEPS
}
RUN apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip &&
\
pip install /*.whl; apt-get install -f -y &&
\
apt-get clean -y &&
\
rm -f /*.whl &&
\
...
...
@@ -683,7 +683,7 @@ function test_fluid_inference_lib() {
========================================
EOF
cd
${
PADDLE_ROOT
}
/paddle/fluid/inference/api/demo_ci
./run.sh
${
PADDLE_ROOT
}
${
WITH_MKL
:-
ON
}
${
WITH_GPU
:-
OFF
}
${
INFERENCE_DEMO_INSTALL_DIR
}
./run.sh
${
PADDLE_ROOT
}
${
WITH_MKL
:-
ON
}
${
WITH_GPU
:-
OFF
}
${
INFERENCE_DEMO_INSTALL_DIR
}
${
TENSORRT_INCLUDE_DIR
:-
/usr/local/TensorRT/include
}
${
TENSORRT_LIB_DIR
:-
/usr/local/TensorRT/lib
}
./clean.sh
fi
}
...
...
python/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -60,7 +60,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND env
${
py_env
}
${
PYTHON_EXECUTABLE
}
setup.py bdist_wheel
COMMAND
${
CMAKE_COMMAND
}
-E touch
${
PADDLE_PYTHON_BUILD_DIR
}
/.timestamp
COMMAND
${
CMAKE_COMMAND
}
-E remove_directory
${
PADDLE_PYTHON_BUILD_DIR
}
/lib-python
COMMAND
${
CMAKE_COMMAND
}
-E copy_directory
${
PADDLE_PYTHON_BUILD_DIR
}
/lib*
${
PADDLE_PYTHON_BUILD_DIR
}
/lib-python
COMMAND
${
CMAKE_COMMAND
}
-E copy_directory
${
PADDLE_PYTHON_BUILD_DIR
}
/lib
.
*
${
PADDLE_PYTHON_BUILD_DIR
}
/lib-python
DEPENDS gen_proto_py copy_paddle_pybind
${
FLUID_CORE
}
framework_py_proto profiler_py_proto
${
PY_FILES
}
${
external_project_dependencies
}
${
COPY_PADDLE_MASTER
}
)
set
(
paddle_python_deps
${
PADDLE_PYTHON_BUILD_DIR
}
/.timestamp
${
MKL_DEPENDS
}
)
...
...
python/paddle/dataset/flowers.py
浏览文件 @
5cedfb60
...
...
@@ -35,16 +35,15 @@ import itertools
import
functools
from
.common
import
download
import
tarfile
import
six
import
scipy.io
as
scio
from
paddle.dataset.image
import
*
from
paddle.reader
import
*
from
paddle
import
compat
as
cpt
import
os
import
numpy
as
np
from
multiprocessing
import
cpu_count
import
six
from
six.moves
import
cPickle
as
pickle
from
six.moves
import
zip
__all__
=
[
'train'
,
'test'
,
'valid'
]
DATA_URL
=
'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
...
...
@@ -126,9 +125,11 @@ def reader_creator(data_file,
batch
=
pickle
.
load
(
f
)
else
:
batch
=
pickle
.
load
(
f
,
encoding
=
'bytes'
)
if
six
.
PY3
:
batch
=
cpt
.
to_text
(
batch
)
data
=
batch
[
'data'
]
labels
=
batch
[
'label'
]
for
sample
,
label
in
zip
(
data
,
batch
[
'label'
]):
for
sample
,
label
in
six
.
moves
.
zip
(
data
,
batch
[
'label'
]):
yield
sample
,
int
(
label
)
-
1
if
not
cycle
:
break
...
...
python/paddle/fluid/framework.py
浏览文件 @
5cedfb60
...
...
@@ -18,7 +18,6 @@ import collections
import
contextlib
import
re
import
six
import
traceback
import
numpy
as
np
...
...
@@ -35,8 +34,6 @@ except ImportError as e:
except
Exception
as
e
:
raise
e
from
.
import
unique_name
import
os
PADDLE_ON_MODEL_CE
=
os
.
environ
.
get
(
'PADDLE_ON_MODEL_CE'
,
None
)
is
not
None
__all__
=
[
'Program'
,
...
...
@@ -490,8 +487,7 @@ class OpProtoHolder(object):
return
{
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpNameScopeAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpCreationCallstackAttrName
()
core
.
op_proto_and_checker_maker
.
kOpNameScopeAttrName
()
}
...
...
@@ -573,11 +569,6 @@ class Operator(object):
if
role_var_name
in
op_attrs
and
len
(
op_attrs
[
role_var_name
])
==
0
:
del
op_attrs
[
role_var_name
]
if
not
PADDLE_ON_MODEL_CE
:
callstack_var_name
=
op_maker
.
kOpCreationCallstackAttrName
()
op_attrs
[
callstack_var_name
]
=
list
(
reversed
(
traceback
.
format_stack
()))[
1
:]
if
len
(
self
.
desc
.
type
())
!=
0
:
return
if
type
is
None
:
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
5cedfb60
...
...
@@ -1570,6 +1570,10 @@ class DynamicRNN(object):
The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
get the output sequence.
NOTES:
Currently it is not supported that setting is_sparse to True of any
layers within DynamicRNN.
"""
BEFORE_RNN
=
0
IN_RNN
=
1
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
5cedfb60
...
...
@@ -55,7 +55,11 @@ def data(name,
Args:
name(str): The name/alias of the function
shape(list): Tuple declaring the shape.
append_batch_size(bool): Whether or not to append the data as a batch.
append_batch_size(bool):
1. If true, it prepends -1 to the shape.
For example if shape=[1], the resulting shape is [-1, 1].
2. If shape contains -1, such as shape=[1, -1],
append_batch_size will be enforced to be be False (ineffective).
dtype(int|float): The type of data : float32, float_16, int etc
type(VarType): The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence.
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
5cedfb60
...
...
@@ -14,6 +14,8 @@
from
__future__
import
print_function
from
.layer_function_generator
import
generate_layer_fn
,
generate_layer_fn_noattr
from
..
import
core
from
..framework
import
convert_np_dtype_to_dtype_
__activations_noattr__
=
[
'sigmoid'
,
...
...
@@ -58,8 +60,11 @@ _uniform_random_ = generate_layer_fn('uniform_random')
def
uniform_random
(
shape
,
dtype
=
None
,
min
=
None
,
max
=
None
,
seed
=
None
):
locals_var
=
locals
().
keys
()
if
not
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
dtype
=
convert_np_dtype_to_dtype_
(
dtype
)
kwargs
=
dict
()
for
name
in
locals
()
:
for
name
in
locals
_var
:
val
=
locals
()[
name
]
if
val
is
not
None
:
kwargs
[
name
]
=
val
...
...
@@ -78,8 +83,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink')
def
hard_shrink
(
x
,
threshold
=
None
):
locals_var
=
locals
().
keys
()
kwargs
=
dict
()
for
name
in
locals
()
:
for
name
in
locals
_var
:
val
=
locals
()[
name
]
if
val
is
not
None
:
kwargs
[
name
]
=
val
...
...
@@ -99,12 +105,12 @@ _cum_sum_ = generate_layer_fn('cumsum')
def
cumsum
(
x
,
axis
=
None
,
exclusive
=
None
,
reverse
=
None
):
locals_var
=
locals
().
keys
()
kwargs
=
dict
()
for
name
in
locals
()
:
for
name
in
locals
_var
:
val
=
locals
()[
name
]
if
val
is
not
None
:
kwargs
[
name
]
=
val
return
_cum_sum_
(
**
kwargs
)
...
...
@@ -121,8 +127,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu')
def
thresholded_relu
(
x
,
threshold
=
None
):
locals_var
=
locals
().
keys
()
kwargs
=
dict
()
for
name
in
locals
()
:
for
name
in
locals
_var
:
val
=
locals
()[
name
]
if
val
is
not
None
:
kwargs
[
name
]
=
val
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
5cedfb60
...
...
@@ -24,21 +24,10 @@ from .layer_function_generator import templatedoc
import
numpy
__all__
=
[
'create_tensor'
,
'create_parameter'
,
'create_global_var'
,
'cast'
,
'concat'
,
'sums'
,
'assign'
,
'fill_constant_batch_size_like'
,
'fill_constant'
,
'argmin'
,
'argmax'
,
'argsort'
,
'ones'
,
'zeros'
,
'reverse'
,
'create_tensor'
,
'create_parameter'
,
'create_global_var'
,
'cast'
,
'concat'
,
'sums'
,
'assign'
,
'fill_constant_batch_size_like'
,
'fill_constant'
,
'argmin'
,
'argmax'
,
'argsort'
,
'ones'
,
'zeros'
,
'reverse'
,
'has_inf'
,
'has_nan'
,
'isfinite'
]
...
...
@@ -111,7 +100,7 @@ def create_global_var(shape,
force_cpu
=
False
,
name
=
None
):
"""
Create a new
variabl
e in the global block(block 0).
Create a new
tensor variable with valu
e in the global block(block 0).
Args:
shape(list[int]): shape of the variable
...
...
@@ -652,3 +641,52 @@ def load_combine(out, file_path):
inputs
=
{},
output
=
{
"Out"
:
out
},
args
=
{
"file_path"
:
file_path
})
def
has_inf
(
x
):
"""
Test if any of x contains an infinity number
Args:
x(variable): The Tensor/LoDTensor to be checked.
Returns:
Variable: The tensor variable storing the output, only a bool value.
"""
helper
=
LayerHelper
(
"isinf"
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
"isinf"
,
inputs
=
{
"X"
:
x
},
outputs
=
{
"Out"
:
out
})
return
out
def
has_nan
(
x
):
"""
Test if any of x contains a NAN
Args:
x(variable): The Tensor/LoDTensor to be checked.
Returns:
Variable: The tensor variable storing the output, only a bool value.
"""
helper
=
LayerHelper
(
"isnan"
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
"isnan"
,
inputs
=
{
"X"
:
x
},
outputs
=
{
"Out"
:
out
})
return
out
def
isfinite
(
x
):
"""
Test if any of x contains an infinity/NAN number. If all the elements are finite,
returns true, else false.
Args:
x(variable): The Tensor/LoDTensor to be checked.
Returns:
Variable: The tensor variable storing the output, contains a bool value.
"""
helper
=
LayerHelper
(
"isfinite"
,
**
locals
())
out
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
"isfinite"
,
inputs
=
{
"X"
:
x
},
outputs
=
{
"Out"
:
out
})
return
out
python/paddle/fluid/lod_tensor.py
浏览文件 @
5cedfb60
...
...
@@ -74,7 +74,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
assert
[
new_recursive_seq_lens
]
==
recursive_seq_lens
,
"data and recursive_seq_lens do not match"
flattened_data
=
np
.
concatenate
(
data
,
axis
=
0
)
.
astype
(
"int64"
)
flattened_data
=
np
.
concatenate
(
data
,
axis
=
0
)
flattened_data
=
flattened_data
.
reshape
([
len
(
flattened_data
),
1
])
return
create_lod_tensor
(
flattened_data
,
recursive_seq_lens
,
place
)
elif
isinstance
(
data
,
np
.
ndarray
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
5cedfb60
...
...
@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
optimizer.minimize(cost)
Notes:
Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
"""
_moment_acc_str
=
"moment"
_inf_norm_acc_str
=
"inf_norm"
...
...
@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
optimizer.minimize(cost)
Notes:
Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
"""
_moment_acc_str
=
"moment"
...
...
@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
optimizer = fluid.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
_, params_grads = optimizer.minimize(cost)
Notes:
Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
"""
_avg_squared_grad_acc_str
=
"_avg_squared_grad"
...
...
@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
optimizer = fluid.optimizer.Ftrl(0.0001)
_, params_grads = optimizer.minimize(cost)
Notes:
Currently, FtrlOptimizer doesn't support sparse parameter optimization.
"""
_squared_acc_str
=
"squared"
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
5cedfb60
...
...
@@ -31,15 +31,32 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
class
ParallelExecutor
(
object
):
"""
ParallelExecutor can run program in parallel.
ParallelExecutor is designed for data parallelism, which focuses on distributing
the data across different nodes and every node operates on the data in parallel.
If you use ParallelExecutor to run the current program on GPU, the node means GPU
device, and ParallelExecutor will get the available GPU device automatically on
the current machine. If you use ParallelExecutor to run the current program on CPU,
the node means the CPU device, and you can specify the CPU device number by adding
'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
of CPUs in the system.
Args:
use_cuda (bool): Whether to use CUDA or not.
loss_name (str): The loss name must set in training. Default None.
main_program (Program): The program that need to run, if not provided,
then default_main_program will be used. Default None.
share_vars_from(ParallelExecutor): If provi
ed
, it will share variables
share_vars_from(ParallelExecutor): If provi
de
, it will share variables
from the specified ParallelExecutor. Default None.
exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
the program in ParallelExecutor, for example how many threads are used to
execute the program, how many iterations to clean up the temp variables
which is generated during execution. For more information, please refer
to fluid.ExecutionStrategy. Default None.
build_strategy(BuildStrategy): build_strategy is used to control how to
build the SSA Graph in ParallelExecutor by setting the property,
for example reduce_strategy, gradient_scale_strategy. For more information,
please refer to fluid.BuildStrategy. Default None.
num_trainers(int): If greater than 1, NCCL will be initialized with
multiple rank of nodes, each node should have same number of GPUs.
Distributed training will be enabled then. Default 1.
...
...
python/paddle/fluid/tests/CMakeLists.txt
浏览文件 @
5cedfb60
set
(
PYTHON_TESTS_DIR
${
CMAKE_CURRENT_BINARY_DIR
}
CACHE PATH
"python tests directory"
)
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
5cedfb60
...
...
@@ -76,11 +76,13 @@ if(WITH_DISTRIBUTE)
if
(
NOT APPLE
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT 200
)
py_test_modules
(
test_dist_se_resnext MODULES test_dist_se_resnext SERIAL
)
py_test_modules
(
test_dist_se_resnext MODULES test_dist_se_resnext
)
set_tests_properties
(
test_dist_se_resnext PROPERTIES TIMEOUT 1000
)
# TODO: fix this test
#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
endif
(
NOT APPLE
)
py_test_modules
(
test_dist_transpiler MODULES test_dist_transpiler
)
#FIXME(gongwb): random fails.
#py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
endif
()
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
...
...
python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
浏览文件 @
5cedfb60
...
...
@@ -18,6 +18,9 @@ import unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
class
TestClipByNormOp
(
OpTest
):
def
setUp
(
self
):
...
...
@@ -62,5 +65,59 @@ class TestCase3(TestClipByNormOp):
self
.
max_norm
=
1.0
class
TestClipByNormOpWithSelectedRows
(
OpTest
):
def
check_with_place
(
self
,
place
):
self
.
config_test_case
()
scope
=
core
.
Scope
()
# set input
x_selected_rows
=
scope
.
var
(
'X'
).
get_selected_rows
()
x_selected_rows
.
set_rows
(
self
.
grad_rows
)
x_tensor
=
x_selected_rows
.
get_tensor
()
x_np
=
np
.
random
.
random
(
self
.
grad_shape
).
astype
(
"float32"
)
x_np
[
np
.
abs
(
x_np
)
<
self
.
max_relative_error
]
=
0.5
x_tensor
.
set
(
x_np
,
place
)
# set output
out_selected_rows
=
scope
.
var
(
'Out'
).
get_selected_rows
()
# run clip_by_norm_op
clip_by_norm_op
=
fluid
.
op
.
Operator
(
"clip_by_norm"
,
max_norm
=
self
.
max_norm
,
X
=
'X'
,
Out
=
'Out'
)
clip_by_norm_op
.
run
(
scope
,
place
)
# check output
self
.
assertEqual
(
out_selected_rows
.
rows
(),
self
.
grad_clipped_rows
)
out_tensor
=
out_selected_rows
.
get_tensor
()
y_np
=
np
.
zeros
(
self
.
grad_clipped_shape
)
y_np
[
0
]
=
np
.
sum
(
x_np
[
0
:
2
])
y_np
[
1
]
=
x_np
[
2
]
y_np
[
2
]
=
x_np
[
3
]
norm
=
np
.
sqrt
(
np
.
sum
(
np
.
square
(
y_np
)))
if
norm
>
self
.
max_norm
:
output
=
self
.
max_norm
*
y_np
/
norm
else
:
output
=
y_np
self
.
assertTrue
(
np
.
allclose
(
np
.
array
(
out_tensor
),
output
,
atol
=
1e-5
,
equal_nan
=
False
))
def
test_clip_by_norm_with_selected_ros
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
def
config_test_case
(
self
):
self
.
max_norm
=
1.0
self
.
max_relative_error
=
0.006
self
.
grad_shape
=
(
4
,
1
)
self
.
grad_clipped_shape
=
(
3
,
1
)
self
.
grad_rows
=
[
0
,
0
,
1
,
2
]
self
.
grad_clipped_rows
=
[
0
,
1
,
2
]
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
浏览文件 @
5cedfb60
...
...
@@ -16,6 +16,8 @@ from __future__ import print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
class
ElementwiseMulOp
(
OpTest
):
...
...
@@ -115,5 +117,56 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
}
class
TestElementWiseMulSelectedRows
(
OpTest
):
def
setUp
(
self
):
self
.
rows
=
[
0
,
1
,
2
,
3
,
4
,
5
,
6
]
self
.
feature
=
12
self
.
height
=
100
self
.
input_shape
=
(
len
(
self
.
rows
),
self
.
feature
)
def
prepare_input
(
self
,
scope
,
place
):
self
.
input
=
{
"X"
:
np
.
random
.
random
(
self
.
input_shape
).
astype
(
"float32"
),
"Y"
:
np
.
random
.
random
(
self
.
input_shape
).
astype
(
"float32"
)
}
def
init_input
(
in_name
):
x_selected_rows
=
scope
.
var
(
in_name
).
get_selected_rows
()
x_selected_rows
.
set_height
(
self
.
height
)
x_selected_rows
.
set_rows
(
self
.
rows
)
x_array
=
self
.
input
[
in_name
]
x_tensor
=
x_selected_rows
.
get_tensor
()
x_tensor
.
set
(
x_array
,
place
)
init_input
(
"X"
)
init_input
(
"Y"
)
def
create_out_selected_row
(
self
,
scope
):
return
scope
.
var
(
'Out'
).
get_selected_rows
()
def
check_result
(
self
,
out_selected_rows
):
assert
out_selected_rows
.
height
()
==
self
.
height
assert
out_selected_rows
.
rows
()
==
self
.
rows
out_tensor
=
np
.
array
(
out_selected_rows
.
get_tensor
())
assert
out_tensor
.
shape
==
self
.
input_shape
def
check_with_place
(
self
,
place
):
scope
=
core
.
Scope
()
self
.
prepare_input
(
scope
,
place
)
out_selected_rows
=
self
.
create_out_selected_row
(
scope
)
out_selected_rows
.
set_height
(
0
)
out_selected_rows
.
set_rows
([])
elementwise_mul
=
Operator
(
"elementwise_mul"
,
X
=
'X'
,
Y
=
'Y'
,
Out
=
'Out'
)
elementwise_mul
.
run
(
scope
,
place
)
self
.
check_result
(
out_selected_rows
)
def
test_elewisemul_with_selected_rows_input
(
self
):
places
=
[
core
.
CPUPlace
()]
for
place
in
places
:
self
.
check_with_place
(
place
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py
0 → 100644
浏览文件 @
5cedfb60
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
from
test_lstm_op
import
lstm
,
ACTIVATION
def
fc
(
x
,
w
,
b
):
return
np
.
dot
(
x
,
w
)
+
b
def
fused_embedded_fc_lstm
(
ids
,
# T x 1
lod
,
# 1 x N
embeddings
=
None
,
# Dict_size x M
wx
=
None
,
# M x 4D
bx
=
None
,
# 1 x 4D
h0
=
None
,
# N x D
c0
=
None
,
# N x D
w_h
=
None
,
# D x 4D
w_b
=
None
,
# 1 x 4D
w_c
=
None
,
# 1 x 3D
is_reverse
=
False
,
act_gate
=
None
,
act_cell
=
None
,
act_cand
=
None
):
# Make a lookup for embeddings and pass result into lstm reference
T
=
ids
.
shape
[
0
]
M
=
embeddings
.
shape
[
1
]
x
=
embeddings
[
ids
].
reshape
([
T
,
M
])
return
lstm
(
fc
(
x
,
wx
,
bx
),
lod
,
h0
,
c0
,
w_h
,
w_b
,
w_c
,
is_reverse
,
act_gate
,
act_cell
,
act_cand
)
class
TestFusionLSTMOp
(
OpTest
):
def
set_conf
(
self
):
pass
def
setUp
(
self
):
self
.
op_type
=
'fused_embedding_fc_lstm'
self
.
lod
=
[[
2
,
3
,
5
,
4
]]
self
.
M
=
8
# Embedding size
self
.
D
=
16
# Hidden size
self
.
dict_size
=
18
self
.
has_initial_state
=
False
self
.
use_peepholes
=
False
self
.
is_reverse
=
False
self
.
act_gate
=
'sigmoid'
self
.
act_cell
=
'tanh'
self
.
act_cand
=
'tanh'
self
.
set_conf
()
T
=
sum
(
self
.
lod
[
0
])
bs
=
len
(
self
.
lod
[
0
])
# this is the weight of fc
wx
=
np
.
random
.
normal
(
size
=
(
self
.
M
,
4
*
self
.
D
)).
astype
(
'float32'
)
# this is the bias of fc
bx
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float32'
)
if
self
.
use_peepholes
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
7
*
self
.
D
)).
astype
(
'float32'
)
else
:
b
=
np
.
random
.
normal
(
size
=
(
1
,
4
*
self
.
D
)).
astype
(
'float32'
)
w_b
=
np
.
copy
(
b
[:,
0
:
4
*
self
.
D
])
w_c
=
b
[:,
4
*
self
.
D
:]
if
self
.
use_peepholes
else
None
# low is 0 , high is voc_size - 1
ids
=
np
.
random
.
randint
(
low
=
0
,
high
=
self
.
dict_size
-
1
,
size
=
(
T
,
1
)).
astype
(
"int64"
)
# embeddings as they were trained , so each entry is of M size
embeddings
=
np
.
random
.
random
(
(
self
.
dict_size
,
self
.
M
)).
astype
(
"float32"
)
# multiply embeddings via Weights
fc_embeddings
=
np
.
dot
(
embeddings
,
wx
)
# bias should be manually added into the bias of this fused embedding fc LSTM
b
[
0
,
0
:
4
*
self
.
D
]
+=
bx
[
0
,
:]
combined_biases
=
b
[:,
0
:
4
*
self
.
D
]
# So let broadcast it , so they can be added
ones
=
np
.
ones
([
self
.
dict_size
,
1
])
broadcasted_biases
=
np
.
dot
(
ones
,
combined_biases
)
# Sum biases with Wx*embeddings
fc_embeddings
+=
broadcasted_biases
if
self
.
has_initial_state
:
h0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float32'
)
c0
=
np
.
random
.
normal
(
size
=
(
bs
,
self
.
D
)).
astype
(
'float32'
)
else
:
h0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float32'
)
c0
=
np
.
zeros
((
bs
,
self
.
D
)).
astype
(
'float32'
)
wh
=
np
.
random
.
normal
(
size
=
(
self
.
D
,
4
*
self
.
D
)).
astype
(
'float32'
)
h
,
c
=
fused_embedded_fc_lstm
(
ids
,
self
.
lod
,
embeddings
,
wx
,
bx
,
h0
,
c0
,
wh
,
w_b
,
w_c
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_gate
],
ACTIVATION
[
self
.
act_cell
],
ACTIVATION
[
self
.
act_cand
])
self
.
inputs
=
{
'Ids'
:
(
ids
,
self
.
lod
),
'Embeddings'
:
fc_embeddings
,
'WeightH'
:
wh
,
'Bias'
:
b
}
if
self
.
has_initial_state
:
self
.
inputs
[
'H0'
]
=
h0
self
.
inputs
[
'C0'
]
=
c0
self
.
outputs
=
{
'Hidden'
:
(
h
,
self
.
lod
),
'Cell'
:
(
c
,
self
.
lod
),
}
self
.
attrs
=
{
'use_peepholes'
:
self
.
use_peepholes
,
'is_reverse'
:
self
.
is_reverse
,
'gate_activation'
:
self
.
act_gate
,
'cell_activation'
:
self
.
act_cell
,
'candidate_activation'
:
self
.
act_cand
}
def
test_check_output
(
self
):
for
use_seq
in
{
True
,
False
}:
self
.
attrs
[
'use_seq'
]
=
use_seq
self
.
check_output
()
class
TestFusionLSTMOpInit
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
has_initial_state
=
True
class
TestFusionLSTMOpReverse
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
is_reverse
=
True
class
TestFusionLSTMOpInitReverse
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
has_initial_state
=
True
self
.
is_reverse
=
True
class
TestFusionLSTMOpMD1
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
M
=
36
self
.
D
=
8
class
TestFusionLSTMOpMD2
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
M
=
8
self
.
D
=
8
class
TestFusionLSTMOpMD3
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
M
=
15
self
.
D
=
3
class
TestFusionLSTMOpBS1
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
lod
=
[[
3
]]
self
.
D
=
16
class
TestFusionLSTMOpPeepholes
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
use_peepholes
=
True
class
TestFusionLSTMOpPeepholesInit
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
use_peepholes
=
True
self
.
has_initial_state
=
True
class
TestFusionLSTMOpPeepholesReverse
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
use_peepholes
=
True
self
.
is_reverse
=
True
class
TestFusionLSTMOpPeepholesInitReverse
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
use_peepholes
=
True
self
.
has_initial_state
=
True
self
.
is_reverse
=
True
class
TestFusionLSTMOpPeepholesBS1
(
TestFusionLSTMOp
):
def
set_conf
(
self
):
self
.
use_peepholes
=
True
self
.
lod
=
[[
2
]]
self
.
D
=
8
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_isfinite_op.py
0 → 100644
浏览文件 @
5cedfb60
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
class
TestInf
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"isinf"
self
.
dtype
=
np
.
float32
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
17
]).
astype
(
self
.
dtype
)
x
[
0
]
=
np
.
inf
x
[
-
1
]
=
np
.
inf
self
.
inputs
=
{
'X'
:
x
}
self
.
outputs
=
{
'Out'
:
np
.
array
(
True
).
astype
(
self
.
dtype
)}
def
init_dtype
(
self
):
pass
def
test_output
(
self
):
self
.
check_output
()
class
TestFP16Inf
(
TestInf
):
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float16
class
TestNAN
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"isnan"
self
.
dtype
=
np
.
float32
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
17
]).
astype
(
self
.
dtype
)
x
[
0
]
=
np
.
nan
x
[
-
1
]
=
np
.
nan
self
.
inputs
=
{
'X'
:
x
}
self
.
outputs
=
{
'Out'
:
np
.
array
(
True
).
astype
(
self
.
dtype
)}
def
init_dtype
(
self
):
pass
def
test_output
(
self
):
self
.
check_output
()
class
TestFP16NAN
(
TestNAN
):
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float16
class
TestIsfinite
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"isfinite"
self
.
dtype
=
np
.
float32
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
17
]).
astype
(
self
.
dtype
)
x
[
0
]
=
np
.
inf
x
[
-
1
]
=
np
.
nan
out
=
np
.
isinf
(
x
)
|
np
.
isnan
(
x
)
self
.
inputs
=
{
'X'
:
x
}
self
.
outputs
=
{
'Out'
:
np
.
array
(
False
).
astype
(
self
.
dtype
)}
def
init_dtype
(
self
):
pass
def
test_output
(
self
):
self
.
check_output
()
class
TestFP16Isfinite
(
TestIsfinite
):
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float16
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_operator_desc.py
浏览文件 @
5cedfb60
...
...
@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
set
(
mul_op
.
attr_names
),
set
([
"x_num_col_dims"
,
"y_num_col_dims"
,
"op_role"
,
"op_role_var"
,
"op_namescope"
,
"op_callstack"
"op_namescope"
]))
self
.
assertEqual
(
mul_op
.
has_attr
(
"x_num_col_dims"
),
True
)
self
.
assertEqual
(
mul_op
.
attr_type
(
"x_num_col_dims"
),
core
.
AttrType
.
INT
)
...
...
python/paddle/fluid/tests/unittests/test_reduce_op.py
浏览文件 @
5cedfb60
...
...
@@ -243,5 +243,87 @@ class TestKeepDimReduceSumMultiAxises(OpTest):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestReduceSumWithDimOne
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
1
,
1
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'dim'
:
[
1
,
2
],
'keep_dim'
:
True
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]),
keepdims
=
True
)
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestReduceSumWithNumelOne
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
1
,
1
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'dim'
:
[
1
],
'keep_dim'
:
False
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]),
keepdims
=
False
)
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestReduceMeanWithDimOne
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_mean"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
1
,
1
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'dim'
:
[
1
],
'keep_dim'
:
False
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
mean
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]),
keepdims
=
False
)
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestReduceMeanWithNumelOne
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_mean"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
1
,
1
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'dim'
:
[
1
],
'keep_dim'
:
True
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
mean
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]),
keepdims
=
True
)
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestReduceAll
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
1
,
1
,
1
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'reduce_all'
:
True
,
'keep_dim'
:
False
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
()}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_rmsprop_op.py
浏览文件 @
5cedfb60
...
...
@@ -88,7 +88,7 @@ class TestBase(unittest.TestCase):
self
.
epsilon
=
epsilon
self
.
decay
=
0.9
self
.
momentum
=
0.
0
self
.
momentum
=
0.
1
self
.
centered
=
centered
self
.
ms_out
=
self
.
decay
*
self
.
mean_square
+
(
1
-
self
.
decay
...
...
@@ -170,8 +170,11 @@ class TestRmspropOp(TestBase):
rmsprop_op
.
run
(
self
.
scope
,
self
.
place
)
self
.
check
(
np
.
array
(
self
.
mean_square_tensor
),
self
.
ms_out
,
self
.
place
,
self
.
mean_square_name
)
np
.
array
(
self
.
mean_square_tensor
),
self
.
ms_out
,
self
.
place
,
self
.
mean_square_name
,
atol
=
atol
)
self
.
check
(
np
.
array
(
self
.
moment_tensor
),
self
.
moment_out
,
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
5cedfb60
...
...
@@ -788,7 +788,8 @@ in a single call.")
tuple: (main_program, startup_program), of type "Program"
"""
pserver_prog
=
self
.
get_pserver_program
(
endpoint
)
pserver_startup
=
self
.
get_startup_program
(
endpoint
)
pserver_startup
=
self
.
get_startup_program
(
endpoint
,
pserver_program
=
pserver_prog
)
return
pserver_prog
,
pserver_startup
def
get_startup_program
(
self
,
...
...
python/paddle/fluid/transpiler/inference_transpiler.py
浏览文件 @
5cedfb60
...
...
@@ -124,7 +124,7 @@ class InferenceTranspiler(object):
next_op
=
self
.
block
.
ops
[
i
+
1
]
if
next_op
.
type
==
'relu'
:
# modify bnorm OP to include relu
current_op
.
set_attr
(
"fuse_relu"
,
True
)
current_op
.
_
set_attr
(
"fuse_relu"
,
True
)
# remove relu OP
self
.
block
.
_remove_op
(
i
+
1
)
i
=
i
+
1
...
...
@@ -454,7 +454,7 @@ class InferenceTranspiler(object):
:type eltwise_op: Operator
'''
conv_op
.
set_attr
(
"fuse_eltwise"
,
True
)
conv_op
.
_
set_attr
(
"fuse_eltwise"
,
True
)
self
.
input_map
[
conv_op
.
output
(
"Output"
)[
0
]]
=
eltwise_op
.
input
(
"Y"
)[
0
]
self
.
input_map
[
eltwise_op
.
output
(
"Out"
)[
0
]]
=
eltwise_op
.
input
(
"Y"
)[
0
]
...
...
python/paddle/reader/decorator.py
浏览文件 @
5cedfb60
...
...
@@ -15,7 +15,7 @@
__all__
=
[
'map_readers'
,
'buffered'
,
'compose'
,
'chain'
,
'shuffle'
,
'ComposeNotAligned'
,
'firstn'
,
'xmap_readers'
,
'PipeReader'
,
'multiprocess_reader'
'multiprocess_reader'
,
'Fake'
]
from
threading
import
Thread
...
...
@@ -504,3 +504,39 @@ class PipeReader:
yield
decomp_buff
else
:
break
class
Fake
(
object
):
"""
fake reader will cache the first data it read and yield it out for data_num times.
It is used to cache a data from real reader and use it for speed testing.
:param reader: the origin reader
:param data_num: times that this reader will yield data.
:return: a fake reader.
Examples:
.. code-block:: python
def reader():
for i in range(10):
yield i
fake_reader = Fake()(reader, 100)
"""
def
__init__
(
self
):
self
.
data
=
None
self
.
yield_num
=
0
def
__call__
(
self
,
reader
,
data_num
):
def
fake_reader
():
if
self
.
data
is
None
:
self
.
data
=
next
(
reader
())
while
self
.
yield_num
<
data_num
:
yield
self
.
data
self
.
yield_num
+=
1
self
.
yield_num
=
0
return
fake_reader
python/paddle/reader/tests/decorator_test.py
浏览文件 @
5cedfb60
...
...
@@ -203,5 +203,21 @@ class TestMultiProcessReader(unittest.TestCase):
self
.
reader_test
(
use_pipe
=
True
)
class
TestFakeReader
(
unittest
.
TestCase
):
def
test_fake_reader
(
self
):
def
reader
():
for
i
in
range
(
10
):
yield
i
data_num
=
100
fake_reader
=
paddle
.
reader
.
Fake
()(
reader
,
data_num
)
for
_
in
range
(
10
):
i
=
0
for
data
in
fake_reader
():
self
.
assertEqual
(
data
,
0
)
i
+=
1
self
.
assertEqual
(
i
,
data_num
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录