Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
eac5a0aa
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
eac5a0aa
编写于
1月 11, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
Merge develop
test=develop
上级
594dc4d8
869f3a9d
变更
132
隐藏空白更改
内联
并排
Showing
132 changed file
with
4848 addition
and
707 deletion
+4848
-707
cmake/FindJeMalloc.cmake
cmake/FindJeMalloc.cmake
+7
-0
cmake/cuda.cmake
cmake/cuda.cmake
+15
-1
cmake/external/boost.cmake
cmake/external/boost.cmake
+2
-5
cmake/external/gflags.cmake
cmake/external/gflags.cmake
+9
-0
cmake/external/mkldnn.cmake
cmake/external/mkldnn.cmake
+1
-1
cmake/external/mklml.cmake
cmake/external/mklml.cmake
+16
-18
cmake/external/ngraph.cmake
cmake/external/ngraph.cmake
+7
-12
cmake/generic.cmake
cmake/generic.cmake
+11
-10
paddle/fluid/API.spec
paddle/fluid/API.spec
+22
-0
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+1
-1
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+1
-0
paddle/fluid/framework/details/eager_deletion_op_handle.cc
paddle/fluid/framework/details/eager_deletion_op_handle.cc
+6
-8
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+1
-1
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+14
-0
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+0
-1
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+63
-0
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+15
-0
paddle/fluid/framework/ir/lock_free_optimize_pass.cc
paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+358
-0
paddle/fluid/framework/ir/lock_free_optimize_pass.h
paddle/fluid/framework/ir/lock_free_optimize_pass.h
+130
-0
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+214
-0
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+52
-0
paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+198
-0
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
.../fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+148
-0
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
...e/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+38
-0
paddle/fluid/framework/ngraph_bridge.cc
paddle/fluid/framework/ngraph_bridge.cc
+3
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+4
-4
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+1
-2
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+3
-2
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+5
-3
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+2
-2
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+5
-4
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+1
-0
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+2
-2
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+0
-2
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+0
-10
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+11
-7
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+5
-3
paddle/fluid/inference/analysis/passes/CMakeLists.txt
paddle/fluid/inference/analysis/passes/CMakeLists.txt
+1
-0
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
...uid/inference/analysis/passes/ir_analysis_compose_pass.cc
+0
-23
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
...luid/inference/analysis/passes/ir_analysis_compose_pass.h
+0
-2
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+1
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+5
-2
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+0
-1
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+4
-4
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+4
-0
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+2
-2
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+8
-3
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+96
-7
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+105
-72
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+27
-15
paddle/fluid/inference/tensorrt/CMakeLists.txt
paddle/fluid/inference/tensorrt/CMakeLists.txt
+1
-0
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+31
-6
paddle/fluid/inference/tensorrt/op_teller.cc
paddle/fluid/inference/tensorrt/op_teller.cc
+49
-0
paddle/fluid/inference/tensorrt/op_teller.h
paddle/fluid/inference/tensorrt/op_teller.h
+68
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+5
-5
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+3
-3
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+111
-18
paddle/fluid/inference/tests/api/config_printer.h
paddle/fluid/inference/tests/api/config_printer.h
+1
-1
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+10
-10
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+3
-15
paddle/fluid/inference/utils/CMakeLists.txt
paddle/fluid/inference/utils/CMakeLists.txt
+0
-3
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+15
-0
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+141
-31
paddle/fluid/operators/elementwise/elementwise_sub_op.cu
paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+5
-0
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+194
-0
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+142
-0
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+134
-0
paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+41
-0
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+25
-2
paddle/fluid/operators/jit/gen/CMakeLists.txt
paddle/fluid/operators/jit/gen/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/gen/seqpool.cc
paddle/fluid/operators/jit/gen/seqpool.cc
+85
-0
paddle/fluid/operators/jit/gen/seqpool.h
paddle/fluid/operators/jit/gen/seqpool.h
+214
-0
paddle/fluid/operators/jit/helper.cc
paddle/fluid/operators/jit/helper.cc
+15
-0
paddle/fluid/operators/jit/helper.h
paddle/fluid/operators/jit/helper.h
+6
-0
paddle/fluid/operators/jit/kernel_base.h
paddle/fluid/operators/jit/kernel_base.h
+23
-0
paddle/fluid/operators/jit/kernel_key.cc
paddle/fluid/operators/jit/kernel_key.cc
+7
-0
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/more/mkl/mkl.cc
paddle/fluid/operators/jit/more/mkl/mkl.cc
+31
-0
paddle/fluid/operators/jit/more/mkl/mkl.h
paddle/fluid/operators/jit/more/mkl/mkl.h
+26
-0
paddle/fluid/operators/jit/refer/CMakeLists.txt
paddle/fluid/operators/jit/refer/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/refer/refer.cc
paddle/fluid/operators/jit/refer/refer.cc
+2
-0
paddle/fluid/operators/jit/refer/refer.h
paddle/fluid/operators/jit/refer/refer.h
+24
-0
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+49
-0
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-1
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+4
-0
paddle/fluid/operators/math/sequence_pooling.cc
paddle/fluid/operators/math/sequence_pooling.cc
+21
-11
paddle/fluid/operators/math/softmax.h
paddle/fluid/operators/math/softmax.h
+1
-0
paddle/fluid/operators/ngraph/ngraph_ops.h
paddle/fluid/operators/ngraph/ngraph_ops.h
+2
-0
paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+0
-2
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
+59
-0
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+0
-2
paddle/fluid/operators/ngraph/ops/mean_op.h
paddle/fluid/operators/ngraph/ops/mean_op.h
+66
-0
paddle/fluid/operators/ngraph/ops/mul_op.h
paddle/fluid/operators/ngraph/ops/mul_op.h
+0
-2
paddle/fluid/operators/ngraph/ops/scale_op.h
paddle/fluid/operators/ngraph/ops/scale_op.h
+39
-0
paddle/fluid/operators/ngraph/ops/top_k_op.h
paddle/fluid/operators/ngraph/ops/top_k_op.h
+0
-2
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+36
-28
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+9
-1
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+3
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+14
-16
paddle/fluid/platform/float16.h
paddle/fluid/platform/float16.h
+1
-1
paddle/fluid/platform/float16_test.cu
paddle/fluid/platform/float16_test.cu
+4
-2
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+37
-12
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+3
-4
paddle/fluid/pybind/ir.cc
paddle/fluid/pybind/ir.cc
+103
-0
paddle/fluid/pybind/ir.h
paddle/fluid/pybind/ir.h
+25
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+11
-3
python/paddle/dataset/mnist.py
python/paddle/dataset/mnist.py
+43
-48
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+204
-0
python/paddle/fluid/data_feeder.py
python/paddle/fluid/data_feeder.py
+18
-12
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+137
-24
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+143
-83
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+3
-4
python/paddle/fluid/tests/test_data_feeder.py
python/paddle/fluid/tests/test_data_feeder.py
+6
-0
python/paddle/fluid/tests/unittests/dist_ctr.py
python/paddle/fluid/tests/unittests/dist_ctr.py
+9
-1
python/paddle/fluid/tests/unittests/dist_se_resnext.py
python/paddle/fluid/tests/unittests/dist_se_resnext.py
+0
-1
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
...addle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+31
-0
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
...ddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+40
-0
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+13
-20
python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
...addle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+153
-15
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+11
-12
python/paddle/fluid/tests/unittests/test_dist_ctr.py
python/paddle/fluid/tests/unittests/test_dist_ctr.py
+14
-1
python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
...addle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
+51
-0
python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
...le/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+118
-0
python/paddle/fluid/tests/unittests/test_ir_graph.py
python/paddle/fluid/tests/unittests/test_ir_graph.py
+146
-0
python/paddle/fluid/tests/unittests/test_optimizer.py
python/paddle/fluid/tests/unittests/test_optimizer.py
+56
-14
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+10
-5
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
...ests/unittests/test_parallel_executor_test_while_train.py
+13
-13
python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
...n/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+8
-7
python/paddle/fluid/tests/unittests/test_seq_pool.py
python/paddle/fluid/tests/unittests/test_seq_pool.py
+27
-22
python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
...uid/tests/unittests/test_softmax_with_cross_entropy_op.py
+55
-5
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+11
-8
未找到文件。
cmake/FindJeMalloc.cmake
浏览文件 @
eac5a0aa
...
...
@@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
mark_as_advanced
(
JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR
)
if
(
JEMALLOC_FOUND
)
add_library
(
jemalloc::jemalloc UNKNOWN IMPORTED
)
set_target_properties
(
jemalloc::jemalloc PROPERTIES
IMPORTED_LOCATION
${
JEMALLOC_LIBRARIES
}
INTERFACE_INCLUDE_DIRECTORIES
"
${
JEMALLOC_INCLUDE_DIR
}
"
)
endif
()
cmake/cuda.cmake
浏览文件 @
eac5a0aa
...
...
@@ -5,6 +5,8 @@ endif()
set
(
paddle_known_gpu_archs
"30 35 50 52 60 61 70"
)
set
(
paddle_known_gpu_archs7
"30 35 50 52"
)
set
(
paddle_known_gpu_archs8
"30 35 50 52 60 61"
)
set
(
paddle_known_gpu_archs9
"30 35 50 52 60 61 70"
)
set
(
paddle_known_gpu_archs10
"30 35 50 52 60 61 70 75"
)
######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
...
...
@@ -59,7 +61,7 @@ endfunction()
# select_nvcc_arch_flags(out_variable)
function
(
select_nvcc_arch_flags out_variable
)
# List of arch names
set
(
archs_names
"Kepler"
"Maxwell"
"Pascal"
"All"
"Manual"
)
set
(
archs_names
"Kepler"
"Maxwell"
"Pascal"
"
Volta"
"Turing"
"
All"
"Manual"
)
set
(
archs_name_default
"All"
)
if
(
NOT CMAKE_CROSSCOMPILING
)
list
(
APPEND archs_names
"Auto"
)
...
...
@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable)
set
(
cuda_arch_bin
"60 61"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Volta"
)
set
(
cuda_arch_bin
"70"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Turing"
)
set
(
cuda_arch_bin
"75"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"All"
)
set
(
cuda_arch_bin
${
paddle_known_gpu_archs
}
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Auto"
)
...
...
@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# warning for now.
list
(
APPEND CUDA_NVCC_FLAGS
"-Wno-deprecated-gpu-targets"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
80
\"
"
)
elseif
(
${
CUDA_VERSION
}
LESS 10.0
)
# CUDA 9.x
set
(
paddle_known_gpu_archs
${
paddle_known_gpu_archs9
}
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D_MWAITXINTRIN_H_INCLUDED"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D__STRICT_ANSI__"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
90
\"
"
)
elseif
(
${
CUDA_VERSION
}
LESS 11.0
)
# CUDA 10.x
set
(
paddle_known_gpu_archs
${
paddle_known_gpu_archs10
}
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D_MWAITXINTRIN_H_INCLUDED"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D__STRICT_ANSI__"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
100
\"
"
)
endif
()
include_directories
(
${
CUDA_INCLUDE_DIRS
}
)
...
...
cmake/external/boost.cmake
浏览文件 @
eac5a0aa
...
...
@@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost")
# checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here.
set
(
BOOST_VER
"1.41.0"
)
if
((
NOT DEFINED BOOST_TAR
)
OR
(
NOT DEFINED BOOST_URL
))
message
(
STATUS
"use pre defined download url"
)
set
(
BOOST_TAR
"boost_1_41_0"
CACHE STRING
""
FORCE
)
set
(
BOOST_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
BOOST_TAR
}
.tar.gz"
CACHE STRING
""
FORCE
)
endif
()
set
(
BOOST_TAR
"boost_1_41_0"
CACHE STRING
""
FORCE
)
set
(
BOOST_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
BOOST_TAR
}
.tar.gz"
CACHE STRING
""
FORCE
)
MESSAGE
(
STATUS
"BOOST_TAR:
${
BOOST_TAR
}
, BOOST_URL:
${
BOOST_URL
}
"
)
...
...
cmake/external/gflags.cmake
浏览文件 @
eac5a0aa
...
...
@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST
(
APPEND external_project_dependencies gflags
)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if
(
WIN32
)
include
(
CheckIncludeFileCXX
)
check_include_file_cxx
(
"shlwapi.h"
HAVE_SHLWAPI
)
if
(
HAVE_SHLWAPI
)
set_property
(
GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib
)
endif
(
HAVE_SHLWAPI
)
endif
(
WIN32
)
IF
(
WITH_C_API
)
INSTALL
(
DIRECTORY
${
GFLAGS_INCLUDE_DIR
}
DESTINATION third_party/gflags
)
IF
(
ANDROID
)
...
...
cmake/external/mkldnn.cmake
浏览文件 @
eac5a0aa
...
...
@@ -55,7 +55,7 @@ ExternalProject_Add(
${
MKLDNN_PROJECT
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
DEPENDS
${
MKLDNN_DEPENDS
}
GIT_REPOSITORY
"https://github.com/
01org
/mkl-dnn.git"
GIT_REPOSITORY
"https://github.com/
intel
/mkl-dnn.git"
GIT_TAG
"830a10059a018cd2634d94195140cf2d8790a75a"
PREFIX
${
MKLDNN_SOURCES_DIR
}
UPDATE_COMMAND
""
...
...
cmake/external/mklml.cmake
浏览文件 @
eac5a0aa
...
...
@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
return
()
ENDIF
(
NOT
${
WITH_MKLML
}
)
IF
(
APPLE
)
MESSAGE
(
WARNING
"Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF."
)
SET
(
WITH_MKLML OFF CACHE STRING
"Disable MKLML package in MacOS"
FORCE
)
return
()
ENDIF
()
INCLUDE
(
ExternalProject
)
SET
(
MKLML_DST_DIR
"mklml"
)
SET
(
MKLML_INSTALL_ROOT
"
${
THIRD_PARTY_PATH
}
/install"
)
...
...
@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET
(
MKLML_ROOT
${
MKLML_INSTALL_DIR
}
)
SET
(
MKLML_INC_DIR
${
MKLML_ROOT
}
/include
)
SET
(
MKLML_LIB_DIR
${
MKLML_ROOT
}
/lib
)
if
(
WIN32
)
SET
(
CMAKE_INSTALL_RPATH
"
${
CMAKE_INSTALL_RPATH
}
"
"
${
MKLML_ROOT
}
/lib"
)
SET
(
TIME_VERSION
"2019.0.1.20181227"
)
IF
(
WIN32
)
SET
(
MKLML_VER
"mklml_win_
${
TIME_VERSION
}
"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"https://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.zip"
CACHE STRING
""
FORCE
)
SET
(
MKLML_LIB
${
MKLML_LIB_DIR
}
/mklml.lib
)
SET
(
MKLML_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5md.lib
)
SET
(
MKLML_SHARED_LIB
${
MKLML_LIB_DIR
}
/mklml.dll
)
SET
(
MKLML_SHARED_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5md.dll
)
else
()
ELSE
()
SET
(
MKLML_VER
"mklml_lnx_
${
TIME_VERSION
}
"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.tgz"
CACHE STRING
""
FORCE
)
SET
(
MKLML_LIB
${
MKLML_LIB_DIR
}
/libmklml_intel.so
)
SET
(
MKLML_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5.so
)
SET
(
MKLML_SHARED_LIB
${
MKLML_LIB_DIR
}
/libmklml_intel.so
)
SET
(
MKLML_SHARED_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5.so
)
endif
()
SET
(
CMAKE_INSTALL_RPATH
"
${
CMAKE_INSTALL_RPATH
}
"
"
${
MKLML_ROOT
}
/lib"
)
IF
((
NOT DEFINED MKLML_VER
)
OR
(
NOT DEFINED MKLML_URL
))
MESSAGE
(
STATUS
"use pre defined download url"
)
if
(
WIN32
)
SET
(
MKLML_VER
"mklml_win_2019.0.1.20180928"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"https://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.zip"
CACHE STRING
""
FORCE
)
elseif
(
APPLE
)
SET
(
MKLML_VER
"mklml_mac_2019.0.1.20180928"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.tgz"
CACHE STRING
""
FORCE
)
else
()
SET
(
MKLML_VER
"mklml_lnx_2019.0.1.20180928"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.tgz"
CACHE STRING
""
FORCE
)
ENDIF
()
endif
()
ENDIF
()
SET
(
MKLML_PROJECT
"extern_mklml"
)
MESSAGE
(
STATUS
"MKLML_VER:
${
MKLML_VER
}
, MKLML_URL:
${
MKLML_URL
}
"
)
...
...
cmake/external/ngraph.cmake
浏览文件 @
eac5a0aa
...
...
@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
INCLUDE
(
ExternalProject
)
SET
(
NGRAPH_PROJECT
"extern_ngraph"
)
SET
(
NGRAPH_GIT_TAG
"
08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9
"
)
SET
(
NGRAPH_GIT_TAG
"
20bd8bbc79ae3a81c57313846a2be7313e5d1dab
"
)
SET
(
NGRAPH_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/ngraph
)
SET
(
NGRAPH_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/ngraph
)
SET
(
NGRAPH_INC_DIR
${
NGRAPH_INSTALL_DIR
}
/include
)
SET
(
NGRAPH_LIB_DIR
${
NGRAPH_INSTALL_DIR
}
/
${
CMAKE_INSTALL_LIBDIR
}
)
SET
(
NGRAPH_SHARED_LIB_NAME libngraph.so
)
SET
(
NGRAPH_CPU_LIB_NAME libcpu_backend.so
)
SET
(
NGRAPH_TBB_LIB_NAME libtbb.so.2
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
SET
(
NGRAPH_TBB_LIB_NAME libtbb_debug.so.2
)
else
()
SET
(
NGRAPH_TBB_LIB_NAME libtbb.so.2
)
endif
()
SET
(
NGRAPH_GIT_REPO
"https://github.com/NervanaSystems/ngraph.git"
)
SET
(
NGRAPH_SHARED_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_SHARED_LIB_NAME
}
)
SET
(
NGRAPH_CPU_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_CPU_LIB_NAME
}
)
...
...
@@ -66,16 +70,7 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_BUILD_TYPE=
${
CMAKE_BUILD_TYPE
}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=
${
MKLDNN_INC_DIR
}
CMAKE_ARGS -DMKLDNN_LIB_DIR=
${
MKLDNN_INSTALL_DIR
}
/lib
)
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step
(
${
NGRAPH_PROJECT
}
PrepareMKL
COMMAND
${
CMAKE_COMMAND
}
-E create_symlink
${
MKLML_LIB
}
${
MKLDNN_INSTALL_DIR
}
/lib/libmklml_intel.so
COMMAND
${
CMAKE_COMMAND
}
-E create_symlink
${
MKLML_IOMP_LIB
}
${
MKLDNN_INSTALL_DIR
}
/lib/libiomp5.so
DEPENDEES download
DEPENDERS configure
CMAKE_ARGS -DMKLML_LIB_DIR=
${
MKLML_INSTALL_DIR
}
/lib
)
add_dependencies
(
ngraph
${
NGRAPH_PROJECT
}
)
...
...
cmake/generic.cmake
浏览文件 @
eac5a0aa
...
...
@@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
endif
()
if
(
WITH_JEMALLOC
)
target_link_libraries
(
${
TARGET_NAME
}
${
JEMALLOC_LIBRARIES
}
)
target_link_libraries
(
${
TARGET_NAME
}
jemalloc::jemalloc
)
endif
()
endfunction
()
...
...
@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
add_dependencies
(
${
TARGET_NAME
}
${
cc_binary_DEPS
}
)
common_link
(
${
TARGET_NAME
}
)
endif
()
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
os_dependency_modules
}
)
endfunction
(
cc_binary
)
function
(
cc_test TARGET_NAME
)
...
...
@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS ARGS
)
cmake_parse_arguments
(
cc_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
if
(
WIN32
)
list
(
APPEND win32_deps shlwapi
)
if
(
"
${
cc_test_DEPS
}
;"
MATCHES
"python;"
)
list
(
REMOVE_ITEM cc_test_DEPS python
)
list
(
APPEND win32_deps
${
PYTHON_LIBRARIES
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
PYTHON_LIBRARIES
}
)
endif
()
endif
(
WIN32
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
if
(
WIN32
)
target_link_libraries
(
${
TARGET_NAME
}
${
win32_deps
}
)
endif
(
WIN32
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
${
os_dependency_modules
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
NAME
${
TARGET_NAME
}
...
...
@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
set
(
multiValueArgs SRCS DEPS
)
cmake_parse_arguments
(
nv_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cuda_add_executable
(
${
TARGET_NAME
}
${
nv_test_SRCS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
${
os_dependency_modules
}
)
add_dependencies
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
...
...
@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
endif
()
add_executable
(
${
TARGET_NAME
}
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE HIP
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
${
os_dependency_modules
}
)
add_dependencies
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
...
...
paddle/fluid/API.spec
浏览文件 @
eac5a0aa
...
...
@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
memory_optimize_pass
)
memory_optimize_pass
lock_free_optimize_pass
)
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
eac5a0aa
...
...
@@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
USE_PASS
(
sequential_execution_pass
);
USE_PASS
(
all_reduce_deps_pass
);
USE_PASS
(
modify_op_lock_and_record_event_pass
);
USE_PASS
(
lock_free_optimize_pass
);
paddle/fluid/framework/details/eager_deletion_op_handle.cc
浏览文件 @
eac5a0aa
...
...
@@ -25,6 +25,8 @@ namespace paddle {
namespace
framework
{
namespace
details
{
static
const
std
::
string
kEagerDeletionOpName
{
"eager_deletion"
};
// NOLINT
EagerDeletionOpHandle
::
EagerDeletionOpHandle
(
ir
::
Node
*
node
,
const
Scope
*
scope
,
const
platform
::
Place
&
place
,
const
std
::
unordered_set
<
std
::
string
>
&
var_names
,
GarbageCollector
*
gc
,
...
...
@@ -59,20 +61,15 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
#endif
}
std
::
string
EagerDeletionOpHandle
::
Name
()
const
{
return
"eager_deletion"
;
}
std
::
string
EagerDeletionOpHandle
::
Name
()
const
{
return
kEagerDeletionOpName
;
}
void
EagerDeletionOpHandle
::
RunImpl
()
{
#ifdef PADDLE_WITH_CUDA
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctx_
);
#else
platform
::
RecordEvent
record_event
(
Name
(),
nullptr
);
#endif
platform
::
RecordEvent
event
(
kEagerDeletionOpName
,
nullptr
);
Scope
*
exec_scope
=
nullptr
;
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
garbages
;
for
(
auto
&
name
:
var_names_
)
{
auto
it
=
ref_cnts_
->
find
(
name
);
//
Var not found, not r
eference count has not decreased to 0
//
R
eference count has not decreased to 0
if
(
it
==
ref_cnts_
->
end
()
||
it
->
second
.
fetch_sub
(
1
)
!=
1
)
{
continue
;
}
...
...
@@ -81,6 +78,7 @@ void EagerDeletionOpHandle::RunImpl() {
exec_scope
=
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
}
// Var not found
auto
*
var
=
exec_scope
->
FindVar
(
name
);
if
(
var
==
nullptr
)
{
continue
;
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
eac5a0aa
...
...
@@ -226,7 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
* Only variables should be the leaves of graph.
*/
AddOutputToLeafOps
(
&
result
);
result
.
Erase
<
GraphOps
>
(
kGraphOps
);
result
.
Erase
(
kGraphOps
);
return
graph
;
}
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library
(
graph_to_program_pass base
)
pass_library
(
graph_viz_pass base
)
pass_library
(
lock_free_optimize_pass base
)
pass_library
(
fc_fuse_pass inference
)
pass_library
(
attention_lstm_fuse_pass inference
)
pass_library
(
infer_clean_graph_pass inference
)
...
...
@@ -41,11 +42,23 @@ pass_library(seq_concat_fc_fuse_pass inference)
pass_library
(
multi_batch_merge_pass base
)
pass_library
(
conv_bn_fuse_pass inference
)
pass_library
(
seqconv_eltadd_relu_fuse_pass inference
)
pass_library
(
seqpool_concat_fuse_pass inference
)
pass_library
(
is_test_pass base
)
pass_library
(
conv_elementwise_add_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add2_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add_fuse_pass inference
)
pass_library
(
conv_affine_channel_fuse_pass inference
)
pass_library
(
transpose_flatten_concat_fuse_pass inference
)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach
(
index RANGE 3 6
)
file
(
APPEND
${
pass_file
}
"USE_PASS(transpose_flatten
${
index
}
_concat_fuse_pass);
\n
"
)
endforeach
()
if
(
WITH_MKLDNN
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
depthwise_conv_mkldnn_pass base
)
...
...
@@ -67,6 +80,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
cc_test
(
test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto
)
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
if
(
WITH_MKLDNN
)
cc_test
(
test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass
)
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
eac5a0aa
...
...
@@ -109,7 +109,6 @@ class Graph {
attr_dels_
[
attr_name
]
=
[]()
{};
}
template
<
typename
AttrType
>
void
Erase
(
const
std
::
string
&
attr_name
)
{
PADDLE_ENFORCE
(
attrs_
.
count
(
attr_name
)
!=
0
,
"%s not set in the graph"
,
attr_name
);
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
eac5a0aa
...
...
@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
return
ac_out_var
;
}
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ...
// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z ->
// concat_op
PDNode
*
patterns
::
TransposeFlattenConcat
::
operator
()(
std
::
vector
<
PDNode
*>
conv_in
,
int
times
)
{
// The times represents the repeat times of the
// {trans, trans_out, flatten, flatten_out}
const
int
kNumFields
=
4
;
const
int
kTransOutOffset
=
1
;
const
int
kFlattenOffset
=
2
;
const
int
kFlattenOutOffset
=
3
;
std
::
vector
<
PDNode
*>
nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"transpose"
+
std
::
to_string
(
i
)))
->
assert_is_op
(
"transpose2"
));
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"transpose_out"
+
std
::
to_string
(
i
)))
->
assert_is_op_output
(
"transpose2"
)
->
assert_is_op_input
(
"flatten2"
,
"X"
)
->
AsIntermediate
());
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"flatten"
+
std
::
to_string
(
i
)))
->
assert_is_op
(
"flatten2"
));
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"flatten_out"
+
std
::
to_string
(
i
)))
->
assert_is_op_output
(
"flatten2"
)
->
assert_is_op_nth_input
(
"concat"
,
"X"
,
i
)
->
AsIntermediate
());
}
auto
concat_op
=
pattern
->
NewNode
(
GetNodeName
(
"concat"
))
->
assert_is_op
(
"concat"
)
->
assert_op_has_n_inputs
(
"concat"
,
times
);
auto
concat_out
=
pattern
->
NewNode
(
GetNodeName
(
"concat_out"
))
->
assert_is_op_output
(
"concat"
)
->
AsOutput
();
std
::
vector
<
PDNode
*>
flatten_outs
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
conv_in
[
i
]
->
AsInput
();
// trans
nodes
[
i
*
kNumFields
]
->
LinksFrom
({
conv_in
[
i
]});
// trans_out
nodes
[
i
*
kNumFields
+
kTransOutOffset
]
->
LinksFrom
({
nodes
[
i
*
kNumFields
]});
// flatten
nodes
[
i
*
kNumFields
+
kFlattenOffset
]
->
LinksFrom
(
{
nodes
[
i
*
kNumFields
+
kTransOutOffset
]});
// flatten_out
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]
->
LinksFrom
(
{
nodes
[
i
*
kNumFields
+
kFlattenOffset
]});
flatten_outs
.
push_back
(
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]);
}
concat_op
->
LinksFrom
(
flatten_outs
).
LinksTo
({
concat_out
});
return
concat_out
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
eac5a0aa
...
...
@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE
(
ac_out
);
// Out
};
struct
TransposeFlattenConcat
:
public
PatternBase
{
TransposeFlattenConcat
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"transpose_flatten_concat"
)
{}
PDNode
*
operator
()(
std
::
vector
<
PDNode
*>
conv_inputs
,
int
times
);
std
::
string
GetNodeName
(
const
std
::
string
&
op_type
)
{
return
PDNodeName
(
name_scope_
,
repr_
,
id_
,
op_type
);
}
PDNode
*
GetPDNode
(
const
std
::
string
&
op_type
)
{
return
pattern
->
RetrieveNode
(
GetNodeName
(
op_type
));
}
};
}
// namespace patterns
// Link two ir::Nodes from each other.
...
...
paddle/fluid/framework/ir/lock_free_optimize_pass.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h"
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
const
char
kSumGradOpName
[]
=
"sum"
;
// TODO(minqiyang): only support sgd at current time, please add
// other optimizers later.
const
char
kOptimizerType
[]
=
"sgd"
;
std
::
unique_ptr
<
ir
::
Graph
>
LockFreeOptimizePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
PADDLE_ENFORCE
(
graph
.
get
());
// We could collect all weights' name from SGD, where
// W1 <- SGD(W0, Grad0)
std
::
unordered_set
<
std
::
string
>
weight_var_set
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
IsOpNamed
(
node
,
kOptimizerType
))
{
auto
&
param_out_vars
=
node
->
Op
()
->
Output
(
"ParamOut"
);
PADDLE_ENFORCE
(
param_out_vars
.
size
()
==
1u
);
weight_var_set
.
insert
(
param_out_vars
[
0
]);
}
}
// find all grad's merge op via weight name, where
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
std
::
unordered_set
<
ir
::
Node
*>
grad_sum_op_set
;
for
(
ir
::
Node
*
node
:
graph
->
Nodes
())
{
if
(
IsOpNamed
(
node
,
kSumGradOpName
))
{
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
// strip the last grad suffix @GRAD
std
::
string
var_name
=
output
->
Name
();
const
std
::
string
suffix
(
kGradVarSuffix
);
if
(
var_name
!=
suffix
&&
var_name
.
size
()
>
suffix
.
size
()
&&
var_name
.
substr
(
var_name
.
size
()
-
suffix
.
size
())
==
suffix
)
{
// if so then strip them off
var_name
=
var_name
.
substr
(
0
,
var_name
.
size
()
-
suffix
.
size
());
if
(
weight_var_set
.
find
(
var_name
)
!=
weight_var_set
.
end
())
{
grad_sum_op_set
.
insert
(
node
);
break
;
}
}
}
}
}
// get the forward op and backward op pairs, where
// out <- forward(X, W)
// Grad1 <- backward(out, X')
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
// W0 <- SGD(W1, Grad0)
for
(
ir
::
Node
*
node
:
grad_sum_op_set
)
{
for
(
ir
::
Node
*
merged_grad_var
:
node
->
outputs
)
{
// find the optimizers connected with sum op
if
(
IsVarNameEndsWith
(
merged_grad_var
,
kGradVarSuffix
)
&&
merged_grad_var
->
outputs
.
size
()
==
1u
)
{
ir
::
Node
*
opt_node
=
merged_grad_var
->
outputs
[
0
];
VLOG
(
3
)
<<
"Found opt node "
<<
opt_node
->
Name
();
// find the backward op connected with sum op
for
(
ir
::
Node
*
unmerged_grad_var
:
node
->
inputs
)
{
if
(
IsVarNameContains
(
unmerged_grad_var
,
kGradVarSuffix
)
&&
unmerged_grad_var
->
inputs
.
size
()
==
1u
)
{
ir
::
Node
*
backward_op
=
unmerged_grad_var
->
inputs
[
0
];
VLOG
(
3
)
<<
"Found backward_op "
<<
backward_op
->
Name
();
// find the forward op related to the backward op
ir
::
Node
*
forward_op
=
FindForwardOpViaBackwardOp
(
graph
.
get
(),
backward_op
);
VLOG
(
3
)
<<
"Found forward_op "
<<
forward_op
->
Name
();
PADDLE_ENFORCE
(
forward_op
);
Node
*
new_optimizer_node
=
CreateNewSGDNode
(
graph
.
get
(),
forward_op
,
backward_op
,
node
,
opt_node
);
PADDLE_ENFORCE
(
new_optimizer_node
);
}
}
}
}
}
// Remove the sum_op and its' outputs and connected Optimizers
for
(
Node
*
sum_op
:
grad_sum_op_set
)
{
for
(
Node
*
sum_op_output
:
sum_op
->
outputs
)
{
for
(
Node
*
optimize_op
:
sum_op_output
->
outputs
)
{
if
(
optimize_op
->
NodeType
()
==
Node
::
Type
::
kOperation
&&
optimize_op
->
Name
()
==
kOptimizerType
)
{
VLOG
(
3
)
<<
"remove optimize_op: "
<<
optimize_op
->
Name
()
<<
"_"
<<
optimize_op
->
id
();
graph
->
RemoveNode
(
optimize_op
);
}
}
VLOG
(
3
)
<<
"remove sum_op_output: "
<<
sum_op_output
->
Name
()
<<
"_"
<<
sum_op_output
->
id
();
graph
->
RemoveNode
(
sum_op_output
);
}
VLOG
(
3
)
<<
"remove sum_op: "
<<
sum_op
->
Name
()
<<
"_"
<<
sum_op
->
id
();
graph
->
RemoveNode
(
sum_op
);
}
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
Node
*
output_node
:
node
->
outputs
)
{
if
(
output_node
->
Name
()
==
"sgd"
)
{
VLOG
(
3
)
<<
"Node link to SGD: "
<<
node
->
Name
()
<<
"_"
<<
node
->
id
()
<<
" --> "
<<
output_node
->
Name
()
<<
"_"
<<
output_node
->
id
();
for
(
Node
*
input_node
:
node
->
inputs
)
{
VLOG
(
3
)
<<
"SGD Input link: "
<<
input_node
->
Name
()
<<
"_"
<<
input_node
->
id
()
<<
" --> "
<<
node
->
Name
()
<<
"_"
<<
node
->
id
();
}
}
}
}
return
graph
;
}
ir
::
Node
*
LockFreeOptimizePass
::
CreateNewSGDNode
(
ir
::
Graph
*
graph
,
ir
::
Node
*
forward_node
,
ir
::
Node
*
backward_node
,
ir
::
Node
*
grad_sum_node
,
ir
::
Node
*
optimize_node
)
const
{
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
forward_node
);
PADDLE_ENFORCE
(
backward_node
);
PADDLE_ENFORCE
(
grad_sum_node
);
PADDLE_ENFORCE
(
optimize_node
);
// find the grad var node between the grad sum node and backward_node
std
::
vector
<
ir
::
Node
*>
grad_vars
=
FindConnectedNode
(
backward_node
,
grad_sum_node
);
ir
::
Node
*
grad_node
=
nullptr
;
for
(
ir
::
Node
*
node
:
grad_vars
)
{
if
(
!
ir
::
IsControlDepVar
(
*
node
))
{
grad_node
=
node
;
}
}
PADDLE_ENFORCE
(
grad_node
);
// create a new SGD node
OpDesc
*
old_desc
=
optimize_node
->
Op
();
// keep with the same block between new optimizer and the old one
OpDesc
new_desc
(
*
old_desc
,
old_desc
->
Block
());
new_desc
.
SetInput
(
"Param"
,
old_desc
->
Input
(
"Param"
));
new_desc
.
SetInput
(
"LearningRate"
,
old_desc
->
Input
(
"LearningRate"
));
new_desc
.
SetInput
(
"Grad"
,
std
::
vector
<
std
::
string
>
({
grad_node
->
Name
()}));
new_desc
.
SetOutput
(
"ParamOut"
,
old_desc
->
Output
(
"ParamOut"
));
std
::
vector
<
std
::
string
>
op_role_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
new_desc
.
GetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
// replace the second op role var, because the grad name was
// changed in new optimizer
op_role_vars
.
pop_back
();
op_role_vars
.
push_back
(
grad_node
->
Name
());
new_desc
.
SetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
(),
op_role_vars
);
new_desc
.
SetType
(
kOptimizerType
);
// set backward op's op role var, this will be used to
// set device_id in multi_device_pass
backward_node
->
Op
()
->
SetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
(),
op_role_vars
);
// backward_node->Op()->SetAttr(
// framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {});
// keep with the same output nodes between new optimizer and the
// old one
Node
*
sgd_node
=
graph
->
CreateOpNode
(
&
new_desc
);
// change all outputs of the optimize_node to the new one
ReplaceAllDownstreamNode
(
optimize_node
,
sgd_node
);
// find connected node between forward node and optimize node
// and replace the optimize node to new sgd node
std
::
vector
<
ir
::
Node
*>
forward_opt_connected_nodes
=
FindConnectedNode
(
forward_node
,
optimize_node
);
for
(
ir
::
Node
*
node
:
forward_opt_connected_nodes
)
{
ReplaceUpstreamNode
(
node
,
optimize_node
,
sgd_node
);
}
// find connected node between backward node and optimize node
// and replace the optimize node to new sgd node
std
::
vector
<
ir
::
Node
*>
backward_opt_connected_nodes
=
FindConnectedNode
(
backward_node
,
optimize_node
);
for
(
ir
::
Node
*
node
:
backward_opt_connected_nodes
)
{
ReplaceUpstreamNode
(
node
,
optimize_node
,
sgd_node
);
}
// SGD must have only one param and LR in
PADDLE_ENFORCE
(
old_desc
->
Input
(
"LearningRate"
).
size
()
==
1u
);
PADDLE_ENFORCE
(
old_desc
->
Input
(
"Param"
).
size
()
==
1u
);
// LR and weight nodes should be copied
for
(
Node
*
upstream_node
:
optimize_node
->
inputs
)
{
if
(
upstream_node
->
Name
()
==
old_desc
->
Input
(
"LearningRate"
)[
0
]
||
upstream_node
->
Name
()
==
old_desc
->
Input
(
"Param"
)[
0
])
{
ReplaceUpstreamNode
(
upstream_node
,
optimize_node
,
sgd_node
);
}
}
VLOG
(
3
)
<<
"Create new opt node"
<<
sgd_node
->
Name
()
<<
"_"
<<
sgd_node
->
id
();
return
sgd_node
;
}
std
::
vector
<
ir
::
Node
*>
LockFreeOptimizePass
::
FindConnectedNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
downstream_node
)
const
{
std
::
vector
<
ir
::
Node
*>
result
;
for
(
ir
::
Node
*
out_node
:
upstream_node
->
outputs
)
{
for
(
ir
::
Node
*
in_node
:
downstream_node
->
inputs
)
{
if
(
in_node
==
out_node
)
{
result
.
push_back
(
in_node
);
}
}
}
return
result
;
}
void
LockFreeOptimizePass
::
ReplaceUpstreamNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
{
PADDLE_ENFORCE
(
upstream_node
);
PADDLE_ENFORCE
(
old_optimizer_node
);
PADDLE_ENFORCE
(
new_optimizer_node
);
// Remove the old_optimizer_node from upstream_node's outputs vector
auto
&
output_node_vec
=
upstream_node
->
outputs
;
for
(
auto
output_node_iter
=
output_node_vec
.
begin
();
output_node_iter
!=
output_node_vec
.
end
();)
{
if
(
*
output_node_iter
==
old_optimizer_node
)
{
output_node_vec
.
erase
(
output_node_iter
);
break
;
}
else
{
++
output_node_iter
;
}
}
// Add the new_optimizer_node to upstream_node's outputs vector
output_node_vec
.
emplace_back
(
new_optimizer_node
);
new_optimizer_node
->
inputs
.
emplace_back
(
upstream_node
);
}
void
LockFreeOptimizePass
::
ReplaceAllDownstreamNode
(
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
{
PADDLE_ENFORCE
(
old_optimizer_node
);
PADDLE_ENFORCE
(
new_optimizer_node
);
for
(
ir
::
Node
*
downstream_node
:
old_optimizer_node
->
outputs
)
{
// Remove the old_optimizer_node from downstream_node's inputs vector
auto
&
input_node_vec
=
downstream_node
->
inputs
;
for
(
auto
input_node_iter
=
input_node_vec
.
begin
();
input_node_iter
!=
input_node_vec
.
end
();)
{
if
(
*
input_node_iter
==
old_optimizer_node
)
{
input_node_vec
.
erase
(
input_node_iter
);
break
;
}
else
{
++
input_node_iter
;
}
}
// Add the new_optimizer_node to downstream_node's inputs vector
input_node_vec
.
emplace_back
(
new_optimizer_node
);
new_optimizer_node
->
outputs
.
emplace_back
(
downstream_node
);
}
}
ir
::
Node
*
LockFreeOptimizePass
::
FindForwardOpViaBackwardOp
(
ir
::
Graph
*
graph
,
ir
::
Node
*
backward_node
)
const
{
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
backward_node
);
// strip the suffix _grad of backward_node's name
std
::
string
forward_op_name
=
backward_node
->
Name
();
const
std
::
string
suffix
(
"_grad"
);
if
(
forward_op_name
!=
suffix
&&
forward_op_name
.
size
()
>
suffix
.
size
()
&&
forward_op_name
.
substr
(
forward_op_name
.
size
()
-
suffix
.
size
())
==
suffix
)
{
// if so then strip them off
forward_op_name
=
forward_op_name
.
substr
(
0
,
forward_op_name
.
size
()
-
suffix
.
size
());
}
else
{
LOG
(
WARNING
)
<<
"Illegal backward node's name "
<<
backward_node
->
Name
()
<<
" id "
<<
backward_node
->
id
();
return
nullptr
;
}
for
(
ir
::
Node
*
node
:
graph
->
Nodes
())
{
if
(
node
->
Name
()
==
forward_op_name
)
{
if
(
node
->
outputs
.
size
()
==
0u
)
{
// if forward_node has no output, then it has NO grad op
continue
;
}
// check whether all inputs of the backward_op that ends_with @GRAD
// comes from the output of forward_op is the input of the backward_op
bool
is_related_forward_node
=
true
;
for
(
ir
::
Node
*
backward_input
:
backward_node
->
inputs
)
{
if
(
IsVarNameEndsWith
(
backward_input
,
kGradVarSuffix
))
{
bool
meets_correct_output
=
false
;
for
(
ir
::
Node
*
forward_output
:
node
->
outputs
)
{
if
(
forward_output
->
Name
()
+
kGradVarSuffix
==
backward_input
->
Name
())
{
meets_correct_output
=
true
;
break
;
}
}
if
(
!
meets_correct_output
)
{
is_related_forward_node
=
false
;
break
;
}
}
}
if
(
is_related_forward_node
)
{
return
node
;
}
}
}
return
nullptr
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
lock_free_optimize_pass
,
paddle
::
framework
::
ir
::
LockFreeOptimizePass
);
paddle/fluid/framework/ir/lock_free_optimize_pass.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#include <string>
#include <vector>
#include <boost/algorithm/string/predicate.hpp>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
Node
;
/*
* Remove the sum op of all gradients of the backward op.
* And remove the dependecies of the optimizer related to the
* same backward op.
*
* Before this pass:
*
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* \ /
* \ /
* sum_op
* |
* sgd_op
*
* After this pass:
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* | |
* sgd_op1 sgd_op2
*
* sgd_op1 and sgd_op2 will update the same weight which holds the same
* memory, so we could benefits from the acceleration
*/
class
LockFreeOptimizePass
:
public
Pass
{
public:
virtual
~
LockFreeOptimizePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
private:
// Create a new sgd node via current optimizer node
ir
::
Node
*
CreateNewSGDNode
(
ir
::
Graph
*
graph
,
ir
::
Node
*
forward_node
,
ir
::
Node
*
backward_node
,
ir
::
Node
*
grad_sum_node
,
ir
::
Node
*
optimize_node
)
const
;
// Replace the input weight's optimizers
void
ReplaceUpstreamNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
;
// Replace the output weight's optimizers
void
ReplaceAllDownstreamNode
(
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
;
// Find all weight variables in graph
bool
FindAllWeightVars
(
ir
::
Graph
*
graph
)
const
;
// Find the forward_op node via the backward_op node
ir
::
Node
*
FindForwardOpViaBackwardOp
(
ir
::
Graph
*
graph
,
ir
::
Node
*
backward_node
)
const
;
std
::
vector
<
ir
::
Node
*>
FindConnectedNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
downstream_node
)
const
;
inline
bool
IsOpNamed
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kOperation
&&
node
->
Name
()
==
name
;
}
inline
bool
IsVarNamed
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kVariable
&&
node
->
Name
()
==
name
;
}
inline
bool
IsVarNameEndsWith
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kVariable
&&
boost
::
algorithm
::
ends_with
(
node
->
Name
(),
name
);
}
inline
bool
IsVarNameContains
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kVariable
&&
node
->
Name
().
find
(
name
)
!=
std
::
string
::
npos
;
}
inline
bool
IsControlDepFrom
(
ir
::
Node
*
ctrl_dep_node
,
ir
::
Node
*
node
)
const
{
PADDLE_ENFORCE
(
ctrl_dep_node
);
PADDLE_ENFORCE
(
node
);
return
IsControlDepVar
(
*
ctrl_dep_node
)
&&
ctrl_dep_node
->
inputs
.
size
()
>=
1u
&&
ctrl_dep_node
->
inputs
[
0
]
==
node
;
}
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
#endif // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#define MAX_CONCAT_INPUTS 200
namespace
paddle
{
namespace
framework
{
namespace
ir
{
PDNode
*
BuildSeqPoolConcatPattern
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
,
int
num_inputs
)
{
auto
is_concat_op_with_inputs
=
[](
Node
*
x
,
int
num
)
->
bool
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"concat"
&&
x
->
Op
()
->
Input
(
"X"
).
size
()
==
static_cast
<
size_t
>
(
num
);
};
auto
is_nth_input_var_of_concat
=
[
=
](
Node
*
x
,
int
idx
)
->
bool
{
return
x
&&
x
->
IsVar
()
&&
VarLinksToOp
(
x
,
"concat"
)
&&
x
->
outputs
.
size
()
==
1
&&
IsNthInput
(
x
,
x
->
outputs
[
0
],
"X"
,
idx
)
&&
is_concat_op_with_inputs
(
x
->
outputs
[
0
],
num_inputs
);
};
auto
is_seqpool_op_with_pootype_of_nth_input_of_concat
=
[
=
](
Node
*
x
,
const
std
::
string
&
type
,
int
idx
)
->
bool
{
bool
this_is_seqpool_op
=
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"sequence_pool"
&&
x
->
Op
()
->
HasAttr
(
"pooltype"
)
&&
boost
::
get
<
std
::
string
>
(
x
->
Op
()
->
GetAttr
(
"pooltype"
))
==
type
&&
x
->
outputs
.
size
()
==
2
;
// seqpool should only have 2 outputs
bool
satisfied_all
=
this_is_seqpool_op
;
if
(
this_is_seqpool_op
)
{
// Only one output of seqpool_op is nth_input_var of concat,
// the other one should be unused empty var.
if
(
is_nth_input_var_of_concat
(
x
->
outputs
[
0
],
idx
))
{
satisfied_all
=
satisfied_all
&&
x
->
outputs
[
1
]
->
IsVar
()
&&
x
->
outputs
[
1
]
->
outputs
.
empty
();
}
else
{
satisfied_all
=
satisfied_all
&&
is_nth_input_var_of_concat
(
x
->
outputs
[
1
],
idx
)
&&
x
->
outputs
[
0
]
->
IsVar
()
&&
x
->
outputs
[
0
]
->
outputs
.
size
()
==
0
;
}
}
return
satisfied_all
;
};
auto
*
concat_op
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
is_concat_op_with_inputs
(
x
,
num_inputs
);
},
name_scope
+
"/concat_op"
);
concat_op
->
assert_op_attr
<
int
>
(
"axis"
,
1
);
auto
*
concat_out_var
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksFromOp
(
x
,
"concat"
)
&&
x
->
inputs
.
size
()
==
1
&&
is_concat_op_with_inputs
(
x
->
inputs
[
0
],
num_inputs
);
},
name_scope
+
"/concat_out_var"
);
concat_out_var
->
assert_is_only_output_of_op
(
"concat"
);
std
::
vector
<
PDNode
*>
seqpool_ops_input_var
(
num_inputs
);
std
::
vector
<
PDNode
*>
seqpool_ops_output_var
(
num_inputs
);
std
::
vector
<
PDNode
*>
seqpool_ops_output_unused_var
(
num_inputs
);
std
::
vector
<
PDNode
*>
seqpool_ops
(
num_inputs
);
for
(
int
i
=
0
;
i
<
num_inputs
;
++
i
)
{
seqpool_ops_output_var
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
is_nth_input_var_of_concat
(
x
,
i
)
&&
x
->
inputs
.
size
()
==
1
&&
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
x
->
inputs
[
0
],
"SUM"
,
i
);
},
name_scope
+
"/sequence_pool_out_"
+
std
::
to_string
(
i
));
seqpool_ops_output_unused_var
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
x
->
inputs
.
size
()
==
1
&&
x
->
outputs
.
size
()
==
0
&&
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
x
->
inputs
[
0
],
"SUM"
,
i
);
},
name_scope
+
"/sequence_pool_unused_out_"
+
std
::
to_string
(
i
));
seqpool_ops
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
x
,
"SUM"
,
i
);
},
name_scope
+
"/sequence_pool_op_"
+
std
::
to_string
(
i
));
seqpool_ops_input_var
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
bool
basic
=
x
&&
x
->
IsVar
()
&&
x
->
outputs
.
size
()
>=
1
;
bool
next_is_fine
=
false
;
for
(
auto
*
o
:
x
->
outputs
)
{
if
(
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
o
,
"SUM"
,
i
))
{
next_is_fine
=
true
;
break
;
}
}
return
basic
&&
next_is_fine
;
},
name_scope
+
"/sequence_pool_in_"
+
std
::
to_string
(
i
));
// Links
seqpool_ops
[
i
]
->
LinksFrom
({
seqpool_ops_input_var
[
i
]})
.
LinksTo
({
seqpool_ops_output_var
[
i
],
seqpool_ops_output_unused_var
[
i
]});
}
concat_op
->
LinksFrom
(
seqpool_ops_output_var
).
LinksTo
({
concat_out_var
});
return
concat_out_var
;
}
int
BuildFusion
(
Graph
*
graph
,
const
std
::
string
&
name_scope
,
int
num_inputs
)
{
GraphPatternDetector
gpd
;
auto
*
pattern
=
gpd
.
mutable_pattern
();
BuildSeqPoolConcatPattern
(
pattern
,
name_scope
,
num_inputs
);
auto
retrieve_node
=
[](
const
std
::
string
&
name
,
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
const
PDPattern
&
pat
)
->
Node
*
{
PADDLE_ENFORCE
(
subgraph
.
count
(
pat
.
RetrieveNode
(
name
)),
"pattern has no Node called %s"
,
name
.
c_str
());
Node
*
p
=
subgraph
.
at
(
pat
.
RetrieveNode
(
name
));
PADDLE_ENFORCE_NOT_NULL
(
p
,
"subgraph has no node %s"
,
name
.
c_str
());
return
p
;
};
int
fusion_count
{
0
};
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle SeqPool Concat fuse"
;
std
::
vector
<
std
::
string
>
input_names
(
num_inputs
);
std
::
vector
<
Node
*>
input_vars
(
num_inputs
);
auto
&
fused_pattern
=
gpd
.
pattern
();
for
(
int
i
=
0
;
i
<
num_inputs
;
++
i
)
{
input_vars
[
i
]
=
retrieve_node
(
name_scope
+
"/sequence_pool_in_"
+
std
::
to_string
(
i
),
subgraph
,
fused_pattern
);
input_names
[
i
]
=
input_vars
[
i
]
->
Name
();
}
auto
*
concat_op
=
retrieve_node
(
name_scope
+
"/concat_op"
,
subgraph
,
fused_pattern
);
auto
*
concat_out_var
=
retrieve_node
(
name_scope
+
"/concat_out_var"
,
subgraph
,
fused_pattern
);
auto
*
seqpool_op0
=
retrieve_node
(
name_scope
+
"/sequence_pool_op_0"
,
subgraph
,
fused_pattern
);
// Create New OpDesc
OpDesc
op_desc
;
op_desc
.
SetType
(
"fusion_seqpool_concat"
);
op_desc
.
SetInput
(
"X"
,
input_names
);
op_desc
.
SetAttr
(
"pooltype"
,
seqpool_op0
->
Op
()
->
GetAttr
(
"pooltype"
));
op_desc
.
SetAttr
(
"axis"
,
concat_op
->
Op
()
->
GetAttr
(
"axis"
));
op_desc
.
SetOutput
(
"Out"
,
{
concat_out_var
->
Name
()});
auto
*
op
=
graph
->
CreateOpNode
(
&
op_desc
);
for
(
size_t
i
=
0
;
i
<
input_vars
.
size
();
++
i
)
{
IR_NODE_LINK_TO
(
input_vars
[
i
],
op
);
}
IR_NODE_LINK_TO
(
op
,
concat_out_var
);
std
::
unordered_set
<
const
Node
*>
marked_nodes
;
for
(
auto
&
item
:
subgraph
)
{
marked_nodes
.
insert
(
item
.
second
);
}
for
(
size_t
i
=
0
;
i
<
input_vars
.
size
();
++
i
)
{
marked_nodes
.
erase
(
input_vars
[
i
]);
}
marked_nodes
.
erase
(
concat_out_var
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
++
fusion_count
;
};
gpd
(
graph
,
handler
);
return
fusion_count
;
}
std
::
unique_ptr
<
ir
::
Graph
>
SeqPoolConcatFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
FusePassBase
::
Init
(
name_scope_
,
graph
.
get
());
int
fusion_count
=
0
;
for
(
int
i
=
MAX_CONCAT_INPUTS
;
i
>
0
;
--
i
)
{
fusion_count
+=
BuildFusion
(
graph
.
get
(),
name_scope_
+
"/"
+
std
::
to_string
(
i
),
i
);
}
AddStatis
(
fusion_count
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
seqpool_concat_fuse_pass
,
paddle
::
framework
::
ir
::
SeqPoolConcatFusePass
);
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
/**
* Fuse SequencePool(with sum pooltype yet) and Concat;
*
* Before fuse:
* | | |
* seq_pool, seq_pool, ... seq_pool
* \ | ... /
* concat
* |
* After fuse:
* \ | /
* FusionSeqPoolConcat
* |
*/
class
SeqPoolConcatFusePass
:
public
FusePassBase
{
public:
virtual
~
SeqPoolConcatFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
const
std
::
string
name_scope_
{
"seqpool_concat_fuse"
};
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
SetOp
(
ProgramDesc
*
prog
,
const
std
::
string
&
type
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
vector
<
std
::
string
>&
outputs
)
{
auto
*
op
=
prog
->
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
type
);
if
(
type
==
"sequence_pool"
)
{
op
->
SetInput
(
"X"
,
{
inputs
[
0
]});
std
::
string
pooltype
=
"SUM"
;
op
->
SetAttr
(
"pooltype"
,
pooltype
);
op
->
SetOutput
(
"MaxIndex"
,
{
outputs
[
0
]});
op
->
SetOutput
(
"Out"
,
{
outputs
[
1
]});
}
else
if
(
type
==
"concat"
)
{
op
->
SetInput
(
"X"
,
inputs
);
op
->
SetAttr
(
"axis"
,
1
);
op
->
SetOutput
(
"Out"
,
{
outputs
[
0
]});
}
else
{
op
->
SetInput
(
"X"
,
inputs
);
op
->
SetOutput
(
"Out"
,
outputs
);
}
op
->
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kForward
));
}
int
CountOpType
(
const
ir
::
Graph
*
graph
,
const
std
::
string
&
op_type
=
"fusion_seqpool_concat"
)
{
int
count
=
0
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
->
Type
()
==
op_type
)
{
++
count
;
}
}
return
count
;
}
std
::
unique_ptr
<
ir
::
Graph
>
GetNumNodesOfBeforeAfter
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
int
*
before
,
int
*
after
,
const
std
::
string
&
pass_type
=
"seqpool_concat_fuse_pass"
)
{
auto
pass
=
PassRegistry
::
Instance
().
Get
(
pass_type
);
*
before
=
graph
->
Nodes
().
size
();
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
*
after
=
graph
->
Nodes
().
size
();
return
graph
;
}
/*
* Before fuse:
* a b c
* | | |
* op1 op2 op3
* / \ / \ / \
* d e f g h i
* \ | /
* concat
* |
* j
* Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr
*
* After fuse:
* a b c
* \ | /
* fusion_seqpool_concat
* |
* j
*/
TEST
(
SeqPoolConcatFusePass
,
basic
)
{
ProgramDesc
prog
;
for
(
auto
&
v
:
std
::
vector
<
std
::
string
>
(
{
"a"
,
"b"
,
"c"
,
"d"
,
"e"
,
"f"
,
"g"
,
"h"
,
"i"
,
"j"
}))
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
}
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"a"
}),
std
::
vector
<
std
::
string
>
({
"d"
,
"e"
}));
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"b"
}),
std
::
vector
<
std
::
string
>
({
"f"
,
"g"
}));
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"c"
}),
std
::
vector
<
std
::
string
>
({
"h"
,
"i"
}));
SetOp
(
&
prog
,
"concat"
,
std
::
vector
<
std
::
string
>
({
"e"
,
"g"
,
"i"
}),
std
::
vector
<
std
::
string
>
({
"j"
}));
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
int
before
,
after
;
graph
=
GetNumNodesOfBeforeAfter
(
std
::
move
(
graph
),
&
before
,
&
after
);
// Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op
// Add 1 Node: fusion_seqpool_concat
EXPECT_EQ
(
after
,
before
-
9
);
EXPECT_EQ
(
CountOpType
(
graph
.
get
()),
1
);
}
/*
* Before fuse:
* a b
* | / \
* op1 op2 op3
* / \ / \ \
* c d e f g
* \ /
* concat
* |
* h
* Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr
*
* After fuse:
* a b
* \ / \
* fusion_seqpool_concat op3
* | |
* h g
*/
TEST
(
SeqPoolConcatFusePass
,
advanced
)
{
ProgramDesc
prog
;
for
(
auto
&
v
:
std
::
vector
<
std
::
string
>
({
"a"
,
"b"
,
"c"
,
"d"
,
"e"
,
"f"
,
"g"
,
"h"
}))
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
}
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"a"
}),
std
::
vector
<
std
::
string
>
({
"c"
,
"d"
}));
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"b"
}),
std
::
vector
<
std
::
string
>
({
"e"
,
"f"
}));
SetOp
(
&
prog
,
"op3"
,
std
::
vector
<
std
::
string
>
({
"b"
}),
std
::
vector
<
std
::
string
>
({
"g"
}));
SetOp
(
&
prog
,
"concat"
,
std
::
vector
<
std
::
string
>
({
"d"
,
"f"
}),
std
::
vector
<
std
::
string
>
({
"h"
}));
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
int
before
,
after
;
graph
=
GetNumNodesOfBeforeAfter
(
std
::
move
(
graph
),
&
before
,
&
after
);
// Remove 7 Nodes: op1, op2, c, d, e, f concat_op
// Add 1 Node: fusion_seqpool_concat
EXPECT_EQ
(
after
,
before
-
6
);
EXPECT_EQ
(
CountOpType
(
graph
.
get
()),
1
);
}
ProgramDesc
BuildProgramDesc
(
int
num_inputs_of_concat
)
{
ProgramDesc
prog
;
auto
new_var
=
[
&
](
const
std
::
string
&
name
)
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
name
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
};
std
::
vector
<
std
::
string
>
concat_inputs
;
for
(
int
i
=
0
;
i
<
num_inputs_of_concat
;
++
i
)
{
std
::
string
prefix
=
"seqpool_op_"
+
i
;
new_var
(
prefix
+
"in"
);
new_var
(
prefix
+
"out"
);
new_var
(
prefix
+
"out_unused"
);
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
prefix
+
"in"
}),
std
::
vector
<
std
::
string
>
({
prefix
+
"out"
,
prefix
+
"out_unused"
}));
concat_inputs
.
push_back
(
prefix
+
"out"
);
}
SetOp
(
&
prog
,
"concat"
,
concat_inputs
,
std
::
vector
<
std
::
string
>
({
"concat_out"
}));
return
prog
;
}
// test more inputs of concat
TEST
(
SeqPoolConcatFusePass
,
more_inputs
)
{
for
(
int
num
:
{
1
,
2
,
10
})
{
ProgramDesc
prog
=
BuildProgramDesc
(
num
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
int
before
,
after
;
graph
=
GetNumNodesOfBeforeAfter
(
std
::
move
(
graph
),
&
before
,
&
after
);
// Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op
// Add Node: fusion_seqpool_concat op
EXPECT_EQ
(
after
,
before
-
num
*
3
);
EXPECT_EQ
(
CountOpType
(
graph
.
get
()),
1
);
}
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
USE_PASS
(
seqpool_concat_fuse_pass
);
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
template
<
int
times
>
std
::
unique_ptr
<
ir
::
Graph
>
TransposeFlattenConcatFusePass
<
times
>::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
pattern_name
=
"transpose_flatten"
+
std
::
to_string
(
times
)
+
"_concat_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
.
get
());
GraphPatternDetector
gpd
;
std
::
vector
<
PDNode
*>
input_nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
input_nodes
.
push_back
(
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
+
std
::
to_string
(
i
))
->
assert_is_op_input
(
"transpose2"
,
"X"
)
->
AsInput
());
}
patterns
::
TransposeFlattenConcat
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
input_nodes
,
times
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
const
int
kNumFields
=
5
;
const
int
kTransOffset
=
1
;
const
int
kTransOutOffset
=
2
;
const
int
kFlattenOffset
=
3
;
const
int
kFlattenOutOffset
=
4
;
std
::
vector
<
Node
*>
nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_out"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten_out"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_out"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten_out"
+
std
::
to_string
(
i
))));
}
Node
*
concat_op
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"concat"
));
Node
*
concat_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"concat_out"
));
std
::
vector
<
std
::
string
>
input_names
;
std
::
vector
<
int
>
trans_axis
=
boost
::
get
<
std
::
vector
<
int
>>
(
nodes
[
kTransOffset
]
->
Op
()
->
GetAttr
(
"axis"
));
int
flatten_axis
=
boost
::
get
<
int
>
(
nodes
[
kFlattenOffset
]
->
Op
()
->
GetAttr
(
"axis"
));
int
concat_axis
=
boost
::
get
<
int
>
(
concat_op
->
Op
()
->
GetAttr
(
"axis"
));
std
::
string
output_name
=
concat_out
->
Name
();
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
input_names
.
push_back
(
nodes
[
i
*
kNumFields
]
->
Name
());
}
framework
::
OpDesc
new_op_desc
;
new_op_desc
.
SetType
(
"fusion_transpose_flatten_concat"
);
new_op_desc
.
SetInput
(
"X"
,
input_names
);
new_op_desc
.
SetAttr
(
"trans_axis"
,
trans_axis
);
new_op_desc
.
SetAttr
(
"flatten_axis"
,
flatten_axis
);
new_op_desc
.
SetAttr
(
"concat_axis"
,
concat_axis
);
new_op_desc
.
SetOutput
(
"Out"
,
{
output_name
});
new_op_desc
.
Flush
();
// Create a new node for the fused op.
auto
*
new_conv_op
=
graph
->
CreateOpNode
(
&
new_op_desc
);
std
::
unordered_set
<
const
Node
*>
delete_nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
nodes
[
i
*
kNumFields
]
->
outputs
.
push_back
(
new_conv_op
);
new_conv_op
->
inputs
.
push_back
(
nodes
[
i
*
kNumFields
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kTransOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kTransOutOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kFlattenOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]);
}
delete_nodes
.
insert
(
concat_op
);
new_conv_op
->
outputs
.
push_back
(
concat_out
);
concat_out
->
inputs
.
push_back
(
new_conv_op
);
// Delete the unneeded nodes.
GraphSafeRemoveNodes
(
graph
.
get
(),
delete_nodes
);
};
gpd
(
graph
.
get
(),
handler
);
return
graph
;
}
template
class
TransposeFlattenConcatFusePass
<
1
>;
template
class
TransposeFlattenConcatFusePass
<
3
>;
template
class
TransposeFlattenConcatFusePass
<
4
>;
template
class
TransposeFlattenConcatFusePass
<
5
>;
template
class
TransposeFlattenConcatFusePass
<
6
>;
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
transpose_flatten_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
1
>
);
REGISTER_PASS
(
transpose_flatten3_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
3
>
);
REGISTER_PASS
(
transpose_flatten4_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
4
>
);
REGISTER_PASS
(
transpose_flatten5_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
5
>
);
REGISTER_PASS
(
transpose_flatten6_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
6
>
);
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
// There may be many transpose-flatten structures in a model, and the output of
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template
<
int
times
>
class
TransposeFlattenConcatFusePass
:
public
FusePassBase
{
public:
virtual
~
TransposeFlattenConcatFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ngraph_bridge.cc
浏览文件 @
eac5a0aa
...
...
@@ -32,8 +32,11 @@ std::map<std::string,
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphBridge
::
NG_NODE_MAP
=
{
{
"fill_constant"
,
paddle
::
operators
::
ngraphs
::
BuildFillConstantNode
},
{
"mean"
,
paddle
::
operators
::
ngraphs
::
BuildMeanNode
},
{
"mean_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMeanGradNode
},
{
"mul"
,
paddle
::
operators
::
ngraphs
::
BuildMulNode
},
{
"mul_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMulGradNode
},
{
"scale"
,
paddle
::
operators
::
ngraphs
::
BuildScaleNode
},
{
"relu"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"tanh"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"top_k"
,
paddle
::
operators
::
ngraphs
::
BuildTopKNode
}};
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
eac5a0aa
...
...
@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor(
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
size_t
num_trainers
,
size_t
trainer_id
)
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
use_all_reduce_
=
build_strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
member_
->
nranks_
=
num_trainers
*
places
.
size
();
member_
->
nranks_
=
build_strategy
.
num_trainers_
*
places
.
size
();
if
(
!
member_
->
use_all_reduce_
)
{
PADDLE_ENFORCE
(
places
.
size
()
>
1
,
...
...
@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
}
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
,
nccl_id
,
num_trainers
,
trainer_id
));
member_
->
places_
,
nccl_id
,
build_strategy
.
num_trainers_
,
build_strategy
.
trainer_id_
));
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
eac5a0aa
...
...
@@ -50,8 +50,7 @@ class ParallelExecutor {
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
size_t
num_trainers
=
1
,
size_t
trainer_id
=
0
);
const
BuildStrategy
&
build_strategy
);
~
ParallelExecutor
();
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
eac5a0aa
...
...
@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
}
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
SCOPE_VARS_WRITER_LOCK
auto
new_name
=
std
::
to_string
(
reinterpret_cast
<
uintptr_t
>
(
this
))
+
"."
+
std
::
to_string
(
vars_
.
size
());
if
(
name
!=
nullptr
)
{
*
name
=
new_name
;
}
SCOPE_VARS_WRITER_LOCK
return
VarInternal
(
new_name
);
}
...
...
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
eac5a0aa
...
...
@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
}
// namespace detail
const
std
::
type_index
&
ToTypeIndex
(
int
var_id
)
{
const
std
::
type_index
&
VarTraitId
ToTypeIndex
(
int
var_id
)
{
return
detail
::
VarIdToTypeIndexMapHolder
::
ToTypeIndex
(
var_id
);
}
const
char
*
ToTypeName
(
int
var_id
)
{
return
ToTypeIndex
(
var_id
).
name
();
}
const
char
*
ToTypeName
(
int
var_id
)
{
return
VarTraitIdToTypeIndex
(
var_id
).
name
();
}
int
T
oType
Id
(
const
std
::
type_index
&
type
)
{
int
T
ypeIndexToVarTrait
Id
(
const
std
::
type_index
&
type
)
{
return
detail
::
VarIdToTypeIndexMapHolder
::
ToTypeId
(
type
);
}
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
eac5a0aa
...
...
@@ -66,8 +66,8 @@ namespace paddle {
namespace
framework
{
const
char
*
ToTypeName
(
int
var_id
);
const
std
::
type_index
&
ToTypeIndex
(
int
var_id
);
int
T
oType
Id
(
const
std
::
type_index
&
type
);
const
std
::
type_index
&
VarTraitId
ToTypeIndex
(
int
var_id
);
int
T
ypeIndexToVarTrait
Id
(
const
std
::
type_index
&
type
);
namespace
detail
{
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
eac5a0aa
...
...
@@ -45,10 +45,11 @@ struct TypeIndexChecker {
constexpr
auto
kId
=
VarTypeTrait
<
Type
>::
kId
;
std
::
type_index
actual_type
(
typeid
(
Type
));
EXPECT_EQ
(
std
::
string
(
ToTypeName
(
kId
)),
std
::
string
(
actual_type
.
name
()));
EXPECT_EQ
(
ToTypeIndex
(
kId
),
actual_type
);
EXPECT_EQ
(
ToTypeId
(
actual_type
),
kId
);
EXPECT_EQ
(
ToTypeIndex
(
ToTypeId
(
actual_type
)),
actual_type
);
EXPECT_EQ
(
ToTypeId
(
ToTypeIndex
(
kId
)),
kId
);
EXPECT_EQ
(
VarTraitIdToTypeIndex
(
kId
),
actual_type
);
EXPECT_EQ
(
TypeIndexToVarTraitId
(
actual_type
),
kId
);
EXPECT_EQ
(
VarTraitIdToTypeIndex
(
TypeIndexToVarTraitId
(
actual_type
)),
actual_type
);
EXPECT_EQ
(
TypeIndexToVarTraitId
(
VarTraitIdToTypeIndex
(
kId
)),
kId
);
EXPECT_TRUE
(
var_id_set
->
count
(
kId
)
==
0
);
// NOLINT
EXPECT_TRUE
(
type_index_set
->
count
(
actual_type
)
==
0
);
// NOLINT
...
...
paddle/fluid/imperative/layer.h
浏览文件 @
eac5a0aa
...
...
@@ -77,6 +77,7 @@ class PreparedOp {
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
platform
::
DeviceContext
*
dev_ctx
;
};
class
OpBase
;
class
VarBase
{
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
eac5a0aa
...
...
@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
i
++
)
{
LOG
(
INFO
)
<<
"data: "
<<
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
]
<<
" result: "
<<
result
[
i
];
PADDLE_ENFORCE
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())
[
i
],
result
[
i
]
);
EXPECT_NEAR
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
],
result
[
i
],
1e-3
);
}
}
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
eac5a0aa
...
...
@@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
use_gpu
,
UseGPU
,
bool
);
DECL_ARGUMENT_FIELD
(
gpu_device_id
,
GPUDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
use_tensorrt
,
UseTensorRT
,
bool
);
DECL_ARGUMENT_FIELD
(
tensorrt_node_teller
,
TensorRtNodeTeller
,
std
::
function
<
bool
(
const
framework
::
ir
::
Node
*
)
>
);
DECL_ARGUMENT_FIELD
(
tensorrt_max_batch_size
,
TensorRtMaxBatchSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
eac5a0aa
...
...
@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for
(
const
std
::
string
&
pass_name
:
passes
)
{
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
pass_name
);
// Set some pass attributes.
if
(
pass_name
==
"ir_analysis_pass"
)
{
pass
->
Set
(
"tensorrt_node_teller"
,
new
SubgraphDetector
::
NodeInsideSubgraphTeller
(
argument
->
tensorrt_node_teller
()));
}
if
(
pass_name
==
"graph_viz_pass"
)
{
std
::
string
dot_file_path
=
std
::
to_string
(
pass_num
)
+
"_ir_"
+
(
pre_pass
.
empty
()
?
"origin"
:
pre_pass
)
+
...
...
@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
}
if
(
pass_name
==
"tensorrt_subgraph_pass"
)
{
PADDLE_ENFORCE
(
argument
->
tensorrt_node_teller_valid
());
pass
->
SetNotOwned
(
"tensorrt_node_teller"
,
argument
->
tensorrt_node_teller_ptr
());
pass
->
Set
(
"workspace_size"
,
new
int
(
argument
->
tensorrt_workspace_size
()));
pass
->
Set
(
"max_batch_size"
,
new
int
(
argument
->
tensorrt_max_batch_size
()));
pass
->
Set
(
"min_subgraph_size"
,
...
...
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
浏览文件 @
eac5a0aa
cc_library
(
subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector
)
set
(
analysis_deps
${
analysis_deps
}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL
""
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
APPEND
${
pass_file
}
"USE_PASS(tensorrt_subgraph_pass);
\n
"
)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
tensorrt_subgraph_pass CACHE INTERNAL
""
)
if
(
TENSORRT_FOUND
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller
)
set
(
analysis_deps
${
analysis_deps
}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL
""
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
APPEND
${
pass_file
}
"USE_PASS(tensorrt_subgraph_pass);
\n
"
)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
tensorrt_subgraph_pass CACHE INTERNAL
""
)
endif
()
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
eac5a0aa
...
...
@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
{
framework
::
ir
::
FusePassBase
::
Init
(
"tensorrt_subgraph_pass"
,
graph
.
get
());
auto
teller
=
Get
<
SubgraphDetector
::
NodeInsideSubgraphTeller
>
(
"tensorrt_node_teller"
);
auto
teller
=
[](
const
framework
::
ir
::
Node
*
node
)
{
if
(
!
node
->
IsOp
()
||
!
node
->
Op
())
return
false
;
return
tensorrt
::
OpTeller
::
Global
().
Tell
(
node
->
Op
()
->
Type
(),
*
node
->
Op
());
};
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
Get
<
int
>
(
"min_subgraph_size"
)
/*min subgraph size*/
);
...
...
@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS
(
tensorrt_subgraph_pass
,
paddle
::
inference
::
analysis
::
TensorRtSubgraphPass
)
.
RequirePassAttr
(
"tensorrt_node_teller"
)
.
RequirePassAttr
(
"max_batch_size"
)
.
RequirePassAttr
(
"workspace_size"
)
.
RequirePassAttr
(
"min_subgraph_size"
);
paddle/fluid/inference/analysis/passes/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
ir_graph_build_pass
ir_analysis_pass
analysis_passes
subgraph_detector
CACHE INTERNAL
""
)
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
浏览文件 @
eac5a0aa
...
...
@@ -27,9 +27,6 @@ namespace analysis {
void
IrAnalysisComposePass
::
RunImpl
(
Argument
*
argument
)
{
ARGUMENT_CHECK_FIELD
(
argument
,
ir_analysis_passes
);
if
(
argument
->
use_tensorrt_valid
()
&&
argument
->
use_tensorrt
())
{
InitTensorRTAttrs
(
argument
);
}
ApplyIrPasses
(
argument
);
CollectFusionStatis
(
argument
);
}
...
...
@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return
"ir-analysis-compose-pass"
;
}
void
IrAnalysisComposePass
::
InitTensorRTAttrs
(
Argument
*
argument
)
{
if
(
argument
->
use_tensorrt_valid
()
&&
argument
->
use_tensorrt
())
{
LOG
(
INFO
)
<<
"Initing TensorRT pass"
;
argument
->
SetTensorRtNodeTeller
([](
const
framework
::
ir
::
Node
*
node
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
,
"leaky_relu"
});
if
(
!
node
->
IsOp
())
return
false
;
if
(
teller_set
.
count
(
node
->
Op
()
->
Type
()))
{
return
true
;
}
else
{
return
false
;
}
});
}
}
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
浏览文件 @
eac5a0aa
...
...
@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std
::
string
repr
()
const
override
;
private:
void
InitTensorRTAttrs
(
Argument
*
argument
);
void
ApplyIrPasses
(
Argument
*
argument
);
void
CollectFusionStatis
(
Argument
*
argument
);
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
eac5a0aa
...
...
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
Update
();
}
void
contrib
::
AnalysisConfig
::
Update
()
{
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
eac5a0aa
...
...
@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
using
framework
::
NaiveExecutor
;
using
contrib
::
AnalysisConfig
;
/* This predictor is based on the original native predictor with IR and Analysis
* support. It will optimize IR and Parameters in the runtime.
/** \brief This predictor is based on the original native predictor with IR and
* Analysis support.
*
* It will optimize IR and Parameters in the runtime.
*
* TODO(Superjomn) Replace the Navive predictor?
*/
class
AnalysisPredictor
:
public
PaddlePredictor
{
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
eac5a0aa
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
...
...
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -92,10 +92,10 @@ if(WITH_MKL)
if
(
NOT WIN32
)
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml_intel
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libiomp5
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
else
(
WIN32
)
else
()
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libiomp5md
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
(
WIN32
)
endif
()
set
(
MKLDNN_PATH
"
${
PADDLE_LIB
}
/third_party/install/mkldnn"
)
if
(
EXISTS
${
MKLDNN_PATH
}
)
include_directories
(
"
${
MKLDNN_PATH
}
/include"
)
...
...
@@ -128,8 +128,8 @@ else()
${
CMAKE_STATIC_LIBRARY_PREFIX
}
glog
${
CMAKE_STATIC_LIBRARY_PREFIX
}
gflags
${
CMAKE_STATIC_LIBRARY_PREFIX
}
protobuf
${
CMAKE_STATIC_LIBRARY_PREFIX
}
snappy
${
CMAKE_STATIC_LIBRARY_PREFIX
}
z
${
CMAKE_STATIC_LIBRARY_PREFIX
}
xxhash
snappystream
${
EXTERNAL_LIB
}
)
# NOTE(dzhwinter) shlwapi is deprecated.
set
(
DEPS
${
DEPS
}
libcmt
shlwapi
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
set
(
DEPS
${
DEPS
}
libcmt
${
os_dependency_modules
}
)
endif
(
NOT WIN32
)
if
(
WITH_GPU
)
...
...
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
eac5a0aa
...
...
@@ -116,6 +116,10 @@ D
--modeldir
=
$DATA_DIR
/mobilenet/model
\
--data
=
$DATA_DIR
/mobilenet/data.txt
\
--refer
=
$DATA_DIR
/mobilenet/result.txt
if
[
$?
-ne
0
]
;
then
echo
"trt demo trt_mobilenet_demo runs fail."
exit
1
fi
fi
done
set
+x
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
浏览文件 @
eac5a0aa
...
...
@@ -38,8 +38,8 @@ void Main() {
std
::
unique_ptr
<
PaddlePredictor
>
predictor
;
paddle
::
contrib
::
AnalysisConfig
config
;
config
.
EnableUseGpu
(
100
,
0
);
config
.
SetModel
(
FLAGS_modeldir
+
"/__
params
__"
,
FLAGS_modeldir
+
"/__
model
__"
);
config
.
SetModel
(
FLAGS_modeldir
+
"/__
model
__"
,
FLAGS_modeldir
+
"/__
params
__"
);
config
.
EnableTensorRtEngine
();
predictor
=
CreatePaddlePredictor
(
config
);
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
eac5a0aa
...
...
@@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
os
<<
to_string
(
l
)
<<
"; "
;
}
os
<<
"
\n
"
;
os
<<
" - data: "
;
os
<<
" - memory length: "
<<
tensor
.
data
.
length
();
os
<<
"
\n
"
;
os
<<
" - data: "
;
int
dim
=
VecReduceToInt
(
tensor
.
shape
);
float
*
pdata
=
static_cast
<
float
*>
(
tensor
.
data
.
data
());
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
{
os
<<
static_cast
<
float
*>
(
tensor
.
data
.
data
())
[
i
]
<<
" "
;
os
<<
pdata
[
i
]
<<
" "
;
}
os
<<
'\n'
;
return
os
.
str
();
...
...
@@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
os
<<
to_string
(
l
)
<<
"; "
;
}
os
<<
"
\n
"
;
os
<<
" - data: "
;
PaddlePlace
place
;
int
size
;
const
auto
*
data
=
tensor
.
data
<
float
>
(
&
place
,
&
size
);
os
<<
" - numel: "
<<
size
;
os
<<
"
\n
"
;
os
<<
" - data: "
;
for
(
int
i
=
0
;
i
<
size
;
i
++
)
{
os
<<
data
[
i
]
<<
" "
;
}
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
eac5a0aa
...
...
@@ -19,6 +19,8 @@
#include <unordered_set>
#include <vector>
/*! \file */
// Here we include some header files with relative paths, for that in deploy,
// the abstract path of this header file will be changed.
#include "paddle_api.h" // NOLINT
...
...
@@ -41,49 +43,125 @@ struct AnalysisConfig {
explicit
AnalysisConfig
(
const
std
::
string
&
prog_file
,
const
std
::
string
&
params_file
);
// Model path related.
/** Set model with a directory.
*/
void
SetModel
(
const
std
::
string
&
model_dir
)
{
model_dir_
=
model_dir
;
}
/** Set model with two specific pathes for program and parameters.
*/
void
SetModel
(
const
std
::
string
&
prog_file_path
,
const
std
::
string
&
params_file_path
);
/** Set program file path.
*/
void
SetProgFile
(
const
std
::
string
&
x
)
{
prog_file_
=
x
;
}
/** Set parameter composed file path.
*/
void
SetParamsFile
(
const
std
::
string
&
x
)
{
params_file_
=
x
;
}
/** Get the model directory path.
*/
const
std
::
string
&
model_dir
()
const
{
return
model_dir_
;
}
/** Get the program file path.
*/
const
std
::
string
&
prog_file
()
const
{
return
prog_file_
;
}
/** Get the composed parameters file.
*/
const
std
::
string
&
params_file
()
const
{
return
params_file_
;
}
// GPU related.
/**
* \brief Turn on GPU.
* @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
* @param device_id the GPU card to use (default is 0).
*/
void
EnableUseGpu
(
uint64_t
memory_pool_init_size_mb
,
int
device_id
=
0
);
/** Turn off the GPU.
*/
void
DisableGpu
();
/** A bool state telling whether the GPU is turned on.
*/
bool
use_gpu
()
const
{
return
use_gpu_
;
}
/** Get the GPU device id.
*/
int
gpu_device_id
()
const
{
return
device_id_
;
}
/** Get the initial size in MB of the GPU memory pool.
*/
int
memory_pool_init_size_mb
()
const
{
return
memory_pool_init_size_mb_
;
}
/** Get the proportion of the initial memory pool size compared to the device.
*/
float
fraction_of_gpu_memory_for_pool
()
const
;
// Determine whether to perform graph optimization.
/** \brief Control whether to perform IR graph optimization.
*
* If turned off, the AnalysisConfig will act just like a NativeConfig.
*/
void
SwitchIrOptim
(
int
x
=
true
)
{
enable_ir_optim_
=
x
;
}
/** A boolean state tell whether the ir graph optimization is actived.
*/
bool
ir_optim
()
const
{
return
enable_ir_optim_
;
}
/** \brief INTERNAL Determine whether to use the feed and fetch operators.
* Just for internal development, not stable yet.
* When ZeroCopyTensor is used, this should turned off.
*/
void
SwitchUseFeedFetchOps
(
int
x
=
true
)
{
use_feed_fetch_ops_
=
x
;
}
/** A boolean state telling whether to use the feed and fetch operators.
*/
bool
use_feed_fetch_ops_enabled
()
const
{
return
use_feed_fetch_ops_
;
}
/** \brief Control whether to specify the inputs' names.
*
* The PaddleTensor type has a `name` member, assign it with the corresponding
* variable name. This is used only when the input PaddleTensors passed to the
* `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
*/
void
SwitchSpecifyInputNames
(
bool
x
=
true
)
{
specify_input_name_
=
x
;
}
/** A boolean state tell whether the input PaddleTensor names specified should
* be used to reorder the inputs in `PaddlePredictor.Run(...)`.
*/
bool
specify_input_name
()
const
{
return
specify_input_name_
;
}
/**
* \brief Turn on the TensorRT engine.
*
* The TensorRT engine will accelerate some subgraphes in the original Fluid
* computation graph. In some models such as TensorRT50, GoogleNet and so on,
* it gains significant performance acceleration.
*
* @param workspace_size the memory size(in byte) used for TensorRT workspace.
* @param max_batch_size the maximum batch size of this prediction task,
* better set as small as possible, or performance loss.
* @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
* subgraph is less than this, it will not transfer to TensorRT engine.
*/
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
);
/** A boolean state telling whether the TensorRT engine is used.
*/
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
/** Control whther to debug IR graph analysis phase.
*/
void
SwitchIrDebug
(
int
x
=
true
)
{
ir_debug_
=
x
;
}
/** Turn on MKLDNN.
*/
void
EnableMKLDNN
();
/** A boolean state telling whether to use the MKLDNN.
*/
bool
mkldnn_enabled
()
const
{
return
use_mkldnn_
;
}
// Set and get the number of cpu math library threads.
/** Set and get the number of cpu math library threads.
*/
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
);
/** An int state telling how many threads are used in the CPU math library.
*/
int
cpu_math_library_num_threads
()
const
{
return
cpu_math_library_num_threads_
;
}
/** Transform the AnalysisConfig to NativeConfig.
*/
NativeConfig
ToNativeConfig
()
const
{
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
...
...
@@ -95,19 +173,30 @@ struct AnalysisConfig {
config
.
specify_input_name
=
specify_input_name_
;
return
config
;
}
/** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list.
*/
void
SetMKLDNNOp
(
std
::
unordered_set
<
std
::
string
>
op_list
)
{
mkldnn_enabled_op_types_
=
op_list
;
}
// Specify the memory buffer of program and parameter
/** Specify the memory buffer of program and parameter
* @param prog_buffer the memory buffer of program.
* @param prog_buffer_size the size of the data.
* @param params_buffer the memory buffer of the composed parameters file.
* @param params_buffer_size the size of the commposed parameters data.
*/
void
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
const
char
*
program_buffer
,
size_t
program_buffer_size
);
const
char
*
params_buffer
,
size_t
params_buffer_size
);
/** A boolean state telling whether the model is set from the CPU memory.
*/
bool
model_from_memory
()
const
{
return
model_from_memory_
;
}
friend
class
::
paddle
::
AnalysisPredictor
;
// NOTE just for developer, not an official API, easily to be broken.
// Get a pass builder for customize the passes in IR analysis phase.
/** NOTE just for developer, not an official API, easily to be broken.
* Get a pass builder for customize the passes in IR analysis phase.
*/
PassStrategy
*
pass_builder
()
const
;
protected:
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
eac5a0aa
...
...
@@ -13,61 +13,76 @@
// limitations under the License.
#pragma once
/*! \file paddle_api.h
*/
#include <cassert>
#include <memory>
#include <string>
#include <vector>
/*! \namespace paddle
*/
namespace
paddle
{
// Data type.
/** paddle data type.
*/
enum
PaddleDType
{
FLOAT32
,
INT64
,
// TODO(Superjomn) support more data types if needed.
};
/*
* Memory menage for PaddleTensor.
* The PaddleBuf holds a buffer for data input or output. The memory can be
* allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* should be reused for better performance.
/**
*\brief Memory menager for PaddleTensor.
*
* For user allocated memory, the following API can be used:
* - PaddleBuf(void* data, size_t length) to set an external memory by
* specifying
* the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
* memory.
* ATTENTION, for user allocated memory, deallocation should be done by users
* externally after the program finished. The PaddleBuf won't do any allocation
* or deallocation.
*The PaddleBuf holds a buffer for data input or output. The memory can be
*allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
*should be reused for better performance.
*
* To have the PaddleBuf allocate and manage the memory:
* - PaddleBuf(size_t length) will allocate a memory of size `length`.
* - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*For user allocated memory, the following API can be used:
*- PaddleBuf(void* data, size_t length) to set an external memory by
*specifying
* the memory address and length.
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory.
*ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation.
*
*To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*/
class
PaddleBuf
{
public:
// PaddleBuf allocate memory internally, and manage it.
/** PaddleBuf allocate memory internally, and manage it.
*/
explicit
PaddleBuf
(
size_t
length
)
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
// Set external memory, the PaddleBuf won't manage it.
/** Set external memory, the PaddleBuf won't manage it.
*/
PaddleBuf
(
void
*
data
,
size_t
length
)
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
// Copy only available when memory is managed externally.
/** Copy only available when memory is managed externally.
*/
explicit
PaddleBuf
(
const
PaddleBuf
&
);
// Resize the memory.
/** Resize the memory.
*/
void
Resize
(
size_t
length
);
// Reset to external memory, with address and length set.
/** Reset to external memory, with address and length set.
*/
void
Reset
(
void
*
data
,
size_t
length
);
// Tell whether the buffer is empty.
/** Tell whether the buffer is empty.
*/
bool
empty
()
const
{
return
length_
==
0
;
}
// Get the memory address.
/** Get the memory address.
*/
void
*
data
()
const
{
return
data_
;
}
// Get the memory length.
/** Get the memory length.
*/
size_t
length
()
const
{
return
length_
;
}
~
PaddleBuf
()
{
Free
();
}
...
...
@@ -83,7 +98,8 @@ class PaddleBuf {
bool
memory_owned_
{
true
};
};
// Basic input and output data structure for PaddlePredictor.
/** Basic input and output data structure for PaddlePredictor.
*/
struct
PaddleTensor
{
PaddleTensor
()
=
default
;
std
::
string
name
;
// variable name.
...
...
@@ -94,19 +110,23 @@ struct PaddleTensor {
};
enum
class
PaddlePlace
{
kUNK
=
-
1
,
kCPU
,
kGPU
};
// Tensor without copy, currently only supports AnalysisPredictor.
/** Tensor without copy, currently only supports AnalysisPredictor.
*/
class
ZeroCopyTensor
{
public:
void
Reshape
(
const
std
::
vector
<
int
>&
shape
);
// Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size.
// Once can directly call this data to feed the data.
// This is for write the input tensor.
/** Get the memory in CPU or GPU with specific data type, should Reshape first
* to tell the data size.
* Once can directly call this data to feed the data.
* This is for write the input tensor.
*/
template
<
typename
T
>
T
*
mutable_data
(
PaddlePlace
place
);
// Get the memory directly, will return the place and memory size by pointer.
// This is for reading the output tensor.
/** Get the memory directly, will return the place and element size by
* pointer.
* This is for reading the output tensor.
*/
template
<
typename
T
>
T
*
data
(
PaddlePlace
*
place
,
int
*
size
)
const
;
...
...
@@ -128,8 +148,7 @@ class ZeroCopyTensor {
void
*
scope_
{
nullptr
};
};
/*
* A simple Inference API for Paddle.
/** A simple Inference API for Paddle.
*/
class
PaddlePredictor
{
public:
...
...
@@ -138,18 +157,20 @@ class PaddlePredictor {
PaddlePredictor
(
const
PaddlePredictor
&
)
=
delete
;
PaddlePredictor
&
operator
=
(
const
PaddlePredictor
&
)
=
delete
;
// Predict an record.
// The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for the output tensor's buffer, either allocated or passed from
// outside.
/** Predict an record.
* The caller should be responsible for allocating and releasing the memory of
* `inputs`. `inputs` should be available until Run returns. Caller should be
* responsible for the output tensor's buffer, either allocated or passed from
* outside.
*/
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
// Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly,
// without copy.
/** Zero copy input and output optimization.
* Get the input or output tensors, and operate on their memory directly,
* without copy.
*/
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
{
return
nullptr
;
...
...
@@ -160,16 +181,19 @@ class PaddlePredictor {
}
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
// Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe.
/** Clone a predictor that share the model weights, the Cloned predictor
* should be thread-safe.
*/
virtual
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
=
0
;
// Destroy the Predictor.
/** Destroy the Predictor.
*/
virtual
~
PaddlePredictor
()
=
default
;
// The common configs for all the predictors.
/** The common configs for all the predictors.
*/
struct
Config
{
std
::
string
model_dir
;
// path to the model directory.
std
::
string
model_dir
;
/*!< path to the model directory. */
};
};
...
...
@@ -177,17 +201,21 @@ struct NativeConfig : public PaddlePredictor::Config {
// GPU related fields.
bool
use_gpu
{
false
};
int
device
{
0
};
float
fraction_of_gpu_memory
{
-
1.
f
};
// Change to a float in (0,1] if needed.
float
fraction_of_gpu_memory
{
-
1.
f
};
/*!< Change to a float in (0,1] if needed. */
// Specify the exact path of program and parameter files.
std
::
string
prog_file
;
std
::
string
param_file
;
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
/** Specify the variable's name of each input if input tensors don't follow
* the
* `feeds` and `fetches` of the phase `save_inference_model`.
*/
bool
specify_input_name
{
false
};
// Set and get the number of cpu math library threads.
/** Set and get the number of cpu math library threads.
*/
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
}
...
...
@@ -201,28 +229,33 @@ struct NativeConfig : public PaddlePredictor::Config {
int
cpu_math_library_num_threads_
{
1
};
};
// A factory to help create different predictors.
//
// Usage:
//
// NativeConfig config;
// ... // change the configs.
// auto native_predictor = CreatePaddlePredictor(config);
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type. Similar configs can be
// merged, but there shouldn't be a huge config containing different fields for
// more than one kind of predictors.
/*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
* config);
*
* \brief A factory to help create different predictors.
*
* Usage:
*
* NativeConfig config;
* ... // change the configs.
* auto native_predictor = CreatePaddlePredictor(config);
*
* FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be
* merged, but there shouldn't be a huge config containing different fields for
* more than one kind of predictors.
*/
template
<
typename
ConfigT
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
(
const
ConfigT
&
config
);
// NOTE The following APIs are too trivial, we will discard it in the following
// versions.
/** NOTE The following APIs are too trivial, we will discard it in the following
* versions.
*/
enum
class
PaddleEngineKind
{
kNative
=
0
,
// Use the native Fluid facility.
kAutoMixedTensorRT
,
// Automatically mix Fluid with TensorRT.
kAnalysis
,
// More optimization.
kAnakin
// Use Anakin for inference, not mature yet.
kNative
=
0
,
/*!< Use the native Fluid facility. */
kAutoMixedTensorRT
,
/*!< Automatically mix Fluid with TensorRT. */
kAnalysis
,
/*!< More optimization. */
kAnakin
/*!< Use Anakin for inference, not mature yet. */
};
template
<
typename
ConfigT
,
PaddleEngineKind
engine
>
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
eac5a0aa
...
...
@@ -18,30 +18,39 @@
#include <string>
#include <vector>
/*! \file */
/*! \namespace paddle */
namespace
paddle
{
/*
* This is a pass builder based on string. It is part of inference API.
/*
* This is a pass builder based on string. It is part of inference API.
*/
class
PaddlePassBuilder
{
public:
explicit
PaddlePassBuilder
(
const
std
::
vector
<
std
::
string
>
&
passes
)
:
passes_
(
passes
)
{}
/** Append a pass to the end of the passes. */
void
AppendPass
(
const
std
::
string
&
pass_type
);
/** Insert a pass to a specific position.
* @param idx the position to insert.
* @param pass_type the pass key.
*/
void
InsertPass
(
size_t
idx
,
const
std
::
string
&
pass_type
);
/
/ Delete the `idx`-th pass.
/
** Delete the `idx`-th pass. */
void
DeletePass
(
size_t
idx
);
/
/ Delete all the passes that has type `pass_type`.
/
** Delete all the passes that has type `pass_type`. */
void
DeletePass
(
const
std
::
string
&
pass_type
);
// Visualize the computation graph after each pass by generating a DOT
// language file, one can draw them with the Graphviz toolkit.
/** Visualize the computation graph after each pass by generating a DOT
* language file, one can draw them with the Graphviz toolkit.
*/
void
TurnOnDebug
();
/
/ Human-readible information.
/
** Human-readible information. */
std
::
string
DebugString
();
const
std
::
vector
<
std
::
string
>
&
AllPasses
()
const
{
return
passes_
;
}
...
...
@@ -50,16 +59,16 @@ class PaddlePassBuilder {
std
::
vector
<
std
::
string
>
passes_
;
};
/*
* Pass strategy to help control the IR passes.
/**Pass strategy to help control the IR passes.
*/
class
PassStrategy
:
public
PaddlePassBuilder
{
public:
explicit
PassStrategy
(
const
std
::
vector
<
std
::
string
>
&
passes
)
:
PaddlePassBuilder
(
passes
)
{}
// The MKLDNN control exists in both CPU and GPU mode, because there can be
// still some CPU kernels running in CPU mode.
/** The MKLDNN control exists in both CPU and GPU mode, because there can be
* still some CPU kernels running in CPU mode.
*/
virtual
void
EnableMKLDNN
()
=
0
;
bool
use_gpu
()
const
{
return
use_gpu_
;
}
...
...
@@ -70,8 +79,7 @@ class PassStrategy : public PaddlePassBuilder {
bool
use_gpu_
{
false
};
};
/*
* The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
*/
class
CpuPassStrategy
:
public
PassStrategy
{
public:
...
...
@@ -81,6 +89,7 @@ class CpuPassStrategy : public PassStrategy {
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"seqpool_concat_fuse_pass"
,
//
"seqconv_eltadd_relu_fuse_pass"
,
//
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass"
,
//
...
...
@@ -117,8 +126,7 @@ class CpuPassStrategy : public PassStrategy {
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
passes_
)
{}
};
/*
* The GPU passes strategy, it is used in
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
*/
class
GpuPassStrategy
:
public
PassStrategy
{
public:
...
...
@@ -133,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add_fuse_pass"
,
//
});
for
(
int
i
=
6
;
i
>=
3
;
i
--
)
{
passes_
.
push_back
(
"transpose_flatten"
+
std
::
to_string
(
i
)
+
"_concat_fuse_pass"
);
}
use_gpu_
=
true
;
}
...
...
paddle/fluid/inference/tensorrt/CMakeLists.txt
浏览文件 @
eac5a0aa
nv_library
(
tensorrt_engine SRCS engine.cc DEPS
${
GLOB_OPERATOR_DEPS
}
framework_proto device_context
)
nv_library
(
tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
add_subdirectory
(
plugin
)
...
...
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
浏览文件 @
eac5a0aa
...
...
@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
nvinfer1
::
ILayer
*
layer
=
nullptr
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IScaleLayer"
;
...
...
@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
0
};
TensorRTEngine
::
Weight
power_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
if
(
op_type_
==
"add"
)
{
nvinfer1
::
IScaleLayer
*
scale_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
X
,
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
layer
=
scale_layer
;
}
else
if
(
op_type_
==
"mul"
)
{
nvinfer1
::
IScaleLayer
*
scale_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
X
,
scale_mode
,
scale_weights
.
get
(),
shift_weights
.
get
(),
power_weights
.
get
());
layer
=
scale_layer
;
}
nvinfer1
::
IScaleLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
layer
->
setName
((
"elementwise_add
(Output: "
+
output_name
+
")"
).
c_str
());
layer
->
setName
(
(
"elementwise_"
+
op_type_
+
"
(Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
weight_map
[
op_desc
.
Input
(
"Y"
).
front
()]
=
std
::
move
(
weight_tensor
);
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
...
...
@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
engine_
->
DeclareOutput
(
output_name
);
}
}
protected:
std
::
string
op_type_
;
};
class
ElementwiseTensorOpConverter
:
public
OpConverter
{
...
...
@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
{
"max"
,
nvinfer1
::
ElementWiseOperation
::
kMAX
},
};
class
ElementwiseWeightAddOpConverter
:
public
ElementwiseWeightOpConverter
{
public:
ElementwiseWeightAddOpConverter
()
{
op_type_
=
"add"
;
}
};
class
ElementwiseWeightMulOpConverter
:
public
ElementwiseWeightOpConverter
{
public:
ElementwiseWeightMulOpConverter
()
{
op_type_
=
"mul"
;
}
};
class
ElementwiseTensorAddOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
ElementwiseTensorAddOpConverter
()
{
op_type_
=
"add"
;
}
...
...
@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightAddOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_mul_weight
,
ElementwiseWeightMulOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_tensor
,
ElementwiseTensorAddOpConverter
);
...
...
paddle/fluid/inference/tensorrt/op_teller.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
// Just tell by the op_types.
struct
SimpleOpTypeSetTeller
:
public
Teller
{
SimpleOpTypeSetTeller
()
{}
bool
operator
()(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
override
{
return
teller_set
.
count
(
op_type
);
}
private:
std
::
unordered_set
<
std
::
string
>
teller_set
{
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
,
"leaky_relu"
}};
};
bool
OpTeller
::
Tell
(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
{
for
(
auto
&
teller
:
tellers_
)
{
if
((
*
teller
)(
op_type
,
desc
))
return
true
;
}
return
false
;
}
OpTeller
::
OpTeller
()
{
tellers_
.
emplace_back
(
new
SimpleOpTypeSetTeller
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/op_teller.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct
Teller
{
virtual
bool
operator
()(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
=
0
;
virtual
~
Teller
()
=
default
;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class
OpTeller
{
public:
static
OpTeller
&
Global
()
{
static
std
::
unique_ptr
<
OpTeller
>
x
(
new
OpTeller
);
return
*
x
;
}
bool
Tell
(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
);
private:
OpTeller
();
private:
std
::
vector
<
std
::
unique_ptr
<
Teller
>>
tellers_
;
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -41,7 +41,7 @@ endfunction()
if
(
NOT APPLE AND WITH_MKLML
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
download_model_and_data
(
${
RNN1_INSTALL_DIR
}
"rnn1%2Fmodel.tar.gz"
"rnn1%2Fdata.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_rnn1
${
RNN1_INSTALL_DIR
}
analyzer_rnn1_tester.cc
)
inference_analysis_api_test
(
test_analyzer_rnn1
${
RNN1_INSTALL_DIR
}
analyzer_rnn1_tester.cc
SERIAL
)
else
()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
...
...
@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
SERIAL
)
# small DAM
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
download_model_and_data
(
${
DAM_SMALL_INSTALL_DIR
}
"dam_small_model.tar.gz"
"dam_small_data.txt.tar.gz"
)
inference_analysis_test
(
test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
DAM_SMALL_INSTALL_DIR
}
/model --infer_data=
${
DAM_SMALL_INSTALL_DIR
}
/data.txt --max_turn_num=1
)
ARGS --infer_model=
${
DAM_SMALL_INSTALL_DIR
}
/model --infer_data=
${
DAM_SMALL_INSTALL_DIR
}
/data.txt --max_turn_num=1
SERIAL
)
# chinese_ner
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
...
...
@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
# resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
SERIAL
)
# mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data
(
test_analyzer_mobilenet_depthwise_conv
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet_depthwise_conv"
analyzer_resnet50_tester.cc
"mobilenet_model.tar.gz"
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet_depthwise_conv"
analyzer_resnet50_tester.cc
"mobilenet_model.tar.gz"
SERIAL
)
# anakin
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
eac5a0aa
...
...
@@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
4
/* multi_thread */
);
input_slots_all
,
&
outputs
,
2
/* multi_thread */
);
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
...
...
@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) {
ASSERT_TRUE
(
native_predictor
->
Run
(
native_inputs
.
front
(),
&
native_outputs
));
LOG
(
INFO
)
<<
"native output "
<<
DescribeTensor
(
native_outputs
.
front
());
int
output_size
{
0
};
int
output_size
{
0
};
// this is the number of elements not memory size
auto
*
zero_copy_data
=
output_tensor
->
data
<
float
>
(
&
place
,
&
output_size
);
auto
*
native_data
=
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
());
for
(
size_t
i
=
0
;
i
<
output_size
/
sizeof
(
float
)
;
i
++
)
{
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
EXPECT_NEAR
(
zero_copy_data
[
i
],
native_data
[
i
],
1e-3
);
}
}
...
...
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
浏览文件 @
eac5a0aa
...
...
@@ -121,14 +121,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
}
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
SetModel
(
FLAGS_infer_model
+
"/model"
,
FLAGS_infer_model
+
"/params"
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
pass_builder
()
->
TurnOnDebug
();
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
PaddleTensor
>
input_slots
;
...
...
@@ -141,15 +133,22 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
}
}
void
SetConfig
(
AnalysisConfig
*
cfg
,
bool
use_mkldnn
=
false
)
{
cfg
->
SetModel
(
FLAGS_infer_model
+
"/model"
,
FLAGS_infer_model
+
"/params"
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
pass_builder
()
->
TurnOnDebug
();
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
if
(
use_mkldnn
)
{
cfg
->
EnableMKLDNN
();
}
}
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
,
use_mkldnn
);
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
}
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
...
...
@@ -169,16 +168,110 @@ TEST(Analyzer_seq_pool1, compare) {
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Check the fuse status
TEST
(
Analyzer_seq_pool1
,
fuse_statis
)
{
// Compare Deterministic result
TEST
(
Analyzer_seq_pool1
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
void
analysis_fuse_statis
(
bool
use_zerocopy
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
cfg
.
SwitchUseFeedFetchOps
(
!
use_zerocopy
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
10
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"seqpool_concat_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"seqpool_concat_fuse"
),
2
);
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
EXPECT_EQ
(
num_ops
,
349
);
EXPECT_EQ
(
num_ops
,
195
);
}
// Check the fuse status
TEST
(
Analyzer_seq_pool1
,
fuse_statis
)
{
analysis_fuse_statis
(
false
);
}
void
PrepareZeroCopyInputs
(
const
std
::
unique_ptr
<
PaddlePredictor
>
&
predictor
,
std
::
vector
<
std
::
unique_ptr
<
ZeroCopyTensor
>>
*
inputs
)
{
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
// only feed one batch
const
auto
&
one_batch
=
data
.
NextBatch
();
inputs
->
clear
();
for
(
size_t
i
=
0
;
i
<
one_batch
.
size
();
++
i
)
{
auto
&
slot
=
one_batch
[
i
];
auto
tensor
=
predictor
->
GetInputTensor
(
slot
.
name
+
"_embed"
);
tensor
->
Reshape
(
slot
.
shape
);
tensor
->
SetLoD
({
slot
.
lod
});
ZeroCopyTensorAssignData
<
float
>
(
tensor
.
get
(),
slot
.
data
);
inputs
->
emplace_back
(
std
::
move
(
tensor
));
}
}
// return the output values
std
::
vector
<
float
>
zerocopy_profile
(
int
repeat_times
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
SwitchUseFeedFetchOps
(
false
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
std
::
vector
<
std
::
unique_ptr
<
ZeroCopyTensor
>>
inputs
;
PrepareZeroCopyInputs
(
predictor
,
&
inputs
);
auto
output_tensor
=
predictor
->
GetOutputTensor
(
"reduce_sum_0.tmp_0"
);
Timer
timer
;
LOG
(
INFO
)
<<
"Warm up run..."
;
timer
.
tic
();
predictor
->
ZeroCopyRun
();
PrintTime
(
FLAGS_batch_size
,
1
,
1
,
0
,
timer
.
toc
(),
1
);
if
(
FLAGS_profile
)
{
paddle
::
platform
::
ResetProfiler
();
}
LOG
(
INFO
)
<<
"Run "
<<
repeat_times
<<
" times..."
;
timer
.
tic
();
for
(
int
i
=
0
;
i
<
repeat_times
;
i
++
)
{
predictor
->
ZeroCopyRun
();
}
PrintTime
(
FLAGS_batch_size
,
repeat_times
,
1
,
0
,
timer
.
toc
()
/
repeat_times
,
1
);
VLOG
(
3
)
<<
"ZeroCopy output: "
<<
DescribeZeroCopyTensor
(
*
output_tensor
);
PaddlePlace
place
;
int
output_size
{
0
};
auto
*
pdata
=
output_tensor
->
data
<
float
>
(
&
place
,
&
output_size
);
std
::
vector
<
float
>
res
(
output_size
);
for
(
int
i
=
0
;
i
<
output_size
;
++
i
)
{
res
[
i
]
=
pdata
[
i
];
}
return
res
;
}
TEST
(
Analyzer_seq_pool1
,
zerocopy_profile
)
{
zerocopy_profile
(
FLAGS_repeat
);
}
TEST
(
Analyzer_seq_pool1
,
zerocopy_fuse_statis
)
{
analysis_fuse_statis
(
true
);
}
TEST
(
Analyzer_seq_pool1
,
zerocopy_compare_native
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
SwitchUseFeedFetchOps
(
true
);
auto
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
.
ToNativeConfig
());
std
::
vector
<
PaddleTensor
>
native_outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
ASSERT_TRUE
(
predictor
->
Run
(
input_slots_all
[
0
],
&
native_outputs
));
EXPECT_EQ
(
native_outputs
.
size
(),
1UL
);
auto
zerocopy_output
=
zerocopy_profile
(
1
);
EXPECT_EQ
(
zerocopy_output
.
size
()
*
sizeof
(
float
),
native_outputs
.
front
().
data
.
length
());
auto
*
native_data
=
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
());
for
(
size_t
i
=
0
;
i
<
zerocopy_output
.
size
();
++
i
)
{
EXPECT_NEAR
(
zerocopy_output
[
i
],
native_data
[
i
],
1e-3
);
}
}
}
// namespace analysis
...
...
paddle/fluid/inference/tests/api/config_printer.h
浏览文件 @
eac5a0aa
...
...
@@ -62,7 +62,7 @@ std::ostream &operator<<(std::ostream &os,
const
contrib
::
AnalysisConfig
&
config
)
{
os
<<
GenSpaces
(
num_spaces
)
<<
"contrib::AnalysisConfig {
\n
"
;
num_spaces
++
;
os
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
&
config
);
os
<<
config
.
ToNativeConfig
(
);
if
(
!
config
.
model_from_memory
())
{
os
<<
GenSpaces
(
num_spaces
)
<<
"prog_file: "
<<
config
.
prog_file
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
params_file
()
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
eac5a0aa
...
...
@@ -54,11 +54,13 @@ namespace paddle {
namespace
inference
{
void
PrintConfig
(
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
)
{
const
auto
*
analysis_config
=
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
if
(
use_analysis
)
{
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
)
;
LOG
(
INFO
)
<<
*
analysis_config
;
return
;
}
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
config
);
LOG
(
INFO
)
<<
analysis_config
->
ToNativeConfig
(
);
}
void
CompareResult
(
const
std
::
vector
<
PaddleTensor
>
&
outputs
,
...
...
@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std
::
unique_ptr
<
PaddlePredictor
>
CreateTestPredictor
(
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
=
true
)
{
const
auto
*
analysis_config
=
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
if
(
use_analysis
)
{
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
*
(
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
)));
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
*
analysis_config
);
}
return
CreatePaddlePredictor
<
NativeConfig
>
(
*
(
reinterpret_cast
<
const
NativeConfig
*>
(
config
))
);
auto
native_config
=
analysis_config
->
ToNativeConfig
();
return
CreatePaddlePredictor
<
NativeConfig
>
(
native_config
);
}
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
...
...
@@ -328,10 +331,7 @@ void CompareNativeAndAnalysis(
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
)
{
PrintConfig
(
config
,
true
);
std
::
vector
<
PaddleTensor
>
native_outputs
,
analysis_outputs
;
const
auto
*
analysis_config
=
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
auto
native_config
=
analysis_config
->
ToNativeConfig
();
TestOneThreadPrediction
(
&
native_config
,
inputs
,
&
native_outputs
,
false
);
TestOneThreadPrediction
(
config
,
inputs
,
&
native_outputs
,
false
);
TestOneThreadPrediction
(
config
,
inputs
,
&
analysis_outputs
,
true
);
CompareResult
(
analysis_outputs
,
native_outputs
);
}
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
eac5a0aa
...
...
@@ -99,24 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) {
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
false
,
"__model__"
,
""
);
}
std
::
vector
<
PaddleTensor
>
native_outputs
;
NativeConfig
native_config
;
SetConfig
<
NativeConfig
>
(
&
native_config
,
model_dir
,
true
,
false
,
FLAGS_batch_size
);
TestOneThreadPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
native_config
),
inputs_all
,
&
native_outputs
,
false
);
std
::
vector
<
PaddleTensor
>
analysis_outputs
;
contrib
::
AnalysisConfig
analysis_config
;
analysis_config
.
EnableUseGpu
(
50
,
0
);
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
analysis_config
,
model_dir
,
true
,
use_tensorrt
,
FLAGS_batch_size
);
TestOneThreadPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
analysis_config
),
inputs_all
,
&
analysis_outputs
,
true
);
CompareResult
(
native_outputs
,
analysis_outputs
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
analysis_config
),
inputs_all
);
}
TEST
(
TensorRT_mobilenet
,
compare
)
{
...
...
paddle/fluid/inference/utils/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test
(
test_benchmark SRCS benchmark_tester.cc DEPS benchmark
)
cc_binary
(
visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes
)
if
(
WIN32
)
target_link_libraries
(
visualizer shlwapi
)
endif
(
WIN32
)
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
eac5a0aa
...
...
@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnFilterDescriptor_t
cudnn_filter_desc
=
filter_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize2int
(
filter
->
dims
()),
groups
);
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Enable Tensor Core for cudnn backward
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math for backward"
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math for backward"
;
}
#endif
int
input_channels
=
input
->
dims
()[
1
];
int
input_height
,
input_width
,
input_depth
;
if
(
input
->
dims
().
size
()
==
5
)
{
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
浏览文件 @
eac5a0aa
...
...
@@ -318,8 +318,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
bool
fuse_relu
=
ctx
.
Attr
<
bool
>
(
"fuse_relu"
);
bool
fuse_residual_conn
=
ctx
.
Attr
<
bool
>
(
"fuse_residual_connection"
);
bool
force_fp32_output
=
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
);
if
(
fuse_residual_conn
)
{
PADDLE_ENFORCE
(
force_fp32_output
!=
true
,
"residual fusion does not support force output with fp32"
);
}
bool
is_conv3d
=
strides
.
size
()
==
3U
;
// TODO(tpatejko): add support for dilation
...
...
@@ -329,6 +335,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dilations
[
2
]
==
1
:
dilations
.
size
()
==
2
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
,
"dilation in convolution is not implemented yet"
);
PADDLE_ENFORCE
(
is_conv3d
!=
true
,
"int8 does not support conv3d currently"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
...
...
@@ -340,17 +347,35 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
GetWeightsTz
(
weights_tz
,
g
,
is_conv3d
);
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
mkldnn
::
memory
::
data_type
src_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
auto
dst_dt
=
fuse_relu
?
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
uint8_t
>::
DataType
)
:
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
int8_t
>::
DataType
);
if
(
force_fp32_output
)
{
dst_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
float
>::
DataType
);
}
if
(
fuse_residual_conn
)
{
auto
residual
=
ctx
.
Input
<
Tensor
>
(
"ResidualData"
);
auto
residual_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
residual
->
type
());
if
(
dst_dt
!=
residual_dt
)
dst_dt
=
residual_dt
;
}
// Get unique name for storing MKLDNN primitives
std
::
string
key
;
key
.
reserve
(
MaxKeyLength
);
mkldnn
::
memory
::
data_type
src_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
platform
::
ConvMKLDNNHandler
::
AppendKey
(
&
key
,
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
groups
,
src_dt
,
input
->
format
(),
ctx
.
op
().
Output
(
"Output"
));
input
->
format
(),
fuse_relu
,
fuse_residual_conn
,
ctx
.
op
().
Output
(
"Output"
));
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
bool
need_s8_to_u8
=
false
;
std
::
shared_ptr
<
mkldnn
::
convolution_forward
>
conv_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
user_src_memory_p
=
nullptr
;
...
...
@@ -365,14 +390,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
src_key
=
key
+
"@src_mem_p"
;
auto
user_src_key
=
key
+
"@user_src_mem_p"
;
auto
src_reorder_key
=
key
+
"@src_mem_preorder_p"
;
auto
residual_reorder_key
=
key
+
"@residual_data_mem_preorder_p"
;
conv_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_forward
>
(
dev_ctx
.
GetBlob
(
prim_key
));
if
(
conv_p
==
nullptr
||
!
is_test
)
{
const
K
*
filter_data
=
filter
->
data
<
K
>
();
auto
scale_in_data
=
ctx
.
Attr
<
float
>
(
"Scale_in"
);
auto
scale_in_eltwise_data
=
ctx
.
Attr
<
float
>
(
"Scale_in_eltwise"
);
auto
scale_weights_data
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"Scale_weights"
);
auto
scale_out_data
=
force_fp32_output
?
1.0
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
float
sum_scale
=
fuse_residual_conn
?
scale_out_data
/
scale_in_eltwise_data
:
1.0
f
;
bool
is_multi_channel
=
scale_weights_data
.
size
()
>
1
;
...
...
@@ -413,15 +444,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform
::
MKLDNNMemDesc
(
src_tz
,
src_dt
,
chosen_memory_format
);
auto
weights_md
=
platform
::
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
s8
,
chosen_memory_format
);
auto
dst_dt
=
force_fp32_output
?
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
float
>::
DataType
)
:
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
int8_t
>::
DataType
);
auto
dst_md
=
platform
::
MKLDNNMemDesc
(
dst_tz
,
dst_dt
,
chosen_memory_format
);
// create a conv primitive descriptor and save it for usage in backward
if
(
bias
)
{
bias_tz
=
paddle
::
framework
::
vectorize2int
(
bias
->
dims
());
...
...
@@ -429,11 +454,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
memory
::
format
::
x
);
conv_pd
=
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
bias_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
output_shift_scale
,
is_test
);
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
,
is_test
);
}
else
{
conv_pd
=
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
output_shift_scale
,
is_test
);
mkldnn_engine
,
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
,
is_test
);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd
);
...
...
@@ -458,8 +485,46 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
user_weights_memory_p
,
pipeline
,
is_test
,
true
,
scale_weights_data
,
mask_reorder
);
if
(
!
force_fp32_output
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
handler
);
if
(
fuse_residual_conn
)
{
auto
residual_param
=
ctx
.
Input
<
Tensor
>
(
"ResidualData"
);
PADDLE_ENFORCE_EQ
(
output
->
dims
(),
residual_param
->
dims
(),
"Output and elementwise parameter need to have the "
"same dimension sizes"
);
auto
residual_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
residual_param
->
type
());
if
(
residual_param
->
format
()
!=
handler
->
GetDstFormat
())
{
auto
residual_data_tz
=
paddle
::
framework
::
vectorize2int
(
residual_param
->
dims
());
auto
user_residual_md
=
platform
::
MKLDNNMemDesc
(
residual_data_tz
,
residual_dt
,
residual_param
->
format
());
if
(
residual_dt
==
mkldnn
::
memory
::
data_type
::
u8
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
uint8_t
>
(
ctx
,
output
,
residual_param
,
user_residual_md
,
handler
,
&
pipeline
);
}
else
{
need_s8_to_u8
=
fuse_relu
;
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
residual_param
,
user_residual_md
,
handler
,
&
pipeline
);
}
}
else
{
output
->
ShareDataWith
(
*
residual_param
);
if
(
residual_dt
==
mkldnn
::
memory
::
data_type
::
u8
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
uint8_t
>
(
ctx
,
output
,
handler
);
}
else
{
need_s8_to_u8
=
fuse_relu
;
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
handler
);
}
}
}
else
if
(
!
force_fp32_output
)
{
if
(
fuse_relu
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
uint8_t
>
(
ctx
,
output
,
handler
);
}
else
{
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
handler
);
}
}
else
{
dst_memory_p
=
platform
::
SetDstMemory
<
float
>
(
ctx
,
output
,
handler
);
}
...
...
@@ -467,11 +532,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// create convolution op primitive
auto
scale_bias_key
=
key
+
"@scale_bias"
;
if
(
bias
)
{
const
float
*
bias_data
=
bias
->
data
<
float
>
();
const
K
*
bias_data
=
bias
->
data
<
K
>
();
auto
user_bias_md
=
platform
::
MKLDNNMemDesc
(
{
bias_tz
},
platform
::
MKLDNNGetDataType
<
float
>
(),
memory
::
format
::
x
);
{
bias_tz
},
platform
::
MKLDNNGetDataType
<
K
>
(),
memory
::
format
::
x
);
auto
user_bias_memory_p
=
handler
->
AcquireBiasMemory
(
user_bias_md
,
to_void_cast
<
float
>
(
bias_data
));
user_bias_md
,
to_void_cast
<
K
>
(
bias_data
));
std
::
shared_ptr
<
mkldnn
::
memory
>
bias_memory_p
;
int
mask_reorder
=
is_multi_channel
?
1
<<
0
:
1
;
int
count
=
...
...
@@ -517,21 +582,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
handler
.
reset
(
new
platform
::
ConvMKLDNNHandler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
));
}
if
(
!
force_fp32_output
)
{
dst_memory_p
=
platform
::
SetDstMemoryHandler
<
int8_t
>
(
ctx
,
output
,
handler
);
if
(
fuse_residual_conn
)
{
auto
residual_param
=
ctx
.
Input
<
Tensor
>
(
"ResidualData"
);
auto
residual_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
residual_param
->
type
());
output
->
ShareDataWith
(
*
residual_param
);
if
(
residual_dt
==
mkldnn
::
memory
::
data_type
::
u8
)
{
platform
::
SetDstMemoryHandler
<
uint8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
else
{
platform
::
SetDstMemoryHandler
<
int8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
}
else
if
(
!
force_fp32_output
)
{
if
(
fuse_relu
)
{
platform
::
SetDstMemoryHandler
<
uint8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
else
{
platform
::
SetDstMemoryHandler
<
int8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
}
else
{
dst_memory_p
=
platform
::
SetDstMemoryHandler
<
float
>
(
ctx
,
output
,
handler
);
platform
::
SetDstMemoryHandler
<
float
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
if
(
src_memory_reorder_p
)
{
pipeline
.
push_back
(
*
src_memory_reorder_p
);
}
auto
residual_reorder_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
residual_reorder_key
));
if
(
residual_reorder_p
)
{
pipeline
.
push_back
(
*
residual_reorder_p
);
}
pipeline
.
push_back
(
*
conv_p
);
}
// push primitive to stream and wait until it's executed
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
if
(
need_s8_to_u8
)
{
output
->
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
}
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory_p
));
}
...
...
@@ -563,11 +658,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
mkldnn
::
primitive_attr
CreatePostOps
(
const
std
::
vector
<
float
>
output_shift_scale
)
const
{
bool
fuse_relu
,
bool
fuse_residual_conn
,
const
std
::
vector
<
float
>
output_shift_scale
,
float
sum_scale
)
const
{
mkldnn
::
primitive_attr
conv_attr
;
mkldnn
::
post_ops
post_operations
;
int
mask
=
output_shift_scale
.
size
()
>
1
?
1
<<
1
:
0
;
conv_attr
.
set_output_scales
(
mask
,
output_shift_scale
);
if
(
fuse_residual_conn
)
{
post_operations
.
append_sum
(
sum_scale
);
}
if
(
fuse_relu
)
{
constexpr
float
scale
=
1.0
f
;
constexpr
float
negative_slope
=
0.0
f
;
constexpr
float
placeholder
=
1.0
f
;
// beta
post_operations
.
append_eltwise
(
scale
,
mkldnn
::
algorithm
::
eltwise_relu
,
negative_slope
,
placeholder
);
}
conv_attr
.
set_post_ops
(
post_operations
);
return
conv_attr
;
}
...
...
@@ -600,9 +706,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
const
bool
fuse_residual_conn
,
const
std
::
vector
<
float
>
output_shift_scale
,
bool
is_test
)
const
{
const
float
sum_scale
,
bool
is_test
)
const
{
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
...
...
@@ -613,7 +720,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
output_shift_scale
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
);
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
conv_desc
,
conv_attr
,
engine
);
...
...
@@ -652,9 +760,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const
memory
::
desc
&
bias
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
const
bool
fuse_residual_conn
,
const
std
::
vector
<
float
>
output_shift_scale
,
bool
is_test
)
const
{
const
float
sum_scale
,
bool
is_test
)
const
{
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
...
...
@@ -665,7 +774,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
bias
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
output_shift_scale
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
);
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
conv_desc
,
conv_attr
,
engine
);
...
...
@@ -868,7 +978,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
input_grad
->
set_format
(
GetMKLDNNFormat
(
*
diff_src_memory_p
));
}
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
// Compute()
}
};
}
// namespace operators
...
...
paddle/fluid/operators/elementwise/elementwise_sub_op.cu
浏览文件 @
eac5a0aa
...
...
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub_grad
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace
paddle
{
namespace
operators
{
class
FusedEmbeddingSeqPoolOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input W of FusedEmbeddingSeqPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Ids"
),
"Input Ids of FusedEmbeddingSeqPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output of FusedEmbeddingSeqPoolOp should not be null."
);
auto
table_dims
=
ctx
->
GetInputDim
(
"W"
);
auto
ids_dims
=
ctx
->
GetInputDim
(
"Ids"
);
const
std
::
string
&
combiner
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"combiner"
);
PADDLE_ENFORCE_EQ
(
table_dims
.
size
(),
2
);
PADDLE_ENFORCE_GE
(
ids_dims
.
size
(),
1
,
"The dim size of the 'Ids' tensor must greater than 1."
);
PADDLE_ENFORCE_EQ
(
ids_dims
[
ids_dims
.
size
()
-
1
],
1
,
"The last dimension of the 'Ids' tensor must be 1."
);
// we only support sum now
PADDLE_ENFORCE_EQ
(
combiner
,
"sum"
);
int64_t
last_dim
=
table_dims
[
1
];
for
(
int
i
=
1
;
i
!=
ids_dims
.
size
();
++
i
)
{
last_dim
*=
ids_dims
[
i
];
}
if
(
ctx
->
IsRuntime
())
{
framework
::
Variable
*
ids_var
=
boost
::
get
<
framework
::
Variable
*>
(
ctx
->
GetInputVarPtrs
(
"Ids"
)[
0
]);
const
auto
&
ids_lod
=
ids_var
->
Get
<
LoDTensor
>
().
lod
();
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE
(
ids_lod
.
size
(),
1u
,
"The LoD level of Input(Ids) must be 1"
);
PADDLE_ENFORCE_GE
(
ids_lod
[
0
].
size
(),
1u
,
"The LoD could NOT be empty"
);
int64_t
batch_size
=
ids_lod
[
0
].
size
()
-
1
;
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size, embedding_size]
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
({
batch_size
,
last_dim
}));
}
else
{
// in compile time, the lod level of ids must be 1
framework
::
VarDesc
*
ids_desc
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"Ids"
)[
0
]);
PADDLE_ENFORCE_EQ
(
ids_desc
->
GetLoDLevel
(),
1
);
// in compile time, the shape from Ids -> output
// should be [-1, 1] -> [-1, embedding_size]
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
({
-
1
,
last_dim
}));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"W"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
class
FusedEmbeddingSeqPoolOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"W"
,
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter."
);
AddInput
(
"Ids"
,
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"The last dimension size must be 1."
);
AddOutput
(
"Out"
,
"The lookup results, which have the same type as W."
);
AddAttr
<
std
::
string
>
(
"combiner"
,
"(string, default sum) "
"A string specifying the reduction op. Currently sum "
"are supported, sum computes the weighted sum of the "
"embedding results for each row."
)
.
SetDefault
(
"sum"
);
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr
<
bool
>
(
"grad_inplace"
,
"(boolean, default false) "
"If the grad op reuse the input's variable."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"is_sparse"
,
"(boolean, default false) "
"Sparse update."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
FusedEmbeddingSeqPool Operator.
Computes embeddings for the given ids and weights.
This operator is used to perform lookups on the parameter W,
then computes the weighted sum of the lookups results for each row
and concatenated into a dense tensor.
The input Ids should carry the LoD (Level of Details) information.
And the output will change the LoD information with input Ids.
)DOC"
);
}
};
class
FusedEmbeddingSeqPoolOpGradDescMaker
:
public
framework
::
DefaultGradOpDescMaker
<
true
>
{
using
::
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>::
DefaultGradOpDescMaker
;
protected:
virtual
std
::
string
GradOpType
()
const
{
return
"fused_embedding_seq_pool_grad"
;
}
};
class
FusedEmbeddingSeqPoolOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
table_dims
=
ctx
->
GetInputDim
(
"W"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"W"
),
table_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"W"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
class
FusedEmbeddingSeqPoolOpGradVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
out_var_name
=
op_desc
.
Output
(
framework
::
GradVarName
(
"W"
)).
front
();
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
3
)
<<
"fused_embedding_seq_pool_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
VLOG
(
3
)
<<
"fused_embedding_seq_pool_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
block
->
Var
(
out_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fused_embedding_seq_pool
,
ops
::
FusedEmbeddingSeqPoolOp
,
ops
::
FusedEmbeddingSeqPoolOpGradDescMaker
,
ops
::
FusedEmbeddingSeqPoolOpMaker
);
REGISTER_OPERATOR
(
fused_embedding_seq_pool_grad
,
ops
::
FusedEmbeddingSeqPoolOpGrad
,
ops
::
FusedEmbeddingSeqPoolOpGradVarTypeInference
);
REGISTER_OP_CPU_KERNEL
(
fused_embedding_seq_pool
,
ops
::
FusedEmbeddingSeqPoolKernel
<
float
>
,
ops
::
FusedEmbeddingSeqPoolKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
fused_embedding_seq_pool_grad
,
ops
::
FusedEmbeddingSeqPoolGradKernel
<
float
>
,
ops
::
FusedEmbeddingSeqPoolGradKernel
<
double
>
);
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
SelectedRows
=
framework
::
SelectedRows
;
using
DDim
=
framework
::
DDim
;
template
<
typename
T
>
struct
EmbeddingVSumFunctor
{
void
operator
()(
const
framework
::
ExecutionContext
&
context
,
const
LoDTensor
*
table_t
,
const
LoDTensor
*
ids_t
,
LoDTensor
*
output_t
)
{
auto
*
table
=
table_t
->
data
<
T
>
();
int64_t
row_number
=
table_t
->
dims
()[
0
];
int64_t
row_width
=
table_t
->
dims
()[
1
];
int64_t
last_dim
=
output_t
->
dims
()[
1
];
const
int64_t
*
ids
=
ids_t
->
data
<
int64_t
>
();
auto
ids_lod
=
ids_t
->
lod
()[
0
];
int64_t
ids_count
=
ids_t
->
numel
()
/
ids_lod
.
back
();
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int64_t
i
=
0
;
i
!=
ids_lod
.
size
()
-
1
;
++
i
)
{
size_t
begin
=
ids_lod
[
i
]
*
ids_count
;
for
(
int64_t
j
=
0
;
j
!=
ids_count
;
++
j
)
{
PADDLE_ENFORCE_LT
(
ids
[
begin
],
row_number
);
PADDLE_ENFORCE_GE
(
ids
[
begin
],
0
,
"ids %d"
,
i
);
blas
.
VCOPY
(
row_width
,
table
+
ids
[
begin
+
j
]
*
row_width
,
output
+
i
*
last_dim
+
j
*
row_width
);
}
for
(
int64_t
r
=
(
ids_lod
[
i
]
+
1
)
*
ids_count
;
r
<
ids_lod
[
i
+
1
]
*
ids_count
;
++
r
)
{
PADDLE_ENFORCE_LT
(
ids
[
r
],
row_number
);
PADDLE_ENFORCE_GE
(
ids
[
r
],
0
,
"ids %d"
,
i
);
blas
.
AXPY
(
row_width
,
1.
,
table
+
ids
[
r
]
*
row_width
,
output
+
i
*
last_dim
+
(
r
%
ids_count
)
*
row_width
);
}
}
}
};
template
<
typename
T
>
class
FusedEmbeddingSeqPoolKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
LoDTensor
*
ids_t
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
// int tensor
LoDTensor
*
output_t
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
// float tensor
const
LoDTensor
*
table_var
=
context
.
Input
<
LoDTensor
>
(
"W"
);
const
std
::
string
&
combiner_type
=
context
.
Attr
<
std
::
string
>
(
"combiner"
);
if
(
combiner_type
==
"sum"
)
{
EmbeddingVSumFunctor
<
T
>
functor
;
functor
(
context
,
table_var
,
ids_t
,
output_t
);
}
}
};
template
<
typename
T
>
class
FusedEmbeddingSeqPoolGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
table_var
=
context
.
InputVar
(
"W"
);
DDim
table_dim
;
if
(
table_var
->
IsType
<
LoDTensor
>
())
{
table_dim
=
context
.
Input
<
LoDTensor
>
(
"W"
)
->
dims
();
}
else
if
(
table_var
->
IsType
<
SelectedRows
>
())
{
auto
*
table_t
=
context
.
Input
<
SelectedRows
>
(
"W"
);
table_dim
=
table_t
->
value
().
dims
();
}
else
{
PADDLE_THROW
(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows"
);
}
bool
is_sparse
=
context
.
Attr
<
bool
>
(
"is_sparse"
);
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if
(
is_sparse
)
{
auto
*
ids
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
auto
*
d_output
=
context
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_table
=
context
.
Output
<
SelectedRows
>
(
framework
::
GradVarName
(
"W"
));
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
int64_t
ids_num
=
ids
->
numel
();
auto
lod
=
ids
->
lod
()[
0
];
int64_t
row_width
=
d_output
->
dims
()[
1
];
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
new_rows
->
resize
(
ids_num
);
std
::
memcpy
(
&
(
*
new_rows
)[
0
],
ids_data
,
ids_num
*
sizeof
(
int64_t
));
auto
*
d_table_value
=
d_table
->
mutable_value
();
d_table_value
->
Resize
({
ids_num
,
table_dim
[
1
]});
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
int64_t
in_offset
=
lod
[
i
]
*
row_width
;
const
T
*
out_pos
=
d_output_data
+
i
*
row_width
;
T
*
in_pos
=
d_table_data
+
in_offset
;
for
(
int
r
=
0
;
r
!=
h
;
++
r
)
{
blas
.
VCOPY
(
row_width
,
out_pos
,
in_pos
+
r
*
row_width
);
}
}
}
else
{
LOG
(
ERROR
)
<<
"Dense is not supported in fused_embedding_seq_pool_op now"
;
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/operators/jit/kernels.h"
namespace
paddle
{
namespace
operators
{
void
FusionSeqPoolConcatOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE_GE
(
ctx
->
Inputs
(
"X"
).
size
(),
1UL
,
"Inputs(X) of FusionSeqPoolConcatOp should not be empty."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FusionSeqPoolConcatOp should not be null."
);
int
axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
);
PADDLE_ENFORCE_EQ
(
axis
,
1
,
"FusionSeqPoolConcatOp only supports concat axis=1 yet."
);
auto
ins_dims
=
ctx
->
GetInputsDim
(
"X"
);
const
size_t
n
=
ins_dims
.
size
();
PADDLE_ENFORCE_GT
(
n
,
0UL
,
"Input tensors count should > 0."
);
if
(
n
==
1
)
{
LOG
(
WARNING
)
<<
"Only have one input, may waste memory"
;
}
// The output height should be confirmed in Compute,
// since input lod is not accessible here.
PADDLE_ENFORCE_EQ
(
ins_dims
[
0
].
size
(),
2UL
,
"The dims size of first input should be 2."
);
ctx
->
SetOutputDim
(
"Out"
,
{
-
1
,
ins_dims
[
0
][
axis
]
*
static_cast
<
int
>
(
n
)});
}
framework
::
OpKernelType
FusionSeqPoolConcatOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
return
framework
::
OpKernelType
(
framework
::
GetDataTypeOfVar
(
ctx
.
MultiInputVar
(
"X"
)[
0
]),
ctx
.
GetPlace
());
}
void
FusionSeqPoolConcatOpMaker
::
Make
()
{
AddInput
(
"X"
,
"(LoDTensor) Input tensors of this operator."
).
AsDuplicable
();
AddOutput
(
"Out"
,
"(LoDTensor) Output tensor of concat operator."
);
AddAttr
<
std
::
string
>
(
"pooltype"
,
"(string, default 'SUM') some of the pooling "
"pooltype of SequencePoolOp."
)
.
SetDefault
(
"SUM"
)
.
InEnum
({
"AVERAGE"
,
"SUM"
,
"SQRT"
});
AddAttr
<
int
>
(
"axis"
,
"The axis along which the input tensors will be concatenated. "
"Only supports concat axis=1 yet."
)
.
SetDefault
(
1
);
AddComment
(
R"DOC(
Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
)DOC"
);
}
template
<
typename
T
>
class
FusionSeqPoolConcatKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
std
::
string
pooltype
=
ctx
.
Attr
<
std
::
string
>
(
"pooltype"
);
auto
x0_lod
=
ins
[
0
]
->
lod
();
auto
x0_dims
=
ins
[
0
]
->
dims
();
auto
y_dims
=
out
->
dims
();
size_t
bs
=
x0_lod
[
0
].
size
()
-
1
;
out
->
Resize
({
static_cast
<
int64_t
>
(
bs
),
y_dims
[
1
]});
framework
::
LoD
y_lod
(
1
);
y_lod
[
0
].
resize
(
bs
+
1
);
for
(
size_t
i
=
0
;
i
<=
bs
;
++
i
)
{
y_lod
[
0
][
i
]
=
i
;
}
out
->
set_lod
(
y_lod
);
auto
place
=
ctx
.
GetPlace
();
T
*
y_data
=
out
->
mutable_data
<
T
>
(
place
);
int
w
=
ins
[
0
]
->
numel
()
/
x0_dims
[
0
];
PADDLE_ENFORCE_EQ
(
y_dims
[
1
]
%
w
,
0
,
"The output of dims[1] should be dividable of w"
);
jit
::
seq_pool_attr_t
attr
(
w
,
jit
::
SeqPoolType
::
kSum
);
if
(
pooltype
==
"AVERAGE"
)
{
attr
.
type
=
jit
::
SeqPoolType
::
kAvg
;
}
else
if
(
pooltype
==
"SQRT"
)
{
attr
.
type
=
jit
::
SeqPoolType
::
kSqrt
;
}
auto
seqpool
=
jit
::
Get
<
jit
::
kSeqPool
,
jit
::
SeqPoolTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
size_t
n
=
ins
.
size
();
size_t
dst_step_size
=
n
*
w
;
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
auto
x_dims
=
ins
[
i
]
->
dims
();
auto
x_lod
=
ins
[
i
]
->
lod
()[
0
];
const
T
*
src
=
ins
[
i
]
->
data
<
T
>
();
T
*
dst
=
y_data
+
i
*
w
;
PADDLE_ENFORCE_EQ
(
static_cast
<
int
>
(
ins
[
i
]
->
numel
()
/
x_dims
[
0
]),
w
,
"Width of all inputs should be equal."
);
PADDLE_ENFORCE_EQ
(
x_lod
.
size
(),
bs
+
1
,
"Batchsize of all inputs should be equal."
);
for
(
size_t
j
=
0
;
j
<
bs
;
++
j
)
{
attr
.
h
=
static_cast
<
int
>
(
x_lod
[
j
+
1
]
-
x_lod
[
j
]);
seqpool
(
src
,
dst
,
&
attr
);
dst
+=
dst_step_size
;
src
+=
attr
.
h
*
attr
.
w
;
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_seqpool_concat
,
ops
::
FusionSeqPoolConcatOp
,
ops
::
FusionSeqPoolConcatOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
fusion_seqpool_concat
,
ops
::
FusionSeqPoolConcatKernel
<
float
>
,
ops
::
FusionSeqPoolConcatKernel
<
double
>
);
paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
class
FusionSeqPoolConcatOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
FusionSeqPoolConcatOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
eac5a0aa
...
...
@@ -52,11 +52,11 @@ struct BenchFunc {
for
(
int
i
=
0
;
i
<
FLAGS_burning
;
++
i
)
{
tgt
(
args
...);
}
auto
start
=
paddle
::
platform
::
PosixInNsec
()
/
1e-3
;
auto
start
=
paddle
::
platform
::
PosixInNsec
()
*
1e-3
;
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
++
i
)
{
tgt
(
args
...);
}
auto
end
=
paddle
::
platform
::
PosixInNsec
()
/
1e-3
;
auto
end
=
paddle
::
platform
::
PosixInNsec
()
*
1e-3
;
return
static_cast
<
double
>
(
end
-
start
)
/
FLAGS_repeat
;
}
};
...
...
@@ -190,6 +190,26 @@ void BenchGRUKernel() {
}
}
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
BenchSeqPoolKernel
()
{
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
jit
::
SeqPoolType
::
kSum
,
jit
::
SeqPoolType
::
kAvg
,
jit
::
SeqPoolType
::
kSqrt
};
for
(
auto
type
:
pool_types
)
{
for
(
int
w
:
TestSizes
())
{
jit
::
seq_pool_attr_t
attr
(
w
,
type
);
for
(
int
h
:
TestSizes
())
{
attr
.
h
=
h
;
std
::
vector
<
T
>
x
(
h
*
w
),
y
(
w
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
T
*
x_data
=
x
.
data
();
T
*
y_data
=
y
.
data
();
BenchAllImpls
<
KT
,
jit
::
SeqPoolTuples
<
T
>
,
PlaceType
>
(
attr
,
x_data
,
y_data
,
&
attr
);
}
}
}
}
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
// Options:
...
...
@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
BenchGRUKernel
<
jit
::
kGRUH1
,
T
,
PlaceType
>
();
BenchGRUKernel
<
jit
::
kGRUHtPart1
,
T
,
PlaceType
>
();
BenchGRUKernel
<
jit
::
kGRUHtPart2
,
T
,
PlaceType
>
();
// seq pool function
BenchSeqPoolKernel
<
jit
::
kSeqPool
,
T
,
PlaceType
>
();
}
paddle/fluid/operators/jit/gen/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
USE_JITKERNEL_GEN
(
kGRUHtPart1
)
USE_JITKERNEL_GEN
(
kGRUHtPart2
)
USE_JITKERNEL_GEN
(
kNCHW16CMulNC
)
USE_JITKERNEL_GEN
(
kSeqPool
)
paddle/fluid/operators/jit/gen/seqpool.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/seqpool.h"
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
void
SeqPoolJitCode
::
genCode
()
{
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
constexpr
int
max_num_regs
=
8
;
const
int
num_block
=
w_
/
block
;
const
int
num_groups
=
num_block
/
max_num_regs
;
int
rest_num_regs
=
num_block
%
max_num_regs
;
mov
(
reg32_int_h
,
dword
[
param_attr
]);
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovups
(
xmm_t
(
1
),
ptr
[
reg_tmp
+
OFFSET_EXP_ONE
]);
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
fp_h_
));
fild
(
dword
[
param_attr
]);
fstp
(
dword
[
reg_tmp
]);
vmovss
(
xmm_t
(
0
),
ptr
[
reg_tmp
]);
if
(
type_
==
SeqPoolType
::
kSqrt
)
{
vsqrtps
(
xmm_t
(
0
),
xmm_t
(
0
));
}
vdivps
(
xmm_t
(
1
),
xmm_t
(
1
),
xmm_t
(
0
));
vmovss
(
ptr
[
reg_tmp
],
xmm_t
(
1
));
}
const
int
group_len
=
max_num_regs
*
block
*
sizeof
(
float
);
for
(
int
g
=
0
;
g
<
num_groups
;
++
g
)
{
pool_height
<
ymm_t
>
(
g
*
group_len
,
block
,
max_num_regs
);
}
if
(
rest_num_regs
>
0
)
{
pool_height
<
ymm_t
>
(
num_groups
*
group_len
,
block
,
rest_num_regs
);
}
// part of rest_w * height
const
int
rest
=
w_
%
block
;
pool_height_of_rest_width
(
rest
,
(
w_
-
rest
)
*
sizeof
(
float
),
max_num_regs
);
ret
();
}
class
SeqPoolCreator
:
public
JitCodeCreator
<
seq_pool_attr_t
>
{
public:
bool
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
);
}
size_t
CodeSize
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
96
+
((
attr
.
w
/
YMM_FLOAT_BLOCK
+
4
/* for rest */
)
*
4
/* load, mul and save */
+
256
)
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
seq_pool_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
w
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
h
,
0
);
return
make_unique
<
SeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
}
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
namespace
gen
=
paddle
::
operators
::
jit
::
gen
;
REGISTER_JITKERNEL_GEN
(
kSeqPool
,
gen
::
SeqPoolCreator
);
paddle/fluid/operators/jit/gen/seqpool.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
class
SeqPoolJitCode
:
public
JitCode
{
public:
explicit
SeqPoolJitCode
(
const
seq_pool_attr_t
&
attr
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
w_
(
attr
.
w
),
type_
(
attr
.
type
)
{
if
(
!
(
type_
==
SeqPoolType
::
kSum
||
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
))
{
LOG
(
FATAL
)
<<
"Only support sum pool yet "
;
}
fp_h_
[
0
]
=
1.
f
;
this
->
genCode
();
}
virtual
const
char
*
name
()
const
{
std
::
string
base
=
"SeqPoolJitCode"
;
if
(
type_
==
SeqPoolType
::
kSum
)
{
base
+=
"_Sum"
;
}
else
if
(
type_
==
SeqPoolType
::
kAvg
)
{
base
+=
"_Avg"
;
}
else
if
(
type_
==
SeqPoolType
::
kSqrt
)
{
base
+=
"_Sqrt"
;
}
base
+=
(
"_W"
+
std
::
to_string
(
w_
));
return
base
.
c_str
();
}
void
genCode
()
override
;
protected:
template
<
typename
JMM
>
void
pool_height
(
int
w_offset
,
int
block
,
int
max_num_regs
)
{
int
offset
=
w_offset
;
for
(
int
i
=
0
;
i
<
max_num_regs
;
++
i
)
{
vmovups
(
JMM
(
i
),
ptr
[
param_src
+
offset
]);
offset
+=
sizeof
(
float
)
*
block
;
}
cmp
(
reg32_int_h
,
1
);
Label
l_next_h
,
l_h_done
;
jle
(
l_h_done
,
T_NEAR
);
mov
(
reg_h_i
,
1
);
mov
(
reg_tmp
,
param_src
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
)
+
w_offset
);
L
(
l_next_h
);
{
mov
(
reg_ptr_src_i
,
reg_tmp
);
for
(
int
i
=
0
;
i
<
max_num_regs
;
++
i
)
{
vmovups
(
JMM
(
i
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
// sum anyway
vaddps
(
JMM
(
i
),
JMM
(
i
),
JMM
(
i
+
max_num_regs
));
add
(
reg_ptr_src_i
,
sizeof
(
float
)
*
block
);
}
inc
(
reg_h_i
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
));
cmp
(
reg_h_i
,
reg32_int_h
);
jl
(
l_next_h
,
T_NEAR
);
}
L
(
l_h_done
);
// save right now
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
fp_h_
));
vbroadcastss
(
JMM
(
max_num_regs
),
ptr
[
reg_tmp
]);
}
offset
=
w_offset
;
for
(
int
i
=
0
;
i
<
max_num_regs
;
++
i
)
{
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
vmulps
(
JMM
(
i
),
JMM
(
i
),
JMM
(
max_num_regs
));
}
vmovups
(
ptr
[
param_dst
+
offset
],
JMM
(
i
));
offset
+=
sizeof
(
float
)
*
block
;
}
}
void
pool_height_of_rest_width
(
int
rest
,
int
w_offset
,
int
max_num_regs
)
{
const
int
rest_used_num_regs
=
load_rest
(
rest
,
w_offset
,
0
);
const
bool
has_block4
=
rest
/
4
>
0
;
const
bool
has_block2
=
(
rest
%
4
)
/
2
>
0
;
const
bool
has_block1
=
(
rest
%
2
)
==
1
;
cmp
(
reg32_int_h
,
1
);
Label
l_next_h
,
l_h_done
;
jle
(
l_h_done
,
T_NEAR
);
mov
(
reg_h_i
,
1
);
mov
(
reg_tmp
,
param_src
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
)
+
w_offset
);
L
(
l_next_h
);
{
int
reg_idx
=
0
;
mov
(
reg_ptr_src_i
,
reg_tmp
);
if
(
has_block4
)
{
vmovups
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
add
(
reg_ptr_src_i
,
sizeof
(
float
)
*
4
);
reg_idx
++
;
}
if
(
has_block2
)
{
vmovups
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
add
(
reg_ptr_src_i
,
sizeof
(
float
)
*
2
);
reg_idx
++
;
}
if
(
has_block1
)
{
vmovss
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
reg_idx
++
;
}
PADDLE_ENFORCE_EQ
(
reg_idx
,
rest_used_num_regs
,
"All heights should use same regs"
);
for
(
int
i
=
0
;
i
<
reg_idx
;
++
i
)
{
vaddps
(
xmm_t
(
i
),
xmm_t
(
i
),
xmm_t
(
i
+
max_num_regs
));
}
inc
(
reg_h_i
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
));
cmp
(
reg_h_i
,
reg32_int_h
);
jl
(
l_next_h
,
T_NEAR
);
}
L
(
l_h_done
);
// save right now
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
fp_h_
));
vbroadcastss
(
xmm_t
(
max_num_regs
),
ptr
[
reg_tmp
]);
for
(
int
i
=
0
;
i
<
rest_used_num_regs
;
++
i
)
{
vmulps
(
xmm_t
(
i
),
xmm_t
(
i
),
xmm_t
(
max_num_regs
));
}
}
save_rest
(
rest
,
w_offset
);
}
// return the number of used regs, use start from reg 0
int
load_rest
(
int
rest
,
int
w_offset
,
const
int
num_shift_regs
,
const
int
reg_start
=
0
)
{
const
bool
has_block4
=
rest
/
4
>
0
;
const
bool
has_block2
=
(
rest
%
4
)
/
2
>
0
;
const
bool
has_block1
=
(
rest
%
2
)
==
1
;
int
reg_idx
=
reg_start
;
if
(
has_block4
)
{
vmovups
(
xmm_t
(
reg_idx
+
num_shift_regs
),
ptr
[
param_src
+
w_offset
]);
w_offset
+=
sizeof
(
float
)
*
4
;
reg_idx
++
;
}
if
(
has_block2
)
{
vmovq
(
xmm_t
(
reg_idx
+
num_shift_regs
),
ptr
[
param_src
+
w_offset
]);
w_offset
+=
sizeof
(
float
)
*
2
;
reg_idx
++
;
}
if
(
has_block1
)
{
vmovss
(
xmm_t
(
reg_idx
+
num_shift_regs
),
ptr
[
param_src
+
w_offset
]);
reg_idx
++
;
}
return
reg_idx
;
}
// use reg start from 0
void
save_rest
(
int
rest
,
int
w_offset
,
int
reg_start
=
0
)
{
const
bool
has_block4
=
rest
/
4
>
0
;
const
bool
has_block2
=
(
rest
%
4
)
/
2
>
0
;
const
bool
has_block1
=
(
rest
%
2
)
==
1
;
int
reg_idx
=
reg_start
;
if
(
has_block4
)
{
vmovups
(
ptr
[
param_dst
+
w_offset
],
xmm_t
(
reg_idx
));
w_offset
+=
sizeof
(
float
)
*
4
;
reg_idx
++
;
}
if
(
has_block2
)
{
vmovq
(
ptr
[
param_dst
+
w_offset
],
xmm_t
(
reg_idx
));
w_offset
+=
sizeof
(
float
)
*
2
;
reg_idx
++
;
}
if
(
has_block1
)
{
vmovss
(
ptr
[
param_dst
+
w_offset
],
xmm_t
(
reg_idx
));
}
}
private:
float
ALIGN32_BEG
fp_h_
[
1
]
ALIGN32_END
;
int
w_
;
SeqPoolType
type_
;
reg64_t
param_src
{
abi_param1
};
reg64_t
param_dst
{
abi_param2
};
reg64_t
param_attr
{
abi_param3
};
reg64_t
reg_tmp
{
rax
};
reg32_t
reg32_int_h
{
r8d
};
reg32_t
reg32_fp_h
{
r9d
};
reg64_t
reg_h_i
{
r10
};
reg64_t
reg_ptr_src_i
{
r11
};
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/helper.cc
浏览文件 @
eac5a0aa
...
...
@@ -26,6 +26,7 @@ namespace jit {
const
char
*
to_string
(
KernelType
kt
)
{
switch
(
kt
)
{
ONE_CASE
(
kNone
);
ONE_CASE
(
kVMul
);
ONE_CASE
(
kVAdd
);
ONE_CASE
(
kVAddRelu
);
...
...
@@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
ONE_CASE
(
kCRFDecoding
);
ONE_CASE
(
kLayerNorm
);
ONE_CASE
(
kNCHW16CMulNC
);
ONE_CASE
(
kSeqPool
);
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
kt
);
return
"NOT JITKernel"
;
}
return
nullptr
;
}
const
char
*
to_string
(
SeqPoolType
tp
)
{
switch
(
tp
)
{
ONE_CASE
(
kNonePoolType
);
ONE_CASE
(
kSum
);
ONE_CASE
(
kAvg
);
ONE_CASE
(
kSqrt
);
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
tp
);
return
"NOT PoolType"
;
}
return
nullptr
;
}
#undef ONE_CASE
KernelType
to_kerneltype
(
const
std
::
string
&
act
)
{
...
...
paddle/fluid/operators/jit/helper.h
浏览文件 @
eac5a0aa
...
...
@@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
}
const
char
*
to_string
(
KernelType
kt
);
const
char
*
to_string
(
SeqPoolType
kt
);
KernelType
to_kerneltype
(
const
std
::
string
&
act
);
...
...
@@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
<<
"],act_cand["
<<
to_string
(
attr
.
act_cand
)
<<
"]"
;
return
os
;
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
seq_pool_attr_t
&
attr
)
{
os
<<
"height_size["
<<
attr
.
h
<<
"],width_size["
<<
attr
.
w
<<
"],pool_type["
<<
to_string
(
attr
.
type
)
<<
"]"
;
return
os
;
}
}
// namespace jit
}
// namespace operators
...
...
paddle/fluid/operators/jit/kernel_base.h
浏览文件 @
eac5a0aa
...
...
@@ -41,8 +41,16 @@ typedef enum {
kCRFDecoding
,
kLayerNorm
,
kNCHW16CMulNC
,
kSeqPool
,
}
KernelType
;
typedef
enum
{
kNonePoolType
=
0
,
kSum
=
1
,
kAvg
,
kSqrt
,
}
SeqPoolType
;
template
<
typename
T
>
struct
XYZNTuples
{
typedef
T
data_type
;
...
...
@@ -112,6 +120,21 @@ struct GRUTuples {
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
};
typedef
struct
seq_pool_attr_s
{
int
h
,
w
;
// h should always be the first one
SeqPoolType
type
;
seq_pool_attr_s
()
=
default
;
explicit
seq_pool_attr_s
(
int
width
,
SeqPoolType
pool_type
,
int
height
=
1
)
:
h
(
height
),
w
(
width
),
type
(
pool_type
)
{}
}
seq_pool_attr_t
;
template
<
typename
T
>
struct
SeqPoolTuples
{
typedef
T
data_type
;
typedef
seq_pool_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
const
seq_pool_attr_t
*
);
};
template
<
typename
T
>
struct
CRFDecodingTuples
{
typedef
T
data_type
;
...
...
paddle/fluid/operators/jit/kernel_key.cc
浏览文件 @
eac5a0aa
...
...
@@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
(
static_cast
<
int
>
(
attr
.
act_cand
)
<<
act_type_shift
);
}
template
<
>
size_t
JitCodeKey
<
seq_pool_attr_t
>
(
const
seq_pool_attr_t
&
attr
)
{
size_t
key
=
attr
.
w
;
constexpr
int
pool_type_shift
=
3
;
return
(
key
<<
pool_type_shift
)
+
static_cast
<
int
>
(
attr
.
type
);
}
}
// namespace jit
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl)
USE_JITKERNEL_MORE
(
kVExp, mkl
)
USE_JITKERNEL_MORE
(
kVSigmoid, mkl
)
USE_JITKERNEL_MORE
(
kVTanh, mkl
)
USE_JITKERNEL_MORE
(
kSeqPool, mkl
)
paddle/fluid/operators/jit/more/mkl/mkl.cc
浏览文件 @
eac5a0aa
...
...
@@ -72,6 +72,26 @@ void VExp<double>(const double* x, double* y, int n) {
platform
::
dynload
::
vdExp
(
n
,
x
,
y
);
}
template
<
>
void
VCopy
<
float
>
(
const
float
*
x
,
float
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_scopy
(
n
,
x
,
1
,
y
,
1
);
}
template
<
>
void
VCopy
<
double
>
(
const
double
*
x
,
double
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_dcopy
(
n
,
x
,
1
,
y
,
1
);
}
template
<
>
void
VAXPY
<
float
>
(
float
a
,
const
float
*
x
,
float
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_saxpy
(
n
,
a
,
x
,
1
,
y
,
1
);
}
template
<
>
void
VAXPY
<
double
>
(
double
a
,
const
double
*
x
,
double
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_daxpy
(
n
,
a
,
x
,
1
,
y
,
1
);
}
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template
<
>
bool
VMulKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
...
...
@@ -103,6 +123,16 @@ bool VTanhKernel<float>::UseMe(const int& d) const {
return
d
>
7
;
}
template
<
>
bool
SeqPoolKernel
<
float
>::
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
{
return
true
;
}
template
<
>
bool
SeqPoolKernel
<
double
>::
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
{
return
true
;
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
template <> \
bool func##Kernel<double>::UseMe(const int& d) const { \
...
...
@@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal);
REGISTER_MKL_KERNEL
(
kVExp
,
VExp
);
REGISTER_MKL_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MKL_KERNEL
(
kSeqPool
,
SeqPool
);
#undef REGISTER_MKL_KERNEL
paddle/fluid/operators/jit/more/mkl/mkl.h
浏览文件 @
eac5a0aa
...
...
@@ -14,6 +14,7 @@
#pragma once
#include <cmath>
#include <type_traits>
#include "paddle/fluid/operators/jit/kernel_base.h"
...
...
@@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n);
template
<
typename
T
>
void
VExp
(
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VCopy
(
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VAXPY
(
T
a
,
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
...
...
@@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) {
}
}
template
<
typename
T
>
void
SeqPool
(
const
T
*
x
,
T
*
y
,
const
seq_pool_attr_t
*
attr
)
{
VCopy
<
T
>
(
x
,
y
,
attr
->
w
);
for
(
int
h
=
1
;
h
!=
attr
->
h
;
++
h
)
{
VAXPY
<
T
>
(
static_cast
<
T
>
(
1
),
x
+
h
*
attr
->
w
,
y
,
attr
->
w
);
}
if
(
attr
->
type
==
SeqPoolType
::
kAvg
||
attr
->
type
==
SeqPoolType
::
kSqrt
)
{
T
scalar
=
static_cast
<
T
>
(
1
);
if
(
attr
->
type
==
SeqPoolType
::
kAvg
)
{
scalar
=
scalar
/
static_cast
<
T
>
(
attr
->
h
);
}
else
{
scalar
=
scalar
/
std
::
sqrt
(
static_cast
<
T
>
(
attr
->
h
));
}
VScal
<
T
>
(
&
scalar
,
y
,
y
,
attr
->
w
);
}
}
#define DECLARE_MKL_KERNEL(name, tuples) \
template <typename T> \
class name##Kernel : public KernelMore<tuples<T>> { \
...
...
@@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
SeqPool
,
SeqPoolTuples
);
#undef DECLARE_MKL_KERNEL
}
// namespace mkl
...
...
paddle/fluid/operators/jit/refer/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2)
USE_JITKERNEL_REFER
(
kCRFDecoding
)
USE_JITKERNEL_REFER
(
kLayerNorm
)
USE_JITKERNEL_REFER
(
kNCHW16CMulNC
)
USE_JITKERNEL_REFER
(
kSeqPool
)
paddle/fluid/operators/jit/refer/refer.cc
浏览文件 @
eac5a0aa
...
...
@@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
REGISTER_REFER_KERNEL
(
kNCHW16CMulNC
,
NCHW16CMulNC
);
REGISTER_REFER_KERNEL
(
kSeqPool
,
SeqPool
);
#undef REGISTER_REFER_KERNEL
paddle/fluid/operators/jit/refer/refer.h
浏览文件 @
eac5a0aa
...
...
@@ -332,6 +332,28 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
}
}
template
<
typename
T
>
void
SeqPool
(
const
T
*
x
,
T
*
y
,
const
seq_pool_attr_t
*
attr
)
{
for
(
int
w
=
0
;
w
<
attr
->
w
;
++
w
)
{
const
T
*
src
=
x
+
w
;
T
*
dst
=
y
+
w
;
*
dst
=
static_cast
<
T
>
(
0
);
for
(
int
h
=
0
;
h
<
attr
->
h
;
++
h
)
{
*
dst
=
*
dst
+
*
src
;
src
+=
attr
->
w
;
}
}
if
(
attr
->
type
==
SeqPoolType
::
kAvg
||
attr
->
type
==
SeqPoolType
::
kSqrt
)
{
T
scalar
=
static_cast
<
T
>
(
1
);
if
(
attr
->
type
==
SeqPoolType
::
kAvg
)
{
scalar
=
scalar
/
static_cast
<
T
>
(
attr
->
h
);
}
else
{
scalar
=
scalar
/
std
::
sqrt
(
static_cast
<
T
>
(
attr
->
h
));
}
VScal
<
T
>
(
&
scalar
,
y
,
y
,
attr
->
w
);
}
}
#define DECLARE_REFER_KERNEL(name, tuples) \
template <typename T> \
class name##Kernel : public ReferKernel<tuples<T>> { \
...
...
@@ -370,6 +392,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
DECLARE_REFER_KERNEL
(
NCHW16CMulNC
,
NCHW16CMulNCTuples
);
DECLARE_REFER_KERNEL
(
SeqPool
,
SeqPoolTuples
);
#undef DECLARE_REFER_KERNEL
}
// namespace refer
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
eac5a0aa
...
...
@@ -211,6 +211,24 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
}
};
template
<
typename
T
>
struct
TestFuncWithRefer
<
jit
::
SeqPoolTuples
<
T
>
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>>
{
void
operator
()(
const
typename
jit
::
SeqPoolTuples
<
T
>::
func_type
tgt
,
const
std
::
vector
<
T
>&
x
,
const
std
::
vector
<
T
>&
yref
,
const
typename
jit
::
SeqPoolTuples
<
T
>::
attr_type
&
attr
)
{
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_EQ
(
x
.
size
()
%
yref
.
size
(),
0
);
int
w
=
yref
.
size
();
std
::
vector
<
T
>
y
(
w
);
const
T
*
x_data
=
x
.
data
();
const
T
*
yref_data
=
yref
.
data
();
T
*
y_data
=
y
.
data
();
tgt
(
x_data
,
y_data
,
&
attr
);
ExpectEQ
<
T
>
(
y_data
,
yref_data
,
w
);
}
};
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
KernelTuples
,
typename
PlaceType
,
typename
...
Args
>
void
TestAllImpls
(
const
typename
KernelTuples
::
attr_type
&
attr
,
Args
...
args
)
{
...
...
@@ -415,6 +433,31 @@ void TestGRUKernel() {
}
}
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
TestSeqPoolKernel
()
{
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
jit
::
SeqPoolType
::
kSum
,
jit
::
SeqPoolType
::
kAvg
,
jit
::
SeqPoolType
::
kSqrt
};
for
(
auto
type
:
pool_types
)
{
for
(
int
w
:
TestSizes
())
{
jit
::
seq_pool_attr_t
attr
(
w
,
type
);
for
(
int
h
:
TestSizes
())
{
attr
.
h
=
h
;
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SeqPoolTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
h
*
w
),
yref
(
w
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
T
*
x_data
=
x
.
data
();
T
*
yref_data
=
yref
.
data
();
ref
(
x_data
,
yref_data
,
&
attr
);
VLOG
(
10
)
<<
attr
;
TestAllImpls
<
KT
,
jit
::
SeqPoolTuples
<
T
>
,
PlaceType
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>>
(
attr
,
x
,
yref
,
attr
);
}
}
}
}
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
TestNCHW16CMulNCKernel
()
{
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
...
...
@@ -569,6 +612,12 @@ TEST(JITKernel, kGRUHtPart2) {
TestGRUKernel
<
jit
::
kGRUHtPart2
,
double
,
paddle
::
platform
::
CPUPlace
>
();
}
TEST
(
JITKernel
,
kSeqPool
)
{
namespace
jit
=
paddle
::
operators
::
jit
;
TestSeqPoolKernel
<
jit
::
kSeqPool
,
float
,
paddle
::
platform
::
CPUPlace
>
();
TestSeqPoolKernel
<
jit
::
kSeqPool
,
double
,
paddle
::
platform
::
CPUPlace
>
();
}
TEST
(
JITKernel
,
kNCHW16CMulNC
)
{
namespace
jit
=
paddle
::
operators
::
jit
;
TestNCHW16CMulNCKernel
<
jit
::
kNCHW16CMulNC
,
float
,
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -51,7 +51,7 @@ math_library(pooling)
math_library
(
selected_rows_functor DEPS selected_rows math_function blas
)
math_library
(
sequence2batch
)
math_library
(
sequence_padding
)
math_library
(
sequence_pooling DEPS math_function
)
math_library
(
sequence_pooling DEPS math_function
jit_kernel_helper
)
math_library
(
sequence_scale
)
math_library
(
softmax DEPS math_function
)
...
...
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
eac5a0aa
...
...
@@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input1
,
framework
::
Tensor
*
input2
)
{
if
(
UNLIKELY
(
input1
.
rows
().
size
()
==
0
))
{
LOG
(
WARNING
)
<<
"input selected rows is empty!"
;
return
;
}
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
]);
...
...
paddle/fluid/operators/math/sequence_pooling.cc
浏览文件 @
eac5a0aa
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h"
...
...
@@ -239,15 +240,33 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool
(
context
,
input
,
output
);
return
;
}
if
(
pooltype
==
"FIRST"
)
{
math
::
FirstSeqPoolFunctor
<
T
>
first_pool
;
first_pool
(
context
,
input
,
output
);
return
;
}
auto
lod
=
input
.
lod
()[
0
];
if
(
pooltype
==
"SUM"
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
place
));
const
T
*
src
=
input
.
data
<
T
>
();
T
*
dst
=
output
->
mutable_data
<
T
>
(
place
);
jit
::
seq_pool_attr_t
attr
(
static_cast
<
int
>
(
input
.
numel
()
/
input
.
dims
()[
0
]),
jit
::
SeqPoolType
::
kSum
);
auto
seqpool
=
jit
::
Get
<
jit
::
kSeqPool
,
jit
::
SeqPoolTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
attr
.
h
=
static_cast
<
int
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
seqpool
(
src
,
dst
,
&
attr
);
dst
+=
attr
.
w
;
src
+=
attr
.
h
*
attr
.
w
;
}
return
;
}
auto
&
place
=
*
context
.
eigen_device
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
Tensor
in_t
=
input
.
Slice
(
static_cast
<
int
>
(
lod
[
i
]),
static_cast
<
int
>
(
lod
[
i
+
1
]));
...
...
@@ -258,15 +277,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
auto
out_e
=
EigenVector
<
T
>::
Flatten
(
out_t
);
if
(
pooltype
==
"AVERAGE"
)
{
out_e
.
device
(
place
)
=
in_e
.
mean
(
Eigen
::
array
<
int
,
1
>
({{
0
}}));
}
else
if
(
pooltype
==
"SUM"
)
{
if
(
h
>
0
)
{
const
T
*
in_data
=
in_t
.
data
<
T
>
();
T
*
out_data
=
out_t
.
mutable_data
<
T
>
(
context
.
GetPlace
());
blas
.
VCOPY
(
w
,
in_data
,
out_data
);
for
(
int64_t
r
=
1
;
r
!=
h
;
++
r
)
{
blas
.
AXPY
(
w
,
1.
,
in_data
+
r
*
w
,
out_data
);
}
}
}
else
if
(
pooltype
==
"SQRT"
)
{
out_e
.
device
(
place
)
=
in_e
.
sum
(
Eigen
::
array
<
int
,
1
>
({{
0
}}))
/
std
::
sqrt
(
static_cast
<
T
>
(
h
));
...
...
paddle/fluid/operators/math/softmax.h
浏览文件 @
eac5a0aa
...
...
@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor {
const
framework
::
Tensor
*
Y
,
const
framework
::
Tensor
*
y_grad
,
framework
::
Tensor
*
x_grad
);
};
#endif
}
// namespace math
...
...
paddle/fluid/operators/ngraph/ngraph_ops.h
浏览文件 @
eac5a0aa
...
...
@@ -23,5 +23,7 @@ limitations under the License. */
#include "ops/binary_unnary_op.h"
#include "ops/fill_constant_op.h"
#include "ops/mean_op.h"
#include "ops/mul_op.h"
#include "ops/scale_op.h"
#include "ops/top_k_op.h"
paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
浏览文件 @
eac5a0aa
...
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
...
...
@@ -48,4 +47,3 @@ static void BuildUnaryNode(
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
0 → 100644
浏览文件 @
eac5a0aa
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
template
<
typename
T
>
std
::
shared_ptr
<
ngraph
::
Node
>
ElementwiseScalar
(
float
scale
,
std
::
shared_ptr
<
ngraph
::
Node
>
node
)
{
auto
node_shape
=
node
->
get_shape
();
auto
scale_const
=
ngraph
::
op
::
Constant
::
create
(
node
->
get_element_type
(),
node_shape
,
{
scale
});
return
std
::
make_shared
<
T
>
(
scale_const
,
node
);
}
template
<
typename
T
>
std
::
shared_ptr
<
ngraph
::
Node
>
ElementwiseScalar
(
std
::
shared_ptr
<
ngraph
::
Node
>
scale_1d
,
std
::
shared_ptr
<
ngraph
::
Node
>
node
)
{
auto
scale_shape
=
scale_1d
->
get_shape
();
PADDLE_ENFORCE_EQ
(
scale_shape
.
size
(),
1
,
"Supporting 1d scale node"
);
PADDLE_ENFORCE_EQ
(
scale_shape
.
at
(
0
),
1
,
"scale 1d in in shape {1}"
);
auto
node_shape
=
node
->
get_shape
();
ngraph
::
AxisSet
axis_set
;
for
(
size_t
i
=
0
;
i
<
node_shape
.
size
();
++
i
)
{
axis_set
.
insert
(
i
);
}
node_shape
.
push_back
(
1
);
auto
scale_bcast
=
std
::
make_shared
<
ngraph
::
op
::
Broadcast
>
(
scale_1d
,
node_shape
,
axis_set
);
auto
scale_reshape
=
paddle
::
platform
::
NgReshaper
(
scale_bcast
,
node
->
get_shape
());
return
std
::
make_shared
<
T
>
(
scale_reshape
,
node
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
浏览文件 @
eac5a0aa
...
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
...
...
@@ -58,4 +57,3 @@ void BuildFillConstantNode(
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/mean_op.h
0 → 100644
浏览文件 @
eac5a0aa
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
void
BuildMeanNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
input
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
ngraph
::
AxisSet
axes
;
for
(
size_t
i
=
0
;
i
<
input
->
get_shape
().
size
();
++
i
)
{
axes
.
insert
(
i
);
}
auto
mean
=
ngraph
::
builder
::
mean
(
input
,
axes
);
auto
mean_1d
=
std
::
make_shared
<
ngraph
::
op
::
Reshape
>
(
mean
,
ngraph
::
AxisVector
{},
ngraph
::
Shape
{
1
});
paddle
::
platform
::
SetOutputNode
(
op
,
"Out"
,
mean_1d
,
ngb_node_map
);
}
void
BuildMeanGradNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
x
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
og
=
paddle
::
platform
::
GetInputNode
(
op
,
"Out@GRAD"
,
ngb_node_map
);
auto
x_shape
=
x
->
get_shape
();
float
x_size
=
std
::
accumulate
(
std
::
begin
(
x_shape
),
std
::
end
(
x_shape
),
1
,
std
::
multiplies
<
float
>
());
auto
node_const
=
ngraph
::
op
::
Constant
::
create
(
og
->
get_element_type
(),
ngraph
::
Shape
{
1
},
{
x_size
});
auto
node_div
=
std
::
make_shared
<
ngraph
::
op
::
Divide
>
(
og
,
node_const
);
auto
result
=
ElementwiseScalar
<
ngraph
::
op
::
Add
>
(
og
/
node_const
,
ngraph
::
op
::
Constant
::
create
(
og
->
get_element_type
(),
x_shape
,
{
0
}));
paddle
::
platform
::
SetOutputNode
(
op
,
"X@GRAD"
,
result
,
ngb_node_map
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ops/mul_op.h
浏览文件 @
eac5a0aa
...
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
...
...
@@ -131,4 +130,3 @@ static void BuildMulGradNode(
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/scale_op.h
0 → 100644
浏览文件 @
eac5a0aa
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
void
BuildScaleNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
op_attrs
=
paddle
::
framework
::
AttrReader
(
op
->
Attrs
());
float
scale
=
op_attrs
.
Get
<
float
>
(
"scale"
);
auto
x
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
out
=
ElementwiseScalar
<
ngraph
::
op
::
Multiply
>
(
scale
,
x
);
paddle
::
platform
::
SetOutputNode
(
op
,
"Out"
,
out
,
ngb_node_map
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ops/top_k_op.h
浏览文件 @
eac5a0aa
...
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
...
...
@@ -48,4 +47,3 @@ void BuildTopKNode(
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
}
// namespace
static
__device__
__forceinline__
float
real_exp
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
real_exp
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
float
real_log
(
float
x
)
{
static
__device__
__forceinline__
platform
::
float16
exp_on_device
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
exp
(
x
);
}
static
__device__
__forceinline__
float
exp_on_device
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
exp_on_device
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
platform
::
float16
log_on_device
(
platform
::
float16
x
)
{
return
math
::
TolerableValue
<
platform
::
float16
>
()(
::
Eigen
::
numext
::
log
(
x
));
}
static
__device__
__forceinline__
float
log_on_device
(
float
x
)
{
return
math
::
TolerableValue
<
float
>
()(
logf
(
x
));
}
static
__device__
__forceinline__
double
real_log
(
double
x
)
{
static
__device__
__forceinline__
double
log_on_device
(
double
x
)
{
return
math
::
TolerableValue
<
double
>
()(
log
(
x
));
}
...
...
@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) {
/*
Supposing the x is `logits` and y is `labels`, the equations are as
followings:
cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
= \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
= \sum_{j}(-y_i_j * tmp_i_j)
softmax_i_j = e^{tmp_i_j}
where:
max_i = \max_{j}{x_i_j}
logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
Therefore, the calculation can be separated into 3 steps:
Step 1: row-wise operation to calculate max_i
Step 2: row-wise operation to calculate logDiffMaxSum_i
Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
To save memory, we can share memory among max_i, logDiffMaxSum_i and
cross\_entropy_i.
In this way, the 3 steps should be changed to:
...
...
@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
cur_max
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
cur_max
,
cub
::
Max
());
if
(
threadIdx
.
x
==
0
)
{
max_data
[
blockIdx
.
x
]
=
cur_max
<
-
64
?
-
64
:
cur_max
;
max_data
[
blockIdx
.
x
]
=
cur_max
<
static_cast
<
T
>
(
-
64
)
?
static_cast
<
T
>
(
-
64
)
:
cur_max
;
}
}
...
...
@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
auto
block_max
=
max_data
[
blockIdx
.
x
];
softmax
[
beg_idx
]
=
logits_data
[
beg_idx
]
-
block_max
;
T
diff_max_sum
=
real_exp
(
softmax
[
beg_idx
]);
T
diff_max_sum
=
exp_on_device
(
softmax
[
beg_idx
]);
auto
idx
=
beg_idx
+
BlockDim
;
while
(
idx
<
end_idx
)
{
softmax
[
idx
]
=
logits_data
[
idx
]
-
block_max
;
diff_max_sum
+=
real_exp
(
softmax
[
idx
]);
diff_max_sum
+=
exp_on_device
(
softmax
[
idx
]);
idx
+=
BlockDim
;
}
diff_max_sum
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
diff_max_sum
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
max_data
[
blockIdx
.
x
]
=
real_log
(
diff_max_sum
);
if
(
threadIdx
.
x
==
0
)
max_data
[
blockIdx
.
x
]
=
log_on_device
(
diff_max_sum
);
if
(
!
CalculateLogSoftmax
)
return
;
__syncthreads
();
...
...
@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy(
// log_diff_max_sum shares memory with loss
auto
block_log_diff_max_sum
=
loss_data
[
blockIdx
.
x
];
auto
tmp
=
softmax
[
beg_idx
]
-
block_log_diff_max_sum
;
softmax
[
beg_idx
]
=
real_exp
(
tmp
);
softmax
[
beg_idx
]
=
exp_on_device
(
tmp
);
auto
loss
=
-
labels_data
[
beg_idx
]
*
tmp
;
beg_idx
+=
BlockDim
;
while
(
beg_idx
<
end_idx
)
{
tmp
=
softmax
[
beg_idx
]
-
block_log_diff_max_sum
;
softmax
[
beg_idx
]
=
real_exp
(
tmp
);
softmax
[
beg_idx
]
=
exp_on_device
(
tmp
);
loss
-=
(
labels_data
[
beg_idx
]
*
tmp
);
beg_idx
+=
BlockDim
;
}
...
...
@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
auto
row_idx
=
idx
/
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
if
(
col_idx
!=
labels_
[
row_idx
])
{
log_softmax_
[
idx
]
=
real_exp
(
log_softmax_
[
idx
]);
log_softmax_
[
idx
]
=
exp_on_device
(
log_softmax_
[
idx
]);
}
else
{
auto
softmax
=
log_softmax_
[
idx
];
log_softmax_
[
idx
]
=
real_exp
(
softmax
);
log_softmax_
[
idx
]
=
exp_on_device
(
softmax
);
loss_
[
row_idx
]
=
-
softmax
;
}
}
...
...
@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
auto
row_idx
=
idx
/
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
if
(
col_idx
!=
labels_
[
row_idx
]
||
col_idx
==
ignore_idx_
)
{
log_softmax_
[
idx
]
=
real_exp
(
log_softmax_
[
idx
]);
log_softmax_
[
idx
]
=
exp_on_device
(
log_softmax_
[
idx
]);
}
else
{
auto
softmax
=
log_softmax_
[
idx
];
log_softmax_
[
idx
]
=
real_exp
(
softmax
);
log_softmax_
[
idx
]
=
exp_on_device
(
softmax
);
loss_
[
row_idx
]
=
-
softmax
;
}
}
...
...
@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
softmax_with_cross_entropy
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
softmax_with_cross_entropy_grad
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
softmax_with_cross_entropy
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
paddle
::
platform
::
float16
>
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
softmax_with_cross_entropy_grad
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
paddle
::
platform
::
float16
>
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
double
>
);
paddle/fluid/operators/sum_op.cc
浏览文件 @
eac5a0aa
...
...
@@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel {
return
;
// skip runtime infershape when is tensor array;
}
auto
x_var_types
=
ctx
->
GetInputsVarType
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
size_t
N
=
x_dims
.
size
();
PADDLE_ENFORCE_GT
(
N
,
0
,
"Input tensors count should > 0."
);
if
(
N
==
1
)
{
...
...
@@ -49,7 +51,13 @@ class SumOp : public framework::OperatorWithKernel {
}
framework
::
DDim
in_dim
({
0
});
for
(
auto
&
x_dim
:
x_dims
)
{
for
(
size_t
i
=
0
;
i
<
x_dims
.
size
();
++
i
)
{
auto
&
x_dim
=
x_dims
[
i
];
// x_dim.size() == 1 means the real dim of selected rows is [0]
if
(
x_var_types
[
i
]
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
&&
x_dim
.
size
()
==
1
)
{
continue
;
}
if
(
framework
::
product
(
x_dim
)
==
0
)
{
continue
;
}
...
...
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
eac5a0aa
...
...
@@ -15,6 +15,9 @@
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#ifdef _WIN32
#include <numeric>
#endif
#include <random>
#define PADDLE_CUDA_FP16
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
eac5a0aa
...
...
@@ -92,26 +92,24 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
const
platform
::
Place
&
place
,
const
cudaStream_t
&
stream
)
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
place
));
auto
place_stream
=
std
::
make_pair
(
place
,
stream
);
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
if
(
!
device_allocator_
.
count
(
place_stream
))
{
device_allocator_
[
place_stream
].
reset
(
new
TemporaryAllocator
(
place
));
device_allocator_
[
place_stream
]
->
SetCallback
([
stream
]()
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaGetLastError
());
});
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
auto
it
=
device_allocator_
.
find
(
place_stream
);
if
(
it
==
device_allocator_
.
end
())
{
auto
tmp_allocator
=
new
TemporaryAllocator
(
place
);
tmp_allocator
->
SetCallback
([
stream
]()
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaGetLastError
());
});
device_allocator_
[
place_stream
].
reset
(
tmp_allocator
);
return
*
tmp_allocator
;
}
else
{
return
*
it
->
second
;
}
return
*
device_allocator_
.
at
(
place_stream
);
}
template
<
>
platform
::
TemporaryAllocator
&
DeviceTemporaryAllocator
::
Get
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
auto
place_stream
=
std
::
make_pair
(
dev_ctx
.
GetPlace
(),
dev_ctx
.
stream
());
if
(
device_allocator_
.
count
(
place_stream
))
{
return
*
device_allocator_
.
at
(
place_stream
);
}
return
Get
(
dev_ctx
.
GetPlace
(),
dev_ctx
.
stream
());
}
#endif
...
...
@@ -292,7 +290,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
if
(
dynload
::
HasCUDNN
())
{
auto
local_cudnn_version
=
cudnn_dso_ver
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
if
(
local_cud
a_version
<
compile_cuda
_version
)
{
if
(
local_cud
nn_version
<
compile_cudnn
_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with CUDNN "
...
...
@@ -325,7 +323,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
void
CUDADeviceContext
::
Wait
()
const
{
auto
&
allocator
=
DeviceTemporaryAllocator
::
Instance
().
Get
<
CUDADeviceContext
>
(
*
this
);
allocator
.
Release
([
=
]()
{
allocator
.
Release
([
this
]()
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
PADDLE_ENFORCE
(
cudaGetLastError
());
});
...
...
paddle/fluid/platform/float16.h
浏览文件 @
eac5a0aa
...
...
@@ -59,7 +59,7 @@ limitations under the License. */
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
/*do nothing*/
#define PADDLE_ALIGN(x)
__declspec(align(x))
#endif
namespace
paddle
{
...
...
paddle/fluid/platform/float16_test.cu
浏览文件 @
eac5a0aa
...
...
@@ -271,11 +271,13 @@ TEST(float16, isinf) {
float16
b
=
float16
(
INFINITY
);
// underflow to 0
float16
native_a
(
5e-40
f
);
// overflow to inf
float16
native_b
(
5e40
f
);
EXPECT_EQ
(
std
::
isinf
(
a
),
true
);
EXPECT_EQ
(
std
::
isinf
(
b
),
true
);
#ifndef _WIN32
// overflow to inf
float16
native_b
(
5e40
f
);
EXPECT_EQ
(
std
::
isinf
(
native_b
),
true
);
#endif
EXPECT_EQ
(
native_a
,
float16
(
0
));
}
...
...
paddle/fluid/platform/mkldnn_reuse.h
浏览文件 @
eac5a0aa
...
...
@@ -210,20 +210,25 @@ class MKLDNNHandler {
dst_memory
.
reset
(
new
mkldnn
::
memory
(
*
dst_pd
,
to_void_cast
<
T
>
(
output_data
)));
}
static
void
AppendKey
(
std
::
string
*
key
,
const
mkldnn
::
memory
::
dims
&
input_dims
,
const
mkldnn
::
memory
::
dims
&
weights_dims
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
int
&
groups
,
const
mkldnn
::
memory
::
data_type
&
type
,
const
mkldnn
::
memory
::
format
&
format
,
const
std
::
string
&
suffix
)
{
static
void
AppendKey
(
std
::
string
*
key
,
const
mkldnn
::
memory
::
dims
&
input_dims
,
const
mkldnn
::
memory
::
dims
&
weights_dims
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
int
&
groups
,
const
mkldnn
::
memory
::
data_type
&
srcdt
,
const
mkldnn
::
memory
::
format
&
format
,
const
bool
&
relu
,
const
bool
&
residual
,
const
std
::
string
&
suffix
)
{
AppendKeyDims
(
key
,
input_dims
);
AppendKeyDims
(
key
,
weights_dims
);
AppendKeyVec
(
key
,
strides
);
AppendKeyVec
(
key
,
paddings
);
AppendKeyVec
(
key
,
dilations
);
AppendKey
(
key
,
std
::
to_string
(
groups
));
AppendKey
(
key
,
std
::
to_string
(
type
));
AppendKey
(
key
,
std
::
to_string
(
srcdt
));
AppendKey
(
key
,
std
::
to_string
(
format
));
AppendKey
(
key
,
std
::
to_string
(
relu
));
AppendKey
(
key
,
std
::
to_string
(
residual
));
AppendKey
(
key
,
suffix
);
}
...
...
@@ -662,15 +667,35 @@ static std::shared_ptr<mkldnn::memory> SetDstMemory(
}
template
<
typename
T
>
static
std
::
shared_ptr
<
mkldnn
::
memory
>
SetDstMemory
Handler
(
static
std
::
shared_ptr
<
mkldnn
::
memory
>
SetDstMemory
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
)
{
const
framework
::
Tensor
*
residual_param
,
const
mkldnn
::
memory
::
desc
&
user_residual_md
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
,
std
::
vector
<
mkldnn
::
primitive
>*
pipeline
)
{
const
T
*
residual_param_data
=
residual_param
->
data
<
T
>
();
PADDLE_ENFORCE
(
residual_param_data
!=
nullptr
,
"Provide data if you want MKLDNN conv+elementwise_add fusion"
);
std
::
shared_ptr
<
mkldnn
::
memory
>
user_residual_memory_p
=
handler
->
AcquireResidualDataMemory
(
user_residual_md
,
to_void_cast
<
T
>
(
residual_param_data
));
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
=
handler
->
AcquireDstMemoryFromResidualDataMemory
(
user_residual_memory_p
,
to_void_cast
<
T
>
(
output_data
),
*
pipeline
);
return
dst_memory_p
;
}
template
<
typename
T
>
static
void
SetDstMemoryHandler
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
,
std
::
shared_ptr
<
mkldnn
::
memory
>*
dst_memory_p
)
{
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(),
::
paddle
::
memory
::
Allocator
::
kDefault
,
handler
->
GetDstMemorySize
());
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
;
dst_memory_p
->
set_data_handle
(
to_void_cast
<
T
>
(
output_data
));
return
dst_memory_p
;
(
*
dst_memory_p
)
->
set_data_handle
(
to_void_cast
<
T
>
(
output_data
));
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
eac5a0aa
...
...
@@ -3,7 +3,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune fe
if
(
WITH_PYTHON
)
list
(
APPEND PYBIND_DEPS py_func_op
)
endif
()
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc
)
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc
ir.cc
)
if
(
WITH_PYTHON
)
if
(
WITH_AMD_GPU
)
...
...
@@ -21,9 +21,8 @@ if(WITH_PYTHON)
endif
(
NOT APPLE AND NOT ANDROID AND NOT WIN32
)
endif
(
WITH_AMD_GPU
)
if
(
WIN32
)
target_link_libraries
(
paddle_pybind shlwapi
)
endif
(
WIN32
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
paddle_pybind
${
os_dependency_modules
}
)
cc_test
(
tensor_py_test SRCS tensor_py_test.cc DEPS python
)
endif
(
WITH_PYTHON
)
paddle/fluid/pybind/ir.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/pybind/ir.h"
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "pybind11/stl.h"
namespace
py
=
pybind11
;
using
paddle
::
framework
::
ir
::
Graph
;
using
paddle
::
framework
::
ir
::
Node
;
using
paddle
::
framework
::
OpDesc
;
using
paddle
::
framework
::
ProgramDesc
;
using
paddle
::
framework
::
VarDesc
;
using
pybind11
::
return_value_policy
;
namespace
paddle
{
namespace
pybind
{
void
BindGraph
(
py
::
module
*
m
)
{
py
::
class_
<
Graph
,
std
::
shared_ptr
<
Graph
>>
(
*
m
,
"Graph"
,
"The graph is a Directed Acyclic Single Static Assignment Graph, see "
"`paddle::ir::Graph` for details."
)
.
def
(
py
::
init
<
const
ProgramDesc
&>
())
.
def
(
"has"
,
&
Graph
::
Has
)
.
def
(
"get_int"
,
&
Graph
::
Get
<
int
>
)
.
def
(
"get_float"
,
&
Graph
::
Get
<
float
>
)
.
def
(
"get_double"
,
&
Graph
::
Get
<
double
>
)
.
def
(
"get_string"
,
&
Graph
::
Get
<
std
::
string
>
)
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
int
attr
)
{
return
self
.
Set
(
attr_name
,
new
int
(
attr
));
})
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
const
std
::
string
&
attr
)
{
return
self
.
Set
(
attr_name
,
new
std
::
string
(
attr
));
})
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
float
attr
)
{
return
self
.
Set
(
attr_name
,
new
float
(
attr
));
})
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
double
attr
)
{
return
self
.
Set
(
attr_name
,
new
double
(
attr
));
})
.
def
(
"erase"
,
&
Graph
::
Erase
)
.
def
(
"nodes"
,
&
Graph
::
Nodes
,
return_value_policy
::
reference
)
.
def
(
"create_var_node"
,
[](
Graph
&
self
,
VarDesc
&
var_desc
)
{
return
self
.
CreateVarNode
(
&
var_desc
);
},
return_value_policy
::
reference
)
.
def
(
"create_op_node"
,
[](
Graph
&
self
,
OpDesc
&
op_desc
)
{
return
self
.
CreateOpNode
(
&
op_desc
);
},
return_value_policy
::
reference
)
.
def
(
"create_control_dep_var"
,
&
Graph
::
CreateControlDepVar
,
return_value_policy
::
reference
)
.
def
(
"create_empty_node"
,
&
Graph
::
CreateEmptyNode
,
return_value_policy
::
reference
)
.
def
(
"release_nodes"
,
&
Graph
::
ReleaseNodes
)
.
def
(
"remove_node"
,
[](
Graph
&
self
,
Node
&
node
)
{
return
self
.
RemoveNode
(
&
node
);
})
.
def
(
"retrieve_node"
,
&
Graph
::
RetrieveNode
,
return_value_policy
::
reference
)
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
);
}
void
BindNode
(
py
::
module
*
m
)
{
py
::
class_
<
Node
>
node
(
*
m
,
"Node"
);
node
.
def
(
"name"
,
&
Node
::
Name
)
.
def
(
"node_type"
,
&
Node
::
NodeType
)
.
def
(
"var"
,
&
Node
::
Var
)
.
def
(
"op"
,
&
Node
::
Op
)
.
def
(
"id"
,
&
Node
::
id
)
.
def
(
"is_op"
,
&
Node
::
IsOp
)
.
def
(
"is_var"
,
&
Node
::
IsVar
)
.
def
(
"is_ctrl_var"
,
&
Node
::
IsCtrlVar
)
.
def_readwrite
(
"inputs"
,
&
Node
::
inputs
)
.
def_readwrite
(
"outputs"
,
&
Node
::
outputs
);
py
::
enum_
<
Node
::
Type
>
(
node
,
"Type"
)
.
value
(
"Operation"
,
Node
::
Type
::
kOperation
)
.
value
(
"Variable"
,
Node
::
Type
::
kVariable
)
.
export_values
();
}
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/ir.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
#include "paddle/fluid/framework/ir/graph.h"
namespace
paddle
{
namespace
pybind
{
void
BindGraph
(
pybind11
::
module
*
m
);
void
BindNode
(
pybind11
::
module
*
m
);
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
eac5a0aa
...
...
@@ -49,6 +49,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/recordio.h"
...
...
@@ -775,7 +776,12 @@ All parameter, weight, gradient are variables in Paddle.
})
.
def
(
"set_int"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
int
val
)
{
self
.
Set
<
const
int
>
(
name
,
new
int
(
val
));
})
.
def
(
"type"
,
&
ir
::
Pass
::
Type
);
.
def
(
"type"
,
&
ir
::
Pass
::
Type
)
.
def
(
"apply"
,
[](
ir
::
Pass
&
self
,
std
::
shared_ptr
<
ir
::
Graph
>
graph
)
{
std
::
unique_ptr
<
ir
::
Graph
>
origin_graph
(
graph
.
get
());
auto
optim_graph
=
self
.
Apply
(
std
::
move
(
origin_graph
));
graph
.
reset
(
optim_graph
.
release
());
});
py
::
class_
<
ir
::
PassBuilder
,
std
::
shared_ptr
<
ir
::
PassBuilder
>>
pb
(
m
,
"PassBuilder"
);
...
...
@@ -1019,8 +1025,7 @@ All parameter, weight, gradient are variables in Paddle.
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
ProgramDesc
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
,
size_t
,
size_t
>
())
const
ExecutionStrategy
&
,
const
BuildStrategy
&>
())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
...
...
@@ -1043,6 +1048,9 @@ All parameter, weight, gradient are variables in Paddle.
BindRecordIOWriter
(
&
m
);
BindAsyncExecutor
(
&
m
);
BindGraph
(
&
m
);
BindNode
(
&
m
);
}
}
// namespace pybind
}
// namespace paddle
python/paddle/dataset/mnist.py
浏览文件 @
eac5a0aa
...
...
@@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators.
from
__future__
import
print_function
import
paddle.dataset.common
import
subprocess
import
gzip
import
numpy
import
platform
import
tempfile
import
struct
from
six.moves
import
range
__all__
=
[
'train'
,
'test'
,
'convert'
]
...
...
@@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
def
reader_creator
(
image_filename
,
label_filename
,
buffer_size
):
def
reader
():
if
platform
.
system
()
==
'Darwin'
:
zcat_cmd
=
'gzcat'
elif
platform
.
system
()
==
'Linux'
:
zcat_cmd
=
'zcat'
else
:
raise
NotImplementedError
()
# According to http://stackoverflow.com/a/38061619/724872, we
# cannot use standard package gzip here.
tmp_image_file
=
tempfile
.
TemporaryFile
(
prefix
=
'paddle_dataset'
)
m
=
subprocess
.
Popen
(
[
zcat_cmd
,
image_filename
],
stdout
=
tmp_image_file
).
communicate
()
tmp_image_file
.
seek
(
16
)
# skip some magic bytes
# Python3 will not take stdout as file
tmp_label_file
=
tempfile
.
TemporaryFile
(
prefix
=
'paddle_dataset'
)
l
=
subprocess
.
Popen
(
[
zcat_cmd
,
label_filename
],
stdout
=
tmp_label_file
).
communicate
()
tmp_label_file
.
seek
(
8
)
# skip some magic bytes
try
:
# reader could be break.
while
True
:
labels
=
numpy
.
fromfile
(
tmp_label_file
,
'ubyte'
,
count
=
buffer_size
).
astype
(
"int"
)
if
labels
.
size
!=
buffer_size
:
break
# numpy.fromfile returns empty slice after EOF.
images
=
numpy
.
fromfile
(
tmp_image_file
,
'ubyte'
,
count
=
buffer_size
*
28
*
28
).
reshape
((
buffer_size
,
28
*
28
)).
astype
(
'float32'
)
images
=
images
/
255.0
*
2.0
-
1.0
for
i
in
range
(
buffer_size
):
yield
images
[
i
,
:],
int
(
labels
[
i
])
finally
:
try
:
m
.
terminate
()
except
:
pass
try
:
l
.
terminate
()
except
:
pass
with
gzip
.
GzipFile
(
image_filename
,
'rb'
)
as
image_file
:
img_buf
=
image_file
.
read
()
with
gzip
.
GzipFile
(
label_filename
,
'rb'
)
as
label_file
:
lab_buf
=
label_file
.
read
()
step_label
=
0
offset_img
=
0
# read from Big-endian
# get file info from magic byte
# image file : 16B
magic_byte_img
=
'>IIII'
magic_img
,
image_num
,
rows
,
cols
=
struct
.
unpack_from
(
magic_byte_img
,
img_buf
,
offset_img
)
offset_img
+=
struct
.
calcsize
(
magic_byte_img
)
offset_lab
=
0
# label file : 8B
magic_byte_lab
=
'>II'
magic_lab
,
label_num
=
struct
.
unpack_from
(
magic_byte_lab
,
lab_buf
,
offset_lab
)
offset_lab
+=
struct
.
calcsize
(
magic_byte_lab
)
while
True
:
if
step_label
>=
label_num
:
break
fmt_label
=
'>'
+
str
(
buffer_size
)
+
'B'
labels
=
struct
.
unpack_from
(
fmt_label
,
lab_buf
,
offset_lab
)
offset_lab
+=
struct
.
calcsize
(
fmt_label
)
step_label
+=
buffer_size
fmt_images
=
'>'
+
str
(
buffer_size
*
rows
*
cols
)
+
'B'
images_temp
=
struct
.
unpack_from
(
fmt_images
,
img_buf
,
offset_img
)
images
=
numpy
.
reshape
(
images_temp
,
(
buffer_size
,
rows
*
cols
)).
astype
(
'float32'
)
offset_img
+=
struct
.
calcsize
(
fmt_images
)
images
=
images
/
255.0
*
2.0
-
1.0
for
i
in
range
(
buffer_size
):
yield
images
[
i
,
:],
int
(
labels
[
i
])
return
reader
...
...
python/paddle/fluid/__init__.py
浏览文件 @
eac5a0aa
...
...
@@ -156,7 +156,7 @@ def __bootstrap__():
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'
cudnn_exhaustive_search_times'
,
'
sync_nccl_allreduce'
'sync_nccl_allreduce'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
python/paddle/fluid/compiler.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
multiprocessing
import
os
import
six
import
sys
from
..
import
compat
as
cpt
from
.
import
core
ExecutionStrategy
=
core
.
ParallelExecutor
.
ExecutionStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
def
_place_obj
(
place
):
p
=
core
.
Place
()
p
.
set_place
(
place
)
return
p
class
CompiledProgram
(
object
):
"""
Compiles a Program for execution.
1. Users first create the program with layers.
2. Optionally, users use CompiledProgram to optimize the program before run.
3. The original program or CompiledProgram is run by executor.
The CompiledProgram is used to transform a program for various
optimizations, for example.
* Pre-compute some logic once so that each run is faster.
* Transform the program so that it can run in multiple devices.
* TODO: transform the program for optimized inference or distributed
training.
Example:
.. code-block:: python
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
loss_name=loss.name)
for i in range(5):
test_loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
program: Program instance that contains the model logic.
"""
def
__init__
(
self
,
program
):
self
.
_program
=
program
self
.
_scope
=
None
self
.
_place
=
None
self
.
_executor
=
None
self
.
_compiled
=
False
self
.
_is_data_parallel
=
False
def
with_data_parallel
(
self
,
loss_name
=
None
,
build_strategy
=
None
,
exec_strategy
=
None
,
share_vars_from
=
None
):
"""Configs the program to run in data parallel way.
Args:
loss_name (str): The loss name must set in training. Default None.
build_strategy(BuildStrategy): build_strategy is used to
build the graph so it can run on multiple devices/cores with
optimized topology.
For more information, please refer to fluid.BuildStrategy.
Default None.
exec_strategy(ExecutionStrategy): exec_strategy is used to
to select the a way to execute the graph, for example how many
threads are used, how many iterations to clean up the temp
variables. For more information, please refer
to fluid.ExecutionStrategy. Default None.
share_vars_from(CompiledProgram): If provide, this CompiledProgram
will share variables from `share_vars_from`. `share_vars_from`
must be run by the executor before this CompiledProgram so that
vars are ready.
Returns:
self
"""
assert
not
self
.
_is_data_parallel
,
"Already compiled with parallel."
self
.
_is_data_parallel
=
True
self
.
_build_strategy
=
build_strategy
self
.
_exec_strategy
=
exec_strategy
self
.
_loss_name
=
loss_name
self
.
_share_vars_from
=
share_vars_from
if
self
.
_exec_strategy
is
None
:
self
.
_exec_strategy
=
ExecutionStrategy
()
if
self
.
_build_strategy
is
None
:
self
.
_build_strategy
=
BuildStrategy
()
return
self
def
_with_distributed
(
self
):
raise
NotImplementedError
()
def
_with_inference_optimize
(
self
):
raise
NotImplementedError
()
def
_compile_data_parallel
(
self
):
if
self
.
_share_vars_from
:
if
self
.
_scope
:
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
if
not
self
.
_share_vars_from
.
_is_data_parallel
:
raise
ValueError
(
"share_vars_from is not data parallel. Cannot "
"share vars from it."
)
if
self
.
_share_vars_from
.
_executor
is
None
:
raise
ValueError
(
"share_vars_from is not compiled and run, so there is no "
"var to share."
)
self
.
_local_scopes
=
self
.
_share_vars_from
.
_executor
.
local_scopes
()
else
:
self
.
_local_scopes
=
[]
self
.
_exec_strategy
.
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
)
if
self
.
_exec_strategy
.
use_cuda
:
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())
]
self
.
_places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
self
.
_places
,
"no place for execution"
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
use_cuda
:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
self
.
_exec_strategy
.
num_threads
=
len
(
self
.
_places
)
*
4
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_exec_strategy
.
num_threads
=
cpu_num
*
2
trainers_endpoints
=
self
.
_program
.
_trainers_endpoints
if
self
.
_build_strategy
.
num_trainers
>
1
and
trainers_endpoints
:
assert
self
.
_build_strategy
.
num_trainers
==
len
(
trainers_endpoints
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
trainers_endpoints
self
.
_persistable_vars
=
set
([
cpt
.
to_text
(
v
.
name
)
for
v
in
[
var
for
var
in
self
.
_program
.
list_vars
()
if
var
.
persistable
and
var
.
type
!=
core
.
VarDesc
.
VarType
.
RAW
]
])
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
return
core
.
ParallelExecutor
(
places
,
self
.
_persistable_vars
,
self
.
_program
.
desc
,
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
)
def
_compile
(
self
,
scope
,
place
):
"""Compile the program based on the configs.
Args:
scope: The variables (resources) that are associated with
this compiled program.
place: The location that the compiled program will be run on.
Returns:
self
"""
if
self
.
_compiled
:
if
scope
and
self
.
_scope
!=
scope
:
raise
ValueError
(
"Cannot compile with different scope"
)
if
place
and
self
.
_place
!=
place
:
raise
ValueError
(
"Cannot compile with different place"
)
return
self
self
.
_compiled
=
True
self
.
_scope
=
scope
self
.
_place
=
place
if
self
.
_is_data_parallel
:
self
.
_executor
=
self
.
_compile_data_parallel
()
else
:
p
=
_place_obj
(
self
.
_place
)
self
.
_executor
=
core
.
Executor
(
p
)
return
self
python/paddle/fluid/data_feeder.py
浏览文件 @
eac5a0aa
...
...
@@ -71,10 +71,25 @@ class DataToLoDTensorConverter(object):
for
each_data
in
data
:
self
.
_feed_impl_
(
each_data
,
lod
[
1
:],
lod_level
-
1
)
def
_check_shape
(
self
,
shape
):
for
s1
,
s2
in
zip
(
self
.
shape
,
shape
):
if
s1
!=
s2
and
s1
>=
0
and
s2
>=
0
:
raise
ValueError
(
"Shape not match. What is defined in data layer is {}, but receive {}"
.
format
(
self
.
shape
,
shape
))
def
done
(
self
):
arr
=
numpy
.
array
(
self
.
data
,
dtype
=
self
.
dtype
)
if
self
.
shape
and
len
(
arr
.
shape
)
!=
len
(
self
.
shape
):
arr
=
arr
.
reshape
(
self
.
shape
)
if
self
.
shape
:
if
len
(
arr
.
shape
)
!=
len
(
self
.
shape
):
try
:
arr
=
arr
.
reshape
(
self
.
shape
)
except
ValueError
:
raise
ValueError
(
"Reshape error. What is defined in data layer is {}, but receive {}"
.
format
(
self
.
shape
,
arr
.
shape
))
else
:
self
.
_check_shape
(
arr
.
shape
)
t
=
core
.
LoDTensor
()
t
.
set
(
arr
,
self
.
place
)
if
self
.
lod_level
>
0
:
...
...
@@ -152,17 +167,8 @@ class DataFeeder(object):
raise
TypeError
(
"Feed list should contain a list of variable"
)
self
.
feed_dtypes
.
append
(
each_var
.
dtype
)
self
.
feed_names
.
append
(
each_var
.
name
)
shape
=
each_var
.
shape
batch_size_dim
=
-
1
for
i
,
s
in
enumerate
(
shape
):
if
s
<
0
:
batch_size_dim
=
i
break
if
batch_size_dim
==
-
1
:
raise
ValueError
(
"Variable {0} must has a batch size dimension"
,
each_var
.
name
)
self
.
feed_lod_level
.
append
(
each_var
.
lod_level
)
self
.
feed_shapes
.
append
(
shape
)
self
.
feed_shapes
.
append
(
each_var
.
shape
)
self
.
place
=
place
...
...
python/paddle/fluid/executor.py
浏览文件 @
eac5a0aa
...
...
@@ -14,11 +14,15 @@
from
__future__
import
print_function
import
os
import
multiprocessing
import
numpy
as
np
import
contextlib
import
six
from
.framework
import
Program
,
default_main_program
,
Variable
from
.
import
core
from
.
import
compiler
from
..
import
compat
as
cpt
__all__
=
[
'Executor'
,
'global_scope'
,
'scope_guard'
]
...
...
@@ -204,20 +208,20 @@ def _fetch_var(name, scope=None, return_numpy=True):
return
tensor
def
_get_program_cache_key
(
feed
,
fetch_list
):
feed_var_names
=
list
(
feed
.
keys
())
def
_to_name_str
(
var
):
if
isinstance
(
var
,
Variable
):
return
var
.
desc
.
name
()
elif
isinstance
(
var
,
str
):
return
var
elif
isinstance
(
var
,
six
.
string_types
):
return
str
(
var
)
else
:
raise
TypeError
(
str
(
var
)
+
" should be Variable or str"
)
def
to_name_str
(
var
):
if
isinstance
(
var
,
Variable
):
return
var
.
desc
.
name
()
elif
isinstance
(
var
,
str
):
return
var
elif
isinstance
(
var
,
six
.
string_types
):
return
str
(
var
)
else
:
raise
TypeError
(
str
(
var
)
+
" should be Variable or str"
)
fetch_var_names
=
list
(
map
(
to_name_str
,
fetch_list
))
def
_get_program_cache_key
(
feed
,
fetch_list
):
feed_var_names
=
list
(
feed
.
keys
())
fetch_var_names
=
list
(
map
(
_to_name_str
,
fetch_list
))
return
str
(
feed_var_names
+
fetch_var_names
)
...
...
@@ -266,6 +270,29 @@ class Executor(object):
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Example:
.. code-block:: python
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the main program directly without compile.
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=[loss.name])
# Or, compiled the program and run. See `CompiledProgram` for more detail.
compiled_prog = compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=loss.name)
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
...
...
@@ -275,11 +302,8 @@ class Executor(object):
def
__init__
(
self
,
place
):
self
.
place
=
place
p
=
core
.
Place
()
p
.
set_place
(
place
)
self
.
executor
=
core
.
Executor
(
p
)
self
.
program_caches
=
dict
()
self
.
executor
=
None
self
.
_closed
=
False
def
_get_program_cache
(
self
,
program_cache_key
):
...
...
@@ -361,6 +385,7 @@ class Executor(object):
You can no long use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example:
>>> cpu = core.CPUPlace()
...
...
@@ -368,10 +393,55 @@ class Executor(object):
>>> ...
>>> exe.close()
"""
if
not
self
.
_closed
:
if
not
self
.
_closed
and
self
.
executor
:
self
.
executor
.
close
()
self
.
_closed
=
True
def
_run_parallel
(
self
,
scope
,
feed
,
fetch_list
,
fetch_var_name
,
return_numpy
):
if
isinstance
(
feed
,
dict
):
feed_tensor_dict
=
dict
()
for
feed_name
in
feed
:
feed_tensor
=
feed
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor
.
set
(
feed
[
feed_name
],
core
.
CPUPlace
())
feed_tensor_dict
[
feed_name
]
=
feed_tensor
self
.
executor
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
self
.
_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
raise
TypeError
(
"Each element of feed list should be a dict"
)
res_dict
=
dict
()
for
feed_name
in
each
:
tensor
=
each
[
feed_name
]
if
not
isinstance
(
tensor
,
core
.
LoDTensor
):
tmp
=
core
.
LoDTensor
()
tmp
.
set
(
tensor
,
self
.
_places
[
i
])
tensor
=
tmp
res_dict
[
feed_name
]
=
tensor
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_names
=
list
(
map
(
_to_name_str
,
fetch_list
))
self
.
executor
.
run
(
fetch_var_names
,
fetch_var_name
)
arr
=
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
if
return_numpy
:
return
as_numpy
(
arr
)
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
def
run
(
self
,
program
=
None
,
feed
=
None
,
...
...
@@ -391,8 +461,9 @@ class Executor(object):
operators in the program but not only the operators dependent by the fetch_list
Args:
program(Program): the program that need to run, if not provied, then default_main_program will be used.
feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
program(Program|CompiledProgram): the program that need to run,
if not provided, then default_main_program will be used.
feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
feed_var_name(str): the name for the input variable of feed Operator.
fetch_var_name(str): the name for the output variable of fetch Operator.
...
...
@@ -428,14 +499,59 @@ class Executor(object):
if
self
.
_closed
:
raise
RuntimeError
(
"Attempted to use a closed Executor"
)
if
scope
is
None
:
scope
=
global_scope
()
if
fetch_list
is
None
:
fetch_list
=
[]
compiled
=
isinstance
(
program
,
compiler
.
CompiledProgram
)
# For backward compatibility, run directly.
if
not
compiled
:
if
not
self
.
executor
:
p
=
core
.
Place
()
p
.
set_place
(
self
.
place
)
self
.
executor
=
core
.
Executor
(
p
)
return
self
.
_run
(
program
,
feed
=
feed
,
fetch_list
=
fetch_list
,
feed_var_name
=
feed_var_name
,
fetch_var_name
=
fetch_var_name
,
scope
=
scope
,
return_numpy
=
return_numpy
,
use_program_cache
=
use_program_cache
)
program
.
_compile
(
scope
,
self
.
place
)
self
.
executor
=
program
.
_executor
if
program
.
_is_data_parallel
:
return
self
.
_run_parallel
(
scope
=
scope
,
feed
=
feed
,
fetch_list
=
fetch_list
,
fetch_var_name
=
fetch_var_name
,
return_numpy
=
return_numpy
)
else
:
# TODO(panyx0718): Can compile program to optimize executor
# performance.
return
self
.
_run
(
program
.
_program
,
feed
=
feed
,
fetch_list
=
fetch_list
,
feed_var_name
=
feed_var_name
,
fetch_var_name
=
fetch_var_name
,
scope
=
scope
,
return_numpy
=
return_numpy
,
use_program_cache
=
use_program_cache
)
def
_run
(
self
,
program
,
feed
,
fetch_list
,
feed_var_name
,
fetch_var_name
,
scope
,
return_numpy
,
use_program_cache
):
if
feed
is
None
:
feed
=
{}
if
not
isinstance
(
feed
,
dict
):
raise
TypeError
(
"feed requires dict as its Parameter. But you passed in %s"
%
(
type
(
feed
)))
if
fetch_list
is
None
:
fetch_list
=
[]
if
program
is
None
:
program
=
default_main_program
()
...
...
@@ -444,9 +560,6 @@ class Executor(object):
"Executor requires Program as its Parameter. But you passed in %s"
%
(
type
(
program
)))
if
scope
is
None
:
scope
=
global_scope
()
cache_key
=
_get_program_cache_key
(
feed
,
fetch_list
)
if
use_program_cache
:
cached_program
=
self
.
_get_program_cache
(
cache_key
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
eac5a0aa
#
Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -195,22 +195,18 @@ class Optimizer(object):
format
(
name
,
param
.
name
))
return
self
.
_accumulators
[
name
][
param
.
name
]
def
_create_optimization_pass
(
self
,
parameters_and_grads
,
loss
,
startup_program
=
None
):
def
_create_optimization_pass
(
self
,
parameters_and_grads
):
"""Add optimization operators to update gradients to variables.
Args:
loss(Variable): the target that this optimization is for.
parameters_and_grads(list(tuple(Variable, Variable))):
a list of (variable, gradient) pair to update.
a list of (variable, gradient) pair to update.
Returns:
return_op_list: a list of operators that will complete one step of
optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage
their internal state.
optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage
their internal state.
"""
# This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that
...
...
@@ -219,37 +215,33 @@ class Optimizer(object):
# _create_accumulators method if it needs to create accumulators
# for parameters and extend _finish_update method to add custom ops.
# Create any accumulators
program
=
loss
.
block
.
program
self
.
_dtype
=
loss
.
dtype
with
program_guard
(
program
,
startup_program
):
global_block
=
framework
.
default_main_program
().
global_block
()
start
=
len
(
global_block
.
ops
)
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
)
self
.
_create_accumulators
(
loss
.
block
,
[
p
[
0
]
for
p
in
parameters_and_grads
])
self
.
_create_global_learning_rate
()
optimize_ops
=
[]
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
continue
with
param_and_grad
[
0
].
block
.
program
.
_optimized_guard
(
param_and_grad
),
name_scope
(
"optimizer"
):
if
param_and_grad
[
0
].
trainable
is
True
:
optimize_op
=
self
.
_append_optimize_op
(
loss
.
block
,
param_and_grad
)
optimize_ops
.
append
(
optimize_op
)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
self
.
_finish_update
(
loss
.
block
,
parameters_and_grads
)
end
=
len
(
global_block
.
ops
)
return
global_block
.
_slice_ops
(
start
,
end
)
def
_process_distribute_lookuptable
(
self
,
param_grads
,
loss
,
startup_program
):
# Allways called under program_guard use global block as loss block
global_block
=
framework
.
default_main_program
().
global_block
()
start
=
len
(
global_block
.
ops
)
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
)
self
.
_create_accumulators
(
global_block
,
[
p
[
0
]
for
p
in
parameters_and_grads
])
self
.
_create_global_learning_rate
()
optimize_ops
=
[]
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
continue
with
param_and_grad
[
0
].
block
.
program
.
_optimized_guard
(
param_and_grad
),
name_scope
(
"optimizer"
):
if
param_and_grad
[
0
].
trainable
is
True
:
optimize_op
=
self
.
_append_optimize_op
(
global_block
,
param_and_grad
)
optimize_ops
.
append
(
optimize_op
)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
self
.
_finish_update
(
global_block
,
parameters_and_grads
)
end
=
len
(
global_block
.
ops
)
return
global_block
.
_slice_ops
(
start
,
end
)
def
_process_distribute_lookuptable
(
self
,
param_grads
):
"""
Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out,
...
...
@@ -259,7 +251,8 @@ class Optimizer(object):
:param loss: the loss variable.
:param startup_program: the startup program
"""
program
=
loss
.
block
.
program
program
=
framework
.
default_main_program
()
global_block
=
framework
.
default_main_program
().
global_block
()
table_name
=
find_distributed_lookup_table
(
program
)
table_param
=
None
table_grad
=
None
...
...
@@ -275,38 +268,121 @@ class Optimizer(object):
new_param_grads
.
append
((
p
,
g
))
sgd_op
=
None
if
table_param
is
not
None
:
with
program_guard
(
program
,
startup_program
):
param_and_grad
=
[
table_param
,
table_grad
]
with
table_param
.
block
.
program
.
_optimized_guard
(
param_and_grad
),
\
framework
.
name_scope
(
"optimizer"
):
self
.
_create_global_learning_rate
()
# create the optimize op
sgd_op
=
loss
.
block
.
append_op
(
type
=
'sgd'
,
inputs
=
{
"Param"
:
table_param
,
"Grad"
:
table_grad
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
)
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
]})
param_and_grad
=
[
table_param
,
table_grad
]
with
table_param
.
block
.
program
.
_optimized_guard
(
param_and_grad
),
\
framework
.
name_scope
(
"optimizer"
):
self
.
_create_global_learning_rate
()
# create the optimize op
sgd_op
=
global_block
.
append_op
(
type
=
'sgd'
,
inputs
=
{
"Param"
:
table_param
,
"Grad"
:
table_grad
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
)
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
]})
return
new_param_grads
,
(
table_param
,
table_grad
),
sgd_op
def
backward
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
callbacks
=
None
):
"""
First part of `minimize`, do auto-diff to append backward ops for
the current program.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
callbacks (list|None): list of callables to run when appending backward
operator for one parameter.
Return:
list: list of (param, grad) pair, grad is the output of backward.
Examples:
See examples in `apply_gradients`.
"""
if
callbacks
is
None
:
callbacks
=
[
error_clip_callback
]
else
:
assert
(
isinstance
(
callbacks
,
list
))
callbacks
.
append
(
error_clip_callback
)
return
append_backward
(
loss
,
parameter_list
,
no_grad_set
,
callbacks
)
def
apply_gradients
(
self
,
params_grads
):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
Examples:
.. code-block:: python
loss = network()
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
params_grads = optimizer.backward(loss)
# you may append operations for params_grads here
# ...
optimizer.apply_gradients(params_grads)
"""
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
params_grads
,
table_param_and_grad
,
table_optimize_op
=
\
self
.
_process_distribute_lookuptable
(
params_grads
)
params_grads
=
append_gradient_clip_ops
(
params_grads
)
# Add regularization if any
params_grads
=
append_regularization_ops
(
params_grads
,
self
.
regularization
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
if
table_optimize_op
is
not
None
:
optimize_ops
.
append
(
table_optimize_op
)
params_grads
.
append
(
table_param_and_grad
)
return
optimize_ops
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
"""Add operations to minimize `loss` by updating `parameter_list`.
"""
Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward()` and
`create_optimization_pass()` into one.
This method combines interface `backward()` and
`apply_gradients()` into one.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
Returns:
tuple: (optimize_ops, params_grads) which are, list of operators appended;
and list of (param, grad) Variables pair for optimization.
"""
self
.
_dtype
=
loss
.
dtype
program
=
loss
.
block
.
program
optimize_ops
=
[]
if
imperative_base
.
enabled
():
if
parameter_list
is
not
None
:
params_grads
=
parameter_list
else
:
program
=
loss
.
block
.
program
parameters
=
program
.
global_block
().
all_parameters
()
params_grads
=
[]
for
param
in
parameters
:
...
...
@@ -317,29 +393,13 @@ class Optimizer(object):
stop_gradient
=
True
)
grad_var
.
_value
=
param
.
_ivar
.
grad_value
params_grads
.
append
((
param
,
grad_var
))
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
,
loss
,
startup_program
)
with
program_guard
(
program
,
startup_program
):
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
else
:
params_grads
=
append_backward
(
loss
,
parameter_list
,
no_grad_set
,
[
error_clip_callback
])
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
params_grads
,
table_param_and_grad
,
table_optimize_op
=
\
self
.
_process_distribute_lookuptable
(
params_grads
,
loss
,
startup_program
)
params_grads
=
append_gradient_clip_ops
(
params_grads
)
# Add regularization if any
params_grads
=
append_regularization_ops
(
params_grads
,
self
.
regularization
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
,
loss
,
startup_program
)
if
table_optimize_op
is
not
None
:
optimize_ops
.
append
(
table_optimize_op
)
params_grads
.
append
(
table_param_and_grad
)
with
program_guard
(
program
,
startup_program
):
params_grads
=
self
.
backward
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
optimize_ops
=
self
.
apply_gradients
(
params_grads
)
return
optimize_ops
,
params_grads
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
eac5a0aa
...
...
@@ -181,9 +181,8 @@ class ParallelExecutor(object):
# step7: init ParallelExecutor
self
.
executor
=
core
.
ParallelExecutor
(
places
,
persistable_vars
,
main
.
desc
,
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
build_strategy
,
num_trainers
,
trainer_id
)
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
build_strategy
)
self
.
scope
=
scope
...
...
@@ -294,7 +293,7 @@ class ParallelExecutor(object):
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_name
=
'
@FETCHED_VAR_NAME@
'
fetch_var_name
=
'
fetch
'
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
...
...
python/paddle/fluid/tests/test_data_feeder.py
浏览文件 @
eac5a0aa
...
...
@@ -30,6 +30,12 @@ class TestDataFeeder(unittest.TestCase):
self
.
assertEqual
(
result
[
'image'
].
recursive_sequence_lengths
(),
[])
self
.
assertEqual
(
result
[
'label'
].
recursive_sequence_lengths
(),
[])
try
:
result
=
feeder
.
feed
([([
0
]
*
783
,
[
9
]),
([
1
]
*
783
,
[
1
])])
self
.
assertTrue
(
False
)
except
ValueError
:
self
.
assertTrue
(
True
)
def
test_lod_level_1_converter
(
self
):
# lod_level = 1
# each sentence has a different number of words
...
...
python/paddle/fluid/tests/unittests/dist_ctr.py
浏览文件 @
eac5a0aa
...
...
@@ -31,6 +31,7 @@ fluid.default_main_program().random_seed = 1
class
TestDistCTR2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
):
dnn_input_dim
,
lr_input_dim
=
dist_ctr_reader
.
load_data_meta
()
""" network definition """
dnn_data
=
fluid
.
layers
.
data
(
...
...
@@ -97,7 +98,14 @@ class TestDistCTR2x2(TestDistRunnerBase):
inference_program
=
paddle
.
fluid
.
default_main_program
().
clone
()
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0001
)
regularization
=
None
use_l2_decay
=
bool
(
os
.
getenv
(
'USE_L2_DECAY'
,
0
))
if
use_l2_decay
:
regularization
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-1
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0001
,
regularization
=
regularization
)
sgd_optimizer
.
minimize
(
avg_cost
)
dataset
=
dist_ctr_reader
.
Dataset
()
...
...
python/paddle/fluid/tests/unittests/dist_se_resnext.py
浏览文件 @
eac5a0aa
...
...
@@ -235,7 +235,6 @@ class DistSeResneXt2x2(TestDistRunnerBase):
bd
=
[
step
*
e
for
e
in
epochs
]
base_lr
=
0.1
lr
=
[]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
optimizer
=
fluid
.
optimizer
.
Momentum
(
...
...
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
paddle.fluid.tests.unittests.test_mean_op
import
TestMeanOp
,
TestFP16MeanOp
class
TestNGRAPHMeanOp
(
TestMeanOp
):
def
setUp
(
self
):
super
(
TestNGRAPHMeanOp
,
self
).
setUp
()
class
TestNGRAPHFP16MeanOp
(
TestFP16MeanOp
):
def
setUp
(
self
):
super
(
TestNGRAPHFP16MeanOp
,
self
).
setUp
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
paddle.fluid.tests.unittests.test_scale_op
import
TestScaleOp
,
TestScaleOpSelectedRows
,
TestScaleFp16Op
,
TestScaleFp16OpSelectedRows
class
TestNGRAPHScaleOp
(
TestScaleOp
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleOpSelectedRows
(
TestScaleOpSelectedRows
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleFp16Op
(
TestScaleFp16Op
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleFp16OpSelectedRows
(
TestScaleFp16OpSelectedRows
):
def
init_dtype_type
(
self
):
pass
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
eac5a0aa
...
...
@@ -19,6 +19,7 @@ import os
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid
import
compiler
import
time
import
numpy
as
np
import
math
...
...
@@ -44,15 +45,8 @@ class TestParallelExecutorBase(unittest.TestCase):
optimizer
=
fluid
.
optimizer
.
Adam
,
use_fast_executor
=
False
,
enable_sequential_execution
=
False
):
def
run_executor
(
exe
,
feed
,
fetch_list
,
program
=
None
):
if
isinstance
(
exe
,
fluid
.
ParallelExecutor
):
res
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
feed
)
elif
isinstance
(
exe
,
fluid
.
Executor
):
if
program
is
None
:
program
=
fluid
.
default_main_program
()
res
=
exe
.
run
(
program
=
program
,
feed
=
feed
,
fetch_list
=
fetch_list
)
else
:
raise
ValueError
(
'Unkown type exe'
)
def
run_executor
(
exe
,
binary
,
feed
,
fetch_list
):
res
=
exe
.
run
(
binary
,
feed
=
feed
,
fetch_list
=
fetch_list
)
return
res
main
=
fluid
.
Program
()
...
...
@@ -72,8 +66,8 @@ class TestParallelExecutorBase(unittest.TestCase):
fluid
.
memory_optimize
(
main
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
startup_
exe
=
fluid
.
Executor
(
place
)
startup_
exe
.
run
(
startup
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
.
allow_op_delay
=
allow_op_delay
if
use_fast_executor
:
...
...
@@ -86,15 +80,13 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_cuda
and
core
.
is_compiled_with_cuda
():
build_strategy
.
remove_unnecessary_lock
=
True
if
use_parallel_executor
:
exe
=
fluid
.
ParallelExecutor
(
use_cuda
,
binary
=
compiler
.
CompiledProgram
(
main
).
with_data_parallel
(
loss_name
=
loss
.
name
,
exec_strategy
=
exec
_strategy
,
build_strategy
=
build
_strategy
)
build_strategy
=
build
_strategy
,
exec_strategy
=
exec
_strategy
)
else
:
exe
=
fluid
.
Executor
(
place
=
place
)
binary
=
compiler
.
CompiledProgram
(
main
)
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
...
...
@@ -102,13 +94,14 @@ class TestParallelExecutorBase(unittest.TestCase):
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
begin
=
time
.
time
()
first_loss
,
=
run_executor
(
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
exe
=
exe
,
binary
=
binary
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
for
i
in
range
(
iter
):
run_executor
(
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[])
run_executor
(
exe
=
exe
,
binary
=
binary
,
feed
=
feed_dict
,
fetch_list
=
[])
last_loss
,
=
run_executor
(
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
exe
=
exe
,
binary
=
binary
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
end
=
time
.
time
()
if
batch_size
is
not
None
:
...
...
python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
浏览文件 @
eac5a0aa
...
...
@@ -25,6 +25,15 @@ from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
def
conv2d_forward_refer
(
input
,
filter
,
group
,
conv_param
):
out
,
in_n
,
out_h
,
out_w
,
out_c
=
conv2d_forward_naive
(
input
,
filter
,
group
,
conv_param
)
size
=
[
in_n
,
out_c
,
out_h
,
out_w
]
return
format_reorder
(
out
,
size
)
def
format_reorder
(
out
,
size
):
in_n
=
size
[
0
]
out_h
=
size
[
2
]
out_w
=
size
[
3
]
out_c
=
size
[
1
]
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
for
n
in
range
(
in_n
):
for
i
in
range
(
out_h
):
...
...
@@ -47,7 +56,9 @@ class TestConv2dInt8Op(TestConv2dOp):
self
.
init_group
()
self
.
init_dilation
()
self
.
init_test_case
()
self
.
init_dtype
()
self
.
init_fuse_relu
()
self
.
init_fuse_residual
()
self
.
init_data_type
()
conv2d_param
=
{
'stride'
:
self
.
stride
,
...
...
@@ -78,7 +89,24 @@ class TestConv2dInt8Op(TestConv2dOp):
np
.
round
((
input_shift
)
*
self
.
scale_in
).
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
conv2d_param
).
astype
(
np
.
float32
)
*
scale_output_shift
output
=
np
.
round
(
output1
-
output2
).
astype
(
self
.
dsttype
)
if
self
.
fuse_residual
:
input_residual
=
np
.
random
.
randint
(
-
5
,
5
,
self
.
input_residual_size
).
astype
(
self
.
srctype
)
output_tmp
=
np
.
round
(
output1
-
output2
+
format_reorder
(
input_residual
,
self
.
input_residual_size
).
astype
(
self
.
srctype
)
*
(
self
.
scale_out
/
self
.
scale_in_eltwise
))
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output_tmp
,
0
).
astype
(
self
.
dsttype
)
else
:
output
=
output_tmp
.
astype
(
self
.
dsttype
)
else
:
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
np
.
round
(
output1
-
output2
),
0
).
astype
(
self
.
dsttype
)
else
:
output
=
np
.
round
(
output1
-
output2
).
astype
(
self
.
dsttype
)
else
:
filter_int
=
np
.
round
(
filter
*
self
.
scale_weights
[
0
]).
astype
(
np
.
int32
)
...
...
@@ -87,13 +115,35 @@ class TestConv2dInt8Op(TestConv2dOp):
output1
=
conv2d_forward_refer
(
input
.
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
conv2d_param
).
astype
(
np
.
float32
)
output
=
np
.
round
(
output1
*
scale_output_shift
).
astype
(
self
.
dsttype
)
if
self
.
fuse_residual
:
input_residual
=
np
.
random
.
randint
(
0
,
10
,
self
.
input_residual_size
).
astype
(
self
.
srctype
)
output_tmp
=
np
.
round
(
output1
*
(
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
]))
+
format_reorder
(
input_residual
,
self
.
input_residual_size
).
astype
(
np
.
int32
)
*
(
self
.
scale_out
/
self
.
scale_in_eltwise
))
output_tmp2
=
np
.
round
(
output1
*
(
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
])))
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output_tmp
,
0
).
astype
(
self
.
dsttype
)
else
:
output
=
output_tmp
.
astype
(
self
.
dsttype
)
else
:
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output_tmp2
,
0
).
astype
(
self
.
dsttype
)
else
:
output
=
output_tmp2
.
astype
(
self
.
dsttype
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
.
astype
(
self
.
srctype
)),
'Filter'
:
OpTest
.
np_dtype_to_fluid_dtype
(
filter
)
}
if
self
.
fuse_residual
:
self
.
inputs
[
'ResidualData'
]
=
OpTest
.
np_dtype_to_fluid_dtype
(
input_residual
)
self
.
attrs
=
{
'strides'
:
self
.
stride
,
'paddings'
:
self
.
pad
,
...
...
@@ -106,6 +156,9 @@ class TestConv2dInt8Op(TestConv2dOp):
'Scale_in'
:
self
.
scale_in
,
'Scale_out'
:
self
.
scale_out
,
'Scale_weights'
:
self
.
scale_weights
,
'Scale_in_eltwise'
:
self
.
scale_in_eltwise
,
'fuse_relu'
:
self
.
fuse_relu
,
'fuse_residual_connection'
:
self
.
fuse_residual
}
self
.
outputs
=
{
'Output'
:
output
}
...
...
@@ -123,18 +176,27 @@ class TestConv2dInt8Op(TestConv2dOp):
def
init_test_case
(
self
):
TestConv2dOp
.
init_test_case
(
self
)
self
.
input_size
=
[
1
,
1
,
5
,
5
]
# NCHW
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
1
,
f_c
,
3
,
3
]
self
.
input_residual_size
=
[
1
,
2
,
3
,
3
]
self
.
filter_size
=
[
2
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.6
def
init_dtype
(
self
):
def
init_d
ata_
type
(
self
):
self
.
srctype
=
np
.
uint8
self
.
dsttype
=
np
.
int8
def
init_fuse_relu
(
self
):
self
.
fuse_relu
=
True
def
init_fuse_residual
(
self
):
self
.
fuse_residual
=
True
#--------------------test conv2d u8 in and s8 out--------------------
#--------------------test conv2d u8 in and u8 out with residual fuse--------------------
class
TestConv2d
(
TestConv2dInt8Op
):
...
...
@@ -142,18 +204,21 @@ class TestConv2d(TestConv2dInt8Op):
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
input_residual_size
=
[
2
,
6
,
3
,
3
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.6
class
TestWithPad
(
TestConv2d
):
def
init_test_case
(
self
):
TestConv2d
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
input_residual_size
=
[
2
,
6
,
5
,
5
]
class
TestWithGroup
(
TestConv2d
):
...
...
@@ -166,12 +231,14 @@ class TestWithStride(TestConv2dInt8Op):
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
6
,
6
]
self
.
input_residual_size
=
[
2
,
6
,
3
,
3
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.8
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.5
class
TestWith1x1
(
TestConv2dInt8Op
):
...
...
@@ -179,12 +246,14 @@ class TestWith1x1(TestConv2dInt8Op):
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
1
,
3
,
5
,
5
]
self
.
input_residual_size
=
[
1
,
6
,
5
,
5
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
12.0
]
self
.
scale_in_eltwise
=
0.5
class
TestWithInput1x1Filter1x1
(
TestConv2dInt8Op
):
...
...
@@ -192,29 +261,98 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
1
,
1
]
self
.
input_residual_size
=
[
2
,
6
,
1
,
1
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.8
def
init_group
(
self
):
self
.
groups
=
3
#--------------------test conv2d s8 in and s8 out--------------------
def
init_data_type_with_fusion
(
self
,
input_dt
,
fuse_relu
,
fuse_residual
):
self
.
srctype
=
input_dt
self
.
dsttype
=
np
.
uint8
if
fuse_relu
else
np
.
int8
def
init_fuse_relu
(
self
):
self
.
fuse_relu
=
fuse_relu
def
init_fuse_residual
(
self
):
self
.
fuse_residual
=
fuse_residual
def
create_test_int8_class
(
parent
):
class
TestInt8Case
(
parent
):
def
init_dtype
(
self
):
self
.
srctype
=
np
.
int8
self
.
dsttype
=
np
.
int8
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"s8s8"
)
TestInt8Case
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestInt8Case
#--------------------test conv2d s8 in and u8 out--------------------
class
TestS8U8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
True
,
False
)
#--------------------test conv2d s8 in and s8 out--------------------
class
TestS8S8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
False
,
False
)
#--------------------test conv2d u8 in and s8 out--------------------
class
TestU8S8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
uint8
,
False
,
False
)
#--------------------test conv2d u8 in and u8 out without residual fuse--------------------
class
TestU8U8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
uint8
,
True
,
False
)
#--------------------test conv2d s8 in and u8 out with residual fuse--------------------
class
TestS8U8ResCase
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
True
,
True
)
#--------------------test conv2d s8 in and s8 out with residual fuse--------------------
class
TestS8S8ResCase
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
False
,
True
)
#--------------------test conv2d u8 in and s8 out with residual fuse--------------------
class
TestU8S8ResCase
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
uint8
,
False
,
True
)
cls_name_s8u8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"1"
)
cls_name_s8s8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"0"
)
cls_name_u8s8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"0"
)
cls_name_u8u8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"1"
)
cls_name_s8u8_re_1
=
"{0}_relu_{1}_residual_{2}"
.
format
(
parent
.
__name__
,
"1"
,
"1"
)
cls_name_s8s8_re_1
=
"{0}_relu_{1}_residual_{2}"
.
format
(
parent
.
__name__
,
"0"
,
"1"
)
cls_name_u8s8_re_1
=
"{0}_relu_{1}_residual_{2}"
.
format
(
parent
.
__name__
,
"0"
,
"1"
)
TestS8U8Case
.
__name__
=
cls_name_s8u8
TestS8S8Case
.
__name__
=
cls_name_s8s8
TestU8S8Case
.
__name__
=
cls_name_u8s8
TestU8U8Case
.
__name__
=
cls_name_u8u8
TestS8U8ResCase
.
__name__
=
cls_name_s8u8_re_1
TestS8S8ResCase
.
__name__
=
cls_name_s8s8_re_1
TestU8S8ResCase
.
__name__
=
cls_name_u8s8_re_1
globals
()[
cls_name_s8u8
]
=
TestS8U8Case
globals
()[
cls_name_s8s8
]
=
TestS8S8Case
globals
()[
cls_name_u8s8
]
=
TestU8S8Case
globals
()[
cls_name_u8u8
]
=
TestU8U8Case
globals
()[
cls_name_s8u8_re_1
]
=
TestS8U8ResCase
globals
()[
cls_name_s8s8_re_1
]
=
TestS8S8ResCase
globals
()[
cls_name_u8s8_re_1
]
=
TestU8S8ResCase
create_test_int8_class
(
TestConv2dInt8Op
)
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
eac5a0aa
...
...
@@ -26,6 +26,7 @@ import pickle
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid
import
compiler
RUN_STEP
=
10
DEFAULT_BATCH_SIZE
=
2
...
...
@@ -104,8 +105,8 @@ class TestDistRunnerBase(object):
else
:
place
=
fluid
.
CPUPlace
()
startup_
exe
=
fluid
.
Executor
(
place
)
startup_
exe
.
run
(
fluid
.
default_startup_program
())
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
...
...
@@ -125,19 +126,16 @@ class TestDistRunnerBase(object):
mypass
.
set_int
(
"num_repeats"
,
args
.
batch_merge_repeat
)
if
args
.
update_method
==
"nccl2"
:
num_trainers
=
len
(
args
.
endpoints
.
split
(
","
))
trainer_id
=
args
.
trainer_id
build_stra
.
num_trainers
=
len
(
args
.
endpoints
.
split
(
","
))
build_stra
.
trainer_id
=
args
.
trainer_id
else
:
num_trainers
=
1
trainer_id
=
0
build_stra
.
num_trainers
=
1
build_stra
.
trainer_id
=
0
exe
=
fluid
.
ParallelExecutor
(
args
.
use_cuda
,
binary
=
compiler
.
CompiledProgram
(
trainer_prog
).
with_data_parallel
(
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
,
build_strategy
=
build_stra
,
num_trainers
=
num_trainers
,
trainer_id
=
trainer_id
)
exec_strategy
=
strategy
)
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
...
...
@@ -160,7 +158,8 @@ class TestDistRunnerBase(object):
out_losses
=
[]
for
_
in
six
.
moves
.
xrange
(
RUN_STEP
):
loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
loss
,
=
exe
.
run
(
binary
,
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
()))
out_losses
.
append
(
loss
[
0
])
if
six
.
PY2
:
...
...
python/paddle/fluid/tests/unittests/test_dist_ctr.py
浏览文件 @
eac5a0aa
...
...
@@ -18,7 +18,6 @@ import unittest
from
test_dist_base
import
TestDistBase
# FIXME(tangwei): sum op can not handle when inputs is empty.
class
TestDistCTR2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
...
...
@@ -28,5 +27,19 @@ class TestDistCTR2x2(TestDistBase):
self
.
check_with_place
(
"dist_ctr.py"
,
delta
=
1e-7
,
check_error_log
=
False
)
class
TestDistCTRWithL2Decay2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
test_dist_ctr
(
self
):
need_envs
=
{
"USE_L2_DECAY"
:
"1"
}
self
.
check_with_place
(
"dist_ctr.py"
,
delta
=
1e-7
,
check_error_log
=
False
,
need_envs
=
need_envs
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
from
paddle.fluid.op
import
Operator
import
paddle.compat
as
cpt
class
TestFusedEmbeddingSeqPoolOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"fused_embedding_seq_pool"
self
.
emb_size
=
2
table
=
np
.
random
.
random
((
17
,
self
.
emb_size
)).
astype
(
"float32"
)
ids
=
np
.
array
([[[
4
],
[
3
]],
[[
4
],
[
3
]],
[[
2
],
[
1
]],
[[
16
],
[
1
]]]).
astype
(
"int64"
)
merged_ids
=
np
.
array
([
4
,
2
,
16
]).
astype
(
"int64"
)
ids_expand
=
np
.
expand_dims
(
ids
,
axis
=
1
)
self
.
lod
=
[[
3
,
1
]]
self
.
attrs
=
{
'is_sparse'
:
True
}
self
.
inputs
=
{
'W'
:
table
,
'Ids'
:
(
ids_expand
,
self
.
lod
)}
self
.
outputs
=
{
'Out'
:
np
.
reshape
(
np
.
array
([
table
[[
4
,
3
]]
+
table
[[
4
,
3
]]
+
table
[[
2
,
1
]],
table
[[
16
,
1
]]
]),
[
len
(
self
.
lod
[
0
]),
2
*
self
.
emb_size
])
}
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
from
test_reorder_lod_tensor
import
convert_to_offset
from
test_seq_pool
import
compute_seqpool_sum
,
compute_seqpool_avg
,
compute_seqpool_sqrt
class
TestFusionSeqPoolConcatOp
(
OpTest
):
def
setUp
(
self
):
self
.
w
=
11
self
.
lods
=
[[[
2
,
3
,
5
]],
[[
1
,
5
,
2
]]]
self
.
set_conf
()
self
.
set_pooltype
()
self
.
op_type
=
'fusion_seqpool_concat'
self
.
axis
=
1
bs
=
len
(
self
.
lods
[
0
][
0
])
inputs
=
[]
outs
=
[]
i
=
0
for
lod
in
self
.
lods
:
assert
bs
==
len
(
lod
[
0
]),
'All lod size should be equal'
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
sum
(
lod
[
0
]),
self
.
w
]).
astype
(
'float32'
)
offset
=
convert_to_offset
(
lod
)
out
=
np
.
zeros
((
bs
,
self
.
w
)).
astype
(
'float32'
)
if
self
.
pooltype
==
"SUM"
:
compute_seqpool_sum
(
x
,
offset
,
out
)
elif
self
.
pooltype
==
"AVERAGE"
:
compute_seqpool_avg
(
x
,
offset
,
out
)
elif
self
.
pooltype
==
"SQRT"
:
compute_seqpool_sqrt
(
x
,
offset
,
out
)
else
:
raise
Exception
(
"Unsupported pool type!"
)
inputs
.
append
((
'x_{0}'
.
format
(
i
),
(
x
,
lod
)))
outs
.
append
(
out
)
i
=
i
+
1
self
.
inputs
=
{
'X'
:
inputs
}
self
.
outputs
=
{
'Out'
:
np
.
concatenate
(
outs
,
axis
=
self
.
axis
)}
self
.
attrs
=
{
'pooltype'
:
self
.
pooltype
,
'axis'
:
self
.
axis
,
}
def
set_pooltype
(
self
):
self
.
pooltype
=
"SUM"
def
set_conf
(
self
):
pass
def
test_check_output
(
self
):
self
.
check_output
()
class
TestFusionSeqPoolConcatOpCase1
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
1
]]]
class
TestFusionSeqPoolConcatOpCase2
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
1
]],
[[
1
]],
[[
1
]]]
class
TestFusionSeqPoolConcatOpCase3
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
1
,
3
,
4
,
6
]]]
self
.
w
=
10
class
TestFusionSeqPoolConcatOpCase4
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
2
,
13
,
4
]],
[[
1
,
1
,
1
]],
[[
5
,
3
,
1
]],
[[
9
,
10
,
3
]]]
self
.
w
=
3
## test avg pool and sqrt
def
create_test_avg_sqrt_class
(
parent
):
class
TestSeqPoolAvgCase
(
parent
):
def
set_pooltype
(
self
):
self
.
pooltype
=
"AVERAGE"
class
TestSeqPoolSqrtCase
(
parent
):
def
set_pooltype
(
self
):
self
.
pooltype
=
"SQRT"
cls_name_avg
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"avg"
)
cls_name_sqrt
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"sqrt"
)
TestSeqPoolAvgCase
.
__name__
=
cls_name_avg
TestSeqPoolSqrtCase
.
__name__
=
cls_name_sqrt
globals
()[
cls_name_avg
]
=
TestSeqPoolAvgCase
globals
()[
cls_name_sqrt
]
=
TestSeqPoolSqrtCase
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOp
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase1
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase2
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase3
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase4
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_ir_graph.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
import
six
from
paddle
import
fluid
class
TestIRGraph
(
unittest
.
TestCase
):
"""
TODO(fc500110): `resolve_hazard` api will be tested when it can be used.
"""
def
test_nodes
(
self
):
graph
=
build_graph
()
self
.
assertTrue
(
{
node
.
name
()
for
node
in
graph
.
nodes
()}
==
{
"x1"
,
"x2"
,
"out"
,
"sum"
})
def
test_has_set_get
(
self
):
graph
=
build_graph
()
for
attr_name
in
[
"int"
,
"float"
,
"string"
]:
self
.
assertFalse
(
graph
.
has
(
attr_name
))
graph
.
set
(
"int"
,
1
)
graph
.
set
(
"float"
,
0.5
)
graph
.
set
(
"string"
,
"string"
)
for
attr_name
in
[
"int"
,
"float"
,
"string"
]:
self
.
assertTrue
(
graph
.
has
(
attr_name
))
self
.
assertTrue
(
graph
.
get_int
(
"int"
)
==
1
)
self
.
assertTrue
(
graph
.
get_float
(
"float"
)
==
0.5
)
self
.
assertTrue
(
graph
.
get_string
(
"string"
)
==
"string"
)
def
test_erase
(
self
):
graph
=
build_graph
()
graph
.
set
(
"test"
,
0
)
self
.
assertTrue
(
graph
.
has
(
"test"
))
graph
.
erase
(
"test"
)
self
.
assertFalse
(
graph
.
has
(
"test"
))
def
test_create_var_node
(
self
):
prog
=
fluid
.
core
.
ProgramDesc
()
block
=
prog
.
block
(
0
)
shape
=
[
10
,
20
]
x1
=
block
.
var
(
six
.
b
(
"x1"
))
x1
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
x1
.
set_shape
(
shape
)
graph
=
fluid
.
core
.
Graph
(
prog
)
node
=
graph
.
create_var_node
(
x1
)
self
.
assertTrue
(
node
.
node_type
()
==
fluid
.
core
.
Node
.
Type
.
Variable
)
def
test_create_op_node
(
self
):
prog
=
fluid
.
core
.
ProgramDesc
()
block
=
prog
.
block
(
0
)
sum_op_desc
=
block
.
append_op
()
graph
=
fluid
.
core
.
Graph
(
prog
)
node
=
graph
.
create_op_node
(
sum_op_desc
)
self
.
assertTrue
(
node
.
node_type
()
==
fluid
.
core
.
Node
.
Type
.
Operation
)
def
test_create_control_dep_var
(
self
):
graph
=
build_graph
()
name
=
"__control_var@{}"
.
format
(
len
(
graph
.
nodes
()))
node
=
graph
.
create_control_dep_var
()
self
.
assertTrue
(
node
.
name
()
==
name
)
def
test_create_empty_node
(
self
):
prog
=
fluid
.
core
.
ProgramDesc
()
graph
=
fluid
.
core
.
Graph
(
prog
)
n1
=
graph
.
create_empty_node
(
'x'
,
fluid
.
core
.
Node
.
Type
.
Operation
)
self
.
assertTrue
(
n1
.
name
()
==
'x'
)
n2
=
graph
.
create_empty_node
(
'y'
,
fluid
.
core
.
Node
.
Type
.
Variable
)
self
.
assertTrue
(
n2
.
name
()
==
'y'
)
def
test_release_nodes
(
self
):
graph
=
build_graph
()
nodes
=
graph
.
release_nodes
()
self
.
assertTrue
(
len
(
graph
.
nodes
())
==
0
)
self
.
assertTrue
({
node
.
name
()
for
node
in
nodes
}
==
{
"x1"
,
"x2"
,
"out"
,
"sum"
})
def
test_remove_node
(
self
):
graph
=
build_graph
()
nodes
=
graph
.
nodes
()
for
node
in
nodes
:
if
node
.
name
()
==
"sum"
:
break
self
.
assertTrue
({
node
.
name
()
for
node
in
nodes
}
==
{
"x1"
,
"x2"
,
"out"
,
"sum"
})
nodes
.
remove
(
node
)
self
.
assertTrue
({
node
.
name
()
for
node
in
nodes
}
==
{
"x1"
,
"x2"
,
"out"
})
def
test_retrieve_node
(
self
):
graph
=
build_graph
()
nodes
=
[]
for
i
in
range
(
len
(
graph
.
nodes
())):
nodes
.
append
(
graph
.
retrieve_node
(
i
))
for
node
in
nodes
:
self
.
assertTrue
(
node
in
graph
.
nodes
())
def
resolve_hazard
(
self
):
pass
def
build_graph
():
prog
=
fluid
.
core
.
ProgramDesc
()
block
=
prog
.
block
(
0
)
shape
=
[
10
,
20
]
# prepare input/output
x1
=
block
.
var
(
six
.
b
(
"x1"
))
x1
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
x1
.
set_shape
(
shape
)
x2
=
block
.
var
(
six
.
b
(
"x2"
))
x2
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
x2
.
set_shape
(
shape
)
out
=
block
.
var
(
six
.
b
(
"out"
))
out
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
sum_op_desc
=
block
.
append_op
()
sum_op_desc
.
set_type
(
"sum"
)
sum_op_desc
.
set_input
(
"X"
,
[
"x1"
,
"x2"
])
sum_op_desc
.
set_output
(
"Out"
,
[
"out"
])
sum_op_desc
.
check_attrs
()
sum_op_desc
.
infer_shape
(
block
)
graph
=
fluid
.
core
.
Graph
(
prog
)
return
graph
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_optimizer.py
浏览文件 @
eac5a0aa
...
...
@@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase):
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"sgd"
])
class
TestOptimizerBackwardApplygrad
(
unittest
.
TestCase
):
def
test_sgd_optimizer
(
self
):
def
check_sgd_optimizer
(
optimizer_attr
):
init_program
=
framework
.
Program
()
program
=
framework
.
Program
()
block
=
program
.
global_block
()
mul_x
=
block
.
create_parameter
(
dtype
=
"float32"
,
shape
=
[
5
,
10
],
lod_level
=
0
,
name
=
"mul.x"
,
optimize_attr
=
optimizer_attr
)
mul_y
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
10
,
8
],
lod_level
=
0
,
name
=
"mul.y"
)
mul_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
5
,
8
],
lod_level
=
0
,
name
=
"mul.out"
)
mean_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
type
=
"mul"
,
inputs
=
{
"X"
:
mul_x
,
"Y"
:
mul_y
},
outputs
=
{
"Out"
:
mul_out
},
attrs
=
{
"x_num_col_dims"
:
1
})
block
.
append_op
(
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
sgd_optimizer
=
optimizer
.
SGDOptimizer
(
learning_rate
=
0.01
)
with
framework
.
program_guard
(
program
,
init_program
):
p_g
=
sgd_optimizer
.
backward
(
mean_out
)
opts
=
sgd_optimizer
.
apply_gradients
(
p_g
)
return
opts
opts
=
check_sgd_optimizer
({
'learning_rate'
:
1.1
})
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"fill_constant"
,
"elementwise_mul"
,
"sgd"
])
opts
=
check_sgd_optimizer
({
'learning_rate'
:
1.0
})
self
.
assertEqual
(
len
(
opts
),
1
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"sgd"
])
class
TestMomentumOptimizer
(
unittest
.
TestCase
):
class
MockMomentum
(
optimizer
.
MomentumOptimizer
):
def
get_accumulators
(
self
):
...
...
@@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
opts
=
momentum_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
momentum_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
sgd_op
=
opts
[
-
1
]
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
...
...
@@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
opts
=
momentum_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
momentum_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
sgd_op
=
opts
[
-
1
]
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
...
...
@@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adagrad_optimizer
.
get_accumulators
()),
0
)
opts
=
adagrad_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
adagrad_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"fill_constant"
,
"elementwise_mul"
,
"adagrad"
])
...
...
@@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adam_optimizer
.
get_accumulators
()),
0
)
opts
=
adam_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
adam_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
5
)
self
.
assertEqual
(
[
op
.
type
for
op
in
opts
],
...
...
@@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adamax_optimizer
.
get_accumulators
()),
0
)
opts
=
adamax_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
adamax_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
4
)
self
.
assertEqual
(
[
op
.
type
for
op
in
opts
],
...
...
@@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
decayed_adagrad_optimizer
.
get_accumulators
()),
0
)
opts
=
decayed_adagrad_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
decayed_adagrad_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
(
[
op
.
type
for
op
in
opts
],
...
...
@@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
ftrl_optimizer
.
get_accumulators
()),
0
)
opts
=
ftrl_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
with
framework
.
program_guard
(
program
,
init_program
):
opts
=
ftrl_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"fill_constant"
,
"elementwise_mul"
,
"ftrl"
])
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
eac5a0aa
...
...
@@ -74,7 +74,11 @@ class TestMNIST(TestParallelExecutorBase):
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
return
img
,
label
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_cuda
):
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_cuda
,
delta1
=
1e-6
,
delta2
=
1e-4
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
...
...
@@ -95,9 +99,9 @@ class TestMNIST(TestParallelExecutorBase):
use_reduce
=
True
)
for
loss
in
zip
(
all_reduce_first_loss
,
reduce_first_loss
):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
delta1
)
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-4
)
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
...
...
@@ -174,8 +178,9 @@ class TestMNIST(TestParallelExecutorBase):
self
.
check_batchnorm_fc_convergence
(
use_cuda
,
use_fast_executor
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
# FIXME(zcd): close this test temporally.
# self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
True
,
1e-5
,
1e-3
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
浏览文件 @
eac5a0aa
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
paddle.fluid
as
fluid
from
paddle.fluid
import
compiler
import
paddle.fluid.core
as
core
import
numpy
as
np
import
unittest
...
...
@@ -61,22 +62,21 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
exe
.
run
(
startup
)
feed_dict
=
{
'image'
:
image
,
'label'
:
label
}
train_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
train_cp
=
compiler
.
CompiledProgram
(
main
).
with_data_parallel
(
loss_name
=
loss
.
name
,
build_strategy
=
build_strategy
)
test_cp
=
compiler
.
CompiledProgram
(
test_program
).
with_data_parallel
(
loss_name
=
loss
.
name
,
main_program
=
main
,
build_strategy
=
build_strategy
)
test_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
main_program
=
test_program
,
share_vars_from
=
train_exe
,
build_strategy
=
build_strategy
)
build_strategy
=
build_strategy
,
share_vars_from
=
train_cp
)
for
i
in
range
(
5
):
test_loss
,
=
test_exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
train_loss
,
=
train_exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
exe
.
run
(
train_cp
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
test_loss
,
=
exe
.
run
(
test_cp
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
train_loss
,
=
exe
.
run
(
train_cp
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
avg_test_loss_val
=
np
.
array
(
test_loss
).
mean
()
if
math
.
isnan
(
float
(
avg_test_loss_val
)):
...
...
python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
浏览文件 @
eac5a0aa
...
...
@@ -22,6 +22,14 @@ import numpy
import
functools
def
convert_to_offset
(
lod
):
offset
=
[[
0
]
for
i
in
lod
]
for
i
,
level
in
enumerate
(
lod
):
for
seq_len
in
level
:
offset
[
i
].
append
(
offset
[
i
][
-
1
]
+
seq_len
)
return
offset
class
TestReorderLoDTensor
(
unittest
.
TestCase
):
num_seq
=
5
# [name, shape, lod_level] pair indicating data info of source and target
...
...
@@ -91,13 +99,6 @@ class TestReorderLoDTensor(unittest.TestCase):
self
.
inputs
[
desc
[
0
]]
=
tensor
def
reorder
(
self
):
def
convert_to_offset
(
lod
):
offset_lod
=
[[
0
]
for
i
in
lod
]
for
i
,
level
in
enumerate
(
lod
):
for
seq_len
in
level
:
offset_lod
[
i
].
append
(
offset_lod
[
i
][
-
1
]
+
seq_len
)
return
offset_lod
level
=
0
# compute the rank_table according to ref_lod
ref_lod
=
self
.
data
[
self
.
data_desc
[
1
][
0
]][
1
][
level
]
...
...
python/paddle/fluid/tests/unittests/test_seq_pool.py
浏览文件 @
eac5a0aa
...
...
@@ -17,33 +17,43 @@ from __future__ import print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
from
test_reorder_lod_tensor
import
convert_to_offset
class
TestSeqAvgPool
(
OpTest
):
def
convert_to_offset
(
self
,
lod
):
offset
=
[[
0
]
for
i
in
lod
]
for
i
,
level
in
enumerate
(
lod
):
for
seq_len
in
level
:
offset
[
i
].
append
(
offset
[
i
][
-
1
]
+
seq_len
)
return
offset
def
compute_seqpool_sum
(
x
,
offset
,
out
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
def
compute_seqpool_avg
(
x
,
offset
,
out
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
out
[
i
]
=
sub_x
.
mean
(
axis
=
0
)
def
compute_seqpool_sqrt
(
x
,
offset
,
out
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
seq_len
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
/
np
.
sqrt
(
seq_len
)
class
TestSeqAvgPool
(
OpTest
):
def
set_data
(
self
):
self
.
op_type
=
'sequence_pool'
# one level, batch size is 4
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
23
]).
astype
(
'float32'
)
lod
=
[[
11
]]
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
out
=
np
.
zeros
((
len
(
lod
[
0
]),
23
)).
astype
(
'float32'
)
self
.
outputs
=
{
'Out'
:
out
}
return
x
,
offset
,
out
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"AVERAGE"
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
out
[
i
]
=
sub_x
.
mean
(
axis
=
0
)
compute_seqpool_avg
(
x
,
offset
,
out
)
def
setUp
(
self
):
x
,
offset
,
out
=
self
.
set_data
()
...
...
@@ -62,9 +72,7 @@ class TestSeqAvgPool(OpTest):
class
TestSeqSumPool
(
TestSeqAvgPool
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"SUM"
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
compute_seqpool_sum
(
x
,
offset
,
out
)
class
TestSeqMaxPool
(
TestSeqAvgPool
):
...
...
@@ -72,7 +80,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
self
.
op_type
=
'sequence_pool'
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
23
]).
astype
(
'float32'
)
lod
=
[[
13
]]
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
l
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
x
[
offset
[
0
][
i
]
+
np
.
random
.
randint
(
l
),
:]
+=
2.0
...
...
@@ -93,10 +101,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
class
TestSeqSqrtPool
(
TestSeqAvgPool
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"SQRT"
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
seq_len
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
/
np
.
sqrt
(
seq_len
)
compute_seqpool_sqrt
(
x
,
offset
,
out
)
class
TestSeqLastPool
(
TestSeqAvgPool
):
...
...
@@ -122,7 +127,7 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
3
,
17
]).
astype
(
'float32'
)
lod
=
[[
4
,
1
,
3
,
5
]]
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
out
=
np
.
zeros
((
4
,
3
,
17
)).
astype
(
'float32'
)
self
.
outputs
=
{
'Out'
:
out
}
...
...
@@ -167,7 +172,7 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
3
,
11
]).
astype
(
'float32'
)
lod
=
[[
4
,
1
,
3
,
5
]]
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
l
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
x
[
offset
[
0
][
i
]
+
np
.
random
.
randint
(
l
),
:]
+=
1.0
...
...
python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
浏览文件 @
eac5a0aa
...
...
@@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
False
self
.
dtype
=
np
.
float64
def
setUp
(
self
):
self
.
initParams
()
...
...
@@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
class_num
=
37
logits
=
np
.
random
.
uniform
(
0.1
,
1.0
,
[
batch_size
,
class_num
]).
astype
(
"float64"
)
[
batch_size
,
class_num
]).
astype
(
self
.
dtype
)
softmax
=
np
.
apply_along_axis
(
stable_softmax
,
1
,
logits
)
labels
=
np
.
random
.
randint
(
0
,
class_num
,
[
batch_size
,
1
],
dtype
=
"int64"
)
cross_entropy
=
np
.
asmatrix
(
[[
-
np
.
log
(
softmax
[
i
][
labels
[
i
][
0
]])]
for
i
in
range
(
softmax
.
shape
[
0
])],
dtype
=
"float64"
)
dtype
=
self
.
dtype
)
self
.
inputs
=
{
"Logits"
:
logits
,
"Label"
:
labels
}
self
.
outputs
=
{
"Softmax"
:
softmax
.
astype
(
"float64"
),
"Loss"
:
cross_entropy
.
astype
(
"float64"
)
"Softmax"
:
softmax
.
astype
(
self
.
dtype
),
"Loss"
:
cross_entropy
.
astype
(
self
.
dtype
)
}
self
.
attrs
=
{
"numeric_stable_mode"
:
self
.
numeric_stable_mode
}
...
...
@@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
"Logits"
],
"Loss"
)
self
.
check_grad
([
"Logits"
],
"Loss"
,
max_relative_error
=
0.05
)
class
TestSoftmaxWithCrossEntropyOpNoCudnn
(
TestSoftmaxWithCrossEntropyOp
):
...
...
@@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
self
.
numeric_stable_mode
=
True
class
TestSoftmaxWithCrossEntropyOpFp16
(
TestSoftmaxWithCrossEntropyOp
):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
False
self
.
dtype
=
np
.
float16
def
setUp
(
self
):
self
.
initParams
()
self
.
op_type
=
"softmax_with_cross_entropy"
batch_size
=
41
class_num
=
37
# NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
logits
=
np
.
random
.
uniform
(
0.1
,
1.0
,
[
batch_size
,
class_num
]).
astype
(
np
.
float32
)
softmax
=
np
.
apply_along_axis
(
stable_softmax
,
1
,
logits
)
labels
=
np
.
random
.
randint
(
0
,
class_num
,
[
batch_size
,
1
],
dtype
=
"int64"
)
cross_entropy
=
np
.
asmatrix
(
[[
-
np
.
log
(
softmax
[
i
][
labels
[
i
][
0
]])]
for
i
in
range
(
softmax
.
shape
[
0
])],
dtype
=
np
.
float32
)
self
.
inputs
=
{
"Logits"
:
logits
.
astype
(
self
.
dtype
).
view
(
np
.
uint16
),
"Label"
:
labels
}
self
.
outputs
=
{
"Softmax"
:
softmax
.
astype
(
self
.
dtype
),
"Loss"
:
cross_entropy
.
astype
(
self
.
dtype
)
}
self
.
attrs
=
{
"numeric_stable_mode"
:
self
.
numeric_stable_mode
}
def
test_check_output
(
self
):
self
.
check_output
(
atol
=
1e-2
)
def
test_check_grad
(
self
):
self
.
check_grad
([
"Logits"
],
"Loss"
,
max_relative_error
=
0.1
)
class
TestSoftmaxWithCrossEntropyOpNoCudnnFp16
(
TestSoftmaxWithCrossEntropyOpFp16
):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
True
self
.
dtype
=
np
.
float16
def
test_check_grad
(
self
):
self
.
check_grad
([
"Logits"
],
"Loss"
,
max_relative_error
=
0.1
)
class
TestSoftmaxWithCrossEntropyOp2
(
OpTest
):
"""
Test softmax with cross entropy operator with soft labels.
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
eac5a0aa
...
...
@@ -752,12 +752,6 @@ class DistributeTranspiler(object):
elif
op
not
in
lr_ops
:
self
.
_append_pserver_non_opt_ops
(
block
,
op
)
def
__op_have_grad_input__
(
op
):
for
varname
in
op
.
input_arg_names
:
if
varname
.
find
(
"@GRAD"
)
>=
0
:
return
varname
return
""
def
__clone_lr_op_sub_block__
(
op
,
program
,
lr_block
):
if
not
op
.
has_attr
(
'sub_block'
):
return
...
...
@@ -808,7 +802,7 @@ class DistributeTranspiler(object):
merged_var
=
None
for
_
,
op
in
enumerate
(
self
.
optimize_ops
):
# find the origin grad var before clipping/L2Decay,
# merged_var should be the input var name of L2Decay
buil
# merged_var should be the input var name of L2Decay
grad_varname_for_block
=
op
.
attr
(
OP_ROLE_VAR_ATTR_NAME
)[
1
]
if
op
.
attr
(
OP_ROLE_VAR_ATTR_NAME
)[
0
]
==
optimize_target_param_name
:
...
...
@@ -1684,7 +1678,16 @@ class DistributeTranspiler(object):
if
self
.
config
.
enable_dc_asgd
:
new_inputs
[
key
]
=
dc
else
:
new_inputs
[
key
]
=
merged_var
# Note!! This is for l2decay on sparse gradient, because it will create a new tensor for
# decayed gradient but not inplace modify the origin one
origin_grad_name
=
opt_op
.
input
(
key
)[
0
]
if
core
.
kNewGradSuffix
(
)
in
origin_grad_name
and
pserver_block
.
has_var
(
origin_grad_name
):
new_grad
=
pserver_block
.
var
(
origin_grad_name
)
new_inputs
[
key
]
=
new_grad
else
:
new_inputs
[
key
]
=
merged_var
elif
key
==
"Param"
:
param_block
=
_get_param_block
(
opt_op
)
if
not
param_block
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录