Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
eac5a0aa
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
eac5a0aa
编写于
1月 11, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
Merge develop
test=develop
上级
594dc4d8
869f3a9d
变更
132
隐藏空白更改
内联
并排
Showing
132 changed file
with
4848 addition
and
707 deletion
+4848
-707
cmake/FindJeMalloc.cmake
cmake/FindJeMalloc.cmake
+7
-0
cmake/cuda.cmake
cmake/cuda.cmake
+15
-1
cmake/external/boost.cmake
cmake/external/boost.cmake
+2
-5
cmake/external/gflags.cmake
cmake/external/gflags.cmake
+9
-0
cmake/external/mkldnn.cmake
cmake/external/mkldnn.cmake
+1
-1
cmake/external/mklml.cmake
cmake/external/mklml.cmake
+16
-18
cmake/external/ngraph.cmake
cmake/external/ngraph.cmake
+7
-12
cmake/generic.cmake
cmake/generic.cmake
+11
-10
paddle/fluid/API.spec
paddle/fluid/API.spec
+22
-0
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+1
-1
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+1
-0
paddle/fluid/framework/details/eager_deletion_op_handle.cc
paddle/fluid/framework/details/eager_deletion_op_handle.cc
+6
-8
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+1
-1
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+14
-0
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+0
-1
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+63
-0
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+15
-0
paddle/fluid/framework/ir/lock_free_optimize_pass.cc
paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+358
-0
paddle/fluid/framework/ir/lock_free_optimize_pass.h
paddle/fluid/framework/ir/lock_free_optimize_pass.h
+130
-0
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+214
-0
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+52
-0
paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+198
-0
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
.../fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+148
-0
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
...e/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+38
-0
paddle/fluid/framework/ngraph_bridge.cc
paddle/fluid/framework/ngraph_bridge.cc
+3
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+4
-4
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+1
-2
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+3
-2
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+5
-3
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+2
-2
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+5
-4
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+1
-0
paddle/fluid/inference/analysis/analyzer_tester.cc
paddle/fluid/inference/analysis/analyzer_tester.cc
+2
-2
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+0
-2
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+0
-10
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+11
-7
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+5
-3
paddle/fluid/inference/analysis/passes/CMakeLists.txt
paddle/fluid/inference/analysis/passes/CMakeLists.txt
+1
-0
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
...uid/inference/analysis/passes/ir_analysis_compose_pass.cc
+0
-23
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
...luid/inference/analysis/passes/ir_analysis_compose_pass.h
+0
-2
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+1
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+5
-2
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+0
-1
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+4
-4
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+4
-0
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+2
-2
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+8
-3
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+96
-7
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+105
-72
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+27
-15
paddle/fluid/inference/tensorrt/CMakeLists.txt
paddle/fluid/inference/tensorrt/CMakeLists.txt
+1
-0
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+31
-6
paddle/fluid/inference/tensorrt/op_teller.cc
paddle/fluid/inference/tensorrt/op_teller.cc
+49
-0
paddle/fluid/inference/tensorrt/op_teller.h
paddle/fluid/inference/tensorrt/op_teller.h
+68
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+5
-5
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+3
-3
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+111
-18
paddle/fluid/inference/tests/api/config_printer.h
paddle/fluid/inference/tests/api/config_printer.h
+1
-1
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+10
-10
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+3
-15
paddle/fluid/inference/utils/CMakeLists.txt
paddle/fluid/inference/utils/CMakeLists.txt
+0
-3
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+15
-0
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+141
-31
paddle/fluid/operators/elementwise/elementwise_sub_op.cu
paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+5
-0
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+194
-0
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+142
-0
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+134
-0
paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+41
-0
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+25
-2
paddle/fluid/operators/jit/gen/CMakeLists.txt
paddle/fluid/operators/jit/gen/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/gen/seqpool.cc
paddle/fluid/operators/jit/gen/seqpool.cc
+85
-0
paddle/fluid/operators/jit/gen/seqpool.h
paddle/fluid/operators/jit/gen/seqpool.h
+214
-0
paddle/fluid/operators/jit/helper.cc
paddle/fluid/operators/jit/helper.cc
+15
-0
paddle/fluid/operators/jit/helper.h
paddle/fluid/operators/jit/helper.h
+6
-0
paddle/fluid/operators/jit/kernel_base.h
paddle/fluid/operators/jit/kernel_base.h
+23
-0
paddle/fluid/operators/jit/kernel_key.cc
paddle/fluid/operators/jit/kernel_key.cc
+7
-0
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/more/mkl/mkl.cc
paddle/fluid/operators/jit/more/mkl/mkl.cc
+31
-0
paddle/fluid/operators/jit/more/mkl/mkl.h
paddle/fluid/operators/jit/more/mkl/mkl.h
+26
-0
paddle/fluid/operators/jit/refer/CMakeLists.txt
paddle/fluid/operators/jit/refer/CMakeLists.txt
+1
-0
paddle/fluid/operators/jit/refer/refer.cc
paddle/fluid/operators/jit/refer/refer.cc
+2
-0
paddle/fluid/operators/jit/refer/refer.h
paddle/fluid/operators/jit/refer/refer.h
+24
-0
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+49
-0
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-1
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+4
-0
paddle/fluid/operators/math/sequence_pooling.cc
paddle/fluid/operators/math/sequence_pooling.cc
+21
-11
paddle/fluid/operators/math/softmax.h
paddle/fluid/operators/math/softmax.h
+1
-0
paddle/fluid/operators/ngraph/ngraph_ops.h
paddle/fluid/operators/ngraph/ngraph_ops.h
+2
-0
paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
+0
-2
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
+59
-0
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+0
-2
paddle/fluid/operators/ngraph/ops/mean_op.h
paddle/fluid/operators/ngraph/ops/mean_op.h
+66
-0
paddle/fluid/operators/ngraph/ops/mul_op.h
paddle/fluid/operators/ngraph/ops/mul_op.h
+0
-2
paddle/fluid/operators/ngraph/ops/scale_op.h
paddle/fluid/operators/ngraph/ops/scale_op.h
+39
-0
paddle/fluid/operators/ngraph/ops/top_k_op.h
paddle/fluid/operators/ngraph/ops/top_k_op.h
+0
-2
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+36
-28
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+9
-1
paddle/fluid/platform/cuda_helper_test.cu
paddle/fluid/platform/cuda_helper_test.cu
+3
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+14
-16
paddle/fluid/platform/float16.h
paddle/fluid/platform/float16.h
+1
-1
paddle/fluid/platform/float16_test.cu
paddle/fluid/platform/float16_test.cu
+4
-2
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+37
-12
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+3
-4
paddle/fluid/pybind/ir.cc
paddle/fluid/pybind/ir.cc
+103
-0
paddle/fluid/pybind/ir.h
paddle/fluid/pybind/ir.h
+25
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+11
-3
python/paddle/dataset/mnist.py
python/paddle/dataset/mnist.py
+43
-48
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+204
-0
python/paddle/fluid/data_feeder.py
python/paddle/fluid/data_feeder.py
+18
-12
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+137
-24
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+143
-83
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+3
-4
python/paddle/fluid/tests/test_data_feeder.py
python/paddle/fluid/tests/test_data_feeder.py
+6
-0
python/paddle/fluid/tests/unittests/dist_ctr.py
python/paddle/fluid/tests/unittests/dist_ctr.py
+9
-1
python/paddle/fluid/tests/unittests/dist_se_resnext.py
python/paddle/fluid/tests/unittests/dist_se_resnext.py
+0
-1
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
...addle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+31
-0
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
...ddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+40
-0
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+13
-20
python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
...addle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+153
-15
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+11
-12
python/paddle/fluid/tests/unittests/test_dist_ctr.py
python/paddle/fluid/tests/unittests/test_dist_ctr.py
+14
-1
python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
...addle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
+51
-0
python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
...le/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
+118
-0
python/paddle/fluid/tests/unittests/test_ir_graph.py
python/paddle/fluid/tests/unittests/test_ir_graph.py
+146
-0
python/paddle/fluid/tests/unittests/test_optimizer.py
python/paddle/fluid/tests/unittests/test_optimizer.py
+56
-14
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+10
-5
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
...ests/unittests/test_parallel_executor_test_while_train.py
+13
-13
python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
...n/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+8
-7
python/paddle/fluid/tests/unittests/test_seq_pool.py
python/paddle/fluid/tests/unittests/test_seq_pool.py
+27
-22
python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
...uid/tests/unittests/test_softmax_with_cross_entropy_op.py
+55
-5
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+11
-8
未找到文件。
cmake/FindJeMalloc.cmake
浏览文件 @
eac5a0aa
...
@@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
...
@@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
mark_as_advanced
(
mark_as_advanced
(
JEMALLOC_LIBRARIES
JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR
)
JEMALLOC_INCLUDE_DIR
)
if
(
JEMALLOC_FOUND
)
add_library
(
jemalloc::jemalloc UNKNOWN IMPORTED
)
set_target_properties
(
jemalloc::jemalloc PROPERTIES
IMPORTED_LOCATION
${
JEMALLOC_LIBRARIES
}
INTERFACE_INCLUDE_DIRECTORIES
"
${
JEMALLOC_INCLUDE_DIR
}
"
)
endif
()
cmake/cuda.cmake
浏览文件 @
eac5a0aa
...
@@ -5,6 +5,8 @@ endif()
...
@@ -5,6 +5,8 @@ endif()
set
(
paddle_known_gpu_archs
"30 35 50 52 60 61 70"
)
set
(
paddle_known_gpu_archs
"30 35 50 52 60 61 70"
)
set
(
paddle_known_gpu_archs7
"30 35 50 52"
)
set
(
paddle_known_gpu_archs7
"30 35 50 52"
)
set
(
paddle_known_gpu_archs8
"30 35 50 52 60 61"
)
set
(
paddle_known_gpu_archs8
"30 35 50 52 60 61"
)
set
(
paddle_known_gpu_archs9
"30 35 50 52 60 61 70"
)
set
(
paddle_known_gpu_archs10
"30 35 50 52 60 61 70 75"
)
######################################################################################
######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# A function for automatic detection of GPUs installed (if autodetection is enabled)
...
@@ -59,7 +61,7 @@ endfunction()
...
@@ -59,7 +61,7 @@ endfunction()
# select_nvcc_arch_flags(out_variable)
# select_nvcc_arch_flags(out_variable)
function
(
select_nvcc_arch_flags out_variable
)
function
(
select_nvcc_arch_flags out_variable
)
# List of arch names
# List of arch names
set
(
archs_names
"Kepler"
"Maxwell"
"Pascal"
"All"
"Manual"
)
set
(
archs_names
"Kepler"
"Maxwell"
"Pascal"
"
Volta"
"Turing"
"
All"
"Manual"
)
set
(
archs_name_default
"All"
)
set
(
archs_name_default
"All"
)
if
(
NOT CMAKE_CROSSCOMPILING
)
if
(
NOT CMAKE_CROSSCOMPILING
)
list
(
APPEND archs_names
"Auto"
)
list
(
APPEND archs_names
"Auto"
)
...
@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable)
...
@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable)
set
(
cuda_arch_bin
"60 61"
)
set
(
cuda_arch_bin
"60 61"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Volta"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Volta"
)
set
(
cuda_arch_bin
"70"
)
set
(
cuda_arch_bin
"70"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Turing"
)
set
(
cuda_arch_bin
"75"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"All"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"All"
)
set
(
cuda_arch_bin
${
paddle_known_gpu_archs
}
)
set
(
cuda_arch_bin
${
paddle_known_gpu_archs
}
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Auto"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Auto"
)
...
@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
...
@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# warning for now.
# warning for now.
list
(
APPEND CUDA_NVCC_FLAGS
"-Wno-deprecated-gpu-targets"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-Wno-deprecated-gpu-targets"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
80
\"
"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
80
\"
"
)
elseif
(
${
CUDA_VERSION
}
LESS 10.0
)
# CUDA 9.x
set
(
paddle_known_gpu_archs
${
paddle_known_gpu_archs9
}
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D_MWAITXINTRIN_H_INCLUDED"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D__STRICT_ANSI__"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
90
\"
"
)
elseif
(
${
CUDA_VERSION
}
LESS 11.0
)
# CUDA 10.x
set
(
paddle_known_gpu_archs
${
paddle_known_gpu_archs10
}
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D_MWAITXINTRIN_H_INCLUDED"
)
list
(
APPEND CUDA_NVCC_FLAGS
"-D__STRICT_ANSI__"
)
add_definitions
(
"-DPADDLE_CUDA_BINVER=
\"
100
\"
"
)
endif
()
endif
()
include_directories
(
${
CUDA_INCLUDE_DIRS
}
)
include_directories
(
${
CUDA_INCLUDE_DIRS
}
)
...
...
cmake/external/boost.cmake
浏览文件 @
eac5a0aa
...
@@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost")
...
@@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost")
# checked that the devtools package of CentOS 6 installs boost 1.41.0.
# checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here.
# So we use 1.41.0 here.
set
(
BOOST_VER
"1.41.0"
)
set
(
BOOST_VER
"1.41.0"
)
if
((
NOT DEFINED BOOST_TAR
)
OR
(
NOT DEFINED BOOST_URL
))
set
(
BOOST_TAR
"boost_1_41_0"
CACHE STRING
""
FORCE
)
message
(
STATUS
"use pre defined download url"
)
set
(
BOOST_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
BOOST_TAR
}
.tar.gz"
CACHE STRING
""
FORCE
)
set
(
BOOST_TAR
"boost_1_41_0"
CACHE STRING
""
FORCE
)
set
(
BOOST_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
BOOST_TAR
}
.tar.gz"
CACHE STRING
""
FORCE
)
endif
()
MESSAGE
(
STATUS
"BOOST_TAR:
${
BOOST_TAR
}
, BOOST_URL:
${
BOOST_URL
}
"
)
MESSAGE
(
STATUS
"BOOST_TAR:
${
BOOST_TAR
}
, BOOST_URL:
${
BOOST_URL
}
"
)
...
...
cmake/external/gflags.cmake
浏览文件 @
eac5a0aa
...
@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
...
@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST
(
APPEND external_project_dependencies gflags
)
LIST
(
APPEND external_project_dependencies gflags
)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if
(
WIN32
)
include
(
CheckIncludeFileCXX
)
check_include_file_cxx
(
"shlwapi.h"
HAVE_SHLWAPI
)
if
(
HAVE_SHLWAPI
)
set_property
(
GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib
)
endif
(
HAVE_SHLWAPI
)
endif
(
WIN32
)
IF
(
WITH_C_API
)
IF
(
WITH_C_API
)
INSTALL
(
DIRECTORY
${
GFLAGS_INCLUDE_DIR
}
DESTINATION third_party/gflags
)
INSTALL
(
DIRECTORY
${
GFLAGS_INCLUDE_DIR
}
DESTINATION third_party/gflags
)
IF
(
ANDROID
)
IF
(
ANDROID
)
...
...
cmake/external/mkldnn.cmake
浏览文件 @
eac5a0aa
...
@@ -55,7 +55,7 @@ ExternalProject_Add(
...
@@ -55,7 +55,7 @@ ExternalProject_Add(
${
MKLDNN_PROJECT
}
${
MKLDNN_PROJECT
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
${
EXTERNAL_PROJECT_LOG_ARGS
}
DEPENDS
${
MKLDNN_DEPENDS
}
DEPENDS
${
MKLDNN_DEPENDS
}
GIT_REPOSITORY
"https://github.com/
01org
/mkl-dnn.git"
GIT_REPOSITORY
"https://github.com/
intel
/mkl-dnn.git"
GIT_TAG
"830a10059a018cd2634d94195140cf2d8790a75a"
GIT_TAG
"830a10059a018cd2634d94195140cf2d8790a75a"
PREFIX
${
MKLDNN_SOURCES_DIR
}
PREFIX
${
MKLDNN_SOURCES_DIR
}
UPDATE_COMMAND
""
UPDATE_COMMAND
""
...
...
cmake/external/mklml.cmake
浏览文件 @
eac5a0aa
...
@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
...
@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
return
()
return
()
ENDIF
(
NOT
${
WITH_MKLML
}
)
ENDIF
(
NOT
${
WITH_MKLML
}
)
IF
(
APPLE
)
MESSAGE
(
WARNING
"Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF."
)
SET
(
WITH_MKLML OFF CACHE STRING
"Disable MKLML package in MacOS"
FORCE
)
return
()
ENDIF
()
INCLUDE
(
ExternalProject
)
INCLUDE
(
ExternalProject
)
SET
(
MKLML_DST_DIR
"mklml"
)
SET
(
MKLML_DST_DIR
"mklml"
)
SET
(
MKLML_INSTALL_ROOT
"
${
THIRD_PARTY_PATH
}
/install"
)
SET
(
MKLML_INSTALL_ROOT
"
${
THIRD_PARTY_PATH
}
/install"
)
...
@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
...
@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET
(
MKLML_ROOT
${
MKLML_INSTALL_DIR
}
)
SET
(
MKLML_ROOT
${
MKLML_INSTALL_DIR
}
)
SET
(
MKLML_INC_DIR
${
MKLML_ROOT
}
/include
)
SET
(
MKLML_INC_DIR
${
MKLML_ROOT
}
/include
)
SET
(
MKLML_LIB_DIR
${
MKLML_ROOT
}
/lib
)
SET
(
MKLML_LIB_DIR
${
MKLML_ROOT
}
/lib
)
if
(
WIN32
)
SET
(
CMAKE_INSTALL_RPATH
"
${
CMAKE_INSTALL_RPATH
}
"
"
${
MKLML_ROOT
}
/lib"
)
SET
(
TIME_VERSION
"2019.0.1.20181227"
)
IF
(
WIN32
)
SET
(
MKLML_VER
"mklml_win_
${
TIME_VERSION
}
"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"https://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.zip"
CACHE STRING
""
FORCE
)
SET
(
MKLML_LIB
${
MKLML_LIB_DIR
}
/mklml.lib
)
SET
(
MKLML_LIB
${
MKLML_LIB_DIR
}
/mklml.lib
)
SET
(
MKLML_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5md.lib
)
SET
(
MKLML_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5md.lib
)
SET
(
MKLML_SHARED_LIB
${
MKLML_LIB_DIR
}
/mklml.dll
)
SET
(
MKLML_SHARED_LIB
${
MKLML_LIB_DIR
}
/mklml.dll
)
SET
(
MKLML_SHARED_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5md.dll
)
SET
(
MKLML_SHARED_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5md.dll
)
else
()
ELSE
()
SET
(
MKLML_VER
"mklml_lnx_
${
TIME_VERSION
}
"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.tgz"
CACHE STRING
""
FORCE
)
SET
(
MKLML_LIB
${
MKLML_LIB_DIR
}
/libmklml_intel.so
)
SET
(
MKLML_LIB
${
MKLML_LIB_DIR
}
/libmklml_intel.so
)
SET
(
MKLML_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5.so
)
SET
(
MKLML_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5.so
)
SET
(
MKLML_SHARED_LIB
${
MKLML_LIB_DIR
}
/libmklml_intel.so
)
SET
(
MKLML_SHARED_LIB
${
MKLML_LIB_DIR
}
/libmklml_intel.so
)
SET
(
MKLML_SHARED_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5.so
)
SET
(
MKLML_SHARED_IOMP_LIB
${
MKLML_LIB_DIR
}
/libiomp5.so
)
endif
()
ENDIF
()
SET
(
CMAKE_INSTALL_RPATH
"
${
CMAKE_INSTALL_RPATH
}
"
"
${
MKLML_ROOT
}
/lib"
)
IF
((
NOT DEFINED MKLML_VER
)
OR
(
NOT DEFINED MKLML_URL
))
MESSAGE
(
STATUS
"use pre defined download url"
)
if
(
WIN32
)
SET
(
MKLML_VER
"mklml_win_2019.0.1.20180928"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"https://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.zip"
CACHE STRING
""
FORCE
)
elseif
(
APPLE
)
SET
(
MKLML_VER
"mklml_mac_2019.0.1.20180928"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.tgz"
CACHE STRING
""
FORCE
)
else
()
SET
(
MKLML_VER
"mklml_lnx_2019.0.1.20180928"
CACHE STRING
""
FORCE
)
SET
(
MKLML_URL
"http://paddlepaddledeps.cdn.bcebos.com/
${
MKLML_VER
}
.tgz"
CACHE STRING
""
FORCE
)
ENDIF
()
endif
()
SET
(
MKLML_PROJECT
"extern_mklml"
)
SET
(
MKLML_PROJECT
"extern_mklml"
)
MESSAGE
(
STATUS
"MKLML_VER:
${
MKLML_VER
}
, MKLML_URL:
${
MKLML_URL
}
"
)
MESSAGE
(
STATUS
"MKLML_VER:
${
MKLML_VER
}
, MKLML_URL:
${
MKLML_URL
}
"
)
...
...
cmake/external/ngraph.cmake
浏览文件 @
eac5a0aa
...
@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
...
@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
INCLUDE
(
ExternalProject
)
INCLUDE
(
ExternalProject
)
SET
(
NGRAPH_PROJECT
"extern_ngraph"
)
SET
(
NGRAPH_PROJECT
"extern_ngraph"
)
SET
(
NGRAPH_GIT_TAG
"
08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9
"
)
SET
(
NGRAPH_GIT_TAG
"
20bd8bbc79ae3a81c57313846a2be7313e5d1dab
"
)
SET
(
NGRAPH_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/ngraph
)
SET
(
NGRAPH_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/ngraph
)
SET
(
NGRAPH_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/ngraph
)
SET
(
NGRAPH_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/ngraph
)
SET
(
NGRAPH_INC_DIR
${
NGRAPH_INSTALL_DIR
}
/include
)
SET
(
NGRAPH_INC_DIR
${
NGRAPH_INSTALL_DIR
}
/include
)
SET
(
NGRAPH_LIB_DIR
${
NGRAPH_INSTALL_DIR
}
/
${
CMAKE_INSTALL_LIBDIR
}
)
SET
(
NGRAPH_LIB_DIR
${
NGRAPH_INSTALL_DIR
}
/
${
CMAKE_INSTALL_LIBDIR
}
)
SET
(
NGRAPH_SHARED_LIB_NAME libngraph.so
)
SET
(
NGRAPH_SHARED_LIB_NAME libngraph.so
)
SET
(
NGRAPH_CPU_LIB_NAME libcpu_backend.so
)
SET
(
NGRAPH_CPU_LIB_NAME libcpu_backend.so
)
SET
(
NGRAPH_TBB_LIB_NAME libtbb.so.2
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
SET
(
NGRAPH_TBB_LIB_NAME libtbb_debug.so.2
)
else
()
SET
(
NGRAPH_TBB_LIB_NAME libtbb.so.2
)
endif
()
SET
(
NGRAPH_GIT_REPO
"https://github.com/NervanaSystems/ngraph.git"
)
SET
(
NGRAPH_GIT_REPO
"https://github.com/NervanaSystems/ngraph.git"
)
SET
(
NGRAPH_SHARED_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_SHARED_LIB_NAME
}
)
SET
(
NGRAPH_SHARED_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_SHARED_LIB_NAME
}
)
SET
(
NGRAPH_CPU_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_CPU_LIB_NAME
}
)
SET
(
NGRAPH_CPU_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_CPU_LIB_NAME
}
)
...
@@ -66,16 +70,7 @@ ExternalProject_Add(
...
@@ -66,16 +70,7 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_BUILD_TYPE=
${
CMAKE_BUILD_TYPE
}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=
${
CMAKE_BUILD_TYPE
}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=
${
MKLDNN_INC_DIR
}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=
${
MKLDNN_INC_DIR
}
CMAKE_ARGS -DMKLDNN_LIB_DIR=
${
MKLDNN_INSTALL_DIR
}
/lib
CMAKE_ARGS -DMKLDNN_LIB_DIR=
${
MKLDNN_INSTALL_DIR
}
/lib
)
CMAKE_ARGS -DMKLML_LIB_DIR=
${
MKLML_INSTALL_DIR
}
/lib
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step
(
${
NGRAPH_PROJECT
}
PrepareMKL
COMMAND
${
CMAKE_COMMAND
}
-E create_symlink
${
MKLML_LIB
}
${
MKLDNN_INSTALL_DIR
}
/lib/libmklml_intel.so
COMMAND
${
CMAKE_COMMAND
}
-E create_symlink
${
MKLML_IOMP_LIB
}
${
MKLDNN_INSTALL_DIR
}
/lib/libiomp5.so
DEPENDEES download
DEPENDERS configure
)
)
add_dependencies
(
ngraph
${
NGRAPH_PROJECT
}
)
add_dependencies
(
ngraph
${
NGRAPH_PROJECT
}
)
...
...
cmake/generic.cmake
浏览文件 @
eac5a0aa
...
@@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
...
@@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
endif
()
endif
()
if
(
WITH_JEMALLOC
)
if
(
WITH_JEMALLOC
)
target_link_libraries
(
${
TARGET_NAME
}
${
JEMALLOC_LIBRARIES
}
)
target_link_libraries
(
${
TARGET_NAME
}
jemalloc::jemalloc
)
endif
()
endif
()
endfunction
()
endfunction
()
...
@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
...
@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
add_dependencies
(
${
TARGET_NAME
}
${
cc_binary_DEPS
}
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_binary_DEPS
}
)
common_link
(
${
TARGET_NAME
}
)
common_link
(
${
TARGET_NAME
}
)
endif
()
endif
()
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
os_dependency_modules
}
)
endfunction
(
cc_binary
)
endfunction
(
cc_binary
)
function
(
cc_test TARGET_NAME
)
function
(
cc_test TARGET_NAME
)
...
@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
...
@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS ARGS
)
set
(
multiValueArgs SRCS DEPS ARGS
)
cmake_parse_arguments
(
cc_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
cc_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
if
(
WIN32
)
if
(
WIN32
)
list
(
APPEND win32_deps shlwapi
)
if
(
"
${
cc_test_DEPS
}
;"
MATCHES
"python;"
)
if
(
"
${
cc_test_DEPS
}
;"
MATCHES
"python;"
)
list
(
REMOVE_ITEM cc_test_DEPS python
)
list
(
REMOVE_ITEM cc_test_DEPS python
)
list
(
APPEND win32_deps
${
PYTHON_LIBRARIES
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
PYTHON_LIBRARIES
}
)
endif
()
endif
()
endif
(
WIN32
)
endif
(
WIN32
)
add_executable
(
${
TARGET_NAME
}
${
cc_test_SRCS
}
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
target_link_libraries
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
${
os_dependency_modules
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
if
(
WIN32
)
target_link_libraries
(
${
TARGET_NAME
}
${
win32_deps
}
)
endif
(
WIN32
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
common_link
(
${
TARGET_NAME
}
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
NAME
${
TARGET_NAME
}
add_test
(
NAME
${
TARGET_NAME
}
...
@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
...
@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
set
(
multiValueArgs SRCS DEPS
)
set
(
multiValueArgs SRCS DEPS
)
cmake_parse_arguments
(
nv_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
nv_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cuda_add_executable
(
${
TARGET_NAME
}
${
nv_test_SRCS
}
)
cuda_add_executable
(
${
TARGET_NAME
}
${
nv_test_SRCS
}
)
target_link_libraries
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
${
os_dependency_modules
}
)
add_dependencies
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
add_dependencies
(
${
TARGET_NAME
}
${
nv_test_DEPS
}
paddle_gtest_main lod_tensor memory gtest gflags glog
)
common_link
(
${
TARGET_NAME
}
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
...
@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
...
@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
endif
()
endif
()
add_executable
(
${
TARGET_NAME
}
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
add_executable
(
${
TARGET_NAME
}
${
_cmake_options
}
${
_generated_files
}
${
_sources
}
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE HIP
)
set_target_properties
(
${
TARGET_NAME
}
PROPERTIES LINKER_LANGUAGE HIP
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
${
os_dependency_modules
}
)
add_dependencies
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
)
add_dependencies
(
${
TARGET_NAME
}
${
hip_test_DEPS
}
paddle_gtest_main memory gtest gflags
)
common_link
(
${
TARGET_NAME
}
)
common_link
(
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
add_test
(
${
TARGET_NAME
}
${
TARGET_NAME
}
)
...
...
paddle/fluid/API.spec
浏览文件 @
eac5a0aa
...
@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
...
@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
...
@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
memory_optimize_pass
)
memory_optimize_pass
lock_free_optimize_pass
)
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
eac5a0aa
...
@@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
...
@@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
USE_PASS
(
sequential_execution_pass
);
USE_PASS
(
sequential_execution_pass
);
USE_PASS
(
all_reduce_deps_pass
);
USE_PASS
(
all_reduce_deps_pass
);
USE_PASS
(
modify_op_lock_and_record_event_pass
);
USE_PASS
(
modify_op_lock_and_record_event_pass
);
USE_PASS
(
lock_free_optimize_pass
);
paddle/fluid/framework/details/eager_deletion_op_handle.cc
浏览文件 @
eac5a0aa
...
@@ -25,6 +25,8 @@ namespace paddle {
...
@@ -25,6 +25,8 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
static
const
std
::
string
kEagerDeletionOpName
{
"eager_deletion"
};
// NOLINT
EagerDeletionOpHandle
::
EagerDeletionOpHandle
(
EagerDeletionOpHandle
::
EagerDeletionOpHandle
(
ir
::
Node
*
node
,
const
Scope
*
scope
,
const
platform
::
Place
&
place
,
ir
::
Node
*
node
,
const
Scope
*
scope
,
const
platform
::
Place
&
place
,
const
std
::
unordered_set
<
std
::
string
>
&
var_names
,
GarbageCollector
*
gc
,
const
std
::
unordered_set
<
std
::
string
>
&
var_names
,
GarbageCollector
*
gc
,
...
@@ -59,20 +61,15 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
...
@@ -59,20 +61,15 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
#endif
#endif
}
}
std
::
string
EagerDeletionOpHandle
::
Name
()
const
{
return
"eager_deletion"
;
}
std
::
string
EagerDeletionOpHandle
::
Name
()
const
{
return
kEagerDeletionOpName
;
}
void
EagerDeletionOpHandle
::
RunImpl
()
{
void
EagerDeletionOpHandle
::
RunImpl
()
{
#ifdef PADDLE_WITH_CUDA
platform
::
RecordEvent
event
(
kEagerDeletionOpName
,
nullptr
);
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctx_
);
#else
platform
::
RecordEvent
record_event
(
Name
(),
nullptr
);
#endif
Scope
*
exec_scope
=
nullptr
;
Scope
*
exec_scope
=
nullptr
;
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
garbages
;
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
garbages
;
for
(
auto
&
name
:
var_names_
)
{
for
(
auto
&
name
:
var_names_
)
{
auto
it
=
ref_cnts_
->
find
(
name
);
auto
it
=
ref_cnts_
->
find
(
name
);
//
Var not found, not r
eference count has not decreased to 0
//
R
eference count has not decreased to 0
if
(
it
==
ref_cnts_
->
end
()
||
it
->
second
.
fetch_sub
(
1
)
!=
1
)
{
if
(
it
==
ref_cnts_
->
end
()
||
it
->
second
.
fetch_sub
(
1
)
!=
1
)
{
continue
;
continue
;
}
}
...
@@ -81,6 +78,7 @@ void EagerDeletionOpHandle::RunImpl() {
...
@@ -81,6 +78,7 @@ void EagerDeletionOpHandle::RunImpl() {
exec_scope
=
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
exec_scope
=
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
}
}
// Var not found
auto
*
var
=
exec_scope
->
FindVar
(
name
);
auto
*
var
=
exec_scope
->
FindVar
(
name
);
if
(
var
==
nullptr
)
{
if
(
var
==
nullptr
)
{
continue
;
continue
;
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
eac5a0aa
...
@@ -226,7 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
...
@@ -226,7 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
* Only variables should be the leaves of graph.
* Only variables should be the leaves of graph.
*/
*/
AddOutputToLeafOps
(
&
result
);
AddOutputToLeafOps
(
&
result
);
result
.
Erase
<
GraphOps
>
(
kGraphOps
);
result
.
Erase
(
kGraphOps
);
return
graph
;
return
graph
;
}
}
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
...
@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library
(
graph_to_program_pass base
)
pass_library
(
graph_to_program_pass base
)
pass_library
(
graph_viz_pass base
)
pass_library
(
graph_viz_pass base
)
pass_library
(
lock_free_optimize_pass base
)
pass_library
(
fc_fuse_pass inference
)
pass_library
(
fc_fuse_pass inference
)
pass_library
(
attention_lstm_fuse_pass inference
)
pass_library
(
attention_lstm_fuse_pass inference
)
pass_library
(
infer_clean_graph_pass inference
)
pass_library
(
infer_clean_graph_pass inference
)
...
@@ -41,11 +42,23 @@ pass_library(seq_concat_fc_fuse_pass inference)
...
@@ -41,11 +42,23 @@ pass_library(seq_concat_fc_fuse_pass inference)
pass_library
(
multi_batch_merge_pass base
)
pass_library
(
multi_batch_merge_pass base
)
pass_library
(
conv_bn_fuse_pass inference
)
pass_library
(
conv_bn_fuse_pass inference
)
pass_library
(
seqconv_eltadd_relu_fuse_pass inference
)
pass_library
(
seqconv_eltadd_relu_fuse_pass inference
)
pass_library
(
seqpool_concat_fuse_pass inference
)
pass_library
(
is_test_pass base
)
pass_library
(
is_test_pass base
)
pass_library
(
conv_elementwise_add_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add2_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add2_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add_fuse_pass inference
)
pass_library
(
conv_elementwise_add_fuse_pass inference
)
pass_library
(
conv_affine_channel_fuse_pass inference
)
pass_library
(
conv_affine_channel_fuse_pass inference
)
pass_library
(
transpose_flatten_concat_fuse_pass inference
)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach
(
index RANGE 3 6
)
file
(
APPEND
${
pass_file
}
"USE_PASS(transpose_flatten
${
index
}
_concat_fuse_pass);
\n
"
)
endforeach
()
if
(
WITH_MKLDNN
)
if
(
WITH_MKLDNN
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
depthwise_conv_mkldnn_pass base
)
pass_library
(
depthwise_conv_mkldnn_pass base
)
...
@@ -67,6 +80,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
...
@@ -67,6 +80,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
cc_test
(
test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto
)
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
if
(
WITH_MKLDNN
)
if
(
WITH_MKLDNN
)
cc_test
(
test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass
)
cc_test
(
test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass
)
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
eac5a0aa
...
@@ -109,7 +109,6 @@ class Graph {
...
@@ -109,7 +109,6 @@ class Graph {
attr_dels_
[
attr_name
]
=
[]()
{};
attr_dels_
[
attr_name
]
=
[]()
{};
}
}
template
<
typename
AttrType
>
void
Erase
(
const
std
::
string
&
attr_name
)
{
void
Erase
(
const
std
::
string
&
attr_name
)
{
PADDLE_ENFORCE
(
attrs_
.
count
(
attr_name
)
!=
0
,
"%s not set in the graph"
,
PADDLE_ENFORCE
(
attrs_
.
count
(
attr_name
)
!=
0
,
"%s not set in the graph"
,
attr_name
);
attr_name
);
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
eac5a0aa
...
@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
...
@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
return
ac_out_var
;
return
ac_out_var
;
}
}
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ...
// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z ->
// concat_op
PDNode
*
patterns
::
TransposeFlattenConcat
::
operator
()(
std
::
vector
<
PDNode
*>
conv_in
,
int
times
)
{
// The times represents the repeat times of the
// {trans, trans_out, flatten, flatten_out}
const
int
kNumFields
=
4
;
const
int
kTransOutOffset
=
1
;
const
int
kFlattenOffset
=
2
;
const
int
kFlattenOutOffset
=
3
;
std
::
vector
<
PDNode
*>
nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"transpose"
+
std
::
to_string
(
i
)))
->
assert_is_op
(
"transpose2"
));
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"transpose_out"
+
std
::
to_string
(
i
)))
->
assert_is_op_output
(
"transpose2"
)
->
assert_is_op_input
(
"flatten2"
,
"X"
)
->
AsIntermediate
());
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"flatten"
+
std
::
to_string
(
i
)))
->
assert_is_op
(
"flatten2"
));
nodes
.
push_back
(
pattern
->
NewNode
(
GetNodeName
(
"flatten_out"
+
std
::
to_string
(
i
)))
->
assert_is_op_output
(
"flatten2"
)
->
assert_is_op_nth_input
(
"concat"
,
"X"
,
i
)
->
AsIntermediate
());
}
auto
concat_op
=
pattern
->
NewNode
(
GetNodeName
(
"concat"
))
->
assert_is_op
(
"concat"
)
->
assert_op_has_n_inputs
(
"concat"
,
times
);
auto
concat_out
=
pattern
->
NewNode
(
GetNodeName
(
"concat_out"
))
->
assert_is_op_output
(
"concat"
)
->
AsOutput
();
std
::
vector
<
PDNode
*>
flatten_outs
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
conv_in
[
i
]
->
AsInput
();
// trans
nodes
[
i
*
kNumFields
]
->
LinksFrom
({
conv_in
[
i
]});
// trans_out
nodes
[
i
*
kNumFields
+
kTransOutOffset
]
->
LinksFrom
({
nodes
[
i
*
kNumFields
]});
// flatten
nodes
[
i
*
kNumFields
+
kFlattenOffset
]
->
LinksFrom
(
{
nodes
[
i
*
kNumFields
+
kTransOutOffset
]});
// flatten_out
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]
->
LinksFrom
(
{
nodes
[
i
*
kNumFields
+
kFlattenOffset
]});
flatten_outs
.
push_back
(
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]);
}
concat_op
->
LinksFrom
(
flatten_outs
).
LinksTo
({
concat_out
});
return
concat_out
;
}
}
// namespace ir
}
// namespace ir
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
eac5a0aa
...
@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
...
@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE
(
ac_out
);
// Out
PATTERN_DECL_NODE
(
ac_out
);
// Out
};
};
struct
TransposeFlattenConcat
:
public
PatternBase
{
TransposeFlattenConcat
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"transpose_flatten_concat"
)
{}
PDNode
*
operator
()(
std
::
vector
<
PDNode
*>
conv_inputs
,
int
times
);
std
::
string
GetNodeName
(
const
std
::
string
&
op_type
)
{
return
PDNodeName
(
name_scope_
,
repr_
,
id_
,
op_type
);
}
PDNode
*
GetPDNode
(
const
std
::
string
&
op_type
)
{
return
pattern
->
RetrieveNode
(
GetNodeName
(
op_type
));
}
};
}
// namespace patterns
}
// namespace patterns
// Link two ir::Nodes from each other.
// Link two ir::Nodes from each other.
...
...
paddle/fluid/framework/ir/lock_free_optimize_pass.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h"
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
const
char
kSumGradOpName
[]
=
"sum"
;
// TODO(minqiyang): only support sgd at current time, please add
// other optimizers later.
const
char
kOptimizerType
[]
=
"sgd"
;
std
::
unique_ptr
<
ir
::
Graph
>
LockFreeOptimizePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
PADDLE_ENFORCE
(
graph
.
get
());
// We could collect all weights' name from SGD, where
// W1 <- SGD(W0, Grad0)
std
::
unordered_set
<
std
::
string
>
weight_var_set
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
IsOpNamed
(
node
,
kOptimizerType
))
{
auto
&
param_out_vars
=
node
->
Op
()
->
Output
(
"ParamOut"
);
PADDLE_ENFORCE
(
param_out_vars
.
size
()
==
1u
);
weight_var_set
.
insert
(
param_out_vars
[
0
]);
}
}
// find all grad's merge op via weight name, where
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
std
::
unordered_set
<
ir
::
Node
*>
grad_sum_op_set
;
for
(
ir
::
Node
*
node
:
graph
->
Nodes
())
{
if
(
IsOpNamed
(
node
,
kSumGradOpName
))
{
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
// strip the last grad suffix @GRAD
std
::
string
var_name
=
output
->
Name
();
const
std
::
string
suffix
(
kGradVarSuffix
);
if
(
var_name
!=
suffix
&&
var_name
.
size
()
>
suffix
.
size
()
&&
var_name
.
substr
(
var_name
.
size
()
-
suffix
.
size
())
==
suffix
)
{
// if so then strip them off
var_name
=
var_name
.
substr
(
0
,
var_name
.
size
()
-
suffix
.
size
());
if
(
weight_var_set
.
find
(
var_name
)
!=
weight_var_set
.
end
())
{
grad_sum_op_set
.
insert
(
node
);
break
;
}
}
}
}
}
// get the forward op and backward op pairs, where
// out <- forward(X, W)
// Grad1 <- backward(out, X')
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
// W0 <- SGD(W1, Grad0)
for
(
ir
::
Node
*
node
:
grad_sum_op_set
)
{
for
(
ir
::
Node
*
merged_grad_var
:
node
->
outputs
)
{
// find the optimizers connected with sum op
if
(
IsVarNameEndsWith
(
merged_grad_var
,
kGradVarSuffix
)
&&
merged_grad_var
->
outputs
.
size
()
==
1u
)
{
ir
::
Node
*
opt_node
=
merged_grad_var
->
outputs
[
0
];
VLOG
(
3
)
<<
"Found opt node "
<<
opt_node
->
Name
();
// find the backward op connected with sum op
for
(
ir
::
Node
*
unmerged_grad_var
:
node
->
inputs
)
{
if
(
IsVarNameContains
(
unmerged_grad_var
,
kGradVarSuffix
)
&&
unmerged_grad_var
->
inputs
.
size
()
==
1u
)
{
ir
::
Node
*
backward_op
=
unmerged_grad_var
->
inputs
[
0
];
VLOG
(
3
)
<<
"Found backward_op "
<<
backward_op
->
Name
();
// find the forward op related to the backward op
ir
::
Node
*
forward_op
=
FindForwardOpViaBackwardOp
(
graph
.
get
(),
backward_op
);
VLOG
(
3
)
<<
"Found forward_op "
<<
forward_op
->
Name
();
PADDLE_ENFORCE
(
forward_op
);
Node
*
new_optimizer_node
=
CreateNewSGDNode
(
graph
.
get
(),
forward_op
,
backward_op
,
node
,
opt_node
);
PADDLE_ENFORCE
(
new_optimizer_node
);
}
}
}
}
}
// Remove the sum_op and its' outputs and connected Optimizers
for
(
Node
*
sum_op
:
grad_sum_op_set
)
{
for
(
Node
*
sum_op_output
:
sum_op
->
outputs
)
{
for
(
Node
*
optimize_op
:
sum_op_output
->
outputs
)
{
if
(
optimize_op
->
NodeType
()
==
Node
::
Type
::
kOperation
&&
optimize_op
->
Name
()
==
kOptimizerType
)
{
VLOG
(
3
)
<<
"remove optimize_op: "
<<
optimize_op
->
Name
()
<<
"_"
<<
optimize_op
->
id
();
graph
->
RemoveNode
(
optimize_op
);
}
}
VLOG
(
3
)
<<
"remove sum_op_output: "
<<
sum_op_output
->
Name
()
<<
"_"
<<
sum_op_output
->
id
();
graph
->
RemoveNode
(
sum_op_output
);
}
VLOG
(
3
)
<<
"remove sum_op: "
<<
sum_op
->
Name
()
<<
"_"
<<
sum_op
->
id
();
graph
->
RemoveNode
(
sum_op
);
}
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
Node
*
output_node
:
node
->
outputs
)
{
if
(
output_node
->
Name
()
==
"sgd"
)
{
VLOG
(
3
)
<<
"Node link to SGD: "
<<
node
->
Name
()
<<
"_"
<<
node
->
id
()
<<
" --> "
<<
output_node
->
Name
()
<<
"_"
<<
output_node
->
id
();
for
(
Node
*
input_node
:
node
->
inputs
)
{
VLOG
(
3
)
<<
"SGD Input link: "
<<
input_node
->
Name
()
<<
"_"
<<
input_node
->
id
()
<<
" --> "
<<
node
->
Name
()
<<
"_"
<<
node
->
id
();
}
}
}
}
return
graph
;
}
ir
::
Node
*
LockFreeOptimizePass
::
CreateNewSGDNode
(
ir
::
Graph
*
graph
,
ir
::
Node
*
forward_node
,
ir
::
Node
*
backward_node
,
ir
::
Node
*
grad_sum_node
,
ir
::
Node
*
optimize_node
)
const
{
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
forward_node
);
PADDLE_ENFORCE
(
backward_node
);
PADDLE_ENFORCE
(
grad_sum_node
);
PADDLE_ENFORCE
(
optimize_node
);
// find the grad var node between the grad sum node and backward_node
std
::
vector
<
ir
::
Node
*>
grad_vars
=
FindConnectedNode
(
backward_node
,
grad_sum_node
);
ir
::
Node
*
grad_node
=
nullptr
;
for
(
ir
::
Node
*
node
:
grad_vars
)
{
if
(
!
ir
::
IsControlDepVar
(
*
node
))
{
grad_node
=
node
;
}
}
PADDLE_ENFORCE
(
grad_node
);
// create a new SGD node
OpDesc
*
old_desc
=
optimize_node
->
Op
();
// keep with the same block between new optimizer and the old one
OpDesc
new_desc
(
*
old_desc
,
old_desc
->
Block
());
new_desc
.
SetInput
(
"Param"
,
old_desc
->
Input
(
"Param"
));
new_desc
.
SetInput
(
"LearningRate"
,
old_desc
->
Input
(
"LearningRate"
));
new_desc
.
SetInput
(
"Grad"
,
std
::
vector
<
std
::
string
>
({
grad_node
->
Name
()}));
new_desc
.
SetOutput
(
"ParamOut"
,
old_desc
->
Output
(
"ParamOut"
));
std
::
vector
<
std
::
string
>
op_role_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
new_desc
.
GetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
// replace the second op role var, because the grad name was
// changed in new optimizer
op_role_vars
.
pop_back
();
op_role_vars
.
push_back
(
grad_node
->
Name
());
new_desc
.
SetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
(),
op_role_vars
);
new_desc
.
SetType
(
kOptimizerType
);
// set backward op's op role var, this will be used to
// set device_id in multi_device_pass
backward_node
->
Op
()
->
SetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
(),
op_role_vars
);
// backward_node->Op()->SetAttr(
// framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {});
// keep with the same output nodes between new optimizer and the
// old one
Node
*
sgd_node
=
graph
->
CreateOpNode
(
&
new_desc
);
// change all outputs of the optimize_node to the new one
ReplaceAllDownstreamNode
(
optimize_node
,
sgd_node
);
// find connected node between forward node and optimize node
// and replace the optimize node to new sgd node
std
::
vector
<
ir
::
Node
*>
forward_opt_connected_nodes
=
FindConnectedNode
(
forward_node
,
optimize_node
);
for
(
ir
::
Node
*
node
:
forward_opt_connected_nodes
)
{
ReplaceUpstreamNode
(
node
,
optimize_node
,
sgd_node
);
}
// find connected node between backward node and optimize node
// and replace the optimize node to new sgd node
std
::
vector
<
ir
::
Node
*>
backward_opt_connected_nodes
=
FindConnectedNode
(
backward_node
,
optimize_node
);
for
(
ir
::
Node
*
node
:
backward_opt_connected_nodes
)
{
ReplaceUpstreamNode
(
node
,
optimize_node
,
sgd_node
);
}
// SGD must have only one param and LR in
PADDLE_ENFORCE
(
old_desc
->
Input
(
"LearningRate"
).
size
()
==
1u
);
PADDLE_ENFORCE
(
old_desc
->
Input
(
"Param"
).
size
()
==
1u
);
// LR and weight nodes should be copied
for
(
Node
*
upstream_node
:
optimize_node
->
inputs
)
{
if
(
upstream_node
->
Name
()
==
old_desc
->
Input
(
"LearningRate"
)[
0
]
||
upstream_node
->
Name
()
==
old_desc
->
Input
(
"Param"
)[
0
])
{
ReplaceUpstreamNode
(
upstream_node
,
optimize_node
,
sgd_node
);
}
}
VLOG
(
3
)
<<
"Create new opt node"
<<
sgd_node
->
Name
()
<<
"_"
<<
sgd_node
->
id
();
return
sgd_node
;
}
std
::
vector
<
ir
::
Node
*>
LockFreeOptimizePass
::
FindConnectedNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
downstream_node
)
const
{
std
::
vector
<
ir
::
Node
*>
result
;
for
(
ir
::
Node
*
out_node
:
upstream_node
->
outputs
)
{
for
(
ir
::
Node
*
in_node
:
downstream_node
->
inputs
)
{
if
(
in_node
==
out_node
)
{
result
.
push_back
(
in_node
);
}
}
}
return
result
;
}
void
LockFreeOptimizePass
::
ReplaceUpstreamNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
{
PADDLE_ENFORCE
(
upstream_node
);
PADDLE_ENFORCE
(
old_optimizer_node
);
PADDLE_ENFORCE
(
new_optimizer_node
);
// Remove the old_optimizer_node from upstream_node's outputs vector
auto
&
output_node_vec
=
upstream_node
->
outputs
;
for
(
auto
output_node_iter
=
output_node_vec
.
begin
();
output_node_iter
!=
output_node_vec
.
end
();)
{
if
(
*
output_node_iter
==
old_optimizer_node
)
{
output_node_vec
.
erase
(
output_node_iter
);
break
;
}
else
{
++
output_node_iter
;
}
}
// Add the new_optimizer_node to upstream_node's outputs vector
output_node_vec
.
emplace_back
(
new_optimizer_node
);
new_optimizer_node
->
inputs
.
emplace_back
(
upstream_node
);
}
void
LockFreeOptimizePass
::
ReplaceAllDownstreamNode
(
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
{
PADDLE_ENFORCE
(
old_optimizer_node
);
PADDLE_ENFORCE
(
new_optimizer_node
);
for
(
ir
::
Node
*
downstream_node
:
old_optimizer_node
->
outputs
)
{
// Remove the old_optimizer_node from downstream_node's inputs vector
auto
&
input_node_vec
=
downstream_node
->
inputs
;
for
(
auto
input_node_iter
=
input_node_vec
.
begin
();
input_node_iter
!=
input_node_vec
.
end
();)
{
if
(
*
input_node_iter
==
old_optimizer_node
)
{
input_node_vec
.
erase
(
input_node_iter
);
break
;
}
else
{
++
input_node_iter
;
}
}
// Add the new_optimizer_node to downstream_node's inputs vector
input_node_vec
.
emplace_back
(
new_optimizer_node
);
new_optimizer_node
->
outputs
.
emplace_back
(
downstream_node
);
}
}
ir
::
Node
*
LockFreeOptimizePass
::
FindForwardOpViaBackwardOp
(
ir
::
Graph
*
graph
,
ir
::
Node
*
backward_node
)
const
{
PADDLE_ENFORCE
(
graph
);
PADDLE_ENFORCE
(
backward_node
);
// strip the suffix _grad of backward_node's name
std
::
string
forward_op_name
=
backward_node
->
Name
();
const
std
::
string
suffix
(
"_grad"
);
if
(
forward_op_name
!=
suffix
&&
forward_op_name
.
size
()
>
suffix
.
size
()
&&
forward_op_name
.
substr
(
forward_op_name
.
size
()
-
suffix
.
size
())
==
suffix
)
{
// if so then strip them off
forward_op_name
=
forward_op_name
.
substr
(
0
,
forward_op_name
.
size
()
-
suffix
.
size
());
}
else
{
LOG
(
WARNING
)
<<
"Illegal backward node's name "
<<
backward_node
->
Name
()
<<
" id "
<<
backward_node
->
id
();
return
nullptr
;
}
for
(
ir
::
Node
*
node
:
graph
->
Nodes
())
{
if
(
node
->
Name
()
==
forward_op_name
)
{
if
(
node
->
outputs
.
size
()
==
0u
)
{
// if forward_node has no output, then it has NO grad op
continue
;
}
// check whether all inputs of the backward_op that ends_with @GRAD
// comes from the output of forward_op is the input of the backward_op
bool
is_related_forward_node
=
true
;
for
(
ir
::
Node
*
backward_input
:
backward_node
->
inputs
)
{
if
(
IsVarNameEndsWith
(
backward_input
,
kGradVarSuffix
))
{
bool
meets_correct_output
=
false
;
for
(
ir
::
Node
*
forward_output
:
node
->
outputs
)
{
if
(
forward_output
->
Name
()
+
kGradVarSuffix
==
backward_input
->
Name
())
{
meets_correct_output
=
true
;
break
;
}
}
if
(
!
meets_correct_output
)
{
is_related_forward_node
=
false
;
break
;
}
}
}
if
(
is_related_forward_node
)
{
return
node
;
}
}
}
return
nullptr
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
lock_free_optimize_pass
,
paddle
::
framework
::
ir
::
LockFreeOptimizePass
);
paddle/fluid/framework/ir/lock_free_optimize_pass.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#include <string>
#include <vector>
#include <boost/algorithm/string/predicate.hpp>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
Node
;
/*
* Remove the sum op of all gradients of the backward op.
* And remove the dependecies of the optimizer related to the
* same backward op.
*
* Before this pass:
*
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* \ /
* \ /
* sum_op
* |
* sgd_op
*
* After this pass:
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* | |
* sgd_op1 sgd_op2
*
* sgd_op1 and sgd_op2 will update the same weight which holds the same
* memory, so we could benefits from the acceleration
*/
class
LockFreeOptimizePass
:
public
Pass
{
public:
virtual
~
LockFreeOptimizePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
private:
// Create a new sgd node via current optimizer node
ir
::
Node
*
CreateNewSGDNode
(
ir
::
Graph
*
graph
,
ir
::
Node
*
forward_node
,
ir
::
Node
*
backward_node
,
ir
::
Node
*
grad_sum_node
,
ir
::
Node
*
optimize_node
)
const
;
// Replace the input weight's optimizers
void
ReplaceUpstreamNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
;
// Replace the output weight's optimizers
void
ReplaceAllDownstreamNode
(
ir
::
Node
*
old_optimizer_node
,
ir
::
Node
*
new_optimizer_node
)
const
;
// Find all weight variables in graph
bool
FindAllWeightVars
(
ir
::
Graph
*
graph
)
const
;
// Find the forward_op node via the backward_op node
ir
::
Node
*
FindForwardOpViaBackwardOp
(
ir
::
Graph
*
graph
,
ir
::
Node
*
backward_node
)
const
;
std
::
vector
<
ir
::
Node
*>
FindConnectedNode
(
ir
::
Node
*
upstream_node
,
ir
::
Node
*
downstream_node
)
const
;
inline
bool
IsOpNamed
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kOperation
&&
node
->
Name
()
==
name
;
}
inline
bool
IsVarNamed
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kVariable
&&
node
->
Name
()
==
name
;
}
inline
bool
IsVarNameEndsWith
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kVariable
&&
boost
::
algorithm
::
ends_with
(
node
->
Name
(),
name
);
}
inline
bool
IsVarNameContains
(
ir
::
Node
*
node
,
const
std
::
string
&
name
)
const
{
PADDLE_ENFORCE
(
node
);
return
node
->
NodeType
()
==
Node
::
Type
::
kVariable
&&
node
->
Name
().
find
(
name
)
!=
std
::
string
::
npos
;
}
inline
bool
IsControlDepFrom
(
ir
::
Node
*
ctrl_dep_node
,
ir
::
Node
*
node
)
const
{
PADDLE_ENFORCE
(
ctrl_dep_node
);
PADDLE_ENFORCE
(
node
);
return
IsControlDepVar
(
*
ctrl_dep_node
)
&&
ctrl_dep_node
->
inputs
.
size
()
>=
1u
&&
ctrl_dep_node
->
inputs
[
0
]
==
node
;
}
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
#endif // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#define MAX_CONCAT_INPUTS 200
namespace
paddle
{
namespace
framework
{
namespace
ir
{
PDNode
*
BuildSeqPoolConcatPattern
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
,
int
num_inputs
)
{
auto
is_concat_op_with_inputs
=
[](
Node
*
x
,
int
num
)
->
bool
{
return
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"concat"
&&
x
->
Op
()
->
Input
(
"X"
).
size
()
==
static_cast
<
size_t
>
(
num
);
};
auto
is_nth_input_var_of_concat
=
[
=
](
Node
*
x
,
int
idx
)
->
bool
{
return
x
&&
x
->
IsVar
()
&&
VarLinksToOp
(
x
,
"concat"
)
&&
x
->
outputs
.
size
()
==
1
&&
IsNthInput
(
x
,
x
->
outputs
[
0
],
"X"
,
idx
)
&&
is_concat_op_with_inputs
(
x
->
outputs
[
0
],
num_inputs
);
};
auto
is_seqpool_op_with_pootype_of_nth_input_of_concat
=
[
=
](
Node
*
x
,
const
std
::
string
&
type
,
int
idx
)
->
bool
{
bool
this_is_seqpool_op
=
x
&&
x
->
IsOp
()
&&
x
->
Op
()
->
Type
()
==
"sequence_pool"
&&
x
->
Op
()
->
HasAttr
(
"pooltype"
)
&&
boost
::
get
<
std
::
string
>
(
x
->
Op
()
->
GetAttr
(
"pooltype"
))
==
type
&&
x
->
outputs
.
size
()
==
2
;
// seqpool should only have 2 outputs
bool
satisfied_all
=
this_is_seqpool_op
;
if
(
this_is_seqpool_op
)
{
// Only one output of seqpool_op is nth_input_var of concat,
// the other one should be unused empty var.
if
(
is_nth_input_var_of_concat
(
x
->
outputs
[
0
],
idx
))
{
satisfied_all
=
satisfied_all
&&
x
->
outputs
[
1
]
->
IsVar
()
&&
x
->
outputs
[
1
]
->
outputs
.
empty
();
}
else
{
satisfied_all
=
satisfied_all
&&
is_nth_input_var_of_concat
(
x
->
outputs
[
1
],
idx
)
&&
x
->
outputs
[
0
]
->
IsVar
()
&&
x
->
outputs
[
0
]
->
outputs
.
size
()
==
0
;
}
}
return
satisfied_all
;
};
auto
*
concat_op
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
is_concat_op_with_inputs
(
x
,
num_inputs
);
},
name_scope
+
"/concat_op"
);
concat_op
->
assert_op_attr
<
int
>
(
"axis"
,
1
);
auto
*
concat_out_var
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
VarLinksFromOp
(
x
,
"concat"
)
&&
x
->
inputs
.
size
()
==
1
&&
is_concat_op_with_inputs
(
x
->
inputs
[
0
],
num_inputs
);
},
name_scope
+
"/concat_out_var"
);
concat_out_var
->
assert_is_only_output_of_op
(
"concat"
);
std
::
vector
<
PDNode
*>
seqpool_ops_input_var
(
num_inputs
);
std
::
vector
<
PDNode
*>
seqpool_ops_output_var
(
num_inputs
);
std
::
vector
<
PDNode
*>
seqpool_ops_output_unused_var
(
num_inputs
);
std
::
vector
<
PDNode
*>
seqpool_ops
(
num_inputs
);
for
(
int
i
=
0
;
i
<
num_inputs
;
++
i
)
{
seqpool_ops_output_var
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
is_nth_input_var_of_concat
(
x
,
i
)
&&
x
->
inputs
.
size
()
==
1
&&
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
x
->
inputs
[
0
],
"SUM"
,
i
);
},
name_scope
+
"/sequence_pool_out_"
+
std
::
to_string
(
i
));
seqpool_ops_output_unused_var
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsVar
()
&&
x
->
inputs
.
size
()
==
1
&&
x
->
outputs
.
size
()
==
0
&&
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
x
->
inputs
[
0
],
"SUM"
,
i
);
},
name_scope
+
"/sequence_pool_unused_out_"
+
std
::
to_string
(
i
));
seqpool_ops
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
x
,
"SUM"
,
i
);
},
name_scope
+
"/sequence_pool_op_"
+
std
::
to_string
(
i
));
seqpool_ops_input_var
[
i
]
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
bool
basic
=
x
&&
x
->
IsVar
()
&&
x
->
outputs
.
size
()
>=
1
;
bool
next_is_fine
=
false
;
for
(
auto
*
o
:
x
->
outputs
)
{
if
(
is_seqpool_op_with_pootype_of_nth_input_of_concat
(
o
,
"SUM"
,
i
))
{
next_is_fine
=
true
;
break
;
}
}
return
basic
&&
next_is_fine
;
},
name_scope
+
"/sequence_pool_in_"
+
std
::
to_string
(
i
));
// Links
seqpool_ops
[
i
]
->
LinksFrom
({
seqpool_ops_input_var
[
i
]})
.
LinksTo
({
seqpool_ops_output_var
[
i
],
seqpool_ops_output_unused_var
[
i
]});
}
concat_op
->
LinksFrom
(
seqpool_ops_output_var
).
LinksTo
({
concat_out_var
});
return
concat_out_var
;
}
int
BuildFusion
(
Graph
*
graph
,
const
std
::
string
&
name_scope
,
int
num_inputs
)
{
GraphPatternDetector
gpd
;
auto
*
pattern
=
gpd
.
mutable_pattern
();
BuildSeqPoolConcatPattern
(
pattern
,
name_scope
,
num_inputs
);
auto
retrieve_node
=
[](
const
std
::
string
&
name
,
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
const
PDPattern
&
pat
)
->
Node
*
{
PADDLE_ENFORCE
(
subgraph
.
count
(
pat
.
RetrieveNode
(
name
)),
"pattern has no Node called %s"
,
name
.
c_str
());
Node
*
p
=
subgraph
.
at
(
pat
.
RetrieveNode
(
name
));
PADDLE_ENFORCE_NOT_NULL
(
p
,
"subgraph has no node %s"
,
name
.
c_str
());
return
p
;
};
int
fusion_count
{
0
};
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle SeqPool Concat fuse"
;
std
::
vector
<
std
::
string
>
input_names
(
num_inputs
);
std
::
vector
<
Node
*>
input_vars
(
num_inputs
);
auto
&
fused_pattern
=
gpd
.
pattern
();
for
(
int
i
=
0
;
i
<
num_inputs
;
++
i
)
{
input_vars
[
i
]
=
retrieve_node
(
name_scope
+
"/sequence_pool_in_"
+
std
::
to_string
(
i
),
subgraph
,
fused_pattern
);
input_names
[
i
]
=
input_vars
[
i
]
->
Name
();
}
auto
*
concat_op
=
retrieve_node
(
name_scope
+
"/concat_op"
,
subgraph
,
fused_pattern
);
auto
*
concat_out_var
=
retrieve_node
(
name_scope
+
"/concat_out_var"
,
subgraph
,
fused_pattern
);
auto
*
seqpool_op0
=
retrieve_node
(
name_scope
+
"/sequence_pool_op_0"
,
subgraph
,
fused_pattern
);
// Create New OpDesc
OpDesc
op_desc
;
op_desc
.
SetType
(
"fusion_seqpool_concat"
);
op_desc
.
SetInput
(
"X"
,
input_names
);
op_desc
.
SetAttr
(
"pooltype"
,
seqpool_op0
->
Op
()
->
GetAttr
(
"pooltype"
));
op_desc
.
SetAttr
(
"axis"
,
concat_op
->
Op
()
->
GetAttr
(
"axis"
));
op_desc
.
SetOutput
(
"Out"
,
{
concat_out_var
->
Name
()});
auto
*
op
=
graph
->
CreateOpNode
(
&
op_desc
);
for
(
size_t
i
=
0
;
i
<
input_vars
.
size
();
++
i
)
{
IR_NODE_LINK_TO
(
input_vars
[
i
],
op
);
}
IR_NODE_LINK_TO
(
op
,
concat_out_var
);
std
::
unordered_set
<
const
Node
*>
marked_nodes
;
for
(
auto
&
item
:
subgraph
)
{
marked_nodes
.
insert
(
item
.
second
);
}
for
(
size_t
i
=
0
;
i
<
input_vars
.
size
();
++
i
)
{
marked_nodes
.
erase
(
input_vars
[
i
]);
}
marked_nodes
.
erase
(
concat_out_var
);
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
++
fusion_count
;
};
gpd
(
graph
,
handler
);
return
fusion_count
;
}
std
::
unique_ptr
<
ir
::
Graph
>
SeqPoolConcatFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
FusePassBase
::
Init
(
name_scope_
,
graph
.
get
());
int
fusion_count
=
0
;
for
(
int
i
=
MAX_CONCAT_INPUTS
;
i
>
0
;
--
i
)
{
fusion_count
+=
BuildFusion
(
graph
.
get
(),
name_scope_
+
"/"
+
std
::
to_string
(
i
),
i
);
}
AddStatis
(
fusion_count
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
seqpool_concat_fuse_pass
,
paddle
::
framework
::
ir
::
SeqPoolConcatFusePass
);
paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
/**
* Fuse SequencePool(with sum pooltype yet) and Concat;
*
* Before fuse:
* | | |
* seq_pool, seq_pool, ... seq_pool
* \ | ... /
* concat
* |
* After fuse:
* \ | /
* FusionSeqPoolConcat
* |
*/
class
SeqPoolConcatFusePass
:
public
FusePassBase
{
public:
virtual
~
SeqPoolConcatFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
const
std
::
string
name_scope_
{
"seqpool_concat_fuse"
};
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
SetOp
(
ProgramDesc
*
prog
,
const
std
::
string
&
type
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
vector
<
std
::
string
>&
outputs
)
{
auto
*
op
=
prog
->
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
type
);
if
(
type
==
"sequence_pool"
)
{
op
->
SetInput
(
"X"
,
{
inputs
[
0
]});
std
::
string
pooltype
=
"SUM"
;
op
->
SetAttr
(
"pooltype"
,
pooltype
);
op
->
SetOutput
(
"MaxIndex"
,
{
outputs
[
0
]});
op
->
SetOutput
(
"Out"
,
{
outputs
[
1
]});
}
else
if
(
type
==
"concat"
)
{
op
->
SetInput
(
"X"
,
inputs
);
op
->
SetAttr
(
"axis"
,
1
);
op
->
SetOutput
(
"Out"
,
{
outputs
[
0
]});
}
else
{
op
->
SetInput
(
"X"
,
inputs
);
op
->
SetOutput
(
"Out"
,
outputs
);
}
op
->
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kForward
));
}
int
CountOpType
(
const
ir
::
Graph
*
graph
,
const
std
::
string
&
op_type
=
"fusion_seqpool_concat"
)
{
int
count
=
0
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
->
Type
()
==
op_type
)
{
++
count
;
}
}
return
count
;
}
std
::
unique_ptr
<
ir
::
Graph
>
GetNumNodesOfBeforeAfter
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
int
*
before
,
int
*
after
,
const
std
::
string
&
pass_type
=
"seqpool_concat_fuse_pass"
)
{
auto
pass
=
PassRegistry
::
Instance
().
Get
(
pass_type
);
*
before
=
graph
->
Nodes
().
size
();
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
*
after
=
graph
->
Nodes
().
size
();
return
graph
;
}
/*
* Before fuse:
* a b c
* | | |
* op1 op2 op3
* / \ / \ / \
* d e f g h i
* \ | /
* concat
* |
* j
* Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr
*
* After fuse:
* a b c
* \ | /
* fusion_seqpool_concat
* |
* j
*/
TEST
(
SeqPoolConcatFusePass
,
basic
)
{
ProgramDesc
prog
;
for
(
auto
&
v
:
std
::
vector
<
std
::
string
>
(
{
"a"
,
"b"
,
"c"
,
"d"
,
"e"
,
"f"
,
"g"
,
"h"
,
"i"
,
"j"
}))
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
}
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"a"
}),
std
::
vector
<
std
::
string
>
({
"d"
,
"e"
}));
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"b"
}),
std
::
vector
<
std
::
string
>
({
"f"
,
"g"
}));
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"c"
}),
std
::
vector
<
std
::
string
>
({
"h"
,
"i"
}));
SetOp
(
&
prog
,
"concat"
,
std
::
vector
<
std
::
string
>
({
"e"
,
"g"
,
"i"
}),
std
::
vector
<
std
::
string
>
({
"j"
}));
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
int
before
,
after
;
graph
=
GetNumNodesOfBeforeAfter
(
std
::
move
(
graph
),
&
before
,
&
after
);
// Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op
// Add 1 Node: fusion_seqpool_concat
EXPECT_EQ
(
after
,
before
-
9
);
EXPECT_EQ
(
CountOpType
(
graph
.
get
()),
1
);
}
/*
* Before fuse:
* a b
* | / \
* op1 op2 op3
* / \ / \ \
* c d e f g
* \ /
* concat
* |
* h
* Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr
*
* After fuse:
* a b
* \ / \
* fusion_seqpool_concat op3
* | |
* h g
*/
TEST
(
SeqPoolConcatFusePass
,
advanced
)
{
ProgramDesc
prog
;
for
(
auto
&
v
:
std
::
vector
<
std
::
string
>
({
"a"
,
"b"
,
"c"
,
"d"
,
"e"
,
"f"
,
"g"
,
"h"
}))
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
}
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"a"
}),
std
::
vector
<
std
::
string
>
({
"c"
,
"d"
}));
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
"b"
}),
std
::
vector
<
std
::
string
>
({
"e"
,
"f"
}));
SetOp
(
&
prog
,
"op3"
,
std
::
vector
<
std
::
string
>
({
"b"
}),
std
::
vector
<
std
::
string
>
({
"g"
}));
SetOp
(
&
prog
,
"concat"
,
std
::
vector
<
std
::
string
>
({
"d"
,
"f"
}),
std
::
vector
<
std
::
string
>
({
"h"
}));
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
int
before
,
after
;
graph
=
GetNumNodesOfBeforeAfter
(
std
::
move
(
graph
),
&
before
,
&
after
);
// Remove 7 Nodes: op1, op2, c, d, e, f concat_op
// Add 1 Node: fusion_seqpool_concat
EXPECT_EQ
(
after
,
before
-
6
);
EXPECT_EQ
(
CountOpType
(
graph
.
get
()),
1
);
}
ProgramDesc
BuildProgramDesc
(
int
num_inputs_of_concat
)
{
ProgramDesc
prog
;
auto
new_var
=
[
&
](
const
std
::
string
&
name
)
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
name
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
};
std
::
vector
<
std
::
string
>
concat_inputs
;
for
(
int
i
=
0
;
i
<
num_inputs_of_concat
;
++
i
)
{
std
::
string
prefix
=
"seqpool_op_"
+
i
;
new_var
(
prefix
+
"in"
);
new_var
(
prefix
+
"out"
);
new_var
(
prefix
+
"out_unused"
);
SetOp
(
&
prog
,
"sequence_pool"
,
std
::
vector
<
std
::
string
>
({
prefix
+
"in"
}),
std
::
vector
<
std
::
string
>
({
prefix
+
"out"
,
prefix
+
"out_unused"
}));
concat_inputs
.
push_back
(
prefix
+
"out"
);
}
SetOp
(
&
prog
,
"concat"
,
concat_inputs
,
std
::
vector
<
std
::
string
>
({
"concat_out"
}));
return
prog
;
}
// test more inputs of concat
TEST
(
SeqPoolConcatFusePass
,
more_inputs
)
{
for
(
int
num
:
{
1
,
2
,
10
})
{
ProgramDesc
prog
=
BuildProgramDesc
(
num
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
int
before
,
after
;
graph
=
GetNumNodesOfBeforeAfter
(
std
::
move
(
graph
),
&
before
,
&
after
);
// Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op
// Add Node: fusion_seqpool_concat op
EXPECT_EQ
(
after
,
before
-
num
*
3
);
EXPECT_EQ
(
CountOpType
(
graph
.
get
()),
1
);
}
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
USE_PASS
(
seqpool_concat_fuse_pass
);
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
template
<
int
times
>
std
::
unique_ptr
<
ir
::
Graph
>
TransposeFlattenConcatFusePass
<
times
>::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
pattern_name
=
"transpose_flatten"
+
std
::
to_string
(
times
)
+
"_concat_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
.
get
());
GraphPatternDetector
gpd
;
std
::
vector
<
PDNode
*>
input_nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
input_nodes
.
push_back
(
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
+
std
::
to_string
(
i
))
->
assert_is_op_input
(
"transpose2"
,
"X"
)
->
AsInput
());
}
patterns
::
TransposeFlattenConcat
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
input_nodes
,
times
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
const
int
kNumFields
=
5
;
const
int
kTransOffset
=
1
;
const
int
kTransOutOffset
=
2
;
const
int
kFlattenOffset
=
3
;
const
int
kFlattenOutOffset
=
4
;
std
::
vector
<
Node
*>
nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_out"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten_out"
+
std
::
to_string
(
i
))));
PADDLE_ENFORCE
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose_out"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
i
))));
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten_out"
+
std
::
to_string
(
i
))));
}
Node
*
concat_op
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"concat"
));
Node
*
concat_out
=
subgraph
.
at
(
pattern
.
GetPDNode
(
"concat_out"
));
std
::
vector
<
std
::
string
>
input_names
;
std
::
vector
<
int
>
trans_axis
=
boost
::
get
<
std
::
vector
<
int
>>
(
nodes
[
kTransOffset
]
->
Op
()
->
GetAttr
(
"axis"
));
int
flatten_axis
=
boost
::
get
<
int
>
(
nodes
[
kFlattenOffset
]
->
Op
()
->
GetAttr
(
"axis"
));
int
concat_axis
=
boost
::
get
<
int
>
(
concat_op
->
Op
()
->
GetAttr
(
"axis"
));
std
::
string
output_name
=
concat_out
->
Name
();
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
input_names
.
push_back
(
nodes
[
i
*
kNumFields
]
->
Name
());
}
framework
::
OpDesc
new_op_desc
;
new_op_desc
.
SetType
(
"fusion_transpose_flatten_concat"
);
new_op_desc
.
SetInput
(
"X"
,
input_names
);
new_op_desc
.
SetAttr
(
"trans_axis"
,
trans_axis
);
new_op_desc
.
SetAttr
(
"flatten_axis"
,
flatten_axis
);
new_op_desc
.
SetAttr
(
"concat_axis"
,
concat_axis
);
new_op_desc
.
SetOutput
(
"Out"
,
{
output_name
});
new_op_desc
.
Flush
();
// Create a new node for the fused op.
auto
*
new_conv_op
=
graph
->
CreateOpNode
(
&
new_op_desc
);
std
::
unordered_set
<
const
Node
*>
delete_nodes
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
nodes
[
i
*
kNumFields
]
->
outputs
.
push_back
(
new_conv_op
);
new_conv_op
->
inputs
.
push_back
(
nodes
[
i
*
kNumFields
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kTransOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kTransOutOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kFlattenOffset
]);
delete_nodes
.
insert
(
nodes
[
i
*
kNumFields
+
kFlattenOutOffset
]);
}
delete_nodes
.
insert
(
concat_op
);
new_conv_op
->
outputs
.
push_back
(
concat_out
);
concat_out
->
inputs
.
push_back
(
new_conv_op
);
// Delete the unneeded nodes.
GraphSafeRemoveNodes
(
graph
.
get
(),
delete_nodes
);
};
gpd
(
graph
.
get
(),
handler
);
return
graph
;
}
template
class
TransposeFlattenConcatFusePass
<
1
>;
template
class
TransposeFlattenConcatFusePass
<
3
>;
template
class
TransposeFlattenConcatFusePass
<
4
>;
template
class
TransposeFlattenConcatFusePass
<
5
>;
template
class
TransposeFlattenConcatFusePass
<
6
>;
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
transpose_flatten_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
1
>
);
REGISTER_PASS
(
transpose_flatten3_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
3
>
);
REGISTER_PASS
(
transpose_flatten4_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
4
>
);
REGISTER_PASS
(
transpose_flatten5_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
5
>
);
REGISTER_PASS
(
transpose_flatten6_concat_fuse_pass
,
paddle
::
framework
::
ir
::
TransposeFlattenConcatFusePass
<
6
>
);
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
// There may be many transpose-flatten structures in a model, and the output of
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template
<
int
times
>
class
TransposeFlattenConcatFusePass
:
public
FusePassBase
{
public:
virtual
~
TransposeFlattenConcatFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ngraph_bridge.cc
浏览文件 @
eac5a0aa
...
@@ -32,8 +32,11 @@ std::map<std::string,
...
@@ -32,8 +32,11 @@ std::map<std::string,
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphBridge
::
NG_NODE_MAP
=
{
NgraphBridge
::
NG_NODE_MAP
=
{
{
"fill_constant"
,
paddle
::
operators
::
ngraphs
::
BuildFillConstantNode
},
{
"fill_constant"
,
paddle
::
operators
::
ngraphs
::
BuildFillConstantNode
},
{
"mean"
,
paddle
::
operators
::
ngraphs
::
BuildMeanNode
},
{
"mean_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMeanGradNode
},
{
"mul"
,
paddle
::
operators
::
ngraphs
::
BuildMulNode
},
{
"mul"
,
paddle
::
operators
::
ngraphs
::
BuildMulNode
},
{
"mul_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMulGradNode
},
{
"mul_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMulGradNode
},
{
"scale"
,
paddle
::
operators
::
ngraphs
::
BuildScaleNode
},
{
"relu"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"relu"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"tanh"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"tanh"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"top_k"
,
paddle
::
operators
::
ngraphs
::
BuildTopKNode
}};
{
"top_k"
,
paddle
::
operators
::
ngraphs
::
BuildTopKNode
}};
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
eac5a0aa
...
@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor(
...
@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor(
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
const
ProgramDesc
&
main_program
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
size_t
num_trainers
,
size_t
trainer_id
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
use_all_reduce_
=
member_
->
use_all_reduce_
=
build_strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
build_strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
member_
->
nranks_
=
num_trainers
*
places
.
size
();
member_
->
nranks_
=
build_strategy
.
num_trainers_
*
places
.
size
();
if
(
!
member_
->
use_all_reduce_
)
{
if
(
!
member_
->
use_all_reduce_
)
{
PADDLE_ENFORCE
(
places
.
size
()
>
1
,
PADDLE_ENFORCE
(
places
.
size
()
>
1
,
...
@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
...
@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
}
}
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
,
nccl_id
,
num_trainers
,
trainer_id
));
member_
->
places_
,
nccl_id
,
build_strategy
.
num_trainers_
,
build_strategy
.
trainer_id_
));
#else
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
#endif
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
eac5a0aa
...
@@ -50,8 +50,7 @@ class ParallelExecutor {
...
@@ -50,8 +50,7 @@ class ParallelExecutor {
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
const
BuildStrategy
&
build_strategy
);
size_t
num_trainers
=
1
,
size_t
trainer_id
=
0
);
~
ParallelExecutor
();
~
ParallelExecutor
();
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
eac5a0aa
...
@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
...
@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
}
}
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
SCOPE_VARS_WRITER_LOCK
auto
new_name
=
std
::
to_string
(
reinterpret_cast
<
uintptr_t
>
(
this
))
+
"."
+
std
::
to_string
(
vars_
.
size
());
if
(
name
!=
nullptr
)
{
if
(
name
!=
nullptr
)
{
*
name
=
new_name
;
*
name
=
new_name
;
}
}
SCOPE_VARS_WRITER_LOCK
return
VarInternal
(
new_name
);
return
VarInternal
(
new_name
);
}
}
...
...
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
eac5a0aa
...
@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
...
@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
}
// namespace detail
}
// namespace detail
const
std
::
type_index
&
ToTypeIndex
(
int
var_id
)
{
const
std
::
type_index
&
VarTraitId
ToTypeIndex
(
int
var_id
)
{
return
detail
::
VarIdToTypeIndexMapHolder
::
ToTypeIndex
(
var_id
);
return
detail
::
VarIdToTypeIndexMapHolder
::
ToTypeIndex
(
var_id
);
}
}
const
char
*
ToTypeName
(
int
var_id
)
{
return
ToTypeIndex
(
var_id
).
name
();
}
const
char
*
ToTypeName
(
int
var_id
)
{
return
VarTraitIdToTypeIndex
(
var_id
).
name
();
}
int
T
oType
Id
(
const
std
::
type_index
&
type
)
{
int
T
ypeIndexToVarTrait
Id
(
const
std
::
type_index
&
type
)
{
return
detail
::
VarIdToTypeIndexMapHolder
::
ToTypeId
(
type
);
return
detail
::
VarIdToTypeIndexMapHolder
::
ToTypeId
(
type
);
}
}
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
eac5a0aa
...
@@ -66,8 +66,8 @@ namespace paddle {
...
@@ -66,8 +66,8 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
const
char
*
ToTypeName
(
int
var_id
);
const
char
*
ToTypeName
(
int
var_id
);
const
std
::
type_index
&
ToTypeIndex
(
int
var_id
);
const
std
::
type_index
&
VarTraitId
ToTypeIndex
(
int
var_id
);
int
T
oType
Id
(
const
std
::
type_index
&
type
);
int
T
ypeIndexToVarTrait
Id
(
const
std
::
type_index
&
type
);
namespace
detail
{
namespace
detail
{
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
eac5a0aa
...
@@ -45,10 +45,11 @@ struct TypeIndexChecker {
...
@@ -45,10 +45,11 @@ struct TypeIndexChecker {
constexpr
auto
kId
=
VarTypeTrait
<
Type
>::
kId
;
constexpr
auto
kId
=
VarTypeTrait
<
Type
>::
kId
;
std
::
type_index
actual_type
(
typeid
(
Type
));
std
::
type_index
actual_type
(
typeid
(
Type
));
EXPECT_EQ
(
std
::
string
(
ToTypeName
(
kId
)),
std
::
string
(
actual_type
.
name
()));
EXPECT_EQ
(
std
::
string
(
ToTypeName
(
kId
)),
std
::
string
(
actual_type
.
name
()));
EXPECT_EQ
(
ToTypeIndex
(
kId
),
actual_type
);
EXPECT_EQ
(
VarTraitIdToTypeIndex
(
kId
),
actual_type
);
EXPECT_EQ
(
ToTypeId
(
actual_type
),
kId
);
EXPECT_EQ
(
TypeIndexToVarTraitId
(
actual_type
),
kId
);
EXPECT_EQ
(
ToTypeIndex
(
ToTypeId
(
actual_type
)),
actual_type
);
EXPECT_EQ
(
VarTraitIdToTypeIndex
(
TypeIndexToVarTraitId
(
actual_type
)),
EXPECT_EQ
(
ToTypeId
(
ToTypeIndex
(
kId
)),
kId
);
actual_type
);
EXPECT_EQ
(
TypeIndexToVarTraitId
(
VarTraitIdToTypeIndex
(
kId
)),
kId
);
EXPECT_TRUE
(
var_id_set
->
count
(
kId
)
==
0
);
// NOLINT
EXPECT_TRUE
(
var_id_set
->
count
(
kId
)
==
0
);
// NOLINT
EXPECT_TRUE
(
type_index_set
->
count
(
actual_type
)
==
0
);
// NOLINT
EXPECT_TRUE
(
type_index_set
->
count
(
actual_type
)
==
0
);
// NOLINT
...
...
paddle/fluid/imperative/layer.h
浏览文件 @
eac5a0aa
...
@@ -77,6 +77,7 @@ class PreparedOp {
...
@@ -77,6 +77,7 @@ class PreparedOp {
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
platform
::
DeviceContext
*
dev_ctx
;
platform
::
DeviceContext
*
dev_ctx
;
};
};
class
OpBase
;
class
OpBase
;
class
VarBase
{
class
VarBase
{
...
...
paddle/fluid/inference/analysis/analyzer_tester.cc
浏览文件 @
eac5a0aa
...
@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
...
@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
i
++
)
{
i
++
)
{
LOG
(
INFO
)
<<
"data: "
<<
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
]
LOG
(
INFO
)
<<
"data: "
<<
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
]
<<
" result: "
<<
result
[
i
];
<<
" result: "
<<
result
[
i
];
PADDLE_ENFORCE
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())
[
i
],
EXPECT_NEAR
(
static_cast
<
float
*>
(
outputs
.
front
().
data
.
data
())[
i
],
result
[
i
],
result
[
i
]
);
1e-3
);
}
}
}
}
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
eac5a0aa
...
@@ -123,8 +123,6 @@ struct Argument {
...
@@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
use_gpu
,
UseGPU
,
bool
);
DECL_ARGUMENT_FIELD
(
use_gpu
,
UseGPU
,
bool
);
DECL_ARGUMENT_FIELD
(
gpu_device_id
,
GPUDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
gpu_device_id
,
GPUDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
use_tensorrt
,
UseTensorRT
,
bool
);
DECL_ARGUMENT_FIELD
(
use_tensorrt
,
UseTensorRT
,
bool
);
DECL_ARGUMENT_FIELD
(
tensorrt_node_teller
,
TensorRtNodeTeller
,
std
::
function
<
bool
(
const
framework
::
ir
::
Node
*
)
>
);
DECL_ARGUMENT_FIELD
(
tensorrt_max_batch_size
,
TensorRtMaxBatchSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_max_batch_size
,
TensorRtMaxBatchSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
eac5a0aa
...
@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
...
@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for
(
const
std
::
string
&
pass_name
:
passes
)
{
for
(
const
std
::
string
&
pass_name
:
passes
)
{
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
pass_name
);
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
pass_name
);
// Set some pass attributes.
if
(
pass_name
==
"ir_analysis_pass"
)
{
pass
->
Set
(
"tensorrt_node_teller"
,
new
SubgraphDetector
::
NodeInsideSubgraphTeller
(
argument
->
tensorrt_node_teller
()));
}
if
(
pass_name
==
"graph_viz_pass"
)
{
if
(
pass_name
==
"graph_viz_pass"
)
{
std
::
string
dot_file_path
=
std
::
to_string
(
pass_num
)
+
"_ir_"
+
std
::
string
dot_file_path
=
std
::
to_string
(
pass_num
)
+
"_ir_"
+
(
pre_pass
.
empty
()
?
"origin"
:
pre_pass
)
+
(
pre_pass
.
empty
()
?
"origin"
:
pre_pass
)
+
...
@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
...
@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
}
}
if
(
pass_name
==
"tensorrt_subgraph_pass"
)
{
if
(
pass_name
==
"tensorrt_subgraph_pass"
)
{
PADDLE_ENFORCE
(
argument
->
tensorrt_node_teller_valid
());
pass
->
SetNotOwned
(
"tensorrt_node_teller"
,
argument
->
tensorrt_node_teller_ptr
());
pass
->
Set
(
"workspace_size"
,
new
int
(
argument
->
tensorrt_workspace_size
()));
pass
->
Set
(
"workspace_size"
,
new
int
(
argument
->
tensorrt_workspace_size
()));
pass
->
Set
(
"max_batch_size"
,
new
int
(
argument
->
tensorrt_max_batch_size
()));
pass
->
Set
(
"max_batch_size"
,
new
int
(
argument
->
tensorrt_max_batch_size
()));
pass
->
Set
(
"min_subgraph_size"
,
pass
->
Set
(
"min_subgraph_size"
,
...
...
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
浏览文件 @
eac5a0aa
cc_library
(
subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc
)
cc_library
(
subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector
)
set
(
analysis_deps
${
analysis_deps
}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL
""
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
if
(
TENSORRT_FOUND
)
file
(
APPEND
${
pass_file
}
"USE_PASS(tensorrt_subgraph_pass);
\n
"
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller
)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
tensorrt_subgraph_pass CACHE INTERNAL
""
)
set
(
analysis_deps
${
analysis_deps
}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL
""
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
APPEND
${
pass_file
}
"USE_PASS(tensorrt_subgraph_pass);
\n
"
)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
tensorrt_subgraph_pass CACHE INTERNAL
""
)
endif
()
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
eac5a0aa
...
@@ -20,6 +20,7 @@
...
@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
...
@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
{
framework
::
ir
::
FusePassBase
::
Init
(
"tensorrt_subgraph_pass"
,
graph
.
get
());
framework
::
ir
::
FusePassBase
::
Init
(
"tensorrt_subgraph_pass"
,
graph
.
get
());
auto
teller
=
auto
teller
=
[](
const
framework
::
ir
::
Node
*
node
)
{
Get
<
SubgraphDetector
::
NodeInsideSubgraphTeller
>
(
"tensorrt_node_teller"
);
if
(
!
node
->
IsOp
()
||
!
node
->
Op
())
return
false
;
return
tensorrt
::
OpTeller
::
Global
().
Tell
(
node
->
Op
()
->
Type
(),
*
node
->
Op
());
};
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
Get
<
int
>
(
"min_subgraph_size"
)
/*min subgraph size*/
);
Get
<
int
>
(
"min_subgraph_size"
)
/*min subgraph size*/
);
...
@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
...
@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS
(
tensorrt_subgraph_pass
,
REGISTER_PASS
(
tensorrt_subgraph_pass
,
paddle
::
inference
::
analysis
::
TensorRtSubgraphPass
)
paddle
::
inference
::
analysis
::
TensorRtSubgraphPass
)
.
RequirePassAttr
(
"tensorrt_node_teller"
)
.
RequirePassAttr
(
"max_batch_size"
)
.
RequirePassAttr
(
"max_batch_size"
)
.
RequirePassAttr
(
"workspace_size"
)
.
RequirePassAttr
(
"workspace_size"
)
.
RequirePassAttr
(
"min_subgraph_size"
);
.
RequirePassAttr
(
"min_subgraph_size"
);
paddle/fluid/inference/analysis/passes/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
...
@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
ir_graph_build_pass
ir_graph_build_pass
ir_analysis_pass
ir_analysis_pass
analysis_passes
analysis_passes
subgraph_detector
CACHE INTERNAL
""
)
CACHE INTERNAL
""
)
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
浏览文件 @
eac5a0aa
...
@@ -27,9 +27,6 @@ namespace analysis {
...
@@ -27,9 +27,6 @@ namespace analysis {
void
IrAnalysisComposePass
::
RunImpl
(
Argument
*
argument
)
{
void
IrAnalysisComposePass
::
RunImpl
(
Argument
*
argument
)
{
ARGUMENT_CHECK_FIELD
(
argument
,
ir_analysis_passes
);
ARGUMENT_CHECK_FIELD
(
argument
,
ir_analysis_passes
);
if
(
argument
->
use_tensorrt_valid
()
&&
argument
->
use_tensorrt
())
{
InitTensorRTAttrs
(
argument
);
}
ApplyIrPasses
(
argument
);
ApplyIrPasses
(
argument
);
CollectFusionStatis
(
argument
);
CollectFusionStatis
(
argument
);
}
}
...
@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
...
@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return
"ir-analysis-compose-pass"
;
return
"ir-analysis-compose-pass"
;
}
}
void
IrAnalysisComposePass
::
InitTensorRTAttrs
(
Argument
*
argument
)
{
if
(
argument
->
use_tensorrt_valid
()
&&
argument
->
use_tensorrt
())
{
LOG
(
INFO
)
<<
"Initing TensorRT pass"
;
argument
->
SetTensorRtNodeTeller
([](
const
framework
::
ir
::
Node
*
node
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
,
"leaky_relu"
});
if
(
!
node
->
IsOp
())
return
false
;
if
(
teller_set
.
count
(
node
->
Op
()
->
Type
()))
{
return
true
;
}
else
{
return
false
;
}
});
}
}
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
std
::
vector
<
std
::
string
>
passes
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
浏览文件 @
eac5a0aa
...
@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
...
@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std
::
string
repr
()
const
override
;
std
::
string
repr
()
const
override
;
private:
private:
void
InitTensorRTAttrs
(
Argument
*
argument
);
void
ApplyIrPasses
(
Argument
*
argument
);
void
ApplyIrPasses
(
Argument
*
argument
);
void
CollectFusionStatis
(
Argument
*
argument
);
void
CollectFusionStatis
(
Argument
*
argument
);
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
eac5a0aa
...
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
...
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_
=
true
;
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
Update
();
}
}
void
contrib
::
AnalysisConfig
::
Update
()
{
void
contrib
::
AnalysisConfig
::
Update
()
{
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
eac5a0aa
...
@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
...
@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
using
framework
::
NaiveExecutor
;
using
framework
::
NaiveExecutor
;
using
contrib
::
AnalysisConfig
;
using
contrib
::
AnalysisConfig
;
/* This predictor is based on the original native predictor with IR and Analysis
/** \brief This predictor is based on the original native predictor with IR and
* support. It will optimize IR and Parameters in the runtime.
* Analysis support.
*
* It will optimize IR and Parameters in the runtime.
*
* TODO(Superjomn) Replace the Navive predictor?
* TODO(Superjomn) Replace the Navive predictor?
*/
*/
class
AnalysisPredictor
:
public
PaddlePredictor
{
class
AnalysisPredictor
:
public
PaddlePredictor
{
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
eac5a0aa
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
...
...
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -92,10 +92,10 @@ if(WITH_MKL)
...
@@ -92,10 +92,10 @@ if(WITH_MKL)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml_intel
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml_intel
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libiomp5
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libiomp5
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
else
(
WIN32
)
else
()
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libiomp5md
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libiomp5md
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
(
WIN32
)
endif
()
set
(
MKLDNN_PATH
"
${
PADDLE_LIB
}
/third_party/install/mkldnn"
)
set
(
MKLDNN_PATH
"
${
PADDLE_LIB
}
/third_party/install/mkldnn"
)
if
(
EXISTS
${
MKLDNN_PATH
}
)
if
(
EXISTS
${
MKLDNN_PATH
}
)
include_directories
(
"
${
MKLDNN_PATH
}
/include"
)
include_directories
(
"
${
MKLDNN_PATH
}
/include"
)
...
@@ -128,8 +128,8 @@ else()
...
@@ -128,8 +128,8 @@ else()
${
CMAKE_STATIC_LIBRARY_PREFIX
}
glog
${
CMAKE_STATIC_LIBRARY_PREFIX
}
gflags
${
CMAKE_STATIC_LIBRARY_PREFIX
}
protobuf
${
CMAKE_STATIC_LIBRARY_PREFIX
}
glog
${
CMAKE_STATIC_LIBRARY_PREFIX
}
gflags
${
CMAKE_STATIC_LIBRARY_PREFIX
}
protobuf
${
CMAKE_STATIC_LIBRARY_PREFIX
}
snappy
${
CMAKE_STATIC_LIBRARY_PREFIX
}
z
${
CMAKE_STATIC_LIBRARY_PREFIX
}
xxhash
${
CMAKE_STATIC_LIBRARY_PREFIX
}
snappy
${
CMAKE_STATIC_LIBRARY_PREFIX
}
z
${
CMAKE_STATIC_LIBRARY_PREFIX
}
xxhash
snappystream
${
EXTERNAL_LIB
}
)
snappystream
${
EXTERNAL_LIB
}
)
# NOTE(dzhwinter) shlwapi is deprecated.
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
set
(
DEPS
${
DEPS
}
libcmt
shlwapi
)
set
(
DEPS
${
DEPS
}
libcmt
${
os_dependency_modules
}
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
...
...
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
eac5a0aa
...
@@ -116,6 +116,10 @@ D
...
@@ -116,6 +116,10 @@ D
--modeldir
=
$DATA_DIR
/mobilenet/model
\
--modeldir
=
$DATA_DIR
/mobilenet/model
\
--data
=
$DATA_DIR
/mobilenet/data.txt
\
--data
=
$DATA_DIR
/mobilenet/data.txt
\
--refer
=
$DATA_DIR
/mobilenet/result.txt
--refer
=
$DATA_DIR
/mobilenet/result.txt
if
[
$?
-ne
0
]
;
then
echo
"trt demo trt_mobilenet_demo runs fail."
exit
1
fi
fi
fi
done
done
set
+x
set
+x
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
浏览文件 @
eac5a0aa
...
@@ -38,8 +38,8 @@ void Main() {
...
@@ -38,8 +38,8 @@ void Main() {
std
::
unique_ptr
<
PaddlePredictor
>
predictor
;
std
::
unique_ptr
<
PaddlePredictor
>
predictor
;
paddle
::
contrib
::
AnalysisConfig
config
;
paddle
::
contrib
::
AnalysisConfig
config
;
config
.
EnableUseGpu
(
100
,
0
);
config
.
EnableUseGpu
(
100
,
0
);
config
.
SetModel
(
FLAGS_modeldir
+
"/__
params
__"
,
config
.
SetModel
(
FLAGS_modeldir
+
"/__
model
__"
,
FLAGS_modeldir
+
"/__
model
__"
);
FLAGS_modeldir
+
"/__
params
__"
);
config
.
EnableTensorRtEngine
();
config
.
EnableTensorRtEngine
();
predictor
=
CreatePaddlePredictor
(
config
);
predictor
=
CreatePaddlePredictor
(
config
);
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
eac5a0aa
...
@@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
...
@@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
os
<<
to_string
(
l
)
<<
"; "
;
os
<<
to_string
(
l
)
<<
"; "
;
}
}
os
<<
"
\n
"
;
os
<<
"
\n
"
;
os
<<
" - data: "
;
os
<<
" - memory length: "
<<
tensor
.
data
.
length
();
os
<<
"
\n
"
;
os
<<
" - data: "
;
int
dim
=
VecReduceToInt
(
tensor
.
shape
);
int
dim
=
VecReduceToInt
(
tensor
.
shape
);
float
*
pdata
=
static_cast
<
float
*>
(
tensor
.
data
.
data
());
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
{
os
<<
static_cast
<
float
*>
(
tensor
.
data
.
data
())
[
i
]
<<
" "
;
os
<<
pdata
[
i
]
<<
" "
;
}
}
os
<<
'\n'
;
os
<<
'\n'
;
return
os
.
str
();
return
os
.
str
();
...
@@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
...
@@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
os
<<
to_string
(
l
)
<<
"; "
;
os
<<
to_string
(
l
)
<<
"; "
;
}
}
os
<<
"
\n
"
;
os
<<
"
\n
"
;
os
<<
" - data: "
;
PaddlePlace
place
;
PaddlePlace
place
;
int
size
;
int
size
;
const
auto
*
data
=
tensor
.
data
<
float
>
(
&
place
,
&
size
);
const
auto
*
data
=
tensor
.
data
<
float
>
(
&
place
,
&
size
);
os
<<
" - numel: "
<<
size
;
os
<<
"
\n
"
;
os
<<
" - data: "
;
for
(
int
i
=
0
;
i
<
size
;
i
++
)
{
for
(
int
i
=
0
;
i
<
size
;
i
++
)
{
os
<<
data
[
i
]
<<
" "
;
os
<<
data
[
i
]
<<
" "
;
}
}
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
eac5a0aa
...
@@ -19,6 +19,8 @@
...
@@ -19,6 +19,8 @@
#include <unordered_set>
#include <unordered_set>
#include <vector>
#include <vector>
/*! \file */
// Here we include some header files with relative paths, for that in deploy,
// Here we include some header files with relative paths, for that in deploy,
// the abstract path of this header file will be changed.
// the abstract path of this header file will be changed.
#include "paddle_api.h" // NOLINT
#include "paddle_api.h" // NOLINT
...
@@ -41,49 +43,125 @@ struct AnalysisConfig {
...
@@ -41,49 +43,125 @@ struct AnalysisConfig {
explicit
AnalysisConfig
(
const
std
::
string
&
prog_file
,
explicit
AnalysisConfig
(
const
std
::
string
&
prog_file
,
const
std
::
string
&
params_file
);
const
std
::
string
&
params_file
);
// Model path related.
/** Set model with a directory.
*/
void
SetModel
(
const
std
::
string
&
model_dir
)
{
model_dir_
=
model_dir
;
}
void
SetModel
(
const
std
::
string
&
model_dir
)
{
model_dir_
=
model_dir
;
}
/** Set model with two specific pathes for program and parameters.
*/
void
SetModel
(
const
std
::
string
&
prog_file_path
,
void
SetModel
(
const
std
::
string
&
prog_file_path
,
const
std
::
string
&
params_file_path
);
const
std
::
string
&
params_file_path
);
/** Set program file path.
*/
void
SetProgFile
(
const
std
::
string
&
x
)
{
prog_file_
=
x
;
}
void
SetProgFile
(
const
std
::
string
&
x
)
{
prog_file_
=
x
;
}
/** Set parameter composed file path.
*/
void
SetParamsFile
(
const
std
::
string
&
x
)
{
params_file_
=
x
;
}
void
SetParamsFile
(
const
std
::
string
&
x
)
{
params_file_
=
x
;
}
/** Get the model directory path.
*/
const
std
::
string
&
model_dir
()
const
{
return
model_dir_
;
}
const
std
::
string
&
model_dir
()
const
{
return
model_dir_
;
}
/** Get the program file path.
*/
const
std
::
string
&
prog_file
()
const
{
return
prog_file_
;
}
const
std
::
string
&
prog_file
()
const
{
return
prog_file_
;
}
/** Get the composed parameters file.
*/
const
std
::
string
&
params_file
()
const
{
return
params_file_
;
}
const
std
::
string
&
params_file
()
const
{
return
params_file_
;
}
// GPU related.
// GPU related.
/**
* \brief Turn on GPU.
* @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
* @param device_id the GPU card to use (default is 0).
*/
void
EnableUseGpu
(
uint64_t
memory_pool_init_size_mb
,
int
device_id
=
0
);
void
EnableUseGpu
(
uint64_t
memory_pool_init_size_mb
,
int
device_id
=
0
);
/** Turn off the GPU.
*/
void
DisableGpu
();
void
DisableGpu
();
/** A bool state telling whether the GPU is turned on.
*/
bool
use_gpu
()
const
{
return
use_gpu_
;
}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
/** Get the GPU device id.
*/
int
gpu_device_id
()
const
{
return
device_id_
;
}
int
gpu_device_id
()
const
{
return
device_id_
;
}
/** Get the initial size in MB of the GPU memory pool.
*/
int
memory_pool_init_size_mb
()
const
{
return
memory_pool_init_size_mb_
;
}
int
memory_pool_init_size_mb
()
const
{
return
memory_pool_init_size_mb_
;
}
/** Get the proportion of the initial memory pool size compared to the device.
*/
float
fraction_of_gpu_memory_for_pool
()
const
;
float
fraction_of_gpu_memory_for_pool
()
const
;
// Determine whether to perform graph optimization.
/** \brief Control whether to perform IR graph optimization.
*
* If turned off, the AnalysisConfig will act just like a NativeConfig.
*/
void
SwitchIrOptim
(
int
x
=
true
)
{
enable_ir_optim_
=
x
;
}
void
SwitchIrOptim
(
int
x
=
true
)
{
enable_ir_optim_
=
x
;
}
/** A boolean state tell whether the ir graph optimization is actived.
*/
bool
ir_optim
()
const
{
return
enable_ir_optim_
;
}
bool
ir_optim
()
const
{
return
enable_ir_optim_
;
}
/** \brief INTERNAL Determine whether to use the feed and fetch operators.
* Just for internal development, not stable yet.
* When ZeroCopyTensor is used, this should turned off.
*/
void
SwitchUseFeedFetchOps
(
int
x
=
true
)
{
use_feed_fetch_ops_
=
x
;
}
void
SwitchUseFeedFetchOps
(
int
x
=
true
)
{
use_feed_fetch_ops_
=
x
;
}
/** A boolean state telling whether to use the feed and fetch operators.
*/
bool
use_feed_fetch_ops_enabled
()
const
{
return
use_feed_fetch_ops_
;
}
bool
use_feed_fetch_ops_enabled
()
const
{
return
use_feed_fetch_ops_
;
}
/** \brief Control whether to specify the inputs' names.
*
* The PaddleTensor type has a `name` member, assign it with the corresponding
* variable name. This is used only when the input PaddleTensors passed to the
* `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
*/
void
SwitchSpecifyInputNames
(
bool
x
=
true
)
{
specify_input_name_
=
x
;
}
void
SwitchSpecifyInputNames
(
bool
x
=
true
)
{
specify_input_name_
=
x
;
}
/** A boolean state tell whether the input PaddleTensor names specified should
* be used to reorder the inputs in `PaddlePredictor.Run(...)`.
*/
bool
specify_input_name
()
const
{
return
specify_input_name_
;
}
bool
specify_input_name
()
const
{
return
specify_input_name_
;
}
/**
* \brief Turn on the TensorRT engine.
*
* The TensorRT engine will accelerate some subgraphes in the original Fluid
* computation graph. In some models such as TensorRT50, GoogleNet and so on,
* it gains significant performance acceleration.
*
* @param workspace_size the memory size(in byte) used for TensorRT workspace.
* @param max_batch_size the maximum batch size of this prediction task,
* better set as small as possible, or performance loss.
* @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
* subgraph is less than this, it will not transfer to TensorRT engine.
*/
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
);
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
);
/** A boolean state telling whether the TensorRT engine is used.
*/
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
/** Control whther to debug IR graph analysis phase.
*/
void
SwitchIrDebug
(
int
x
=
true
)
{
ir_debug_
=
x
;
}
void
SwitchIrDebug
(
int
x
=
true
)
{
ir_debug_
=
x
;
}
/** Turn on MKLDNN.
*/
void
EnableMKLDNN
();
void
EnableMKLDNN
();
/** A boolean state telling whether to use the MKLDNN.
*/
bool
mkldnn_enabled
()
const
{
return
use_mkldnn_
;
}
bool
mkldnn_enabled
()
const
{
return
use_mkldnn_
;
}
// Set and get the number of cpu math library threads.
/** Set and get the number of cpu math library threads.
*/
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
);
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
);
/** An int state telling how many threads are used in the CPU math library.
*/
int
cpu_math_library_num_threads
()
const
{
int
cpu_math_library_num_threads
()
const
{
return
cpu_math_library_num_threads_
;
return
cpu_math_library_num_threads_
;
}
}
/** Transform the AnalysisConfig to NativeConfig.
*/
NativeConfig
ToNativeConfig
()
const
{
NativeConfig
ToNativeConfig
()
const
{
NativeConfig
config
;
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
config
.
model_dir
=
model_dir_
;
...
@@ -95,19 +173,30 @@ struct AnalysisConfig {
...
@@ -95,19 +173,30 @@ struct AnalysisConfig {
config
.
specify_input_name
=
specify_input_name_
;
config
.
specify_input_name
=
specify_input_name_
;
return
config
;
return
config
;
}
}
/** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list.
*/
void
SetMKLDNNOp
(
std
::
unordered_set
<
std
::
string
>
op_list
)
{
void
SetMKLDNNOp
(
std
::
unordered_set
<
std
::
string
>
op_list
)
{
mkldnn_enabled_op_types_
=
op_list
;
mkldnn_enabled_op_types_
=
op_list
;
}
}
// Specify the memory buffer of program and parameter
/** Specify the memory buffer of program and parameter
* @param prog_buffer the memory buffer of program.
* @param prog_buffer_size the size of the data.
* @param params_buffer the memory buffer of the composed parameters file.
* @param params_buffer_size the size of the commposed parameters data.
*/
void
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
void
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
const
char
*
program_buffer
,
size_t
program_buffer_size
);
const
char
*
params_buffer
,
size_t
params_buffer_size
);
/** A boolean state telling whether the model is set from the CPU memory.
*/
bool
model_from_memory
()
const
{
return
model_from_memory_
;
}
bool
model_from_memory
()
const
{
return
model_from_memory_
;
}
friend
class
::
paddle
::
AnalysisPredictor
;
friend
class
::
paddle
::
AnalysisPredictor
;
// NOTE just for developer, not an official API, easily to be broken.
/** NOTE just for developer, not an official API, easily to be broken.
// Get a pass builder for customize the passes in IR analysis phase.
* Get a pass builder for customize the passes in IR analysis phase.
*/
PassStrategy
*
pass_builder
()
const
;
PassStrategy
*
pass_builder
()
const
;
protected:
protected:
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
eac5a0aa
...
@@ -13,61 +13,76 @@
...
@@ -13,61 +13,76 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
/*! \file paddle_api.h
*/
#include <cassert>
#include <cassert>
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
/*! \namespace paddle
*/
namespace
paddle
{
namespace
paddle
{
// Data type.
/** paddle data type.
*/
enum
PaddleDType
{
enum
PaddleDType
{
FLOAT32
,
FLOAT32
,
INT64
,
INT64
,
// TODO(Superjomn) support more data types if needed.
// TODO(Superjomn) support more data types if needed.
};
};
/*
/**
* Memory menage for PaddleTensor.
*\brief Memory menager for PaddleTensor.
* The PaddleBuf holds a buffer for data input or output. The memory can be
* allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* should be reused for better performance.
*
*
* For user allocated memory, the following API can be used:
*The PaddleBuf holds a buffer for data input or output. The memory can be
* - PaddleBuf(void* data, size_t length) to set an external memory by
*allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* specifying
*should be reused for better performance.
* the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
* memory.
* ATTENTION, for user allocated memory, deallocation should be done by users
* externally after the program finished. The PaddleBuf won't do any allocation
* or deallocation.
*
*
* To have the PaddleBuf allocate and manage the memory:
*For user allocated memory, the following API can be used:
* - PaddleBuf(size_t length) will allocate a memory of size `length`.
*- PaddleBuf(void* data, size_t length) to set an external memory by
* - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
*specifying
* if the allocated memory is larger than `length`, nothing will done.
* the memory address and length.
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory.
*ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation.
*
*To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*/
*/
class
PaddleBuf
{
class
PaddleBuf
{
public:
public:
// PaddleBuf allocate memory internally, and manage it.
/** PaddleBuf allocate memory internally, and manage it.
*/
explicit
PaddleBuf
(
size_t
length
)
explicit
PaddleBuf
(
size_t
length
)
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
// Set external memory, the PaddleBuf won't manage it.
/** Set external memory, the PaddleBuf won't manage it.
*/
PaddleBuf
(
void
*
data
,
size_t
length
)
PaddleBuf
(
void
*
data
,
size_t
length
)
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
// Copy only available when memory is managed externally.
/** Copy only available when memory is managed externally.
*/
explicit
PaddleBuf
(
const
PaddleBuf
&
);
explicit
PaddleBuf
(
const
PaddleBuf
&
);
// Resize the memory.
/** Resize the memory.
*/
void
Resize
(
size_t
length
);
void
Resize
(
size_t
length
);
// Reset to external memory, with address and length set.
/** Reset to external memory, with address and length set.
*/
void
Reset
(
void
*
data
,
size_t
length
);
void
Reset
(
void
*
data
,
size_t
length
);
// Tell whether the buffer is empty.
/** Tell whether the buffer is empty.
*/
bool
empty
()
const
{
return
length_
==
0
;
}
bool
empty
()
const
{
return
length_
==
0
;
}
// Get the memory address.
/** Get the memory address.
*/
void
*
data
()
const
{
return
data_
;
}
void
*
data
()
const
{
return
data_
;
}
// Get the memory length.
/** Get the memory length.
*/
size_t
length
()
const
{
return
length_
;
}
size_t
length
()
const
{
return
length_
;
}
~
PaddleBuf
()
{
Free
();
}
~
PaddleBuf
()
{
Free
();
}
...
@@ -83,7 +98,8 @@ class PaddleBuf {
...
@@ -83,7 +98,8 @@ class PaddleBuf {
bool
memory_owned_
{
true
};
bool
memory_owned_
{
true
};
};
};
// Basic input and output data structure for PaddlePredictor.
/** Basic input and output data structure for PaddlePredictor.
*/
struct
PaddleTensor
{
struct
PaddleTensor
{
PaddleTensor
()
=
default
;
PaddleTensor
()
=
default
;
std
::
string
name
;
// variable name.
std
::
string
name
;
// variable name.
...
@@ -94,19 +110,23 @@ struct PaddleTensor {
...
@@ -94,19 +110,23 @@ struct PaddleTensor {
};
};
enum
class
PaddlePlace
{
kUNK
=
-
1
,
kCPU
,
kGPU
};
enum
class
PaddlePlace
{
kUNK
=
-
1
,
kCPU
,
kGPU
};
// Tensor without copy, currently only supports AnalysisPredictor.
/** Tensor without copy, currently only supports AnalysisPredictor.
*/
class
ZeroCopyTensor
{
class
ZeroCopyTensor
{
public:
public:
void
Reshape
(
const
std
::
vector
<
int
>&
shape
);
void
Reshape
(
const
std
::
vector
<
int
>&
shape
);
// Get the memory in CPU or GPU with specific data type, should Reshape first
/** Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size.
* to tell the data size.
// Once can directly call this data to feed the data.
* Once can directly call this data to feed the data.
// This is for write the input tensor.
* This is for write the input tensor.
*/
template
<
typename
T
>
template
<
typename
T
>
T
*
mutable_data
(
PaddlePlace
place
);
T
*
mutable_data
(
PaddlePlace
place
);
// Get the memory directly, will return the place and memory size by pointer.
/** Get the memory directly, will return the place and element size by
// This is for reading the output tensor.
* pointer.
* This is for reading the output tensor.
*/
template
<
typename
T
>
template
<
typename
T
>
T
*
data
(
PaddlePlace
*
place
,
int
*
size
)
const
;
T
*
data
(
PaddlePlace
*
place
,
int
*
size
)
const
;
...
@@ -128,8 +148,7 @@ class ZeroCopyTensor {
...
@@ -128,8 +148,7 @@ class ZeroCopyTensor {
void
*
scope_
{
nullptr
};
void
*
scope_
{
nullptr
};
};
};
/*
/** A simple Inference API for Paddle.
* A simple Inference API for Paddle.
*/
*/
class
PaddlePredictor
{
class
PaddlePredictor
{
public:
public:
...
@@ -138,18 +157,20 @@ class PaddlePredictor {
...
@@ -138,18 +157,20 @@ class PaddlePredictor {
PaddlePredictor
(
const
PaddlePredictor
&
)
=
delete
;
PaddlePredictor
(
const
PaddlePredictor
&
)
=
delete
;
PaddlePredictor
&
operator
=
(
const
PaddlePredictor
&
)
=
delete
;
PaddlePredictor
&
operator
=
(
const
PaddlePredictor
&
)
=
delete
;
// Predict an record.
/** Predict an record.
// The caller should be responsible for allocating and releasing the memory of
* The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be available until Run returns. Caller should be
* `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for the output tensor's buffer, either allocated or passed from
* responsible for the output tensor's buffer, either allocated or passed from
// outside.
* outside.
*/
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
virtual
bool
Run
(
const
std
::
vector
<
PaddleTensor
>&
inputs
,
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
int
batch_size
=
-
1
)
=
0
;
// Zero copy input and output optimization.
/** Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly,
* Get the input or output tensors, and operate on their memory directly,
// without copy.
* without copy.
*/
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
{
const
std
::
string
&
name
)
{
return
nullptr
;
return
nullptr
;
...
@@ -160,16 +181,19 @@ class PaddlePredictor {
...
@@ -160,16 +181,19 @@ class PaddlePredictor {
}
}
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
// Clone a predictor that share the model weights, the Cloned predictor should
/** Clone a predictor that share the model weights, the Cloned predictor
// be thread-safe.
* should be thread-safe.
*/
virtual
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
=
0
;
virtual
std
::
unique_ptr
<
PaddlePredictor
>
Clone
()
=
0
;
// Destroy the Predictor.
/** Destroy the Predictor.
*/
virtual
~
PaddlePredictor
()
=
default
;
virtual
~
PaddlePredictor
()
=
default
;
// The common configs for all the predictors.
/** The common configs for all the predictors.
*/
struct
Config
{
struct
Config
{
std
::
string
model_dir
;
// path to the model directory.
std
::
string
model_dir
;
/*!< path to the model directory. */
};
};
};
};
...
@@ -177,17 +201,21 @@ struct NativeConfig : public PaddlePredictor::Config {
...
@@ -177,17 +201,21 @@ struct NativeConfig : public PaddlePredictor::Config {
// GPU related fields.
// GPU related fields.
bool
use_gpu
{
false
};
bool
use_gpu
{
false
};
int
device
{
0
};
int
device
{
0
};
float
fraction_of_gpu_memory
{
-
1.
f
};
// Change to a float in (0,1] if needed.
float
fraction_of_gpu_memory
{
-
1.
f
};
/*!< Change to a float in (0,1] if needed. */
// Specify the exact path of program and parameter files.
// Specify the exact path of program and parameter files.
std
::
string
prog_file
;
std
::
string
prog_file
;
std
::
string
param_file
;
std
::
string
param_file
;
// Specify the variable's name of each input if input tensors don't follow the
/** Specify the variable's name of each input if input tensors don't follow
// `feeds` and `fetches` of the phase `save_inference_model`.
* the
* `feeds` and `fetches` of the phase `save_inference_model`.
*/
bool
specify_input_name
{
false
};
bool
specify_input_name
{
false
};
// Set and get the number of cpu math library threads.
/** Set and get the number of cpu math library threads.
*/
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
}
}
...
@@ -201,28 +229,33 @@ struct NativeConfig : public PaddlePredictor::Config {
...
@@ -201,28 +229,33 @@ struct NativeConfig : public PaddlePredictor::Config {
int
cpu_math_library_num_threads_
{
1
};
int
cpu_math_library_num_threads_
{
1
};
};
};
// A factory to help create different predictors.
/*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
//
* config);
// Usage:
*
//
* \brief A factory to help create different predictors.
// NativeConfig config;
*
// ... // change the configs.
* Usage:
// auto native_predictor = CreatePaddlePredictor(config);
*
//
* NativeConfig config;
// FOR EXTENSION DEVELOPER:
* ... // change the configs.
// Different predictors are designated by config type. Similar configs can be
* auto native_predictor = CreatePaddlePredictor(config);
// merged, but there shouldn't be a huge config containing different fields for
*
// more than one kind of predictors.
* FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be
* merged, but there shouldn't be a huge config containing different fields for
* more than one kind of predictors.
*/
template
<
typename
ConfigT
>
template
<
typename
ConfigT
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
(
const
ConfigT
&
config
);
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
(
const
ConfigT
&
config
);
// NOTE The following APIs are too trivial, we will discard it in the following
/** NOTE The following APIs are too trivial, we will discard it in the following
// versions.
* versions.
*/
enum
class
PaddleEngineKind
{
enum
class
PaddleEngineKind
{
kNative
=
0
,
// Use the native Fluid facility.
kNative
=
0
,
/*!< Use the native Fluid facility. */
kAutoMixedTensorRT
,
// Automatically mix Fluid with TensorRT.
kAutoMixedTensorRT
,
/*!< Automatically mix Fluid with TensorRT. */
kAnalysis
,
// More optimization.
kAnalysis
,
/*!< More optimization. */
kAnakin
// Use Anakin for inference, not mature yet.
kAnakin
/*!< Use Anakin for inference, not mature yet. */
};
};
template
<
typename
ConfigT
,
PaddleEngineKind
engine
>
template
<
typename
ConfigT
,
PaddleEngineKind
engine
>
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
eac5a0aa
...
@@ -18,30 +18,39 @@
...
@@ -18,30 +18,39 @@
#include <string>
#include <string>
#include <vector>
#include <vector>
/*! \file */
/*! \namespace paddle */
namespace
paddle
{
namespace
paddle
{
/*
* This is a pass builder based on string. It is part of inference API.
/*
* This is a pass builder based on string. It is part of inference API.
*/
*/
class
PaddlePassBuilder
{
class
PaddlePassBuilder
{
public:
public:
explicit
PaddlePassBuilder
(
const
std
::
vector
<
std
::
string
>
&
passes
)
explicit
PaddlePassBuilder
(
const
std
::
vector
<
std
::
string
>
&
passes
)
:
passes_
(
passes
)
{}
:
passes_
(
passes
)
{}
/** Append a pass to the end of the passes. */
void
AppendPass
(
const
std
::
string
&
pass_type
);
void
AppendPass
(
const
std
::
string
&
pass_type
);
/** Insert a pass to a specific position.
* @param idx the position to insert.
* @param pass_type the pass key.
*/
void
InsertPass
(
size_t
idx
,
const
std
::
string
&
pass_type
);
void
InsertPass
(
size_t
idx
,
const
std
::
string
&
pass_type
);
/
/ Delete the `idx`-th pass.
/
** Delete the `idx`-th pass. */
void
DeletePass
(
size_t
idx
);
void
DeletePass
(
size_t
idx
);
/
/ Delete all the passes that has type `pass_type`.
/
** Delete all the passes that has type `pass_type`. */
void
DeletePass
(
const
std
::
string
&
pass_type
);
void
DeletePass
(
const
std
::
string
&
pass_type
);
// Visualize the computation graph after each pass by generating a DOT
/** Visualize the computation graph after each pass by generating a DOT
// language file, one can draw them with the Graphviz toolkit.
* language file, one can draw them with the Graphviz toolkit.
*/
void
TurnOnDebug
();
void
TurnOnDebug
();
/
/ Human-readible information.
/
** Human-readible information. */
std
::
string
DebugString
();
std
::
string
DebugString
();
const
std
::
vector
<
std
::
string
>
&
AllPasses
()
const
{
return
passes_
;
}
const
std
::
vector
<
std
::
string
>
&
AllPasses
()
const
{
return
passes_
;
}
...
@@ -50,16 +59,16 @@ class PaddlePassBuilder {
...
@@ -50,16 +59,16 @@ class PaddlePassBuilder {
std
::
vector
<
std
::
string
>
passes_
;
std
::
vector
<
std
::
string
>
passes_
;
};
};
/*
/**Pass strategy to help control the IR passes.
* Pass strategy to help control the IR passes.
*/
*/
class
PassStrategy
:
public
PaddlePassBuilder
{
class
PassStrategy
:
public
PaddlePassBuilder
{
public:
public:
explicit
PassStrategy
(
const
std
::
vector
<
std
::
string
>
&
passes
)
explicit
PassStrategy
(
const
std
::
vector
<
std
::
string
>
&
passes
)
:
PaddlePassBuilder
(
passes
)
{}
:
PaddlePassBuilder
(
passes
)
{}
// The MKLDNN control exists in both CPU and GPU mode, because there can be
/** The MKLDNN control exists in both CPU and GPU mode, because there can be
// still some CPU kernels running in CPU mode.
* still some CPU kernels running in CPU mode.
*/
virtual
void
EnableMKLDNN
()
=
0
;
virtual
void
EnableMKLDNN
()
=
0
;
bool
use_gpu
()
const
{
return
use_gpu_
;
}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
...
@@ -70,8 +79,7 @@ class PassStrategy : public PaddlePassBuilder {
...
@@ -70,8 +79,7 @@ class PassStrategy : public PaddlePassBuilder {
bool
use_gpu_
{
false
};
bool
use_gpu_
{
false
};
};
};
/*
/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
* The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
*/
*/
class
CpuPassStrategy
:
public
PassStrategy
{
class
CpuPassStrategy
:
public
PassStrategy
{
public:
public:
...
@@ -81,6 +89,7 @@ class CpuPassStrategy : public PassStrategy {
...
@@ -81,6 +89,7 @@ class CpuPassStrategy : public PassStrategy {
passes_
.
assign
({
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"seqpool_concat_fuse_pass"
,
//
"seqconv_eltadd_relu_fuse_pass"
,
//
"seqconv_eltadd_relu_fuse_pass"
,
//
// "embedding_fc_lstm_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass"
,
//
"fc_lstm_fuse_pass"
,
//
...
@@ -117,8 +126,7 @@ class CpuPassStrategy : public PassStrategy {
...
@@ -117,8 +126,7 @@ class CpuPassStrategy : public PassStrategy {
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
passes_
)
{}
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
passes_
)
{}
};
};
/*
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
* The GPU passes strategy, it is used in
*/
*/
class
GpuPassStrategy
:
public
PassStrategy
{
class
GpuPassStrategy
:
public
PassStrategy
{
public:
public:
...
@@ -133,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
...
@@ -133,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
});
});
for
(
int
i
=
6
;
i
>=
3
;
i
--
)
{
passes_
.
push_back
(
"transpose_flatten"
+
std
::
to_string
(
i
)
+
"_concat_fuse_pass"
);
}
use_gpu_
=
true
;
use_gpu_
=
true
;
}
}
...
...
paddle/fluid/inference/tensorrt/CMakeLists.txt
浏览文件 @
eac5a0aa
nv_library
(
tensorrt_engine SRCS engine.cc DEPS
${
GLOB_OPERATOR_DEPS
}
framework_proto device_context
)
nv_library
(
tensorrt_engine SRCS engine.cc DEPS
${
GLOB_OPERATOR_DEPS
}
framework_proto device_context
)
nv_library
(
tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
add_subdirectory
(
plugin
)
add_subdirectory
(
plugin
)
...
...
paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
浏览文件 @
eac5a0aa
...
@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
...
@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
// Here the two nullptr looks strange, that's because the
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
// framework::OpDesc's constructor is strange.
nvinfer1
::
ILayer
*
layer
=
nullptr
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IScaleLayer"
;
VLOG
(
3
)
<<
"Convert a fluid elementwise op to TensorRT IScaleLayer"
;
...
@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
...
@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
0
};
0
};
TensorRTEngine
::
Weight
power_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
TensorRTEngine
::
Weight
power_weights
{
nvinfer1
::
DataType
::
kFLOAT
,
nullptr
,
0
};
0
};
if
(
op_type_
==
"add"
)
{
nvinfer1
::
IScaleLayer
*
scale_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
X
,
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
layer
=
scale_layer
;
}
else
if
(
op_type_
==
"mul"
)
{
nvinfer1
::
IScaleLayer
*
scale_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
X
,
scale_mode
,
scale_weights
.
get
(),
shift_weights
.
get
(),
power_weights
.
get
());
layer
=
scale_layer
;
}
nvinfer1
::
IScaleLayer
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Scale
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
scale_mode
,
shift_weights
.
get
(),
scale_weights
.
get
(),
power_weights
.
get
());
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
layer
->
setName
(
layer
->
setName
((
"elementwise_add
(Output: "
+
output_name
+
")"
).
c_str
());
(
"elementwise_"
+
op_type_
+
"
(Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
engine_
->
weight_map
[
op_desc
.
Input
(
"Y"
).
front
()]
=
std
::
move
(
weight_tensor
);
engine_
->
weight_map
[
op_desc
.
Input
(
"Y"
).
front
()]
=
std
::
move
(
weight_tensor
);
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
...
@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
...
@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
engine_
->
DeclareOutput
(
output_name
);
engine_
->
DeclareOutput
(
output_name
);
}
}
}
}
protected:
std
::
string
op_type_
;
};
};
class
ElementwiseTensorOpConverter
:
public
OpConverter
{
class
ElementwiseTensorOpConverter
:
public
OpConverter
{
...
@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
...
@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
{
"max"
,
nvinfer1
::
ElementWiseOperation
::
kMAX
},
{
"max"
,
nvinfer1
::
ElementWiseOperation
::
kMAX
},
};
};
class
ElementwiseWeightAddOpConverter
:
public
ElementwiseWeightOpConverter
{
public:
ElementwiseWeightAddOpConverter
()
{
op_type_
=
"add"
;
}
};
class
ElementwiseWeightMulOpConverter
:
public
ElementwiseWeightOpConverter
{
public:
ElementwiseWeightMulOpConverter
()
{
op_type_
=
"mul"
;
}
};
class
ElementwiseTensorAddOpConverter
:
public
ElementwiseTensorOpConverter
{
class
ElementwiseTensorAddOpConverter
:
public
ElementwiseTensorOpConverter
{
public:
public:
ElementwiseTensorAddOpConverter
()
{
op_type_
=
"add"
;
}
ElementwiseTensorAddOpConverter
()
{
op_type_
=
"add"
;
}
...
@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
...
@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_weight
,
ElementwiseWeightAddOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_mul_weight
,
ElementwiseWeightMulOpConverter
);
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_tensor
,
REGISTER_TRT_OP_CONVERTER
(
elementwise_add_tensor
,
ElementwiseTensorAddOpConverter
);
ElementwiseTensorAddOpConverter
);
...
...
paddle/fluid/inference/tensorrt/op_teller.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
// Just tell by the op_types.
struct
SimpleOpTypeSetTeller
:
public
Teller
{
SimpleOpTypeSetTeller
()
{}
bool
operator
()(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
override
{
return
teller_set
.
count
(
op_type
);
}
private:
std
::
unordered_set
<
std
::
string
>
teller_set
{
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
,
"leaky_relu"
}};
};
bool
OpTeller
::
Tell
(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
{
for
(
auto
&
teller
:
tellers_
)
{
if
((
*
teller
)(
op_type
,
desc
))
return
true
;
}
return
false
;
}
OpTeller
::
OpTeller
()
{
tellers_
.
emplace_back
(
new
SimpleOpTypeSetTeller
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/op_teller.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct
Teller
{
virtual
bool
operator
()(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
=
0
;
virtual
~
Teller
()
=
default
;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class
OpTeller
{
public:
static
OpTeller
&
Global
()
{
static
std
::
unique_ptr
<
OpTeller
>
x
(
new
OpTeller
);
return
*
x
;
}
bool
Tell
(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
);
private:
OpTeller
();
private:
std
::
vector
<
std
::
unique_ptr
<
Teller
>>
tellers_
;
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -41,7 +41,7 @@ endfunction()
...
@@ -41,7 +41,7 @@ endfunction()
if
(
NOT APPLE AND WITH_MKLML
)
if
(
NOT APPLE AND WITH_MKLML
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
download_model_and_data
(
${
RNN1_INSTALL_DIR
}
"rnn1%2Fmodel.tar.gz"
"rnn1%2Fdata.txt.tar.gz"
)
download_model_and_data
(
${
RNN1_INSTALL_DIR
}
"rnn1%2Fmodel.tar.gz"
"rnn1%2Fdata.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_rnn1
${
RNN1_INSTALL_DIR
}
analyzer_rnn1_tester.cc
)
inference_analysis_api_test
(
test_analyzer_rnn1
${
RNN1_INSTALL_DIR
}
analyzer_rnn1_tester.cc
SERIAL
)
else
()
else
()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
...
@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
...
@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM
# normal DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
SERIAL
)
# small DAM
# small DAM
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
download_model_and_data
(
${
DAM_SMALL_INSTALL_DIR
}
"dam_small_model.tar.gz"
"dam_small_data.txt.tar.gz"
)
download_model_and_data
(
${
DAM_SMALL_INSTALL_DIR
}
"dam_small_model.tar.gz"
"dam_small_data.txt.tar.gz"
)
inference_analysis_test
(
test_analyzer_small_dam SRCS analyzer_dam_tester.cc
inference_analysis_test
(
test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
DAM_SMALL_INSTALL_DIR
}
/model --infer_data=
${
DAM_SMALL_INSTALL_DIR
}
/data.txt --max_turn_num=1
)
ARGS --infer_model=
${
DAM_SMALL_INSTALL_DIR
}
/model --infer_data=
${
DAM_SMALL_INSTALL_DIR
}
/data.txt --max_turn_num=1
SERIAL
)
# chinese_ner
# chinese_ner
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
...
@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
...
@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
# resnet50
# resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
SERIAL
)
# mobilenet with depthwise_conv op
# mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data
(
test_analyzer_mobilenet_depthwise_conv
inference_analysis_api_test_with_fake_data
(
test_analyzer_mobilenet_depthwise_conv
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet_depthwise_conv"
analyzer_resnet50_tester.cc
"mobilenet_model.tar.gz"
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet_depthwise_conv"
analyzer_resnet50_tester.cc
"mobilenet_model.tar.gz"
SERIAL
)
# anakin
# anakin
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
eac5a0aa
...
@@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
...
@@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
4
/* multi_thread */
);
input_slots_all
,
&
outputs
,
2
/* multi_thread */
);
}
}
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
...
@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) {
...
@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) {
ASSERT_TRUE
(
native_predictor
->
Run
(
native_inputs
.
front
(),
&
native_outputs
));
ASSERT_TRUE
(
native_predictor
->
Run
(
native_inputs
.
front
(),
&
native_outputs
));
LOG
(
INFO
)
<<
"native output "
<<
DescribeTensor
(
native_outputs
.
front
());
LOG
(
INFO
)
<<
"native output "
<<
DescribeTensor
(
native_outputs
.
front
());
int
output_size
{
0
};
int
output_size
{
0
};
// this is the number of elements not memory size
auto
*
zero_copy_data
=
output_tensor
->
data
<
float
>
(
&
place
,
&
output_size
);
auto
*
zero_copy_data
=
output_tensor
->
data
<
float
>
(
&
place
,
&
output_size
);
auto
*
native_data
=
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
());
auto
*
native_data
=
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
());
for
(
size_t
i
=
0
;
i
<
output_size
/
sizeof
(
float
)
;
i
++
)
{
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
EXPECT_NEAR
(
zero_copy_data
[
i
],
native_data
[
i
],
1e-3
);
EXPECT_NEAR
(
zero_copy_data
[
i
],
native_data
[
i
],
1e-3
);
}
}
}
}
...
...
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
浏览文件 @
eac5a0aa
...
@@ -121,14 +121,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
...
@@ -121,14 +121,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
}
}
}
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
SetModel
(
FLAGS_infer_model
+
"/model"
,
FLAGS_infer_model
+
"/params"
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
pass_builder
()
->
TurnOnDebug
();
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
PaddleTensor
>
input_slots
;
std
::
vector
<
PaddleTensor
>
input_slots
;
...
@@ -141,15 +133,22 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...
@@ -141,15 +133,22 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
}
}
}
}
void
SetConfig
(
AnalysisConfig
*
cfg
,
bool
use_mkldnn
=
false
)
{
cfg
->
SetModel
(
FLAGS_infer_model
+
"/model"
,
FLAGS_infer_model
+
"/params"
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
pass_builder
()
->
TurnOnDebug
();
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
if
(
use_mkldnn
)
{
cfg
->
EnableMKLDNN
();
}
}
void
profile
(
bool
use_mkldnn
=
false
)
{
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
,
use_mkldnn
);
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
}
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
...
@@ -169,16 +168,110 @@ TEST(Analyzer_seq_pool1, compare) {
...
@@ -169,16 +168,110 @@ TEST(Analyzer_seq_pool1, compare) {
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// Check the fuse status
// Compare Deterministic result
TEST
(
Analyzer_seq_pool1
,
fuse_statis
)
{
TEST
(
Analyzer_seq_pool1
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
void
analysis_fuse_statis
(
bool
use_zerocopy
)
{
AnalysisConfig
cfg
;
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
cfg
.
SwitchUseFeedFetchOps
(
!
use_zerocopy
);
int
num_ops
;
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
10
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"seqpool_concat_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"seqpool_concat_fuse"
),
2
);
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
EXPECT_EQ
(
num_ops
,
349
);
EXPECT_EQ
(
num_ops
,
195
);
}
// Check the fuse status
TEST
(
Analyzer_seq_pool1
,
fuse_statis
)
{
analysis_fuse_statis
(
false
);
}
void
PrepareZeroCopyInputs
(
const
std
::
unique_ptr
<
PaddlePredictor
>
&
predictor
,
std
::
vector
<
std
::
unique_ptr
<
ZeroCopyTensor
>>
*
inputs
)
{
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
// only feed one batch
const
auto
&
one_batch
=
data
.
NextBatch
();
inputs
->
clear
();
for
(
size_t
i
=
0
;
i
<
one_batch
.
size
();
++
i
)
{
auto
&
slot
=
one_batch
[
i
];
auto
tensor
=
predictor
->
GetInputTensor
(
slot
.
name
+
"_embed"
);
tensor
->
Reshape
(
slot
.
shape
);
tensor
->
SetLoD
({
slot
.
lod
});
ZeroCopyTensorAssignData
<
float
>
(
tensor
.
get
(),
slot
.
data
);
inputs
->
emplace_back
(
std
::
move
(
tensor
));
}
}
// return the output values
std
::
vector
<
float
>
zerocopy_profile
(
int
repeat_times
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
SwitchUseFeedFetchOps
(
false
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
std
::
vector
<
std
::
unique_ptr
<
ZeroCopyTensor
>>
inputs
;
PrepareZeroCopyInputs
(
predictor
,
&
inputs
);
auto
output_tensor
=
predictor
->
GetOutputTensor
(
"reduce_sum_0.tmp_0"
);
Timer
timer
;
LOG
(
INFO
)
<<
"Warm up run..."
;
timer
.
tic
();
predictor
->
ZeroCopyRun
();
PrintTime
(
FLAGS_batch_size
,
1
,
1
,
0
,
timer
.
toc
(),
1
);
if
(
FLAGS_profile
)
{
paddle
::
platform
::
ResetProfiler
();
}
LOG
(
INFO
)
<<
"Run "
<<
repeat_times
<<
" times..."
;
timer
.
tic
();
for
(
int
i
=
0
;
i
<
repeat_times
;
i
++
)
{
predictor
->
ZeroCopyRun
();
}
PrintTime
(
FLAGS_batch_size
,
repeat_times
,
1
,
0
,
timer
.
toc
()
/
repeat_times
,
1
);
VLOG
(
3
)
<<
"ZeroCopy output: "
<<
DescribeZeroCopyTensor
(
*
output_tensor
);
PaddlePlace
place
;
int
output_size
{
0
};
auto
*
pdata
=
output_tensor
->
data
<
float
>
(
&
place
,
&
output_size
);
std
::
vector
<
float
>
res
(
output_size
);
for
(
int
i
=
0
;
i
<
output_size
;
++
i
)
{
res
[
i
]
=
pdata
[
i
];
}
return
res
;
}
TEST
(
Analyzer_seq_pool1
,
zerocopy_profile
)
{
zerocopy_profile
(
FLAGS_repeat
);
}
TEST
(
Analyzer_seq_pool1
,
zerocopy_fuse_statis
)
{
analysis_fuse_statis
(
true
);
}
TEST
(
Analyzer_seq_pool1
,
zerocopy_compare_native
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
SwitchUseFeedFetchOps
(
true
);
auto
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
.
ToNativeConfig
());
std
::
vector
<
PaddleTensor
>
native_outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
ASSERT_TRUE
(
predictor
->
Run
(
input_slots_all
[
0
],
&
native_outputs
));
EXPECT_EQ
(
native_outputs
.
size
(),
1UL
);
auto
zerocopy_output
=
zerocopy_profile
(
1
);
EXPECT_EQ
(
zerocopy_output
.
size
()
*
sizeof
(
float
),
native_outputs
.
front
().
data
.
length
());
auto
*
native_data
=
static_cast
<
float
*>
(
native_outputs
.
front
().
data
.
data
());
for
(
size_t
i
=
0
;
i
<
zerocopy_output
.
size
();
++
i
)
{
EXPECT_NEAR
(
zerocopy_output
[
i
],
native_data
[
i
],
1e-3
);
}
}
}
}
// namespace analysis
}
// namespace analysis
...
...
paddle/fluid/inference/tests/api/config_printer.h
浏览文件 @
eac5a0aa
...
@@ -62,7 +62,7 @@ std::ostream &operator<<(std::ostream &os,
...
@@ -62,7 +62,7 @@ std::ostream &operator<<(std::ostream &os,
const
contrib
::
AnalysisConfig
&
config
)
{
const
contrib
::
AnalysisConfig
&
config
)
{
os
<<
GenSpaces
(
num_spaces
)
<<
"contrib::AnalysisConfig {
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"contrib::AnalysisConfig {
\n
"
;
num_spaces
++
;
num_spaces
++
;
os
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
&
config
);
os
<<
config
.
ToNativeConfig
(
);
if
(
!
config
.
model_from_memory
())
{
if
(
!
config
.
model_from_memory
())
{
os
<<
GenSpaces
(
num_spaces
)
<<
"prog_file: "
<<
config
.
prog_file
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"prog_file: "
<<
config
.
prog_file
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
params_file
()
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
params_file
()
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
eac5a0aa
...
@@ -54,11 +54,13 @@ namespace paddle {
...
@@ -54,11 +54,13 @@ namespace paddle {
namespace
inference
{
namespace
inference
{
void
PrintConfig
(
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
)
{
void
PrintConfig
(
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
)
{
const
auto
*
analysis_config
=
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
if
(
use_analysis
)
{
if
(
use_analysis
)
{
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
)
;
LOG
(
INFO
)
<<
*
analysis_config
;
return
;
return
;
}
}
LOG
(
INFO
)
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
config
);
LOG
(
INFO
)
<<
analysis_config
->
ToNativeConfig
(
);
}
}
void
CompareResult
(
const
std
::
vector
<
PaddleTensor
>
&
outputs
,
void
CompareResult
(
const
std
::
vector
<
PaddleTensor
>
&
outputs
,
...
@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
...
@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std
::
unique_ptr
<
PaddlePredictor
>
CreateTestPredictor
(
std
::
unique_ptr
<
PaddlePredictor
>
CreateTestPredictor
(
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
=
true
)
{
const
PaddlePredictor
::
Config
*
config
,
bool
use_analysis
=
true
)
{
const
auto
*
analysis_config
=
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
if
(
use_analysis
)
{
if
(
use_analysis
)
{
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
return
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
*
analysis_config
);
*
(
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
)));
}
}
return
CreatePaddlePredictor
<
NativeConfig
>
(
auto
native_config
=
analysis_config
->
ToNativeConfig
();
*
(
reinterpret_cast
<
const
NativeConfig
*>
(
config
))
);
return
CreatePaddlePredictor
<
NativeConfig
>
(
native_config
);
}
}
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
size_t
GetSize
(
const
PaddleTensor
&
out
)
{
return
VecReduceToInt
(
out
.
shape
);
}
...
@@ -328,10 +331,7 @@ void CompareNativeAndAnalysis(
...
@@ -328,10 +331,7 @@ void CompareNativeAndAnalysis(
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
)
{
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
)
{
PrintConfig
(
config
,
true
);
PrintConfig
(
config
,
true
);
std
::
vector
<
PaddleTensor
>
native_outputs
,
analysis_outputs
;
std
::
vector
<
PaddleTensor
>
native_outputs
,
analysis_outputs
;
const
auto
*
analysis_config
=
TestOneThreadPrediction
(
config
,
inputs
,
&
native_outputs
,
false
);
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
auto
native_config
=
analysis_config
->
ToNativeConfig
();
TestOneThreadPrediction
(
&
native_config
,
inputs
,
&
native_outputs
,
false
);
TestOneThreadPrediction
(
config
,
inputs
,
&
analysis_outputs
,
true
);
TestOneThreadPrediction
(
config
,
inputs
,
&
analysis_outputs
,
true
);
CompareResult
(
analysis_outputs
,
native_outputs
);
CompareResult
(
analysis_outputs
,
native_outputs
);
}
}
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
eac5a0aa
...
@@ -99,24 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) {
...
@@ -99,24 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) {
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
false
,
"__model__"
,
""
);
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
false
,
"__model__"
,
""
);
}
}
std
::
vector
<
PaddleTensor
>
native_outputs
;
NativeConfig
native_config
;
SetConfig
<
NativeConfig
>
(
&
native_config
,
model_dir
,
true
,
false
,
FLAGS_batch_size
);
TestOneThreadPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
native_config
),
inputs_all
,
&
native_outputs
,
false
);
std
::
vector
<
PaddleTensor
>
analysis_outputs
;
contrib
::
AnalysisConfig
analysis_config
;
contrib
::
AnalysisConfig
analysis_config
;
analysis_config
.
EnableUseGpu
(
50
,
0
);
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
analysis_config
,
model_dir
,
true
,
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
analysis_config
,
model_dir
,
true
,
use_tensorrt
,
FLAGS_batch_size
);
use_tensorrt
,
FLAGS_batch_size
);
TestOneThreadPrediction
(
CompareNativeAndAnalysis
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
analysis_config
),
inputs_all
,
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
analysis_config
),
&
analysis_outputs
,
true
);
inputs_all
);
CompareResult
(
native_outputs
,
analysis_outputs
);
}
}
TEST
(
TensorRT_mobilenet
,
compare
)
{
TEST
(
TensorRT_mobilenet
,
compare
)
{
...
...
paddle/fluid/inference/utils/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
...
@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test
(
test_benchmark SRCS benchmark_tester.cc DEPS benchmark
)
cc_test
(
test_benchmark SRCS benchmark_tester.cc DEPS benchmark
)
cc_binary
(
visualizer SRCS visualizer.cc DEPS analysis
cc_binary
(
visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes
)
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes
)
if
(
WIN32
)
target_link_libraries
(
visualizer shlwapi
)
endif
(
WIN32
)
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
eac5a0aa
...
@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
...
@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnFilterDescriptor_t
cudnn_filter_desc
=
filter_desc
.
descriptor
<
T
>
(
cudnnFilterDescriptor_t
cudnn_filter_desc
=
filter_desc
.
descriptor
<
T
>
(
layout
,
framework
::
vectorize2int
(
filter
->
dims
()),
groups
);
layout
,
framework
::
vectorize2int
(
filter
->
dims
()),
groups
);
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Enable Tensor Core for cudnn backward
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math for backward"
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math for backward"
;
}
#endif
int
input_channels
=
input
->
dims
()[
1
];
int
input_channels
=
input
->
dims
()[
1
];
int
input_height
,
input_width
,
input_depth
;
int
input_height
,
input_width
,
input_depth
;
if
(
input
->
dims
().
size
()
==
5
)
{
if
(
input
->
dims
().
size
()
==
5
)
{
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
浏览文件 @
eac5a0aa
...
@@ -318,8 +318,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -318,8 +318,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
bool
fuse_relu
=
ctx
.
Attr
<
bool
>
(
"fuse_relu"
);
bool
fuse_residual_conn
=
ctx
.
Attr
<
bool
>
(
"fuse_residual_connection"
);
bool
force_fp32_output
=
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
);
bool
force_fp32_output
=
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
);
if
(
fuse_residual_conn
)
{
PADDLE_ENFORCE
(
force_fp32_output
!=
true
,
"residual fusion does not support force output with fp32"
);
}
bool
is_conv3d
=
strides
.
size
()
==
3U
;
bool
is_conv3d
=
strides
.
size
()
==
3U
;
// TODO(tpatejko): add support for dilation
// TODO(tpatejko): add support for dilation
...
@@ -329,6 +335,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -329,6 +335,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dilations
[
2
]
==
1
dilations
[
2
]
==
1
:
dilations
.
size
()
==
2
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
,
:
dilations
.
size
()
==
2
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
,
"dilation in convolution is not implemented yet"
);
"dilation in convolution is not implemented yet"
);
PADDLE_ENFORCE
(
is_conv3d
!=
true
,
"int8 does not support conv3d currently"
);
PADDLE_ENFORCE
(
is_conv3d
!=
true
,
"int8 does not support conv3d currently"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
input_data
=
input
->
data
<
T
>
();
...
@@ -340,17 +347,35 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -340,17 +347,35 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
GetWeightsTz
(
weights_tz
,
g
,
is_conv3d
);
GetWeightsTz
(
weights_tz
,
g
,
is_conv3d
);
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
mkldnn
::
memory
::
data_type
src_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
auto
dst_dt
=
fuse_relu
?
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
uint8_t
>::
DataType
)
:
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
int8_t
>::
DataType
);
if
(
force_fp32_output
)
{
dst_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
float
>::
DataType
);
}
if
(
fuse_residual_conn
)
{
auto
residual
=
ctx
.
Input
<
Tensor
>
(
"ResidualData"
);
auto
residual_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
residual
->
type
());
if
(
dst_dt
!=
residual_dt
)
dst_dt
=
residual_dt
;
}
// Get unique name for storing MKLDNN primitives
// Get unique name for storing MKLDNN primitives
std
::
string
key
;
std
::
string
key
;
key
.
reserve
(
MaxKeyLength
);
key
.
reserve
(
MaxKeyLength
);
mkldnn
::
memory
::
data_type
src_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
platform
::
ConvMKLDNNHandler
::
AppendKey
(
platform
::
ConvMKLDNNHandler
::
AppendKey
(
&
key
,
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
groups
,
src_dt
,
&
key
,
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
groups
,
src_dt
,
input
->
format
(),
ctx
.
op
().
Output
(
"Output"
));
input
->
format
(),
fuse_relu
,
fuse_residual_conn
,
ctx
.
op
().
Output
(
"Output"
));
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
bool
need_s8_to_u8
=
false
;
std
::
shared_ptr
<
mkldnn
::
convolution_forward
>
conv_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
convolution_forward
>
conv_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
user_src_memory_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
user_src_memory_p
=
nullptr
;
...
@@ -365,14 +390,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -365,14 +390,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
src_key
=
key
+
"@src_mem_p"
;
auto
src_key
=
key
+
"@src_mem_p"
;
auto
user_src_key
=
key
+
"@user_src_mem_p"
;
auto
user_src_key
=
key
+
"@user_src_mem_p"
;
auto
src_reorder_key
=
key
+
"@src_mem_preorder_p"
;
auto
src_reorder_key
=
key
+
"@src_mem_preorder_p"
;
auto
residual_reorder_key
=
key
+
"@residual_data_mem_preorder_p"
;
conv_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_forward
>
(
conv_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_forward
>
(
dev_ctx
.
GetBlob
(
prim_key
));
dev_ctx
.
GetBlob
(
prim_key
));
if
(
conv_p
==
nullptr
||
!
is_test
)
{
if
(
conv_p
==
nullptr
||
!
is_test
)
{
const
K
*
filter_data
=
filter
->
data
<
K
>
();
const
K
*
filter_data
=
filter
->
data
<
K
>
();
auto
scale_in_data
=
ctx
.
Attr
<
float
>
(
"Scale_in"
);
auto
scale_in_data
=
ctx
.
Attr
<
float
>
(
"Scale_in"
);
auto
scale_in_eltwise_data
=
ctx
.
Attr
<
float
>
(
"Scale_in_eltwise"
);
auto
scale_weights_data
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"Scale_weights"
);
auto
scale_weights_data
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"Scale_weights"
);
auto
scale_out_data
=
auto
scale_out_data
=
force_fp32_output
?
1.0
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
force_fp32_output
?
1.0
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
float
sum_scale
=
fuse_residual_conn
?
scale_out_data
/
scale_in_eltwise_data
:
1.0
f
;
bool
is_multi_channel
=
scale_weights_data
.
size
()
>
1
;
bool
is_multi_channel
=
scale_weights_data
.
size
()
>
1
;
...
@@ -413,15 +444,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -413,15 +444,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform
::
MKLDNNMemDesc
(
src_tz
,
src_dt
,
chosen_memory_format
);
platform
::
MKLDNNMemDesc
(
src_tz
,
src_dt
,
chosen_memory_format
);
auto
weights_md
=
platform
::
MKLDNNMemDesc
(
auto
weights_md
=
platform
::
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
s8
,
chosen_memory_format
);
weights_tz
,
memory
::
data_type
::
s8
,
chosen_memory_format
);
auto
dst_dt
=
force_fp32_output
?
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
float
>::
DataType
)
:
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
int8_t
>::
DataType
);
auto
dst_md
=
auto
dst_md
=
platform
::
MKLDNNMemDesc
(
dst_tz
,
dst_dt
,
chosen_memory_format
);
platform
::
MKLDNNMemDesc
(
dst_tz
,
dst_dt
,
chosen_memory_format
);
// create a conv primitive descriptor and save it for usage in backward
// create a conv primitive descriptor and save it for usage in backward
if
(
bias
)
{
if
(
bias
)
{
bias_tz
=
paddle
::
framework
::
vectorize2int
(
bias
->
dims
());
bias_tz
=
paddle
::
framework
::
vectorize2int
(
bias
->
dims
());
...
@@ -429,11 +454,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -429,11 +454,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
memory
::
format
::
x
);
memory
::
format
::
x
);
conv_pd
=
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
bias_md
,
dst_md
,
conv_pd
=
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
bias_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
strides
,
paddings
,
mkldnn_engine
,
output_shift_scale
,
is_test
);
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
,
is_test
);
}
else
{
}
else
{
conv_pd
=
conv_pd
=
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
dst_md
,
strides
,
paddings
,
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
output_shift_scale
,
is_test
);
mkldnn_engine
,
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
,
is_test
);
}
}
// Save conv_pd/src_memory/weights_memory for backward pass
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd
);
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd
);
...
@@ -458,8 +485,46 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -458,8 +485,46 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
user_weights_memory_p
,
pipeline
,
is_test
,
true
,
scale_weights_data
,
user_weights_memory_p
,
pipeline
,
is_test
,
true
,
scale_weights_data
,
mask_reorder
);
mask_reorder
);
if
(
!
force_fp32_output
)
{
if
(
fuse_residual_conn
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
handler
);
auto
residual_param
=
ctx
.
Input
<
Tensor
>
(
"ResidualData"
);
PADDLE_ENFORCE_EQ
(
output
->
dims
(),
residual_param
->
dims
(),
"Output and elementwise parameter need to have the "
"same dimension sizes"
);
auto
residual_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
residual_param
->
type
());
if
(
residual_param
->
format
()
!=
handler
->
GetDstFormat
())
{
auto
residual_data_tz
=
paddle
::
framework
::
vectorize2int
(
residual_param
->
dims
());
auto
user_residual_md
=
platform
::
MKLDNNMemDesc
(
residual_data_tz
,
residual_dt
,
residual_param
->
format
());
if
(
residual_dt
==
mkldnn
::
memory
::
data_type
::
u8
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
uint8_t
>
(
ctx
,
output
,
residual_param
,
user_residual_md
,
handler
,
&
pipeline
);
}
else
{
need_s8_to_u8
=
fuse_relu
;
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
residual_param
,
user_residual_md
,
handler
,
&
pipeline
);
}
}
else
{
output
->
ShareDataWith
(
*
residual_param
);
if
(
residual_dt
==
mkldnn
::
memory
::
data_type
::
u8
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
uint8_t
>
(
ctx
,
output
,
handler
);
}
else
{
need_s8_to_u8
=
fuse_relu
;
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
handler
);
}
}
}
else
if
(
!
force_fp32_output
)
{
if
(
fuse_relu
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
uint8_t
>
(
ctx
,
output
,
handler
);
}
else
{
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
handler
);
}
}
else
{
}
else
{
dst_memory_p
=
platform
::
SetDstMemory
<
float
>
(
ctx
,
output
,
handler
);
dst_memory_p
=
platform
::
SetDstMemory
<
float
>
(
ctx
,
output
,
handler
);
}
}
...
@@ -467,11 +532,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -467,11 +532,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// create convolution op primitive
// create convolution op primitive
auto
scale_bias_key
=
key
+
"@scale_bias"
;
auto
scale_bias_key
=
key
+
"@scale_bias"
;
if
(
bias
)
{
if
(
bias
)
{
const
float
*
bias_data
=
bias
->
data
<
float
>
();
const
K
*
bias_data
=
bias
->
data
<
K
>
();
auto
user_bias_md
=
platform
::
MKLDNNMemDesc
(
auto
user_bias_md
=
platform
::
MKLDNNMemDesc
(
{
bias_tz
},
platform
::
MKLDNNGetDataType
<
float
>
(),
memory
::
format
::
x
);
{
bias_tz
},
platform
::
MKLDNNGetDataType
<
K
>
(),
memory
::
format
::
x
);
auto
user_bias_memory_p
=
handler
->
AcquireBiasMemory
(
auto
user_bias_memory_p
=
handler
->
AcquireBiasMemory
(
user_bias_md
,
to_void_cast
<
float
>
(
bias_data
));
user_bias_md
,
to_void_cast
<
K
>
(
bias_data
));
std
::
shared_ptr
<
mkldnn
::
memory
>
bias_memory_p
;
std
::
shared_ptr
<
mkldnn
::
memory
>
bias_memory_p
;
int
mask_reorder
=
is_multi_channel
?
1
<<
0
:
1
;
int
mask_reorder
=
is_multi_channel
?
1
<<
0
:
1
;
int
count
=
int
count
=
...
@@ -517,21 +582,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -517,21 +582,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
handler
.
reset
(
new
platform
::
ConvMKLDNNHandler
(
conv_pd
,
dev_ctx
,
handler
.
reset
(
new
platform
::
ConvMKLDNNHandler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
));
mkldnn_engine
,
key
));
}
}
if
(
!
force_fp32_output
)
{
dst_memory_p
=
if
(
fuse_residual_conn
)
{
platform
::
SetDstMemoryHandler
<
int8_t
>
(
ctx
,
output
,
handler
);
auto
residual_param
=
ctx
.
Input
<
Tensor
>
(
"ResidualData"
);
auto
residual_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
residual_param
->
type
());
output
->
ShareDataWith
(
*
residual_param
);
if
(
residual_dt
==
mkldnn
::
memory
::
data_type
::
u8
)
{
platform
::
SetDstMemoryHandler
<
uint8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
else
{
platform
::
SetDstMemoryHandler
<
int8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
}
else
if
(
!
force_fp32_output
)
{
if
(
fuse_relu
)
{
platform
::
SetDstMemoryHandler
<
uint8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
else
{
platform
::
SetDstMemoryHandler
<
int8_t
>
(
ctx
,
output
,
handler
,
&
dst_memory_p
);
}
}
else
{
}
else
{
dst_memory_p
=
platform
::
SetDstMemoryHandler
<
float
>
(
ctx
,
output
,
handler
,
platform
::
SetDstMemoryHandler
<
float
>
(
ctx
,
output
,
handler
);
&
dst_memory_p
);
}
}
if
(
src_memory_reorder_p
)
{
if
(
src_memory_reorder_p
)
{
pipeline
.
push_back
(
*
src_memory_reorder_p
);
pipeline
.
push_back
(
*
src_memory_reorder_p
);
}
}
auto
residual_reorder_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
residual_reorder_key
));
if
(
residual_reorder_p
)
{
pipeline
.
push_back
(
*
residual_reorder_p
);
}
pipeline
.
push_back
(
*
conv_p
);
pipeline
.
push_back
(
*
conv_p
);
}
}
// push primitive to stream and wait until it's executed
// push primitive to stream and wait until it's executed
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
if
(
need_s8_to_u8
)
{
output
->
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
}
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory_p
));
output
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory_p
));
}
}
...
@@ -563,11 +658,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -563,11 +658,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
}
mkldnn
::
primitive_attr
CreatePostOps
(
mkldnn
::
primitive_attr
CreatePostOps
(
const
std
::
vector
<
float
>
output_shift_scale
)
const
{
bool
fuse_relu
,
bool
fuse_residual_conn
,
const
std
::
vector
<
float
>
output_shift_scale
,
float
sum_scale
)
const
{
mkldnn
::
primitive_attr
conv_attr
;
mkldnn
::
primitive_attr
conv_attr
;
mkldnn
::
post_ops
post_operations
;
mkldnn
::
post_ops
post_operations
;
int
mask
=
output_shift_scale
.
size
()
>
1
?
1
<<
1
:
0
;
int
mask
=
output_shift_scale
.
size
()
>
1
?
1
<<
1
:
0
;
conv_attr
.
set_output_scales
(
mask
,
output_shift_scale
);
conv_attr
.
set_output_scales
(
mask
,
output_shift_scale
);
if
(
fuse_residual_conn
)
{
post_operations
.
append_sum
(
sum_scale
);
}
if
(
fuse_relu
)
{
constexpr
float
scale
=
1.0
f
;
constexpr
float
negative_slope
=
0.0
f
;
constexpr
float
placeholder
=
1.0
f
;
// beta
post_operations
.
append_eltwise
(
scale
,
mkldnn
::
algorithm
::
eltwise_relu
,
negative_slope
,
placeholder
);
}
conv_attr
.
set_post_ops
(
post_operations
);
conv_attr
.
set_post_ops
(
post_operations
);
return
conv_attr
;
return
conv_attr
;
}
}
...
@@ -600,9 +706,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -600,9 +706,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
const
bool
fuse_residual_conn
,
const
std
::
vector
<
float
>
output_shift_scale
,
const
std
::
vector
<
float
>
output_shift_scale
,
bool
is_test
)
const
{
const
float
sum_scale
,
bool
is_test
)
const
{
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
...
@@ -613,7 +720,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -613,7 +720,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
dst
,
stride_dims
,
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
output_shift_scale
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
);
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
conv_desc
,
conv_attr
,
engine
);
conv_desc
,
conv_attr
,
engine
);
...
@@ -652,9 +760,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -652,9 +760,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const
memory
::
desc
&
bias
,
const
memory
::
desc
&
dst
,
const
memory
::
desc
&
bias
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
const
bool
fuse_residual_conn
,
const
std
::
vector
<
float
>
output_shift_scale
,
const
std
::
vector
<
float
>
output_shift_scale
,
bool
is_test
)
const
{
const
float
sum_scale
,
bool
is_test
)
const
{
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
...
@@ -665,7 +774,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -665,7 +774,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
bias
,
dst
,
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
bias
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
output_shift_scale
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
fuse_relu
,
fuse_residual_conn
,
output_shift_scale
,
sum_scale
);
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
conv_desc
,
conv_attr
,
engine
);
conv_desc
,
conv_attr
,
engine
);
...
@@ -868,7 +978,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -868,7 +978,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
input_grad
->
set_format
(
GetMKLDNNFormat
(
*
diff_src_memory_p
));
input_grad
->
set_format
(
GetMKLDNNFormat
(
*
diff_src_memory_p
));
}
}
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
// Compute()
}
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/elementwise/elementwise_sub_op.cu
浏览文件 @
eac5a0aa
...
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub
,
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub_grad
,
elementwise_sub_grad
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace
paddle
{
namespace
operators
{
class
FusedEmbeddingSeqPoolOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input W of FusedEmbeddingSeqPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Ids"
),
"Input Ids of FusedEmbeddingSeqPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output of FusedEmbeddingSeqPoolOp should not be null."
);
auto
table_dims
=
ctx
->
GetInputDim
(
"W"
);
auto
ids_dims
=
ctx
->
GetInputDim
(
"Ids"
);
const
std
::
string
&
combiner
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"combiner"
);
PADDLE_ENFORCE_EQ
(
table_dims
.
size
(),
2
);
PADDLE_ENFORCE_GE
(
ids_dims
.
size
(),
1
,
"The dim size of the 'Ids' tensor must greater than 1."
);
PADDLE_ENFORCE_EQ
(
ids_dims
[
ids_dims
.
size
()
-
1
],
1
,
"The last dimension of the 'Ids' tensor must be 1."
);
// we only support sum now
PADDLE_ENFORCE_EQ
(
combiner
,
"sum"
);
int64_t
last_dim
=
table_dims
[
1
];
for
(
int
i
=
1
;
i
!=
ids_dims
.
size
();
++
i
)
{
last_dim
*=
ids_dims
[
i
];
}
if
(
ctx
->
IsRuntime
())
{
framework
::
Variable
*
ids_var
=
boost
::
get
<
framework
::
Variable
*>
(
ctx
->
GetInputVarPtrs
(
"Ids"
)[
0
]);
const
auto
&
ids_lod
=
ids_var
->
Get
<
LoDTensor
>
().
lod
();
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE
(
ids_lod
.
size
(),
1u
,
"The LoD level of Input(Ids) must be 1"
);
PADDLE_ENFORCE_GE
(
ids_lod
[
0
].
size
(),
1u
,
"The LoD could NOT be empty"
);
int64_t
batch_size
=
ids_lod
[
0
].
size
()
-
1
;
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size, embedding_size]
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
({
batch_size
,
last_dim
}));
}
else
{
// in compile time, the lod level of ids must be 1
framework
::
VarDesc
*
ids_desc
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"Ids"
)[
0
]);
PADDLE_ENFORCE_EQ
(
ids_desc
->
GetLoDLevel
(),
1
);
// in compile time, the shape from Ids -> output
// should be [-1, 1] -> [-1, embedding_size]
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
({
-
1
,
last_dim
}));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"W"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
class
FusedEmbeddingSeqPoolOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"W"
,
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter."
);
AddInput
(
"Ids"
,
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"The last dimension size must be 1."
);
AddOutput
(
"Out"
,
"The lookup results, which have the same type as W."
);
AddAttr
<
std
::
string
>
(
"combiner"
,
"(string, default sum) "
"A string specifying the reduction op. Currently sum "
"are supported, sum computes the weighted sum of the "
"embedding results for each row."
)
.
SetDefault
(
"sum"
);
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr
<
bool
>
(
"grad_inplace"
,
"(boolean, default false) "
"If the grad op reuse the input's variable."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"is_sparse"
,
"(boolean, default false) "
"Sparse update."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
FusedEmbeddingSeqPool Operator.
Computes embeddings for the given ids and weights.
This operator is used to perform lookups on the parameter W,
then computes the weighted sum of the lookups results for each row
and concatenated into a dense tensor.
The input Ids should carry the LoD (Level of Details) information.
And the output will change the LoD information with input Ids.
)DOC"
);
}
};
class
FusedEmbeddingSeqPoolOpGradDescMaker
:
public
framework
::
DefaultGradOpDescMaker
<
true
>
{
using
::
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>::
DefaultGradOpDescMaker
;
protected:
virtual
std
::
string
GradOpType
()
const
{
return
"fused_embedding_seq_pool_grad"
;
}
};
class
FusedEmbeddingSeqPoolOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
table_dims
=
ctx
->
GetInputDim
(
"W"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"W"
),
table_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"W"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
class
FusedEmbeddingSeqPoolOpGradVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
out_var_name
=
op_desc
.
Output
(
framework
::
GradVarName
(
"W"
)).
front
();
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
3
)
<<
"fused_embedding_seq_pool_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
VLOG
(
3
)
<<
"fused_embedding_seq_pool_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
block
->
Var
(
out_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fused_embedding_seq_pool
,
ops
::
FusedEmbeddingSeqPoolOp
,
ops
::
FusedEmbeddingSeqPoolOpGradDescMaker
,
ops
::
FusedEmbeddingSeqPoolOpMaker
);
REGISTER_OPERATOR
(
fused_embedding_seq_pool_grad
,
ops
::
FusedEmbeddingSeqPoolOpGrad
,
ops
::
FusedEmbeddingSeqPoolOpGradVarTypeInference
);
REGISTER_OP_CPU_KERNEL
(
fused_embedding_seq_pool
,
ops
::
FusedEmbeddingSeqPoolKernel
<
float
>
,
ops
::
FusedEmbeddingSeqPoolKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
fused_embedding_seq_pool_grad
,
ops
::
FusedEmbeddingSeqPoolGradKernel
<
float
>
,
ops
::
FusedEmbeddingSeqPoolGradKernel
<
double
>
);
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
SelectedRows
=
framework
::
SelectedRows
;
using
DDim
=
framework
::
DDim
;
template
<
typename
T
>
struct
EmbeddingVSumFunctor
{
void
operator
()(
const
framework
::
ExecutionContext
&
context
,
const
LoDTensor
*
table_t
,
const
LoDTensor
*
ids_t
,
LoDTensor
*
output_t
)
{
auto
*
table
=
table_t
->
data
<
T
>
();
int64_t
row_number
=
table_t
->
dims
()[
0
];
int64_t
row_width
=
table_t
->
dims
()[
1
];
int64_t
last_dim
=
output_t
->
dims
()[
1
];
const
int64_t
*
ids
=
ids_t
->
data
<
int64_t
>
();
auto
ids_lod
=
ids_t
->
lod
()[
0
];
int64_t
ids_count
=
ids_t
->
numel
()
/
ids_lod
.
back
();
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int64_t
i
=
0
;
i
!=
ids_lod
.
size
()
-
1
;
++
i
)
{
size_t
begin
=
ids_lod
[
i
]
*
ids_count
;
for
(
int64_t
j
=
0
;
j
!=
ids_count
;
++
j
)
{
PADDLE_ENFORCE_LT
(
ids
[
begin
],
row_number
);
PADDLE_ENFORCE_GE
(
ids
[
begin
],
0
,
"ids %d"
,
i
);
blas
.
VCOPY
(
row_width
,
table
+
ids
[
begin
+
j
]
*
row_width
,
output
+
i
*
last_dim
+
j
*
row_width
);
}
for
(
int64_t
r
=
(
ids_lod
[
i
]
+
1
)
*
ids_count
;
r
<
ids_lod
[
i
+
1
]
*
ids_count
;
++
r
)
{
PADDLE_ENFORCE_LT
(
ids
[
r
],
row_number
);
PADDLE_ENFORCE_GE
(
ids
[
r
],
0
,
"ids %d"
,
i
);
blas
.
AXPY
(
row_width
,
1.
,
table
+
ids
[
r
]
*
row_width
,
output
+
i
*
last_dim
+
(
r
%
ids_count
)
*
row_width
);
}
}
}
};
template
<
typename
T
>
class
FusedEmbeddingSeqPoolKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
LoDTensor
*
ids_t
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
// int tensor
LoDTensor
*
output_t
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
// float tensor
const
LoDTensor
*
table_var
=
context
.
Input
<
LoDTensor
>
(
"W"
);
const
std
::
string
&
combiner_type
=
context
.
Attr
<
std
::
string
>
(
"combiner"
);
if
(
combiner_type
==
"sum"
)
{
EmbeddingVSumFunctor
<
T
>
functor
;
functor
(
context
,
table_var
,
ids_t
,
output_t
);
}
}
};
template
<
typename
T
>
class
FusedEmbeddingSeqPoolGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
table_var
=
context
.
InputVar
(
"W"
);
DDim
table_dim
;
if
(
table_var
->
IsType
<
LoDTensor
>
())
{
table_dim
=
context
.
Input
<
LoDTensor
>
(
"W"
)
->
dims
();
}
else
if
(
table_var
->
IsType
<
SelectedRows
>
())
{
auto
*
table_t
=
context
.
Input
<
SelectedRows
>
(
"W"
);
table_dim
=
table_t
->
value
().
dims
();
}
else
{
PADDLE_THROW
(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows"
);
}
bool
is_sparse
=
context
.
Attr
<
bool
>
(
"is_sparse"
);
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if
(
is_sparse
)
{
auto
*
ids
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
auto
*
d_output
=
context
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_table
=
context
.
Output
<
SelectedRows
>
(
framework
::
GradVarName
(
"W"
));
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
int64_t
ids_num
=
ids
->
numel
();
auto
lod
=
ids
->
lod
()[
0
];
int64_t
row_width
=
d_output
->
dims
()[
1
];
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
new_rows
->
resize
(
ids_num
);
std
::
memcpy
(
&
(
*
new_rows
)[
0
],
ids_data
,
ids_num
*
sizeof
(
int64_t
));
auto
*
d_table_value
=
d_table
->
mutable_value
();
d_table_value
->
Resize
({
ids_num
,
table_dim
[
1
]});
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
int64_t
in_offset
=
lod
[
i
]
*
row_width
;
const
T
*
out_pos
=
d_output_data
+
i
*
row_width
;
T
*
in_pos
=
d_table_data
+
in_offset
;
for
(
int
r
=
0
;
r
!=
h
;
++
r
)
{
blas
.
VCOPY
(
row_width
,
out_pos
,
in_pos
+
r
*
row_width
);
}
}
}
else
{
LOG
(
ERROR
)
<<
"Dense is not supported in fused_embedding_seq_pool_op now"
;
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/operators/jit/kernels.h"
namespace
paddle
{
namespace
operators
{
void
FusionSeqPoolConcatOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE_GE
(
ctx
->
Inputs
(
"X"
).
size
(),
1UL
,
"Inputs(X) of FusionSeqPoolConcatOp should not be empty."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FusionSeqPoolConcatOp should not be null."
);
int
axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
);
PADDLE_ENFORCE_EQ
(
axis
,
1
,
"FusionSeqPoolConcatOp only supports concat axis=1 yet."
);
auto
ins_dims
=
ctx
->
GetInputsDim
(
"X"
);
const
size_t
n
=
ins_dims
.
size
();
PADDLE_ENFORCE_GT
(
n
,
0UL
,
"Input tensors count should > 0."
);
if
(
n
==
1
)
{
LOG
(
WARNING
)
<<
"Only have one input, may waste memory"
;
}
// The output height should be confirmed in Compute,
// since input lod is not accessible here.
PADDLE_ENFORCE_EQ
(
ins_dims
[
0
].
size
(),
2UL
,
"The dims size of first input should be 2."
);
ctx
->
SetOutputDim
(
"Out"
,
{
-
1
,
ins_dims
[
0
][
axis
]
*
static_cast
<
int
>
(
n
)});
}
framework
::
OpKernelType
FusionSeqPoolConcatOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
return
framework
::
OpKernelType
(
framework
::
GetDataTypeOfVar
(
ctx
.
MultiInputVar
(
"X"
)[
0
]),
ctx
.
GetPlace
());
}
void
FusionSeqPoolConcatOpMaker
::
Make
()
{
AddInput
(
"X"
,
"(LoDTensor) Input tensors of this operator."
).
AsDuplicable
();
AddOutput
(
"Out"
,
"(LoDTensor) Output tensor of concat operator."
);
AddAttr
<
std
::
string
>
(
"pooltype"
,
"(string, default 'SUM') some of the pooling "
"pooltype of SequencePoolOp."
)
.
SetDefault
(
"SUM"
)
.
InEnum
({
"AVERAGE"
,
"SUM"
,
"SQRT"
});
AddAttr
<
int
>
(
"axis"
,
"The axis along which the input tensors will be concatenated. "
"Only supports concat axis=1 yet."
)
.
SetDefault
(
1
);
AddComment
(
R"DOC(
Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
)DOC"
);
}
template
<
typename
T
>
class
FusionSeqPoolConcatKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
LoDTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
std
::
string
pooltype
=
ctx
.
Attr
<
std
::
string
>
(
"pooltype"
);
auto
x0_lod
=
ins
[
0
]
->
lod
();
auto
x0_dims
=
ins
[
0
]
->
dims
();
auto
y_dims
=
out
->
dims
();
size_t
bs
=
x0_lod
[
0
].
size
()
-
1
;
out
->
Resize
({
static_cast
<
int64_t
>
(
bs
),
y_dims
[
1
]});
framework
::
LoD
y_lod
(
1
);
y_lod
[
0
].
resize
(
bs
+
1
);
for
(
size_t
i
=
0
;
i
<=
bs
;
++
i
)
{
y_lod
[
0
][
i
]
=
i
;
}
out
->
set_lod
(
y_lod
);
auto
place
=
ctx
.
GetPlace
();
T
*
y_data
=
out
->
mutable_data
<
T
>
(
place
);
int
w
=
ins
[
0
]
->
numel
()
/
x0_dims
[
0
];
PADDLE_ENFORCE_EQ
(
y_dims
[
1
]
%
w
,
0
,
"The output of dims[1] should be dividable of w"
);
jit
::
seq_pool_attr_t
attr
(
w
,
jit
::
SeqPoolType
::
kSum
);
if
(
pooltype
==
"AVERAGE"
)
{
attr
.
type
=
jit
::
SeqPoolType
::
kAvg
;
}
else
if
(
pooltype
==
"SQRT"
)
{
attr
.
type
=
jit
::
SeqPoolType
::
kSqrt
;
}
auto
seqpool
=
jit
::
Get
<
jit
::
kSeqPool
,
jit
::
SeqPoolTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
size_t
n
=
ins
.
size
();
size_t
dst_step_size
=
n
*
w
;
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
auto
x_dims
=
ins
[
i
]
->
dims
();
auto
x_lod
=
ins
[
i
]
->
lod
()[
0
];
const
T
*
src
=
ins
[
i
]
->
data
<
T
>
();
T
*
dst
=
y_data
+
i
*
w
;
PADDLE_ENFORCE_EQ
(
static_cast
<
int
>
(
ins
[
i
]
->
numel
()
/
x_dims
[
0
]),
w
,
"Width of all inputs should be equal."
);
PADDLE_ENFORCE_EQ
(
x_lod
.
size
(),
bs
+
1
,
"Batchsize of all inputs should be equal."
);
for
(
size_t
j
=
0
;
j
<
bs
;
++
j
)
{
attr
.
h
=
static_cast
<
int
>
(
x_lod
[
j
+
1
]
-
x_lod
[
j
]);
seqpool
(
src
,
dst
,
&
attr
);
dst
+=
dst_step_size
;
src
+=
attr
.
h
*
attr
.
w
;
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fusion_seqpool_concat
,
ops
::
FusionSeqPoolConcatOp
,
ops
::
FusionSeqPoolConcatOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OP_CPU_KERNEL
(
fusion_seqpool_concat
,
ops
::
FusionSeqPoolConcatKernel
<
float
>
,
ops
::
FusionSeqPoolConcatKernel
<
double
>
);
paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
class
FusionSeqPoolConcatOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
FusionSeqPoolConcatOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
eac5a0aa
...
@@ -52,11 +52,11 @@ struct BenchFunc {
...
@@ -52,11 +52,11 @@ struct BenchFunc {
for
(
int
i
=
0
;
i
<
FLAGS_burning
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_burning
;
++
i
)
{
tgt
(
args
...);
tgt
(
args
...);
}
}
auto
start
=
paddle
::
platform
::
PosixInNsec
()
/
1e-3
;
auto
start
=
paddle
::
platform
::
PosixInNsec
()
*
1e-3
;
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_repeat
;
++
i
)
{
tgt
(
args
...);
tgt
(
args
...);
}
}
auto
end
=
paddle
::
platform
::
PosixInNsec
()
/
1e-3
;
auto
end
=
paddle
::
platform
::
PosixInNsec
()
*
1e-3
;
return
static_cast
<
double
>
(
end
-
start
)
/
FLAGS_repeat
;
return
static_cast
<
double
>
(
end
-
start
)
/
FLAGS_repeat
;
}
}
};
};
...
@@ -190,6 +190,26 @@ void BenchGRUKernel() {
...
@@ -190,6 +190,26 @@ void BenchGRUKernel() {
}
}
}
}
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
BenchSeqPoolKernel
()
{
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
jit
::
SeqPoolType
::
kSum
,
jit
::
SeqPoolType
::
kAvg
,
jit
::
SeqPoolType
::
kSqrt
};
for
(
auto
type
:
pool_types
)
{
for
(
int
w
:
TestSizes
())
{
jit
::
seq_pool_attr_t
attr
(
w
,
type
);
for
(
int
h
:
TestSizes
())
{
attr
.
h
=
h
;
std
::
vector
<
T
>
x
(
h
*
w
),
y
(
w
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
T
*
x_data
=
x
.
data
();
T
*
y_data
=
y
.
data
();
BenchAllImpls
<
KT
,
jit
::
SeqPoolTuples
<
T
>
,
PlaceType
>
(
attr
,
x_data
,
y_data
,
&
attr
);
}
}
}
}
// Benchmark all jit kernels including jitcode, mkl and refer.
// Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...]
// To use this tool, run command: ./benchmark [options...]
// Options:
// Options:
...
@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
...
@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
BenchGRUKernel
<
jit
::
kGRUH1
,
T
,
PlaceType
>
();
BenchGRUKernel
<
jit
::
kGRUH1
,
T
,
PlaceType
>
();
BenchGRUKernel
<
jit
::
kGRUHtPart1
,
T
,
PlaceType
>
();
BenchGRUKernel
<
jit
::
kGRUHtPart1
,
T
,
PlaceType
>
();
BenchGRUKernel
<
jit
::
kGRUHtPart2
,
T
,
PlaceType
>
();
BenchGRUKernel
<
jit
::
kGRUHtPart2
,
T
,
PlaceType
>
();
// seq pool function
BenchSeqPoolKernel
<
jit
::
kSeqPool
,
T
,
PlaceType
>
();
}
}
paddle/fluid/operators/jit/gen/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
...
@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
USE_JITKERNEL_GEN
(
kGRUHtPart1
)
USE_JITKERNEL_GEN
(
kGRUHtPart1
)
USE_JITKERNEL_GEN
(
kGRUHtPart2
)
USE_JITKERNEL_GEN
(
kGRUHtPart2
)
USE_JITKERNEL_GEN
(
kNCHW16CMulNC
)
USE_JITKERNEL_GEN
(
kNCHW16CMulNC
)
USE_JITKERNEL_GEN
(
kSeqPool
)
paddle/fluid/operators/jit/gen/seqpool.cc
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/seqpool.h"
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
void
SeqPoolJitCode
::
genCode
()
{
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
constexpr
int
max_num_regs
=
8
;
const
int
num_block
=
w_
/
block
;
const
int
num_groups
=
num_block
/
max_num_regs
;
int
rest_num_regs
=
num_block
%
max_num_regs
;
mov
(
reg32_int_h
,
dword
[
param_attr
]);
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
exp_float_consts
));
vmovups
(
xmm_t
(
1
),
ptr
[
reg_tmp
+
OFFSET_EXP_ONE
]);
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
fp_h_
));
fild
(
dword
[
param_attr
]);
fstp
(
dword
[
reg_tmp
]);
vmovss
(
xmm_t
(
0
),
ptr
[
reg_tmp
]);
if
(
type_
==
SeqPoolType
::
kSqrt
)
{
vsqrtps
(
xmm_t
(
0
),
xmm_t
(
0
));
}
vdivps
(
xmm_t
(
1
),
xmm_t
(
1
),
xmm_t
(
0
));
vmovss
(
ptr
[
reg_tmp
],
xmm_t
(
1
));
}
const
int
group_len
=
max_num_regs
*
block
*
sizeof
(
float
);
for
(
int
g
=
0
;
g
<
num_groups
;
++
g
)
{
pool_height
<
ymm_t
>
(
g
*
group_len
,
block
,
max_num_regs
);
}
if
(
rest_num_regs
>
0
)
{
pool_height
<
ymm_t
>
(
num_groups
*
group_len
,
block
,
rest_num_regs
);
}
// part of rest_w * height
const
int
rest
=
w_
%
block
;
pool_height_of_rest_width
(
rest
,
(
w_
-
rest
)
*
sizeof
(
float
),
max_num_regs
);
ret
();
}
class
SeqPoolCreator
:
public
JitCodeCreator
<
seq_pool_attr_t
>
{
public:
bool
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
);
}
size_t
CodeSize
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
96
+
((
attr
.
w
/
YMM_FLOAT_BLOCK
+
4
/* for rest */
)
*
4
/* load, mul and save */
+
256
)
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
seq_pool_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
w
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
h
,
0
);
return
make_unique
<
SeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
}
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
namespace
gen
=
paddle
::
operators
::
jit
::
gen
;
REGISTER_JITKERNEL_GEN
(
kSeqPool
,
gen
::
SeqPoolCreator
);
paddle/fluid/operators/jit/gen/seqpool.h
0 → 100644
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
jit
{
namespace
gen
{
class
SeqPoolJitCode
:
public
JitCode
{
public:
explicit
SeqPoolJitCode
(
const
seq_pool_attr_t
&
attr
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
w_
(
attr
.
w
),
type_
(
attr
.
type
)
{
if
(
!
(
type_
==
SeqPoolType
::
kSum
||
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
))
{
LOG
(
FATAL
)
<<
"Only support sum pool yet "
;
}
fp_h_
[
0
]
=
1.
f
;
this
->
genCode
();
}
virtual
const
char
*
name
()
const
{
std
::
string
base
=
"SeqPoolJitCode"
;
if
(
type_
==
SeqPoolType
::
kSum
)
{
base
+=
"_Sum"
;
}
else
if
(
type_
==
SeqPoolType
::
kAvg
)
{
base
+=
"_Avg"
;
}
else
if
(
type_
==
SeqPoolType
::
kSqrt
)
{
base
+=
"_Sqrt"
;
}
base
+=
(
"_W"
+
std
::
to_string
(
w_
));
return
base
.
c_str
();
}
void
genCode
()
override
;
protected:
template
<
typename
JMM
>
void
pool_height
(
int
w_offset
,
int
block
,
int
max_num_regs
)
{
int
offset
=
w_offset
;
for
(
int
i
=
0
;
i
<
max_num_regs
;
++
i
)
{
vmovups
(
JMM
(
i
),
ptr
[
param_src
+
offset
]);
offset
+=
sizeof
(
float
)
*
block
;
}
cmp
(
reg32_int_h
,
1
);
Label
l_next_h
,
l_h_done
;
jle
(
l_h_done
,
T_NEAR
);
mov
(
reg_h_i
,
1
);
mov
(
reg_tmp
,
param_src
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
)
+
w_offset
);
L
(
l_next_h
);
{
mov
(
reg_ptr_src_i
,
reg_tmp
);
for
(
int
i
=
0
;
i
<
max_num_regs
;
++
i
)
{
vmovups
(
JMM
(
i
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
// sum anyway
vaddps
(
JMM
(
i
),
JMM
(
i
),
JMM
(
i
+
max_num_regs
));
add
(
reg_ptr_src_i
,
sizeof
(
float
)
*
block
);
}
inc
(
reg_h_i
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
));
cmp
(
reg_h_i
,
reg32_int_h
);
jl
(
l_next_h
,
T_NEAR
);
}
L
(
l_h_done
);
// save right now
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
fp_h_
));
vbroadcastss
(
JMM
(
max_num_regs
),
ptr
[
reg_tmp
]);
}
offset
=
w_offset
;
for
(
int
i
=
0
;
i
<
max_num_regs
;
++
i
)
{
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
vmulps
(
JMM
(
i
),
JMM
(
i
),
JMM
(
max_num_regs
));
}
vmovups
(
ptr
[
param_dst
+
offset
],
JMM
(
i
));
offset
+=
sizeof
(
float
)
*
block
;
}
}
void
pool_height_of_rest_width
(
int
rest
,
int
w_offset
,
int
max_num_regs
)
{
const
int
rest_used_num_regs
=
load_rest
(
rest
,
w_offset
,
0
);
const
bool
has_block4
=
rest
/
4
>
0
;
const
bool
has_block2
=
(
rest
%
4
)
/
2
>
0
;
const
bool
has_block1
=
(
rest
%
2
)
==
1
;
cmp
(
reg32_int_h
,
1
);
Label
l_next_h
,
l_h_done
;
jle
(
l_h_done
,
T_NEAR
);
mov
(
reg_h_i
,
1
);
mov
(
reg_tmp
,
param_src
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
)
+
w_offset
);
L
(
l_next_h
);
{
int
reg_idx
=
0
;
mov
(
reg_ptr_src_i
,
reg_tmp
);
if
(
has_block4
)
{
vmovups
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
add
(
reg_ptr_src_i
,
sizeof
(
float
)
*
4
);
reg_idx
++
;
}
if
(
has_block2
)
{
vmovups
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
add
(
reg_ptr_src_i
,
sizeof
(
float
)
*
2
);
reg_idx
++
;
}
if
(
has_block1
)
{
vmovss
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
reg_idx
++
;
}
PADDLE_ENFORCE_EQ
(
reg_idx
,
rest_used_num_regs
,
"All heights should use same regs"
);
for
(
int
i
=
0
;
i
<
reg_idx
;
++
i
)
{
vaddps
(
xmm_t
(
i
),
xmm_t
(
i
),
xmm_t
(
i
+
max_num_regs
));
}
inc
(
reg_h_i
);
add
(
reg_tmp
,
w_
*
sizeof
(
float
));
cmp
(
reg_h_i
,
reg32_int_h
);
jl
(
l_next_h
,
T_NEAR
);
}
L
(
l_h_done
);
// save right now
if
(
type_
==
SeqPoolType
::
kAvg
||
type_
==
SeqPoolType
::
kSqrt
)
{
mov
(
reg_tmp
,
reinterpret_cast
<
size_t
>
(
fp_h_
));
vbroadcastss
(
xmm_t
(
max_num_regs
),
ptr
[
reg_tmp
]);
for
(
int
i
=
0
;
i
<
rest_used_num_regs
;
++
i
)
{
vmulps
(
xmm_t
(
i
),
xmm_t
(
i
),
xmm_t
(
max_num_regs
));
}
}
save_rest
(
rest
,
w_offset
);
}
// return the number of used regs, use start from reg 0
int
load_rest
(
int
rest
,
int
w_offset
,
const
int
num_shift_regs
,
const
int
reg_start
=
0
)
{
const
bool
has_block4
=
rest
/
4
>
0
;
const
bool
has_block2
=
(
rest
%
4
)
/
2
>
0
;
const
bool
has_block1
=
(
rest
%
2
)
==
1
;
int
reg_idx
=
reg_start
;
if
(
has_block4
)
{
vmovups
(
xmm_t
(
reg_idx
+
num_shift_regs
),
ptr
[
param_src
+
w_offset
]);
w_offset
+=
sizeof
(
float
)
*
4
;
reg_idx
++
;
}
if
(
has_block2
)
{
vmovq
(
xmm_t
(
reg_idx
+
num_shift_regs
),
ptr
[
param_src
+
w_offset
]);
w_offset
+=
sizeof
(
float
)
*
2
;
reg_idx
++
;
}
if
(
has_block1
)
{
vmovss
(
xmm_t
(
reg_idx
+
num_shift_regs
),
ptr
[
param_src
+
w_offset
]);
reg_idx
++
;
}
return
reg_idx
;
}
// use reg start from 0
void
save_rest
(
int
rest
,
int
w_offset
,
int
reg_start
=
0
)
{
const
bool
has_block4
=
rest
/
4
>
0
;
const
bool
has_block2
=
(
rest
%
4
)
/
2
>
0
;
const
bool
has_block1
=
(
rest
%
2
)
==
1
;
int
reg_idx
=
reg_start
;
if
(
has_block4
)
{
vmovups
(
ptr
[
param_dst
+
w_offset
],
xmm_t
(
reg_idx
));
w_offset
+=
sizeof
(
float
)
*
4
;
reg_idx
++
;
}
if
(
has_block2
)
{
vmovq
(
ptr
[
param_dst
+
w_offset
],
xmm_t
(
reg_idx
));
w_offset
+=
sizeof
(
float
)
*
2
;
reg_idx
++
;
}
if
(
has_block1
)
{
vmovss
(
ptr
[
param_dst
+
w_offset
],
xmm_t
(
reg_idx
));
}
}
private:
float
ALIGN32_BEG
fp_h_
[
1
]
ALIGN32_END
;
int
w_
;
SeqPoolType
type_
;
reg64_t
param_src
{
abi_param1
};
reg64_t
param_dst
{
abi_param2
};
reg64_t
param_attr
{
abi_param3
};
reg64_t
reg_tmp
{
rax
};
reg32_t
reg32_int_h
{
r8d
};
reg32_t
reg32_fp_h
{
r9d
};
reg64_t
reg_h_i
{
r10
};
reg64_t
reg_ptr_src_i
{
r11
};
};
}
// namespace gen
}
// namespace jit
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/jit/helper.cc
浏览文件 @
eac5a0aa
...
@@ -26,6 +26,7 @@ namespace jit {
...
@@ -26,6 +26,7 @@ namespace jit {
const
char
*
to_string
(
KernelType
kt
)
{
const
char
*
to_string
(
KernelType
kt
)
{
switch
(
kt
)
{
switch
(
kt
)
{
ONE_CASE
(
kNone
);
ONE_CASE
(
kVMul
);
ONE_CASE
(
kVMul
);
ONE_CASE
(
kVAdd
);
ONE_CASE
(
kVAdd
);
ONE_CASE
(
kVAddRelu
);
ONE_CASE
(
kVAddRelu
);
...
@@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
...
@@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
ONE_CASE
(
kCRFDecoding
);
ONE_CASE
(
kCRFDecoding
);
ONE_CASE
(
kLayerNorm
);
ONE_CASE
(
kLayerNorm
);
ONE_CASE
(
kNCHW16CMulNC
);
ONE_CASE
(
kNCHW16CMulNC
);
ONE_CASE
(
kSeqPool
);
default:
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
kt
);
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
kt
);
return
"NOT JITKernel"
;
return
"NOT JITKernel"
;
}
}
return
nullptr
;
return
nullptr
;
}
}
const
char
*
to_string
(
SeqPoolType
tp
)
{
switch
(
tp
)
{
ONE_CASE
(
kNonePoolType
);
ONE_CASE
(
kSum
);
ONE_CASE
(
kAvg
);
ONE_CASE
(
kSqrt
);
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
tp
);
return
"NOT PoolType"
;
}
return
nullptr
;
}
#undef ONE_CASE
#undef ONE_CASE
KernelType
to_kerneltype
(
const
std
::
string
&
act
)
{
KernelType
to_kerneltype
(
const
std
::
string
&
act
)
{
...
...
paddle/fluid/operators/jit/helper.h
浏览文件 @
eac5a0aa
...
@@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
...
@@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
}
}
const
char
*
to_string
(
KernelType
kt
);
const
char
*
to_string
(
KernelType
kt
);
const
char
*
to_string
(
SeqPoolType
kt
);
KernelType
to_kerneltype
(
const
std
::
string
&
act
);
KernelType
to_kerneltype
(
const
std
::
string
&
act
);
...
@@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
...
@@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
<<
"],act_cand["
<<
to_string
(
attr
.
act_cand
)
<<
"]"
;
<<
"],act_cand["
<<
to_string
(
attr
.
act_cand
)
<<
"]"
;
return
os
;
return
os
;
}
}
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
seq_pool_attr_t
&
attr
)
{
os
<<
"height_size["
<<
attr
.
h
<<
"],width_size["
<<
attr
.
w
<<
"],pool_type["
<<
to_string
(
attr
.
type
)
<<
"]"
;
return
os
;
}
}
// namespace jit
}
// namespace jit
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/jit/kernel_base.h
浏览文件 @
eac5a0aa
...
@@ -41,8 +41,16 @@ typedef enum {
...
@@ -41,8 +41,16 @@ typedef enum {
kCRFDecoding
,
kCRFDecoding
,
kLayerNorm
,
kLayerNorm
,
kNCHW16CMulNC
,
kNCHW16CMulNC
,
kSeqPool
,
}
KernelType
;
}
KernelType
;
typedef
enum
{
kNonePoolType
=
0
,
kSum
=
1
,
kAvg
,
kSqrt
,
}
SeqPoolType
;
template
<
typename
T
>
template
<
typename
T
>
struct
XYZNTuples
{
struct
XYZNTuples
{
typedef
T
data_type
;
typedef
T
data_type
;
...
@@ -112,6 +120,21 @@ struct GRUTuples {
...
@@ -112,6 +120,21 @@ struct GRUTuples {
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
typedef
void
(
*
func_type
)(
gru_t
*
,
const
gru_attr_t
*
);
};
};
typedef
struct
seq_pool_attr_s
{
int
h
,
w
;
// h should always be the first one
SeqPoolType
type
;
seq_pool_attr_s
()
=
default
;
explicit
seq_pool_attr_s
(
int
width
,
SeqPoolType
pool_type
,
int
height
=
1
)
:
h
(
height
),
w
(
width
),
type
(
pool_type
)
{}
}
seq_pool_attr_t
;
template
<
typename
T
>
struct
SeqPoolTuples
{
typedef
T
data_type
;
typedef
seq_pool_attr_t
attr_type
;
typedef
void
(
*
func_type
)(
const
T
*
,
T
*
,
const
seq_pool_attr_t
*
);
};
template
<
typename
T
>
template
<
typename
T
>
struct
CRFDecodingTuples
{
struct
CRFDecodingTuples
{
typedef
T
data_type
;
typedef
T
data_type
;
...
...
paddle/fluid/operators/jit/kernel_key.cc
浏览文件 @
eac5a0aa
...
@@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
...
@@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
(
static_cast
<
int
>
(
attr
.
act_cand
)
<<
act_type_shift
);
(
static_cast
<
int
>
(
attr
.
act_cand
)
<<
act_type_shift
);
}
}
template
<
>
size_t
JitCodeKey
<
seq_pool_attr_t
>
(
const
seq_pool_attr_t
&
attr
)
{
size_t
key
=
attr
.
w
;
constexpr
int
pool_type_shift
=
3
;
return
(
key
<<
pool_type_shift
)
+
static_cast
<
int
>
(
attr
.
type
);
}
}
// namespace jit
}
// namespace jit
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl)
...
@@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl)
USE_JITKERNEL_MORE
(
kVExp, mkl
)
USE_JITKERNEL_MORE
(
kVExp, mkl
)
USE_JITKERNEL_MORE
(
kVSigmoid, mkl
)
USE_JITKERNEL_MORE
(
kVSigmoid, mkl
)
USE_JITKERNEL_MORE
(
kVTanh, mkl
)
USE_JITKERNEL_MORE
(
kVTanh, mkl
)
USE_JITKERNEL_MORE
(
kSeqPool, mkl
)
paddle/fluid/operators/jit/more/mkl/mkl.cc
浏览文件 @
eac5a0aa
...
@@ -72,6 +72,26 @@ void VExp<double>(const double* x, double* y, int n) {
...
@@ -72,6 +72,26 @@ void VExp<double>(const double* x, double* y, int n) {
platform
::
dynload
::
vdExp
(
n
,
x
,
y
);
platform
::
dynload
::
vdExp
(
n
,
x
,
y
);
}
}
template
<
>
void
VCopy
<
float
>
(
const
float
*
x
,
float
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_scopy
(
n
,
x
,
1
,
y
,
1
);
}
template
<
>
void
VCopy
<
double
>
(
const
double
*
x
,
double
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_dcopy
(
n
,
x
,
1
,
y
,
1
);
}
template
<
>
void
VAXPY
<
float
>
(
float
a
,
const
float
*
x
,
float
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_saxpy
(
n
,
a
,
x
,
1
,
y
,
1
);
}
template
<
>
void
VAXPY
<
double
>
(
double
a
,
const
double
*
x
,
double
*
y
,
int
n
)
{
platform
::
dynload
::
cblas_daxpy
(
n
,
a
,
x
,
1
,
y
,
1
);
}
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template
<
>
template
<
>
bool
VMulKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
bool
VMulKernel
<
float
>::
UseMe
(
const
int
&
d
)
const
{
...
@@ -103,6 +123,16 @@ bool VTanhKernel<float>::UseMe(const int& d) const {
...
@@ -103,6 +123,16 @@ bool VTanhKernel<float>::UseMe(const int& d) const {
return
d
>
7
;
return
d
>
7
;
}
}
template
<
>
bool
SeqPoolKernel
<
float
>::
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
{
return
true
;
}
template
<
>
bool
SeqPoolKernel
<
double
>::
UseMe
(
const
seq_pool_attr_t
&
attr
)
const
{
return
true
;
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
template <> \
template <> \
bool func##Kernel<double>::UseMe(const int& d) const { \
bool func##Kernel<double>::UseMe(const int& d) const { \
...
@@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal);
...
@@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal);
REGISTER_MKL_KERNEL
(
kVExp
,
VExp
);
REGISTER_MKL_KERNEL
(
kVExp
,
VExp
);
REGISTER_MKL_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
kVSigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MKL_KERNEL
(
kVTanh
,
VTanh
);
REGISTER_MKL_KERNEL
(
kSeqPool
,
SeqPool
);
#undef REGISTER_MKL_KERNEL
#undef REGISTER_MKL_KERNEL
paddle/fluid/operators/jit/more/mkl/mkl.h
浏览文件 @
eac5a0aa
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#pragma once
#pragma once
#include <cmath>
#include <type_traits>
#include <type_traits>
#include "paddle/fluid/operators/jit/kernel_base.h"
#include "paddle/fluid/operators/jit/kernel_base.h"
...
@@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n);
...
@@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n);
template
<
typename
T
>
template
<
typename
T
>
void
VExp
(
const
T
*
x
,
T
*
y
,
int
n
);
void
VExp
(
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VCopy
(
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VAXPY
(
T
a
,
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
template
<
typename
T
>
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
...
@@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) {
...
@@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) {
}
}
}
}
template
<
typename
T
>
void
SeqPool
(
const
T
*
x
,
T
*
y
,
const
seq_pool_attr_t
*
attr
)
{
VCopy
<
T
>
(
x
,
y
,
attr
->
w
);
for
(
int
h
=
1
;
h
!=
attr
->
h
;
++
h
)
{
VAXPY
<
T
>
(
static_cast
<
T
>
(
1
),
x
+
h
*
attr
->
w
,
y
,
attr
->
w
);
}
if
(
attr
->
type
==
SeqPoolType
::
kAvg
||
attr
->
type
==
SeqPoolType
::
kSqrt
)
{
T
scalar
=
static_cast
<
T
>
(
1
);
if
(
attr
->
type
==
SeqPoolType
::
kAvg
)
{
scalar
=
scalar
/
static_cast
<
T
>
(
attr
->
h
);
}
else
{
scalar
=
scalar
/
std
::
sqrt
(
static_cast
<
T
>
(
attr
->
h
));
}
VScal
<
T
>
(
&
scalar
,
y
,
y
,
attr
->
w
);
}
}
#define DECLARE_MKL_KERNEL(name, tuples) \
#define DECLARE_MKL_KERNEL(name, tuples) \
template <typename T> \
template <typename T> \
class name##Kernel : public KernelMore<tuples<T>> { \
class name##Kernel : public KernelMore<tuples<T>> { \
...
@@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
...
@@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
SeqPool
,
SeqPoolTuples
);
#undef DECLARE_MKL_KERNEL
#undef DECLARE_MKL_KERNEL
}
// namespace mkl
}
// namespace mkl
...
...
paddle/fluid/operators/jit/refer/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2)
...
@@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2)
USE_JITKERNEL_REFER
(
kCRFDecoding
)
USE_JITKERNEL_REFER
(
kCRFDecoding
)
USE_JITKERNEL_REFER
(
kLayerNorm
)
USE_JITKERNEL_REFER
(
kLayerNorm
)
USE_JITKERNEL_REFER
(
kNCHW16CMulNC
)
USE_JITKERNEL_REFER
(
kNCHW16CMulNC
)
USE_JITKERNEL_REFER
(
kSeqPool
)
paddle/fluid/operators/jit/refer/refer.cc
浏览文件 @
eac5a0aa
...
@@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
...
@@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
REGISTER_REFER_KERNEL
(
kNCHW16CMulNC
,
NCHW16CMulNC
);
REGISTER_REFER_KERNEL
(
kNCHW16CMulNC
,
NCHW16CMulNC
);
REGISTER_REFER_KERNEL
(
kSeqPool
,
SeqPool
);
#undef REGISTER_REFER_KERNEL
#undef REGISTER_REFER_KERNEL
paddle/fluid/operators/jit/refer/refer.h
浏览文件 @
eac5a0aa
...
@@ -332,6 +332,28 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
...
@@ -332,6 +332,28 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
}
}
}
}
template
<
typename
T
>
void
SeqPool
(
const
T
*
x
,
T
*
y
,
const
seq_pool_attr_t
*
attr
)
{
for
(
int
w
=
0
;
w
<
attr
->
w
;
++
w
)
{
const
T
*
src
=
x
+
w
;
T
*
dst
=
y
+
w
;
*
dst
=
static_cast
<
T
>
(
0
);
for
(
int
h
=
0
;
h
<
attr
->
h
;
++
h
)
{
*
dst
=
*
dst
+
*
src
;
src
+=
attr
->
w
;
}
}
if
(
attr
->
type
==
SeqPoolType
::
kAvg
||
attr
->
type
==
SeqPoolType
::
kSqrt
)
{
T
scalar
=
static_cast
<
T
>
(
1
);
if
(
attr
->
type
==
SeqPoolType
::
kAvg
)
{
scalar
=
scalar
/
static_cast
<
T
>
(
attr
->
h
);
}
else
{
scalar
=
scalar
/
std
::
sqrt
(
static_cast
<
T
>
(
attr
->
h
));
}
VScal
<
T
>
(
&
scalar
,
y
,
y
,
attr
->
w
);
}
}
#define DECLARE_REFER_KERNEL(name, tuples) \
#define DECLARE_REFER_KERNEL(name, tuples) \
template <typename T> \
template <typename T> \
class name##Kernel : public ReferKernel<tuples<T>> { \
class name##Kernel : public ReferKernel<tuples<T>> { \
...
@@ -370,6 +392,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
...
@@ -370,6 +392,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
DECLARE_REFER_KERNEL
(
NCHW16CMulNC
,
NCHW16CMulNCTuples
);
DECLARE_REFER_KERNEL
(
NCHW16CMulNC
,
NCHW16CMulNCTuples
);
DECLARE_REFER_KERNEL
(
SeqPool
,
SeqPoolTuples
);
#undef DECLARE_REFER_KERNEL
#undef DECLARE_REFER_KERNEL
}
// namespace refer
}
// namespace refer
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
eac5a0aa
...
@@ -211,6 +211,24 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
...
@@ -211,6 +211,24 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
}
}
};
};
template
<
typename
T
>
struct
TestFuncWithRefer
<
jit
::
SeqPoolTuples
<
T
>
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>>
{
void
operator
()(
const
typename
jit
::
SeqPoolTuples
<
T
>::
func_type
tgt
,
const
std
::
vector
<
T
>&
x
,
const
std
::
vector
<
T
>&
yref
,
const
typename
jit
::
SeqPoolTuples
<
T
>::
attr_type
&
attr
)
{
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_EQ
(
x
.
size
()
%
yref
.
size
(),
0
);
int
w
=
yref
.
size
();
std
::
vector
<
T
>
y
(
w
);
const
T
*
x_data
=
x
.
data
();
const
T
*
yref_data
=
yref
.
data
();
T
*
y_data
=
y
.
data
();
tgt
(
x_data
,
y_data
,
&
attr
);
ExpectEQ
<
T
>
(
y_data
,
yref_data
,
w
);
}
};
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
KernelTuples
,
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
KernelTuples
,
typename
PlaceType
,
typename
...
Args
>
typename
PlaceType
,
typename
...
Args
>
void
TestAllImpls
(
const
typename
KernelTuples
::
attr_type
&
attr
,
Args
...
args
)
{
void
TestAllImpls
(
const
typename
KernelTuples
::
attr_type
&
attr
,
Args
...
args
)
{
...
@@ -415,6 +433,31 @@ void TestGRUKernel() {
...
@@ -415,6 +433,31 @@ void TestGRUKernel() {
}
}
}
}
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
TestSeqPoolKernel
()
{
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
std
::
vector
<
jit
::
SeqPoolType
>
pool_types
=
{
jit
::
SeqPoolType
::
kSum
,
jit
::
SeqPoolType
::
kAvg
,
jit
::
SeqPoolType
::
kSqrt
};
for
(
auto
type
:
pool_types
)
{
for
(
int
w
:
TestSizes
())
{
jit
::
seq_pool_attr_t
attr
(
w
,
type
);
for
(
int
h
:
TestSizes
())
{
attr
.
h
=
h
;
auto
ref
=
jit
::
GetRefer
<
KT
,
jit
::
SeqPoolTuples
<
T
>>
();
EXPECT_TRUE
(
ref
!=
nullptr
);
std
::
vector
<
T
>
x
(
h
*
w
),
yref
(
w
);
RandomVec
<
T
>
(
h
*
w
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
T
*
x_data
=
x
.
data
();
T
*
yref_data
=
yref
.
data
();
ref
(
x_data
,
yref_data
,
&
attr
);
VLOG
(
10
)
<<
attr
;
TestAllImpls
<
KT
,
jit
::
SeqPoolTuples
<
T
>
,
PlaceType
,
std
::
vector
<
T
>
,
std
::
vector
<
T
>>
(
attr
,
x
,
yref
,
attr
);
}
}
}
}
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
TestNCHW16CMulNCKernel
()
{
void
TestNCHW16CMulNCKernel
()
{
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
VLOG
(
10
)
<<
"===== Test JITKernel "
<<
jit
::
to_string
(
KT
);
...
@@ -569,6 +612,12 @@ TEST(JITKernel, kGRUHtPart2) {
...
@@ -569,6 +612,12 @@ TEST(JITKernel, kGRUHtPart2) {
TestGRUKernel
<
jit
::
kGRUHtPart2
,
double
,
paddle
::
platform
::
CPUPlace
>
();
TestGRUKernel
<
jit
::
kGRUHtPart2
,
double
,
paddle
::
platform
::
CPUPlace
>
();
}
}
TEST
(
JITKernel
,
kSeqPool
)
{
namespace
jit
=
paddle
::
operators
::
jit
;
TestSeqPoolKernel
<
jit
::
kSeqPool
,
float
,
paddle
::
platform
::
CPUPlace
>
();
TestSeqPoolKernel
<
jit
::
kSeqPool
,
double
,
paddle
::
platform
::
CPUPlace
>
();
}
TEST
(
JITKernel
,
kNCHW16CMulNC
)
{
TEST
(
JITKernel
,
kNCHW16CMulNC
)
{
namespace
jit
=
paddle
::
operators
::
jit
;
namespace
jit
=
paddle
::
operators
::
jit
;
TestNCHW16CMulNCKernel
<
jit
::
kNCHW16CMulNC
,
float
,
TestNCHW16CMulNCKernel
<
jit
::
kNCHW16CMulNC
,
float
,
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -51,7 +51,7 @@ math_library(pooling)
...
@@ -51,7 +51,7 @@ math_library(pooling)
math_library
(
selected_rows_functor DEPS selected_rows math_function blas
)
math_library
(
selected_rows_functor DEPS selected_rows math_function blas
)
math_library
(
sequence2batch
)
math_library
(
sequence2batch
)
math_library
(
sequence_padding
)
math_library
(
sequence_padding
)
math_library
(
sequence_pooling DEPS math_function
)
math_library
(
sequence_pooling DEPS math_function
jit_kernel_helper
)
math_library
(
sequence_scale
)
math_library
(
sequence_scale
)
math_library
(
softmax DEPS math_function
)
math_library
(
softmax DEPS math_function
)
...
...
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
eac5a0aa
...
@@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
...
@@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input1
,
const
framework
::
SelectedRows
&
input1
,
framework
::
Tensor
*
input2
)
{
framework
::
Tensor
*
input2
)
{
if
(
UNLIKELY
(
input1
.
rows
().
size
()
==
0
))
{
LOG
(
WARNING
)
<<
"input selected rows is empty!"
;
return
;
}
auto
in1_height
=
input1
.
height
();
auto
in1_height
=
input1
.
height
();
auto
in2_dims
=
input2
->
dims
();
auto
in2_dims
=
input2
->
dims
();
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
]);
PADDLE_ENFORCE_EQ
(
in1_height
,
in2_dims
[
0
]);
...
...
paddle/fluid/operators/math/sequence_pooling.cc
浏览文件 @
eac5a0aa
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/fluid/operators/math/sequence_pooling.h"
...
@@ -239,15 +240,33 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
...
@@ -239,15 +240,33 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool
(
context
,
input
,
output
);
last_pool
(
context
,
input
,
output
);
return
;
return
;
}
}
if
(
pooltype
==
"FIRST"
)
{
if
(
pooltype
==
"FIRST"
)
{
math
::
FirstSeqPoolFunctor
<
T
>
first_pool
;
math
::
FirstSeqPoolFunctor
<
T
>
first_pool
;
first_pool
(
context
,
input
,
output
);
first_pool
(
context
,
input
,
output
);
return
;
return
;
}
}
auto
lod
=
input
.
lod
()[
0
];
auto
lod
=
input
.
lod
()[
0
];
if
(
pooltype
==
"SUM"
)
{
auto
place
=
context
.
GetPlace
();
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
place
));
const
T
*
src
=
input
.
data
<
T
>
();
T
*
dst
=
output
->
mutable_data
<
T
>
(
place
);
jit
::
seq_pool_attr_t
attr
(
static_cast
<
int
>
(
input
.
numel
()
/
input
.
dims
()[
0
]),
jit
::
SeqPoolType
::
kSum
);
auto
seqpool
=
jit
::
Get
<
jit
::
kSeqPool
,
jit
::
SeqPoolTuples
<
T
>
,
platform
::
CPUPlace
>
(
attr
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
attr
.
h
=
static_cast
<
int
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
seqpool
(
src
,
dst
,
&
attr
);
dst
+=
attr
.
w
;
src
+=
attr
.
h
*
attr
.
w
;
}
return
;
}
auto
&
place
=
*
context
.
eigen_device
();
auto
&
place
=
*
context
.
eigen_device
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
Tensor
in_t
=
Tensor
in_t
=
input
.
Slice
(
static_cast
<
int
>
(
lod
[
i
]),
static_cast
<
int
>
(
lod
[
i
+
1
]));
input
.
Slice
(
static_cast
<
int
>
(
lod
[
i
]),
static_cast
<
int
>
(
lod
[
i
+
1
]));
...
@@ -258,15 +277,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
...
@@ -258,15 +277,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
auto
out_e
=
EigenVector
<
T
>::
Flatten
(
out_t
);
auto
out_e
=
EigenVector
<
T
>::
Flatten
(
out_t
);
if
(
pooltype
==
"AVERAGE"
)
{
if
(
pooltype
==
"AVERAGE"
)
{
out_e
.
device
(
place
)
=
in_e
.
mean
(
Eigen
::
array
<
int
,
1
>
({{
0
}}));
out_e
.
device
(
place
)
=
in_e
.
mean
(
Eigen
::
array
<
int
,
1
>
({{
0
}}));
}
else
if
(
pooltype
==
"SUM"
)
{
if
(
h
>
0
)
{
const
T
*
in_data
=
in_t
.
data
<
T
>
();
T
*
out_data
=
out_t
.
mutable_data
<
T
>
(
context
.
GetPlace
());
blas
.
VCOPY
(
w
,
in_data
,
out_data
);
for
(
int64_t
r
=
1
;
r
!=
h
;
++
r
)
{
blas
.
AXPY
(
w
,
1.
,
in_data
+
r
*
w
,
out_data
);
}
}
}
else
if
(
pooltype
==
"SQRT"
)
{
}
else
if
(
pooltype
==
"SQRT"
)
{
out_e
.
device
(
place
)
=
in_e
.
sum
(
Eigen
::
array
<
int
,
1
>
({{
0
}}))
/
out_e
.
device
(
place
)
=
in_e
.
sum
(
Eigen
::
array
<
int
,
1
>
({{
0
}}))
/
std
::
sqrt
(
static_cast
<
T
>
(
h
));
std
::
sqrt
(
static_cast
<
T
>
(
h
));
...
...
paddle/fluid/operators/math/softmax.h
浏览文件 @
eac5a0aa
...
@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor {
...
@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor {
const
framework
::
Tensor
*
Y
,
const
framework
::
Tensor
*
y_grad
,
const
framework
::
Tensor
*
Y
,
const
framework
::
Tensor
*
y_grad
,
framework
::
Tensor
*
x_grad
);
framework
::
Tensor
*
x_grad
);
};
};
#endif
#endif
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/ngraph/ngraph_ops.h
浏览文件 @
eac5a0aa
...
@@ -23,5 +23,7 @@ limitations under the License. */
...
@@ -23,5 +23,7 @@ limitations under the License. */
#include "ops/binary_unnary_op.h"
#include "ops/binary_unnary_op.h"
#include "ops/fill_constant_op.h"
#include "ops/fill_constant_op.h"
#include "ops/mean_op.h"
#include "ops/mul_op.h"
#include "ops/mul_op.h"
#include "ops/scale_op.h"
#include "ops/top_k_op.h"
#include "ops/top_k_op.h"
paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
浏览文件 @
eac5a0aa
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#pragma once
#include <string>
#include <string>
...
@@ -48,4 +47,3 @@ static void BuildUnaryNode(
...
@@ -48,4 +47,3 @@ static void BuildUnaryNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
0 → 100644
浏览文件 @
eac5a0aa
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
template
<
typename
T
>
std
::
shared_ptr
<
ngraph
::
Node
>
ElementwiseScalar
(
float
scale
,
std
::
shared_ptr
<
ngraph
::
Node
>
node
)
{
auto
node_shape
=
node
->
get_shape
();
auto
scale_const
=
ngraph
::
op
::
Constant
::
create
(
node
->
get_element_type
(),
node_shape
,
{
scale
});
return
std
::
make_shared
<
T
>
(
scale_const
,
node
);
}
template
<
typename
T
>
std
::
shared_ptr
<
ngraph
::
Node
>
ElementwiseScalar
(
std
::
shared_ptr
<
ngraph
::
Node
>
scale_1d
,
std
::
shared_ptr
<
ngraph
::
Node
>
node
)
{
auto
scale_shape
=
scale_1d
->
get_shape
();
PADDLE_ENFORCE_EQ
(
scale_shape
.
size
(),
1
,
"Supporting 1d scale node"
);
PADDLE_ENFORCE_EQ
(
scale_shape
.
at
(
0
),
1
,
"scale 1d in in shape {1}"
);
auto
node_shape
=
node
->
get_shape
();
ngraph
::
AxisSet
axis_set
;
for
(
size_t
i
=
0
;
i
<
node_shape
.
size
();
++
i
)
{
axis_set
.
insert
(
i
);
}
node_shape
.
push_back
(
1
);
auto
scale_bcast
=
std
::
make_shared
<
ngraph
::
op
::
Broadcast
>
(
scale_1d
,
node_shape
,
axis_set
);
auto
scale_reshape
=
paddle
::
platform
::
NgReshaper
(
scale_bcast
,
node
->
get_shape
());
return
std
::
make_shared
<
T
>
(
scale_reshape
,
node
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ops/fill_constant_op.h
浏览文件 @
eac5a0aa
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#pragma once
#include <string>
#include <string>
...
@@ -58,4 +57,3 @@ void BuildFillConstantNode(
...
@@ -58,4 +57,3 @@ void BuildFillConstantNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/mean_op.h
0 → 100644
浏览文件 @
eac5a0aa
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
void
BuildMeanNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
input
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
ngraph
::
AxisSet
axes
;
for
(
size_t
i
=
0
;
i
<
input
->
get_shape
().
size
();
++
i
)
{
axes
.
insert
(
i
);
}
auto
mean
=
ngraph
::
builder
::
mean
(
input
,
axes
);
auto
mean_1d
=
std
::
make_shared
<
ngraph
::
op
::
Reshape
>
(
mean
,
ngraph
::
AxisVector
{},
ngraph
::
Shape
{
1
});
paddle
::
platform
::
SetOutputNode
(
op
,
"Out"
,
mean_1d
,
ngb_node_map
);
}
void
BuildMeanGradNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
x
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
og
=
paddle
::
platform
::
GetInputNode
(
op
,
"Out@GRAD"
,
ngb_node_map
);
auto
x_shape
=
x
->
get_shape
();
float
x_size
=
std
::
accumulate
(
std
::
begin
(
x_shape
),
std
::
end
(
x_shape
),
1
,
std
::
multiplies
<
float
>
());
auto
node_const
=
ngraph
::
op
::
Constant
::
create
(
og
->
get_element_type
(),
ngraph
::
Shape
{
1
},
{
x_size
});
auto
node_div
=
std
::
make_shared
<
ngraph
::
op
::
Divide
>
(
og
,
node_const
);
auto
result
=
ElementwiseScalar
<
ngraph
::
op
::
Add
>
(
og
/
node_const
,
ngraph
::
op
::
Constant
::
create
(
og
->
get_element_type
(),
x_shape
,
{
0
}));
paddle
::
platform
::
SetOutputNode
(
op
,
"X@GRAD"
,
result
,
ngb_node_map
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ops/mul_op.h
浏览文件 @
eac5a0aa
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#pragma once
#include <string>
#include <string>
...
@@ -131,4 +130,3 @@ static void BuildMulGradNode(
...
@@ -131,4 +130,3 @@ static void BuildMulGradNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/scale_op.h
0 → 100644
浏览文件 @
eac5a0aa
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
void
BuildScaleNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
op_attrs
=
paddle
::
framework
::
AttrReader
(
op
->
Attrs
());
float
scale
=
op_attrs
.
Get
<
float
>
(
"scale"
);
auto
x
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
out
=
ElementwiseScalar
<
ngraph
::
op
::
Multiply
>
(
scale
,
x
);
paddle
::
platform
::
SetOutputNode
(
op
,
"Out"
,
out
,
ngb_node_map
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ops/top_k_op.h
浏览文件 @
eac5a0aa
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#pragma once
#include <string>
#include <string>
...
@@ -48,4 +47,3 @@ void BuildTopKNode(
...
@@ -48,4 +47,3 @@ void BuildTopKNode(
}
// namespace ngraphs
}
// namespace ngraphs
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
#endif
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
浏览文件 @
eac5a0aa
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
...
@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
}
// namespace
}
// namespace
static
__device__
__forceinline__
float
real_exp
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
platform
::
float16
exp_on_device
(
static
__device__
__forceinline__
double
real_exp
(
double
x
)
{
return
exp
(
x
);
}
platform
::
float16
x
)
{
static
__device__
__forceinline__
float
real_log
(
float
x
)
{
return
::
Eigen
::
numext
::
exp
(
x
);
}
static
__device__
__forceinline__
float
exp_on_device
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
exp_on_device
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
platform
::
float16
log_on_device
(
platform
::
float16
x
)
{
return
math
::
TolerableValue
<
platform
::
float16
>
()(
::
Eigen
::
numext
::
log
(
x
));
}
static
__device__
__forceinline__
float
log_on_device
(
float
x
)
{
return
math
::
TolerableValue
<
float
>
()(
logf
(
x
));
return
math
::
TolerableValue
<
float
>
()(
logf
(
x
));
}
}
static
__device__
__forceinline__
double
real_log
(
double
x
)
{
static
__device__
__forceinline__
double
log_on_device
(
double
x
)
{
return
math
::
TolerableValue
<
double
>
()(
log
(
x
));
return
math
::
TolerableValue
<
double
>
()(
log
(
x
));
}
}
...
@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) {
...
@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) {
/*
/*
Supposing the x is `logits` and y is `labels`, the equations are as
Supposing the x is `logits` and y is `labels`, the equations are as
followings:
followings:
cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
= \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
= \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
= \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
= \sum_{j}(-y_i_j * tmp_i_j)
= \sum_{j}(-y_i_j * tmp_i_j)
softmax_i_j = e^{tmp_i_j}
softmax_i_j = e^{tmp_i_j}
where:
where:
max_i = \max_{j}{x_i_j}
max_i = \max_{j}{x_i_j}
logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
Therefore, the calculation can be separated into 3 steps:
Therefore, the calculation can be separated into 3 steps:
Step 1: row-wise operation to calculate max_i
Step 1: row-wise operation to calculate max_i
Step 2: row-wise operation to calculate logDiffMaxSum_i
Step 2: row-wise operation to calculate logDiffMaxSum_i
Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
To save memory, we can share memory among max_i, logDiffMaxSum_i and
To save memory, we can share memory among max_i, logDiffMaxSum_i and
cross\_entropy_i.
cross\_entropy_i.
In this way, the 3 steps should be changed to:
In this way, the 3 steps should be changed to:
...
@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
...
@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
cur_max
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
cur_max
,
cub
::
Max
());
cur_max
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
cur_max
,
cub
::
Max
());
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
)
{
max_data
[
blockIdx
.
x
]
=
cur_max
<
-
64
?
-
64
:
cur_max
;
max_data
[
blockIdx
.
x
]
=
cur_max
<
static_cast
<
T
>
(
-
64
)
?
static_cast
<
T
>
(
-
64
)
:
cur_max
;
}
}
}
}
...
@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
...
@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
auto
block_max
=
max_data
[
blockIdx
.
x
];
auto
block_max
=
max_data
[
blockIdx
.
x
];
softmax
[
beg_idx
]
=
logits_data
[
beg_idx
]
-
block_max
;
softmax
[
beg_idx
]
=
logits_data
[
beg_idx
]
-
block_max
;
T
diff_max_sum
=
real_exp
(
softmax
[
beg_idx
]);
T
diff_max_sum
=
exp_on_device
(
softmax
[
beg_idx
]);
auto
idx
=
beg_idx
+
BlockDim
;
auto
idx
=
beg_idx
+
BlockDim
;
while
(
idx
<
end_idx
)
{
while
(
idx
<
end_idx
)
{
softmax
[
idx
]
=
logits_data
[
idx
]
-
block_max
;
softmax
[
idx
]
=
logits_data
[
idx
]
-
block_max
;
diff_max_sum
+=
real_exp
(
softmax
[
idx
]);
diff_max_sum
+=
exp_on_device
(
softmax
[
idx
]);
idx
+=
BlockDim
;
idx
+=
BlockDim
;
}
}
diff_max_sum
=
diff_max_sum
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
diff_max_sum
,
cub
::
Sum
());
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
diff_max_sum
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
max_data
[
blockIdx
.
x
]
=
real_log
(
diff_max_sum
);
if
(
threadIdx
.
x
==
0
)
max_data
[
blockIdx
.
x
]
=
log_on_device
(
diff_max_sum
);
if
(
!
CalculateLogSoftmax
)
return
;
if
(
!
CalculateLogSoftmax
)
return
;
__syncthreads
();
__syncthreads
();
...
@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy(
...
@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy(
// log_diff_max_sum shares memory with loss
// log_diff_max_sum shares memory with loss
auto
block_log_diff_max_sum
=
loss_data
[
blockIdx
.
x
];
auto
block_log_diff_max_sum
=
loss_data
[
blockIdx
.
x
];
auto
tmp
=
softmax
[
beg_idx
]
-
block_log_diff_max_sum
;
auto
tmp
=
softmax
[
beg_idx
]
-
block_log_diff_max_sum
;
softmax
[
beg_idx
]
=
real_exp
(
tmp
);
softmax
[
beg_idx
]
=
exp_on_device
(
tmp
);
auto
loss
=
-
labels_data
[
beg_idx
]
*
tmp
;
auto
loss
=
-
labels_data
[
beg_idx
]
*
tmp
;
beg_idx
+=
BlockDim
;
beg_idx
+=
BlockDim
;
while
(
beg_idx
<
end_idx
)
{
while
(
beg_idx
<
end_idx
)
{
tmp
=
softmax
[
beg_idx
]
-
block_log_diff_max_sum
;
tmp
=
softmax
[
beg_idx
]
-
block_log_diff_max_sum
;
softmax
[
beg_idx
]
=
real_exp
(
tmp
);
softmax
[
beg_idx
]
=
exp_on_device
(
tmp
);
loss
-=
(
labels_data
[
beg_idx
]
*
tmp
);
loss
-=
(
labels_data
[
beg_idx
]
*
tmp
);
beg_idx
+=
BlockDim
;
beg_idx
+=
BlockDim
;
}
}
...
@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
...
@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
auto
row_idx
=
idx
/
feature_size_
;
auto
row_idx
=
idx
/
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
if
(
col_idx
!=
labels_
[
row_idx
])
{
if
(
col_idx
!=
labels_
[
row_idx
])
{
log_softmax_
[
idx
]
=
real_exp
(
log_softmax_
[
idx
]);
log_softmax_
[
idx
]
=
exp_on_device
(
log_softmax_
[
idx
]);
}
else
{
}
else
{
auto
softmax
=
log_softmax_
[
idx
];
auto
softmax
=
log_softmax_
[
idx
];
log_softmax_
[
idx
]
=
real_exp
(
softmax
);
log_softmax_
[
idx
]
=
exp_on_device
(
softmax
);
loss_
[
row_idx
]
=
-
softmax
;
loss_
[
row_idx
]
=
-
softmax
;
}
}
}
}
...
@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
...
@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
auto
row_idx
=
idx
/
feature_size_
;
auto
row_idx
=
idx
/
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
if
(
col_idx
!=
labels_
[
row_idx
]
||
col_idx
==
ignore_idx_
)
{
if
(
col_idx
!=
labels_
[
row_idx
]
||
col_idx
==
ignore_idx_
)
{
log_softmax_
[
idx
]
=
real_exp
(
log_softmax_
[
idx
]);
log_softmax_
[
idx
]
=
exp_on_device
(
log_softmax_
[
idx
]);
}
else
{
}
else
{
auto
softmax
=
log_softmax_
[
idx
];
auto
softmax
=
log_softmax_
[
idx
];
log_softmax_
[
idx
]
=
real_exp
(
softmax
);
log_softmax_
[
idx
]
=
exp_on_device
(
softmax
);
loss_
[
row_idx
]
=
-
softmax
;
loss_
[
row_idx
]
=
-
softmax
;
}
}
}
}
...
@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
...
@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
softmax_with_cross_entropy
,
REGISTER_OP_CUDA_KERNEL
(
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
float
>
,
softmax_with_cross_entropy
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
double
>
);
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
paddle
::
platform
::
float16
>
,
REGISTER_OP_CUDA_KERNEL
(
softmax_with_cross_entropy_grad
,
ops
::
SoftmaxWithCrossEntropyCUDAKernel
<
double
>
);
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
float
>
,
REGISTER_OP_CUDA_KERNEL
(
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
double
>
);
softmax_with_cross_entropy_grad
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
paddle
::
platform
::
float16
>
,
ops
::
SoftmaxWithCrossEntropyGradCUDAKernel
<
double
>
);
paddle/fluid/operators/sum_op.cc
浏览文件 @
eac5a0aa
...
@@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel {
...
@@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel {
return
;
// skip runtime infershape when is tensor array;
return
;
// skip runtime infershape when is tensor array;
}
}
auto
x_var_types
=
ctx
->
GetInputsVarType
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
size_t
N
=
x_dims
.
size
();
size_t
N
=
x_dims
.
size
();
PADDLE_ENFORCE_GT
(
N
,
0
,
"Input tensors count should > 0."
);
PADDLE_ENFORCE_GT
(
N
,
0
,
"Input tensors count should > 0."
);
if
(
N
==
1
)
{
if
(
N
==
1
)
{
...
@@ -49,7 +51,13 @@ class SumOp : public framework::OperatorWithKernel {
...
@@ -49,7 +51,13 @@ class SumOp : public framework::OperatorWithKernel {
}
}
framework
::
DDim
in_dim
({
0
});
framework
::
DDim
in_dim
({
0
});
for
(
auto
&
x_dim
:
x_dims
)
{
for
(
size_t
i
=
0
;
i
<
x_dims
.
size
();
++
i
)
{
auto
&
x_dim
=
x_dims
[
i
];
// x_dim.size() == 1 means the real dim of selected rows is [0]
if
(
x_var_types
[
i
]
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
&&
x_dim
.
size
()
==
1
)
{
continue
;
}
if
(
framework
::
product
(
x_dim
)
==
0
)
{
if
(
framework
::
product
(
x_dim
)
==
0
)
{
continue
;
continue
;
}
}
...
...
paddle/fluid/platform/cuda_helper_test.cu
浏览文件 @
eac5a0aa
...
@@ -15,6 +15,9 @@
...
@@ -15,6 +15,9 @@
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <algorithm>
#include <iostream>
#include <iostream>
#ifdef _WIN32
#include <numeric>
#endif
#include <random>
#include <random>
#define PADDLE_CUDA_FP16
#define PADDLE_CUDA_FP16
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
eac5a0aa
...
@@ -92,26 +92,24 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
...
@@ -92,26 +92,24 @@ platform::TemporaryAllocator& DeviceTemporaryAllocator::Get(
const
platform
::
Place
&
place
,
const
cudaStream_t
&
stream
)
{
const
platform
::
Place
&
place
,
const
cudaStream_t
&
stream
)
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
place
));
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
place
));
auto
place_stream
=
std
::
make_pair
(
place
,
stream
);
auto
place_stream
=
std
::
make_pair
(
place
,
stream
);
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mtx_
);
auto
it
=
device_allocator_
.
find
(
place_stream
);
if
(
!
device_allocator_
.
count
(
place_stream
))
{
if
(
it
==
device_allocator_
.
end
())
{
device_allocator_
[
place_stream
].
reset
(
new
TemporaryAllocator
(
place
));
auto
tmp_allocator
=
new
TemporaryAllocator
(
place
);
device_allocator_
[
place_stream
]
->
SetCallback
([
stream
]()
{
tmp_allocator
->
SetCallback
([
stream
]()
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
PADDLE_ENFORCE
(
cudaGetLastError
());
PADDLE_ENFORCE
(
cudaGetLastError
());
});
});
}
device_allocator_
[
place_stream
].
reset
(
tmp_allocator
);
return
*
tmp_allocator
;
}
else
{
return
*
it
->
second
;
}
}
return
*
device_allocator_
.
at
(
place_stream
);
}
}
template
<
>
template
<
>
platform
::
TemporaryAllocator
&
DeviceTemporaryAllocator
::
Get
(
platform
::
TemporaryAllocator
&
DeviceTemporaryAllocator
::
Get
(
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
const
platform
::
CUDADeviceContext
&
dev_ctx
)
{
auto
place_stream
=
std
::
make_pair
(
dev_ctx
.
GetPlace
(),
dev_ctx
.
stream
());
if
(
device_allocator_
.
count
(
place_stream
))
{
return
*
device_allocator_
.
at
(
place_stream
);
}
return
Get
(
dev_ctx
.
GetPlace
(),
dev_ctx
.
stream
());
return
Get
(
dev_ctx
.
GetPlace
(),
dev_ctx
.
stream
());
}
}
#endif
#endif
...
@@ -292,7 +290,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
...
@@ -292,7 +290,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
if
(
dynload
::
HasCUDNN
())
{
if
(
dynload
::
HasCUDNN
())
{
auto
local_cudnn_version
=
cudnn_dso_ver
/
100
;
auto
local_cudnn_version
=
cudnn_dso_ver
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
auto
compile_cudnn_version
=
CUDNN_VERSION
/
100
;
if
(
local_cud
a_version
<
compile_cuda
_version
)
{
if
(
local_cud
nn_version
<
compile_cudnn
_version
)
{
LOG_FIRST_N
(
WARNING
,
1
)
LOG_FIRST_N
(
WARNING
,
1
)
<<
"WARNING: device: "
<<
place_
.
device
<<
"WARNING: device: "
<<
place_
.
device
<<
". The installed Paddle is compiled with CUDNN "
<<
". The installed Paddle is compiled with CUDNN "
...
@@ -325,7 +323,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
...
@@ -325,7 +323,7 @@ Place CUDADeviceContext::GetPlace() const { return place_; }
void
CUDADeviceContext
::
Wait
()
const
{
void
CUDADeviceContext
::
Wait
()
const
{
auto
&
allocator
=
auto
&
allocator
=
DeviceTemporaryAllocator
::
Instance
().
Get
<
CUDADeviceContext
>
(
*
this
);
DeviceTemporaryAllocator
::
Instance
().
Get
<
CUDADeviceContext
>
(
*
this
);
allocator
.
Release
([
=
]()
{
allocator
.
Release
([
this
]()
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
PADDLE_ENFORCE
(
cudaGetLastError
());
PADDLE_ENFORCE
(
cudaGetLastError
());
});
});
...
...
paddle/fluid/platform/float16.h
浏览文件 @
eac5a0aa
...
@@ -59,7 +59,7 @@ limitations under the License. */
...
@@ -59,7 +59,7 @@ limitations under the License. */
#if !defined(_WIN32)
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#else
#define PADDLE_ALIGN(x)
/*do nothing*/
#define PADDLE_ALIGN(x)
__declspec(align(x))
#endif
#endif
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/platform/float16_test.cu
浏览文件 @
eac5a0aa
...
@@ -271,11 +271,13 @@ TEST(float16, isinf) {
...
@@ -271,11 +271,13 @@ TEST(float16, isinf) {
float16
b
=
float16
(
INFINITY
);
float16
b
=
float16
(
INFINITY
);
// underflow to 0
// underflow to 0
float16
native_a
(
5e-40
f
);
float16
native_a
(
5e-40
f
);
// overflow to inf
float16
native_b
(
5e40
f
);
EXPECT_EQ
(
std
::
isinf
(
a
),
true
);
EXPECT_EQ
(
std
::
isinf
(
a
),
true
);
EXPECT_EQ
(
std
::
isinf
(
b
),
true
);
EXPECT_EQ
(
std
::
isinf
(
b
),
true
);
#ifndef _WIN32
// overflow to inf
float16
native_b
(
5e40
f
);
EXPECT_EQ
(
std
::
isinf
(
native_b
),
true
);
EXPECT_EQ
(
std
::
isinf
(
native_b
),
true
);
#endif
EXPECT_EQ
(
native_a
,
float16
(
0
));
EXPECT_EQ
(
native_a
,
float16
(
0
));
}
}
...
...
paddle/fluid/platform/mkldnn_reuse.h
浏览文件 @
eac5a0aa
...
@@ -210,20 +210,25 @@ class MKLDNNHandler {
...
@@ -210,20 +210,25 @@ class MKLDNNHandler {
dst_memory
.
reset
(
new
mkldnn
::
memory
(
*
dst_pd
,
to_void_cast
<
T
>
(
output_data
)));
dst_memory
.
reset
(
new
mkldnn
::
memory
(
*
dst_pd
,
to_void_cast
<
T
>
(
output_data
)));
}
}
static
void
AppendKey
(
static
void
AppendKey
(
std
::
string
*
key
,
std
::
string
*
key
,
const
mkldnn
::
memory
::
dims
&
input_dims
,
const
mkldnn
::
memory
::
dims
&
input_dims
,
const
mkldnn
::
memory
::
dims
&
weights_dims
,
const
std
::
vector
<
int
>&
strides
,
const
mkldnn
::
memory
::
dims
&
weights_dims
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
&
groups
,
const
mkldnn
::
memory
::
data_type
&
type
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
memory
::
format
&
format
,
const
std
::
string
&
suffix
)
{
const
std
::
vector
<
int
>&
dilations
,
const
int
&
groups
,
const
mkldnn
::
memory
::
data_type
&
srcdt
,
const
mkldnn
::
memory
::
format
&
format
,
const
bool
&
relu
,
const
bool
&
residual
,
const
std
::
string
&
suffix
)
{
AppendKeyDims
(
key
,
input_dims
);
AppendKeyDims
(
key
,
input_dims
);
AppendKeyDims
(
key
,
weights_dims
);
AppendKeyDims
(
key
,
weights_dims
);
AppendKeyVec
(
key
,
strides
);
AppendKeyVec
(
key
,
strides
);
AppendKeyVec
(
key
,
paddings
);
AppendKeyVec
(
key
,
paddings
);
AppendKeyVec
(
key
,
dilations
);
AppendKeyVec
(
key
,
dilations
);
AppendKey
(
key
,
std
::
to_string
(
groups
));
AppendKey
(
key
,
std
::
to_string
(
groups
));
AppendKey
(
key
,
std
::
to_string
(
type
));
AppendKey
(
key
,
std
::
to_string
(
srcdt
));
AppendKey
(
key
,
std
::
to_string
(
format
));
AppendKey
(
key
,
std
::
to_string
(
format
));
AppendKey
(
key
,
std
::
to_string
(
relu
));
AppendKey
(
key
,
std
::
to_string
(
residual
));
AppendKey
(
key
,
suffix
);
AppendKey
(
key
,
suffix
);
}
}
...
@@ -662,15 +667,35 @@ static std::shared_ptr<mkldnn::memory> SetDstMemory(
...
@@ -662,15 +667,35 @@ static std::shared_ptr<mkldnn::memory> SetDstMemory(
}
}
template
<
typename
T
>
template
<
typename
T
>
static
std
::
shared_ptr
<
mkldnn
::
memory
>
SetDstMemory
Handler
(
static
std
::
shared_ptr
<
mkldnn
::
memory
>
SetDstMemory
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
)
{
const
framework
::
Tensor
*
residual_param
,
const
mkldnn
::
memory
::
desc
&
user_residual_md
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
,
std
::
vector
<
mkldnn
::
primitive
>*
pipeline
)
{
const
T
*
residual_param_data
=
residual_param
->
data
<
T
>
();
PADDLE_ENFORCE
(
residual_param_data
!=
nullptr
,
"Provide data if you want MKLDNN conv+elementwise_add fusion"
);
std
::
shared_ptr
<
mkldnn
::
memory
>
user_residual_memory_p
=
handler
->
AcquireResidualDataMemory
(
user_residual_md
,
to_void_cast
<
T
>
(
residual_param_data
));
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
=
handler
->
AcquireDstMemoryFromResidualDataMemory
(
user_residual_memory_p
,
to_void_cast
<
T
>
(
output_data
),
*
pipeline
);
return
dst_memory_p
;
}
template
<
typename
T
>
static
void
SetDstMemoryHandler
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
,
std
::
shared_ptr
<
mkldnn
::
memory
>*
dst_memory_p
)
{
T
*
output_data
=
output
->
mutable_data
<
T
>
(
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(),
::
paddle
::
memory
::
Allocator
::
kDefault
,
ctx
.
GetPlace
(),
::
paddle
::
memory
::
Allocator
::
kDefault
,
handler
->
GetDstMemorySize
());
handler
->
GetDstMemorySize
());
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
;
(
*
dst_memory_p
)
->
set_data_handle
(
to_void_cast
<
T
>
(
output_data
));
dst_memory_p
->
set_data_handle
(
to_void_cast
<
T
>
(
output_data
));
return
dst_memory_p
;
}
}
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
eac5a0aa
...
@@ -3,7 +3,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune fe
...
@@ -3,7 +3,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune fe
if
(
WITH_PYTHON
)
if
(
WITH_PYTHON
)
list
(
APPEND PYBIND_DEPS py_func_op
)
list
(
APPEND PYBIND_DEPS py_func_op
)
endif
()
endif
()
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc
)
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc
ir.cc
)
if
(
WITH_PYTHON
)
if
(
WITH_PYTHON
)
if
(
WITH_AMD_GPU
)
if
(
WITH_AMD_GPU
)
...
@@ -21,9 +21,8 @@ if(WITH_PYTHON)
...
@@ -21,9 +21,8 @@ if(WITH_PYTHON)
endif
(
NOT APPLE AND NOT ANDROID AND NOT WIN32
)
endif
(
NOT APPLE AND NOT ANDROID AND NOT WIN32
)
endif
(
WITH_AMD_GPU
)
endif
(
WITH_AMD_GPU
)
if
(
WIN32
)
get_property
(
os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES
)
target_link_libraries
(
paddle_pybind shlwapi
)
target_link_libraries
(
paddle_pybind
${
os_dependency_modules
}
)
endif
(
WIN32
)
cc_test
(
tensor_py_test SRCS tensor_py_test.cc DEPS python
)
cc_test
(
tensor_py_test SRCS tensor_py_test.cc DEPS python
)
endif
(
WITH_PYTHON
)
endif
(
WITH_PYTHON
)
paddle/fluid/pybind/ir.cc
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/pybind/ir.h"
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "pybind11/stl.h"
namespace
py
=
pybind11
;
using
paddle
::
framework
::
ir
::
Graph
;
using
paddle
::
framework
::
ir
::
Node
;
using
paddle
::
framework
::
OpDesc
;
using
paddle
::
framework
::
ProgramDesc
;
using
paddle
::
framework
::
VarDesc
;
using
pybind11
::
return_value_policy
;
namespace
paddle
{
namespace
pybind
{
void
BindGraph
(
py
::
module
*
m
)
{
py
::
class_
<
Graph
,
std
::
shared_ptr
<
Graph
>>
(
*
m
,
"Graph"
,
"The graph is a Directed Acyclic Single Static Assignment Graph, see "
"`paddle::ir::Graph` for details."
)
.
def
(
py
::
init
<
const
ProgramDesc
&>
())
.
def
(
"has"
,
&
Graph
::
Has
)
.
def
(
"get_int"
,
&
Graph
::
Get
<
int
>
)
.
def
(
"get_float"
,
&
Graph
::
Get
<
float
>
)
.
def
(
"get_double"
,
&
Graph
::
Get
<
double
>
)
.
def
(
"get_string"
,
&
Graph
::
Get
<
std
::
string
>
)
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
int
attr
)
{
return
self
.
Set
(
attr_name
,
new
int
(
attr
));
})
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
const
std
::
string
&
attr
)
{
return
self
.
Set
(
attr_name
,
new
std
::
string
(
attr
));
})
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
float
attr
)
{
return
self
.
Set
(
attr_name
,
new
float
(
attr
));
})
.
def
(
"set"
,
[](
Graph
&
self
,
const
std
::
string
&
attr_name
,
double
attr
)
{
return
self
.
Set
(
attr_name
,
new
double
(
attr
));
})
.
def
(
"erase"
,
&
Graph
::
Erase
)
.
def
(
"nodes"
,
&
Graph
::
Nodes
,
return_value_policy
::
reference
)
.
def
(
"create_var_node"
,
[](
Graph
&
self
,
VarDesc
&
var_desc
)
{
return
self
.
CreateVarNode
(
&
var_desc
);
},
return_value_policy
::
reference
)
.
def
(
"create_op_node"
,
[](
Graph
&
self
,
OpDesc
&
op_desc
)
{
return
self
.
CreateOpNode
(
&
op_desc
);
},
return_value_policy
::
reference
)
.
def
(
"create_control_dep_var"
,
&
Graph
::
CreateControlDepVar
,
return_value_policy
::
reference
)
.
def
(
"create_empty_node"
,
&
Graph
::
CreateEmptyNode
,
return_value_policy
::
reference
)
.
def
(
"release_nodes"
,
&
Graph
::
ReleaseNodes
)
.
def
(
"remove_node"
,
[](
Graph
&
self
,
Node
&
node
)
{
return
self
.
RemoveNode
(
&
node
);
})
.
def
(
"retrieve_node"
,
&
Graph
::
RetrieveNode
,
return_value_policy
::
reference
)
.
def
(
"resolve_hazard"
,
&
Graph
::
ResolveHazard
);
}
void
BindNode
(
py
::
module
*
m
)
{
py
::
class_
<
Node
>
node
(
*
m
,
"Node"
);
node
.
def
(
"name"
,
&
Node
::
Name
)
.
def
(
"node_type"
,
&
Node
::
NodeType
)
.
def
(
"var"
,
&
Node
::
Var
)
.
def
(
"op"
,
&
Node
::
Op
)
.
def
(
"id"
,
&
Node
::
id
)
.
def
(
"is_op"
,
&
Node
::
IsOp
)
.
def
(
"is_var"
,
&
Node
::
IsVar
)
.
def
(
"is_ctrl_var"
,
&
Node
::
IsCtrlVar
)
.
def_readwrite
(
"inputs"
,
&
Node
::
inputs
)
.
def_readwrite
(
"outputs"
,
&
Node
::
outputs
);
py
::
enum_
<
Node
::
Type
>
(
node
,
"Type"
)
.
value
(
"Operation"
,
Node
::
Type
::
kOperation
)
.
value
(
"Variable"
,
Node
::
Type
::
kVariable
)
.
export_values
();
}
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/ir.h
0 → 100644
浏览文件 @
eac5a0aa
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
#include "paddle/fluid/framework/ir/graph.h"
namespace
paddle
{
namespace
pybind
{
void
BindGraph
(
pybind11
::
module
*
m
);
void
BindNode
(
pybind11
::
module
*
m
);
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
eac5a0aa
...
@@ -49,6 +49,7 @@ limitations under the License. */
...
@@ -49,6 +49,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/pybind.h" // NOLINT
#include "paddle/fluid/pybind/recordio.h"
#include "paddle/fluid/pybind/recordio.h"
...
@@ -775,7 +776,12 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -775,7 +776,12 @@ All parameter, weight, gradient are variables in Paddle.
})
})
.
def
(
"set_int"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
.
def
(
"set_int"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
int
val
)
{
self
.
Set
<
const
int
>
(
name
,
new
int
(
val
));
})
int
val
)
{
self
.
Set
<
const
int
>
(
name
,
new
int
(
val
));
})
.
def
(
"type"
,
&
ir
::
Pass
::
Type
);
.
def
(
"type"
,
&
ir
::
Pass
::
Type
)
.
def
(
"apply"
,
[](
ir
::
Pass
&
self
,
std
::
shared_ptr
<
ir
::
Graph
>
graph
)
{
std
::
unique_ptr
<
ir
::
Graph
>
origin_graph
(
graph
.
get
());
auto
optim_graph
=
self
.
Apply
(
std
::
move
(
origin_graph
));
graph
.
reset
(
optim_graph
.
release
());
});
py
::
class_
<
ir
::
PassBuilder
,
std
::
shared_ptr
<
ir
::
PassBuilder
>>
pb
(
py
::
class_
<
ir
::
PassBuilder
,
std
::
shared_ptr
<
ir
::
PassBuilder
>>
pb
(
m
,
"PassBuilder"
);
m
,
"PassBuilder"
);
...
@@ -1019,8 +1025,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1019,8 +1025,7 @@ All parameter, weight, gradient are variables in Paddle.
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
ProgramDesc
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
ProgramDesc
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
,
size_t
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&>
())
size_t
>
())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
...
@@ -1043,6 +1048,9 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1043,6 +1048,9 @@ All parameter, weight, gradient are variables in Paddle.
BindRecordIOWriter
(
&
m
);
BindRecordIOWriter
(
&
m
);
BindAsyncExecutor
(
&
m
);
BindAsyncExecutor
(
&
m
);
BindGraph
(
&
m
);
BindNode
(
&
m
);
}
}
}
// namespace pybind
}
// namespace pybind
}
// namespace paddle
}
// namespace paddle
python/paddle/dataset/mnist.py
浏览文件 @
eac5a0aa
...
@@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators.
...
@@ -21,10 +21,9 @@ parse training set and test set into paddle reader creators.
from
__future__
import
print_function
from
__future__
import
print_function
import
paddle.dataset.common
import
paddle.dataset.common
import
subprocess
import
gzip
import
numpy
import
numpy
import
platform
import
struct
import
tempfile
from
six.moves
import
range
from
six.moves
import
range
__all__
=
[
'train'
,
'test'
,
'convert'
]
__all__
=
[
'train'
,
'test'
,
'convert'
]
...
@@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
...
@@ -41,51 +40,47 @@ TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
def
reader_creator
(
image_filename
,
label_filename
,
buffer_size
):
def
reader_creator
(
image_filename
,
label_filename
,
buffer_size
):
def
reader
():
def
reader
():
if
platform
.
system
()
==
'Darwin'
:
with
gzip
.
GzipFile
(
image_filename
,
'rb'
)
as
image_file
:
zcat_cmd
=
'gzcat'
img_buf
=
image_file
.
read
()
elif
platform
.
system
()
==
'Linux'
:
with
gzip
.
GzipFile
(
label_filename
,
'rb'
)
as
label_file
:
zcat_cmd
=
'zcat'
lab_buf
=
label_file
.
read
()
else
:
raise
NotImplementedError
()
step_label
=
0
# According to http://stackoverflow.com/a/38061619/724872, we
offset_img
=
0
# cannot use standard package gzip here.
# read from Big-endian
tmp_image_file
=
tempfile
.
TemporaryFile
(
prefix
=
'paddle_dataset'
)
# get file info from magic byte
m
=
subprocess
.
Popen
(
# image file : 16B
[
zcat_cmd
,
image_filename
],
stdout
=
tmp_image_file
).
communicate
()
magic_byte_img
=
'>IIII'
tmp_image_file
.
seek
(
16
)
# skip some magic bytes
magic_img
,
image_num
,
rows
,
cols
=
struct
.
unpack_from
(
magic_byte_img
,
img_buf
,
offset_img
)
# Python3 will not take stdout as file
offset_img
+=
struct
.
calcsize
(
magic_byte_img
)
tmp_label_file
=
tempfile
.
TemporaryFile
(
prefix
=
'paddle_dataset'
)
l
=
subprocess
.
Popen
(
offset_lab
=
0
[
zcat_cmd
,
label_filename
],
stdout
=
tmp_label_file
).
communicate
()
# label file : 8B
tmp_label_file
.
seek
(
8
)
# skip some magic bytes
magic_byte_lab
=
'>II'
magic_lab
,
label_num
=
struct
.
unpack_from
(
magic_byte_lab
,
try
:
# reader could be break.
lab_buf
,
offset_lab
)
while
True
:
offset_lab
+=
struct
.
calcsize
(
magic_byte_lab
)
labels
=
numpy
.
fromfile
(
tmp_label_file
,
'ubyte'
,
count
=
buffer_size
).
astype
(
"int"
)
while
True
:
if
step_label
>=
label_num
:
if
labels
.
size
!=
buffer_size
:
break
break
# numpy.fromfile returns empty slice after EOF.
fmt_label
=
'>'
+
str
(
buffer_size
)
+
'B'
labels
=
struct
.
unpack_from
(
fmt_label
,
lab_buf
,
offset_lab
)
images
=
numpy
.
fromfile
(
offset_lab
+=
struct
.
calcsize
(
fmt_label
)
tmp_image_file
,
'ubyte'
,
count
=
buffer_size
*
28
*
step_label
+=
buffer_size
28
).
reshape
((
buffer_size
,
28
*
28
)).
astype
(
'float32'
)
fmt_images
=
'>'
+
str
(
buffer_size
*
rows
*
cols
)
+
'B'
images
=
images
/
255.0
*
2.0
-
1.0
images_temp
=
struct
.
unpack_from
(
fmt_images
,
img_buf
,
offset_img
)
for
i
in
range
(
buffer_size
):
images
=
numpy
.
reshape
(
images_temp
,
(
yield
images
[
i
,
:],
int
(
labels
[
i
])
buffer_size
,
rows
*
cols
)).
astype
(
'float32'
)
finally
:
offset_img
+=
struct
.
calcsize
(
fmt_images
)
try
:
m
.
terminate
()
images
=
images
/
255.0
*
2.0
-
1.0
except
:
for
i
in
range
(
buffer_size
):
pass
yield
images
[
i
,
:],
int
(
labels
[
i
])
try
:
l
.
terminate
()
except
:
pass
return
reader
return
reader
...
...
python/paddle/fluid/__init__.py
浏览文件 @
eac5a0aa
...
@@ -156,7 +156,7 @@ def __bootstrap__():
...
@@ -156,7 +156,7 @@ def __bootstrap__():
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'
cudnn_exhaustive_search_times'
,
'
sync_nccl_allreduce'
'sync_nccl_allreduce'
]
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
python/paddle/fluid/compiler.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
multiprocessing
import
os
import
six
import
sys
from
..
import
compat
as
cpt
from
.
import
core
ExecutionStrategy
=
core
.
ParallelExecutor
.
ExecutionStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
def
_place_obj
(
place
):
p
=
core
.
Place
()
p
.
set_place
(
place
)
return
p
class
CompiledProgram
(
object
):
"""
Compiles a Program for execution.
1. Users first create the program with layers.
2. Optionally, users use CompiledProgram to optimize the program before run.
3. The original program or CompiledProgram is run by executor.
The CompiledProgram is used to transform a program for various
optimizations, for example.
* Pre-compute some logic once so that each run is faster.
* Transform the program so that it can run in multiple devices.
* TODO: transform the program for optimized inference or distributed
training.
Example:
.. code-block:: python
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
loss_name=loss.name)
for i in range(5):
test_loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
program: Program instance that contains the model logic.
"""
def
__init__
(
self
,
program
):
self
.
_program
=
program
self
.
_scope
=
None
self
.
_place
=
None
self
.
_executor
=
None
self
.
_compiled
=
False
self
.
_is_data_parallel
=
False
def
with_data_parallel
(
self
,
loss_name
=
None
,
build_strategy
=
None
,
exec_strategy
=
None
,
share_vars_from
=
None
):
"""Configs the program to run in data parallel way.
Args:
loss_name (str): The loss name must set in training. Default None.
build_strategy(BuildStrategy): build_strategy is used to
build the graph so it can run on multiple devices/cores with
optimized topology.
For more information, please refer to fluid.BuildStrategy.
Default None.
exec_strategy(ExecutionStrategy): exec_strategy is used to
to select the a way to execute the graph, for example how many
threads are used, how many iterations to clean up the temp
variables. For more information, please refer
to fluid.ExecutionStrategy. Default None.
share_vars_from(CompiledProgram): If provide, this CompiledProgram
will share variables from `share_vars_from`. `share_vars_from`
must be run by the executor before this CompiledProgram so that
vars are ready.
Returns:
self
"""
assert
not
self
.
_is_data_parallel
,
"Already compiled with parallel."
self
.
_is_data_parallel
=
True
self
.
_build_strategy
=
build_strategy
self
.
_exec_strategy
=
exec_strategy
self
.
_loss_name
=
loss_name
self
.
_share_vars_from
=
share_vars_from
if
self
.
_exec_strategy
is
None
:
self
.
_exec_strategy
=
ExecutionStrategy
()
if
self
.
_build_strategy
is
None
:
self
.
_build_strategy
=
BuildStrategy
()
return
self
def
_with_distributed
(
self
):
raise
NotImplementedError
()
def
_with_inference_optimize
(
self
):
raise
NotImplementedError
()
def
_compile_data_parallel
(
self
):
if
self
.
_share_vars_from
:
if
self
.
_scope
:
sys
.
stderr
.
write
(
"share_vars_from is set, scope is ignored.
\n
"
)
if
not
self
.
_share_vars_from
.
_is_data_parallel
:
raise
ValueError
(
"share_vars_from is not data parallel. Cannot "
"share vars from it."
)
if
self
.
_share_vars_from
.
_executor
is
None
:
raise
ValueError
(
"share_vars_from is not compiled and run, so there is no "
"var to share."
)
self
.
_local_scopes
=
self
.
_share_vars_from
.
_executor
.
local_scopes
()
else
:
self
.
_local_scopes
=
[]
self
.
_exec_strategy
.
use_cuda
=
isinstance
(
self
.
_place
,
core
.
CUDAPlace
)
if
self
.
_exec_strategy
.
use_cuda
:
gpus_env
=
os
.
getenv
(
"FLAGS_selected_gpus"
)
if
gpus_env
:
gpus
=
[
int
(
s
)
for
s
in
gpus_env
.
split
(
","
)]
else
:
gpus
=
[
i
for
i
in
six
.
moves
.
range
(
core
.
get_cuda_device_count
())
]
self
.
_places
=
[
core
.
CUDAPlace
(
i
)
for
i
in
gpus
]
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_places
=
[
core
.
CPUPlace
()
for
_
in
six
.
moves
.
range
(
cpu_num
)]
assert
self
.
_places
,
"no place for execution"
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
use_cuda
:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
self
.
_exec_strategy
.
num_threads
=
len
(
self
.
_places
)
*
4
else
:
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
self
.
_exec_strategy
.
num_threads
=
cpu_num
*
2
trainers_endpoints
=
self
.
_program
.
_trainers_endpoints
if
self
.
_build_strategy
.
num_trainers
>
1
and
trainers_endpoints
:
assert
self
.
_build_strategy
.
num_trainers
==
len
(
trainers_endpoints
),
"num_trainers == len(end_points)"
self
.
_build_strategy
.
trainers_endpoints
=
trainers_endpoints
self
.
_persistable_vars
=
set
([
cpt
.
to_text
(
v
.
name
)
for
v
in
[
var
for
var
in
self
.
_program
.
list_vars
()
if
var
.
persistable
and
var
.
type
!=
core
.
VarDesc
.
VarType
.
RAW
]
])
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
return
core
.
ParallelExecutor
(
places
,
self
.
_persistable_vars
,
self
.
_program
.
desc
,
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
)
def
_compile
(
self
,
scope
,
place
):
"""Compile the program based on the configs.
Args:
scope: The variables (resources) that are associated with
this compiled program.
place: The location that the compiled program will be run on.
Returns:
self
"""
if
self
.
_compiled
:
if
scope
and
self
.
_scope
!=
scope
:
raise
ValueError
(
"Cannot compile with different scope"
)
if
place
and
self
.
_place
!=
place
:
raise
ValueError
(
"Cannot compile with different place"
)
return
self
self
.
_compiled
=
True
self
.
_scope
=
scope
self
.
_place
=
place
if
self
.
_is_data_parallel
:
self
.
_executor
=
self
.
_compile_data_parallel
()
else
:
p
=
_place_obj
(
self
.
_place
)
self
.
_executor
=
core
.
Executor
(
p
)
return
self
python/paddle/fluid/data_feeder.py
浏览文件 @
eac5a0aa
...
@@ -71,10 +71,25 @@ class DataToLoDTensorConverter(object):
...
@@ -71,10 +71,25 @@ class DataToLoDTensorConverter(object):
for
each_data
in
data
:
for
each_data
in
data
:
self
.
_feed_impl_
(
each_data
,
lod
[
1
:],
lod_level
-
1
)
self
.
_feed_impl_
(
each_data
,
lod
[
1
:],
lod_level
-
1
)
def
_check_shape
(
self
,
shape
):
for
s1
,
s2
in
zip
(
self
.
shape
,
shape
):
if
s1
!=
s2
and
s1
>=
0
and
s2
>=
0
:
raise
ValueError
(
"Shape not match. What is defined in data layer is {}, but receive {}"
.
format
(
self
.
shape
,
shape
))
def
done
(
self
):
def
done
(
self
):
arr
=
numpy
.
array
(
self
.
data
,
dtype
=
self
.
dtype
)
arr
=
numpy
.
array
(
self
.
data
,
dtype
=
self
.
dtype
)
if
self
.
shape
and
len
(
arr
.
shape
)
!=
len
(
self
.
shape
):
if
self
.
shape
:
arr
=
arr
.
reshape
(
self
.
shape
)
if
len
(
arr
.
shape
)
!=
len
(
self
.
shape
):
try
:
arr
=
arr
.
reshape
(
self
.
shape
)
except
ValueError
:
raise
ValueError
(
"Reshape error. What is defined in data layer is {}, but receive {}"
.
format
(
self
.
shape
,
arr
.
shape
))
else
:
self
.
_check_shape
(
arr
.
shape
)
t
=
core
.
LoDTensor
()
t
=
core
.
LoDTensor
()
t
.
set
(
arr
,
self
.
place
)
t
.
set
(
arr
,
self
.
place
)
if
self
.
lod_level
>
0
:
if
self
.
lod_level
>
0
:
...
@@ -152,17 +167,8 @@ class DataFeeder(object):
...
@@ -152,17 +167,8 @@ class DataFeeder(object):
raise
TypeError
(
"Feed list should contain a list of variable"
)
raise
TypeError
(
"Feed list should contain a list of variable"
)
self
.
feed_dtypes
.
append
(
each_var
.
dtype
)
self
.
feed_dtypes
.
append
(
each_var
.
dtype
)
self
.
feed_names
.
append
(
each_var
.
name
)
self
.
feed_names
.
append
(
each_var
.
name
)
shape
=
each_var
.
shape
batch_size_dim
=
-
1
for
i
,
s
in
enumerate
(
shape
):
if
s
<
0
:
batch_size_dim
=
i
break
if
batch_size_dim
==
-
1
:
raise
ValueError
(
"Variable {0} must has a batch size dimension"
,
each_var
.
name
)
self
.
feed_lod_level
.
append
(
each_var
.
lod_level
)
self
.
feed_lod_level
.
append
(
each_var
.
lod_level
)
self
.
feed_shapes
.
append
(
shape
)
self
.
feed_shapes
.
append
(
each_var
.
shape
)
self
.
place
=
place
self
.
place
=
place
...
...
python/paddle/fluid/executor.py
浏览文件 @
eac5a0aa
...
@@ -14,11 +14,15 @@
...
@@ -14,11 +14,15 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
multiprocessing
import
numpy
as
np
import
numpy
as
np
import
contextlib
import
contextlib
import
six
import
six
from
.framework
import
Program
,
default_main_program
,
Variable
from
.framework
import
Program
,
default_main_program
,
Variable
from
.
import
core
from
.
import
core
from
.
import
compiler
from
..
import
compat
as
cpt
__all__
=
[
'Executor'
,
'global_scope'
,
'scope_guard'
]
__all__
=
[
'Executor'
,
'global_scope'
,
'scope_guard'
]
...
@@ -204,20 +208,20 @@ def _fetch_var(name, scope=None, return_numpy=True):
...
@@ -204,20 +208,20 @@ def _fetch_var(name, scope=None, return_numpy=True):
return
tensor
return
tensor
def
_get_program_cache_key
(
feed
,
fetch_list
):
def
_to_name_str
(
var
):
feed_var_names
=
list
(
feed
.
keys
())
if
isinstance
(
var
,
Variable
):
return
var
.
desc
.
name
()
elif
isinstance
(
var
,
str
):
return
var
elif
isinstance
(
var
,
six
.
string_types
):
return
str
(
var
)
else
:
raise
TypeError
(
str
(
var
)
+
" should be Variable or str"
)
def
to_name_str
(
var
):
if
isinstance
(
var
,
Variable
):
return
var
.
desc
.
name
()
elif
isinstance
(
var
,
str
):
return
var
elif
isinstance
(
var
,
six
.
string_types
):
return
str
(
var
)
else
:
raise
TypeError
(
str
(
var
)
+
" should be Variable or str"
)
fetch_var_names
=
list
(
map
(
to_name_str
,
fetch_list
))
def
_get_program_cache_key
(
feed
,
fetch_list
):
feed_var_names
=
list
(
feed
.
keys
())
fetch_var_names
=
list
(
map
(
_to_name_str
,
fetch_list
))
return
str
(
feed_var_names
+
fetch_var_names
)
return
str
(
feed_var_names
+
fetch_var_names
)
...
@@ -266,6 +270,29 @@ class Executor(object):
...
@@ -266,6 +270,29 @@ class Executor(object):
But the global scope variables will be persistent through different runs.
But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
All of ops in program will be running in sequence.
Example:
.. code-block:: python
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the main program directly without compile.
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
fetch_list=[loss.name])
# Or, compiled the program and run. See `CompiledProgram` for more detail.
compiled_prog = compiler.CompiledProgram(
fluid.default_main_program()).with_data_parallel(
loss_name=loss.name)
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
...
@@ -275,11 +302,8 @@ class Executor(object):
...
@@ -275,11 +302,8 @@ class Executor(object):
def
__init__
(
self
,
place
):
def
__init__
(
self
,
place
):
self
.
place
=
place
self
.
place
=
place
p
=
core
.
Place
()
p
.
set_place
(
place
)
self
.
executor
=
core
.
Executor
(
p
)
self
.
program_caches
=
dict
()
self
.
program_caches
=
dict
()
self
.
executor
=
None
self
.
_closed
=
False
self
.
_closed
=
False
def
_get_program_cache
(
self
,
program_cache_key
):
def
_get_program_cache
(
self
,
program_cache_key
):
...
@@ -361,6 +385,7 @@ class Executor(object):
...
@@ -361,6 +385,7 @@ class Executor(object):
You can no long use this executor after calling this method.
You can no long use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
the current Trainer.
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example:
Example:
>>> cpu = core.CPUPlace()
>>> cpu = core.CPUPlace()
...
@@ -368,10 +393,55 @@ class Executor(object):
...
@@ -368,10 +393,55 @@ class Executor(object):
>>> ...
>>> ...
>>> exe.close()
>>> exe.close()
"""
"""
if
not
self
.
_closed
:
if
not
self
.
_closed
and
self
.
executor
:
self
.
executor
.
close
()
self
.
executor
.
close
()
self
.
_closed
=
True
self
.
_closed
=
True
def
_run_parallel
(
self
,
scope
,
feed
,
fetch_list
,
fetch_var_name
,
return_numpy
):
if
isinstance
(
feed
,
dict
):
feed_tensor_dict
=
dict
()
for
feed_name
in
feed
:
feed_tensor
=
feed
[
feed_name
]
if
not
isinstance
(
feed_tensor
,
core
.
LoDTensor
):
feed_tensor
=
core
.
LoDTensor
()
# always set to CPU place, since the tensor need to be splitted
# it is fast in CPU
feed_tensor
.
set
(
feed
[
feed_name
],
core
.
CPUPlace
())
feed_tensor_dict
[
feed_name
]
=
feed_tensor
self
.
executor
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
self
.
_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
raise
TypeError
(
"Each element of feed list should be a dict"
)
res_dict
=
dict
()
for
feed_name
in
each
:
tensor
=
each
[
feed_name
]
if
not
isinstance
(
tensor
,
core
.
LoDTensor
):
tmp
=
core
.
LoDTensor
()
tmp
.
set
(
tensor
,
self
.
_places
[
i
])
tensor
=
tmp
res_dict
[
feed_name
]
=
tensor
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_names
=
list
(
map
(
_to_name_str
,
fetch_list
))
self
.
executor
.
run
(
fetch_var_names
,
fetch_var_name
)
arr
=
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
if
return_numpy
:
return
as_numpy
(
arr
)
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
def
run
(
self
,
def
run
(
self
,
program
=
None
,
program
=
None
,
feed
=
None
,
feed
=
None
,
...
@@ -391,8 +461,9 @@ class Executor(object):
...
@@ -391,8 +461,9 @@ class Executor(object):
operators in the program but not only the operators dependent by the fetch_list
operators in the program but not only the operators dependent by the fetch_list
Args:
Args:
program(Program): the program that need to run, if not provied, then default_main_program will be used.
program(Program|CompiledProgram): the program that need to run,
feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
if not provided, then default_main_program will be used.
feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
feed_var_name(str): the name for the input variable of feed Operator.
feed_var_name(str): the name for the input variable of feed Operator.
fetch_var_name(str): the name for the output variable of fetch Operator.
fetch_var_name(str): the name for the output variable of fetch Operator.
...
@@ -428,14 +499,59 @@ class Executor(object):
...
@@ -428,14 +499,59 @@ class Executor(object):
if
self
.
_closed
:
if
self
.
_closed
:
raise
RuntimeError
(
"Attempted to use a closed Executor"
)
raise
RuntimeError
(
"Attempted to use a closed Executor"
)
if
scope
is
None
:
scope
=
global_scope
()
if
fetch_list
is
None
:
fetch_list
=
[]
compiled
=
isinstance
(
program
,
compiler
.
CompiledProgram
)
# For backward compatibility, run directly.
if
not
compiled
:
if
not
self
.
executor
:
p
=
core
.
Place
()
p
.
set_place
(
self
.
place
)
self
.
executor
=
core
.
Executor
(
p
)
return
self
.
_run
(
program
,
feed
=
feed
,
fetch_list
=
fetch_list
,
feed_var_name
=
feed_var_name
,
fetch_var_name
=
fetch_var_name
,
scope
=
scope
,
return_numpy
=
return_numpy
,
use_program_cache
=
use_program_cache
)
program
.
_compile
(
scope
,
self
.
place
)
self
.
executor
=
program
.
_executor
if
program
.
_is_data_parallel
:
return
self
.
_run_parallel
(
scope
=
scope
,
feed
=
feed
,
fetch_list
=
fetch_list
,
fetch_var_name
=
fetch_var_name
,
return_numpy
=
return_numpy
)
else
:
# TODO(panyx0718): Can compile program to optimize executor
# performance.
return
self
.
_run
(
program
.
_program
,
feed
=
feed
,
fetch_list
=
fetch_list
,
feed_var_name
=
feed_var_name
,
fetch_var_name
=
fetch_var_name
,
scope
=
scope
,
return_numpy
=
return_numpy
,
use_program_cache
=
use_program_cache
)
def
_run
(
self
,
program
,
feed
,
fetch_list
,
feed_var_name
,
fetch_var_name
,
scope
,
return_numpy
,
use_program_cache
):
if
feed
is
None
:
if
feed
is
None
:
feed
=
{}
feed
=
{}
if
not
isinstance
(
feed
,
dict
):
if
not
isinstance
(
feed
,
dict
):
raise
TypeError
(
raise
TypeError
(
"feed requires dict as its Parameter. But you passed in %s"
%
"feed requires dict as its Parameter. But you passed in %s"
%
(
type
(
feed
)))
(
type
(
feed
)))
if
fetch_list
is
None
:
fetch_list
=
[]
if
program
is
None
:
if
program
is
None
:
program
=
default_main_program
()
program
=
default_main_program
()
...
@@ -444,9 +560,6 @@ class Executor(object):
...
@@ -444,9 +560,6 @@ class Executor(object):
"Executor requires Program as its Parameter. But you passed in %s"
"Executor requires Program as its Parameter. But you passed in %s"
%
(
type
(
program
)))
%
(
type
(
program
)))
if
scope
is
None
:
scope
=
global_scope
()
cache_key
=
_get_program_cache_key
(
feed
,
fetch_list
)
cache_key
=
_get_program_cache_key
(
feed
,
fetch_list
)
if
use_program_cache
:
if
use_program_cache
:
cached_program
=
self
.
_get_program_cache
(
cache_key
)
cached_program
=
self
.
_get_program_cache
(
cache_key
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
eac5a0aa
#
Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -195,22 +195,18 @@ class Optimizer(object):
...
@@ -195,22 +195,18 @@ class Optimizer(object):
format
(
name
,
param
.
name
))
format
(
name
,
param
.
name
))
return
self
.
_accumulators
[
name
][
param
.
name
]
return
self
.
_accumulators
[
name
][
param
.
name
]
def
_create_optimization_pass
(
self
,
def
_create_optimization_pass
(
self
,
parameters_and_grads
):
parameters_and_grads
,
loss
,
startup_program
=
None
):
"""Add optimization operators to update gradients to variables.
"""Add optimization operators to update gradients to variables.
Args:
Args:
loss(Variable): the target that this optimization is for.
parameters_and_grads(list(tuple(Variable, Variable))):
parameters_and_grads(list(tuple(Variable, Variable))):
a list of (variable, gradient) pair to update.
a list of (variable, gradient) pair to update.
Returns:
Returns:
return_op_list: a list of operators that will complete one step of
return_op_list: a list of operators that will complete one step of
optimization. This will include parameter update ops, global step
optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage
update ops and any other custom ops required by subclasses to manage
their internal state.
their internal state.
"""
"""
# This is a default implementation of create_optimization_pass that
# This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that
# can be shared by most optimizers. This implementation assumes that
...
@@ -219,37 +215,33 @@ class Optimizer(object):
...
@@ -219,37 +215,33 @@ class Optimizer(object):
# _create_accumulators method if it needs to create accumulators
# _create_accumulators method if it needs to create accumulators
# for parameters and extend _finish_update method to add custom ops.
# for parameters and extend _finish_update method to add custom ops.
# Create any accumulators
# Allways called under program_guard use global block as loss block
program
=
loss
.
block
.
program
global_block
=
framework
.
default_main_program
().
global_block
()
self
.
_dtype
=
loss
.
dtype
start
=
len
(
global_block
.
ops
)
with
program_guard
(
program
,
startup_program
):
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
)
global_block
=
framework
.
default_main_program
().
global_block
()
self
.
_create_accumulators
(
global_block
,
start
=
len
(
global_block
.
ops
)
[
p
[
0
]
for
p
in
parameters_and_grads
])
self
.
helper
=
LayerHelper
(
self
.
__class__
.
__name__
)
self
.
_create_global_learning_rate
()
self
.
_create_accumulators
(
loss
.
block
,
[
p
[
0
]
for
p
in
parameters_and_grads
])
optimize_ops
=
[]
self
.
_create_global_learning_rate
()
for
param_and_grad
in
parameters_and_grads
:
if
param_and_grad
[
1
]
is
None
:
optimize_ops
=
[]
continue
for
param_and_grad
in
parameters_and_grads
:
with
param_and_grad
[
0
].
block
.
program
.
_optimized_guard
(
if
param_and_grad
[
1
]
is
None
:
param_and_grad
),
name_scope
(
"optimizer"
):
continue
if
param_and_grad
[
0
].
trainable
is
True
:
with
param_and_grad
[
0
].
block
.
program
.
_optimized_guard
(
optimize_op
=
self
.
_append_optimize_op
(
global_block
,
param_and_grad
),
name_scope
(
"optimizer"
):
param_and_grad
)
if
param_and_grad
[
0
].
trainable
is
True
:
optimize_ops
.
append
(
optimize_op
)
optimize_op
=
self
.
_append_optimize_op
(
loss
.
block
,
param_and_grad
)
# Get custom finish ops for subclasses
optimize_ops
.
append
(
optimize_op
)
# FIXME: Need to fix this once we figure out how to handle dependencies
self
.
_finish_update
(
global_block
,
parameters_and_grads
)
# Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies
end
=
len
(
global_block
.
ops
)
self
.
_finish_update
(
loss
.
block
,
parameters_and_grads
)
return
global_block
.
_slice_ops
(
start
,
end
)
end
=
len
(
global_block
.
ops
)
def
_process_distribute_lookuptable
(
self
,
param_grads
):
return
global_block
.
_slice_ops
(
start
,
end
)
def
_process_distribute_lookuptable
(
self
,
param_grads
,
loss
,
startup_program
):
"""
"""
Because distribute lookup table only support SGD optimizer for now, not support
Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out,
other optimizer and regularization, so we should find the table parameter out,
...
@@ -259,7 +251,8 @@ class Optimizer(object):
...
@@ -259,7 +251,8 @@ class Optimizer(object):
:param loss: the loss variable.
:param loss: the loss variable.
:param startup_program: the startup program
:param startup_program: the startup program
"""
"""
program
=
loss
.
block
.
program
program
=
framework
.
default_main_program
()
global_block
=
framework
.
default_main_program
().
global_block
()
table_name
=
find_distributed_lookup_table
(
program
)
table_name
=
find_distributed_lookup_table
(
program
)
table_param
=
None
table_param
=
None
table_grad
=
None
table_grad
=
None
...
@@ -275,38 +268,121 @@ class Optimizer(object):
...
@@ -275,38 +268,121 @@ class Optimizer(object):
new_param_grads
.
append
((
p
,
g
))
new_param_grads
.
append
((
p
,
g
))
sgd_op
=
None
sgd_op
=
None
if
table_param
is
not
None
:
if
table_param
is
not
None
:
with
program_guard
(
program
,
startup_program
):
param_and_grad
=
[
table_param
,
table_grad
]
param_and_grad
=
[
table_param
,
table_grad
]
with
table_param
.
block
.
program
.
_optimized_guard
(
param_and_grad
),
\
with
table_param
.
block
.
program
.
_optimized_guard
(
param_and_grad
),
\
framework
.
name_scope
(
"optimizer"
):
framework
.
name_scope
(
"optimizer"
):
self
.
_create_global_learning_rate
()
self
.
_create_global_learning_rate
()
# create the optimize op
# create the optimize op
sgd_op
=
global_block
.
append_op
(
sgd_op
=
loss
.
block
.
append_op
(
type
=
'sgd'
,
type
=
'sgd'
,
inputs
=
{
inputs
=
{
"Param"
:
table_param
,
"Param"
:
table_param
,
"Grad"
:
table_grad
,
"Grad"
:
table_grad
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
)
"LearningRate"
:
},
self
.
_create_param_lr
(
param_and_grad
)
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
]})
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
]})
return
new_param_grads
,
(
table_param
,
table_grad
),
sgd_op
return
new_param_grads
,
(
table_param
,
table_grad
),
sgd_op
def
backward
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
,
callbacks
=
None
):
"""
First part of `minimize`, do auto-diff to append backward ops for
the current program.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
callbacks (list|None): list of callables to run when appending backward
operator for one parameter.
Return:
list: list of (param, grad) pair, grad is the output of backward.
Examples:
See examples in `apply_gradients`.
"""
if
callbacks
is
None
:
callbacks
=
[
error_clip_callback
]
else
:
assert
(
isinstance
(
callbacks
,
list
))
callbacks
.
append
(
error_clip_callback
)
return
append_backward
(
loss
,
parameter_list
,
no_grad_set
,
callbacks
)
def
apply_gradients
(
self
,
params_grads
):
"""
Second part of `minimize`, appending optimization operators for
given `params_grads` pairs.
Args:
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
list: A list of operators appended to the current program.
Examples:
.. code-block:: python
loss = network()
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
params_grads = optimizer.backward(loss)
# you may append operations for params_grads here
# ...
optimizer.apply_gradients(params_grads)
"""
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
params_grads
,
table_param_and_grad
,
table_optimize_op
=
\
self
.
_process_distribute_lookuptable
(
params_grads
)
params_grads
=
append_gradient_clip_ops
(
params_grads
)
# Add regularization if any
params_grads
=
append_regularization_ops
(
params_grads
,
self
.
regularization
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
if
table_optimize_op
is
not
None
:
optimize_ops
.
append
(
table_optimize_op
)
params_grads
.
append
(
table_param_and_grad
)
return
optimize_ops
def
minimize
(
self
,
def
minimize
(
self
,
loss
,
loss
,
startup_program
=
None
,
startup_program
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
no_grad_set
=
None
):
"""Add operations to minimize `loss` by updating `parameter_list`.
"""
Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward()` and
This method combines interface `backward()` and
`create_optimization_pass()` into one.
`apply_gradients()` into one.
Args:
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
Returns:
tuple: (optimize_ops, params_grads) which are, list of operators appended;
and list of (param, grad) Variables pair for optimization.
"""
"""
self
.
_dtype
=
loss
.
dtype
program
=
loss
.
block
.
program
optimize_ops
=
[]
if
imperative_base
.
enabled
():
if
imperative_base
.
enabled
():
if
parameter_list
is
not
None
:
if
parameter_list
is
not
None
:
params_grads
=
parameter_list
params_grads
=
parameter_list
else
:
else
:
program
=
loss
.
block
.
program
parameters
=
program
.
global_block
().
all_parameters
()
parameters
=
program
.
global_block
().
all_parameters
()
params_grads
=
[]
params_grads
=
[]
for
param
in
parameters
:
for
param
in
parameters
:
...
@@ -317,29 +393,13 @@ class Optimizer(object):
...
@@ -317,29 +393,13 @@ class Optimizer(object):
stop_gradient
=
True
)
stop_gradient
=
True
)
grad_var
.
_value
=
param
.
_ivar
.
grad_value
grad_var
.
_value
=
param
.
_ivar
.
grad_value
params_grads
.
append
((
param
,
grad_var
))
params_grads
.
append
((
param
,
grad_var
))
with
program_guard
(
program
,
startup_program
):
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
,
loss
,
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
startup_program
)
else
:
else
:
params_grads
=
append_backward
(
loss
,
parameter_list
,
no_grad_set
,
with
program_guard
(
program
,
startup_program
):
[
error_clip_callback
])
params_grads
=
self
.
backward
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
optimize_ops
=
self
.
apply_gradients
(
params_grads
)
params_grads
,
table_param_and_grad
,
table_optimize_op
=
\
self
.
_process_distribute_lookuptable
(
params_grads
,
loss
,
startup_program
)
params_grads
=
append_gradient_clip_ops
(
params_grads
)
# Add regularization if any
params_grads
=
append_regularization_ops
(
params_grads
,
self
.
regularization
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
,
loss
,
startup_program
)
if
table_optimize_op
is
not
None
:
optimize_ops
.
append
(
table_optimize_op
)
params_grads
.
append
(
table_param_and_grad
)
return
optimize_ops
,
params_grads
return
optimize_ops
,
params_grads
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
eac5a0aa
...
@@ -181,9 +181,8 @@ class ParallelExecutor(object):
...
@@ -181,9 +181,8 @@ class ParallelExecutor(object):
# step7: init ParallelExecutor
# step7: init ParallelExecutor
self
.
executor
=
core
.
ParallelExecutor
(
self
.
executor
=
core
.
ParallelExecutor
(
places
,
persistable_vars
,
main
.
desc
,
places
,
persistable_vars
,
main
.
desc
,
cpt
.
to_text
(
loss_name
)
cpt
.
to_text
(
loss_name
)
if
loss_name
else
six
.
u
(
''
),
scope
,
if
loss_name
else
six
.
u
(
''
),
scope
,
local_scopes
,
exec_strategy
,
local_scopes
,
exec_strategy
,
build_strategy
)
build_strategy
,
num_trainers
,
trainer_id
)
self
.
scope
=
scope
self
.
scope
=
scope
...
@@ -294,7 +293,7 @@ class ParallelExecutor(object):
...
@@ -294,7 +293,7 @@ class ParallelExecutor(object):
res
.
append
(
res_dict
)
res
.
append
(
res_dict
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
self
.
executor
.
feed_tensors_into_local_scopes
(
res
)
fetch_var_name
=
'
@FETCHED_VAR_NAME@
'
fetch_var_name
=
'
fetch
'
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
self
.
executor
.
run
(
fetch_list
,
fetch_var_name
)
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
...
...
python/paddle/fluid/tests/test_data_feeder.py
浏览文件 @
eac5a0aa
...
@@ -30,6 +30,12 @@ class TestDataFeeder(unittest.TestCase):
...
@@ -30,6 +30,12 @@ class TestDataFeeder(unittest.TestCase):
self
.
assertEqual
(
result
[
'image'
].
recursive_sequence_lengths
(),
[])
self
.
assertEqual
(
result
[
'image'
].
recursive_sequence_lengths
(),
[])
self
.
assertEqual
(
result
[
'label'
].
recursive_sequence_lengths
(),
[])
self
.
assertEqual
(
result
[
'label'
].
recursive_sequence_lengths
(),
[])
try
:
result
=
feeder
.
feed
([([
0
]
*
783
,
[
9
]),
([
1
]
*
783
,
[
1
])])
self
.
assertTrue
(
False
)
except
ValueError
:
self
.
assertTrue
(
True
)
def
test_lod_level_1_converter
(
self
):
def
test_lod_level_1_converter
(
self
):
# lod_level = 1
# lod_level = 1
# each sentence has a different number of words
# each sentence has a different number of words
...
...
python/paddle/fluid/tests/unittests/dist_ctr.py
浏览文件 @
eac5a0aa
...
@@ -31,6 +31,7 @@ fluid.default_main_program().random_seed = 1
...
@@ -31,6 +31,7 @@ fluid.default_main_program().random_seed = 1
class
TestDistCTR2x2
(
TestDistRunnerBase
):
class
TestDistCTR2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
):
def
get_model
(
self
,
batch_size
=
2
):
dnn_input_dim
,
lr_input_dim
=
dist_ctr_reader
.
load_data_meta
()
dnn_input_dim
,
lr_input_dim
=
dist_ctr_reader
.
load_data_meta
()
""" network definition """
""" network definition """
dnn_data
=
fluid
.
layers
.
data
(
dnn_data
=
fluid
.
layers
.
data
(
...
@@ -97,7 +98,14 @@ class TestDistCTR2x2(TestDistRunnerBase):
...
@@ -97,7 +98,14 @@ class TestDistCTR2x2(TestDistRunnerBase):
inference_program
=
paddle
.
fluid
.
default_main_program
().
clone
()
inference_program
=
paddle
.
fluid
.
default_main_program
().
clone
()
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0001
)
regularization
=
None
use_l2_decay
=
bool
(
os
.
getenv
(
'USE_L2_DECAY'
,
0
))
if
use_l2_decay
:
regularization
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
1e-1
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0001
,
regularization
=
regularization
)
sgd_optimizer
.
minimize
(
avg_cost
)
sgd_optimizer
.
minimize
(
avg_cost
)
dataset
=
dist_ctr_reader
.
Dataset
()
dataset
=
dist_ctr_reader
.
Dataset
()
...
...
python/paddle/fluid/tests/unittests/dist_se_resnext.py
浏览文件 @
eac5a0aa
...
@@ -235,7 +235,6 @@ class DistSeResneXt2x2(TestDistRunnerBase):
...
@@ -235,7 +235,6 @@ class DistSeResneXt2x2(TestDistRunnerBase):
bd
=
[
step
*
e
for
e
in
epochs
]
bd
=
[
step
*
e
for
e
in
epochs
]
base_lr
=
0.1
base_lr
=
0.1
lr
=
[]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
optimizer
=
fluid
.
optimizer
.
Momentum
(
optimizer
=
fluid
.
optimizer
.
Momentum
(
...
...
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
paddle.fluid.tests.unittests.test_mean_op
import
TestMeanOp
,
TestFP16MeanOp
class
TestNGRAPHMeanOp
(
TestMeanOp
):
def
setUp
(
self
):
super
(
TestNGRAPHMeanOp
,
self
).
setUp
()
class
TestNGRAPHFP16MeanOp
(
TestFP16MeanOp
):
def
setUp
(
self
):
super
(
TestNGRAPHFP16MeanOp
,
self
).
setUp
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
paddle.fluid.tests.unittests.test_scale_op
import
TestScaleOp
,
TestScaleOpSelectedRows
,
TestScaleFp16Op
,
TestScaleFp16OpSelectedRows
class
TestNGRAPHScaleOp
(
TestScaleOp
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleOpSelectedRows
(
TestScaleOpSelectedRows
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleFp16Op
(
TestScaleFp16Op
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleFp16OpSelectedRows
(
TestScaleFp16OpSelectedRows
):
def
init_dtype_type
(
self
):
pass
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
eac5a0aa
...
@@ -19,6 +19,7 @@ import os
...
@@ -19,6 +19,7 @@ import os
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
from
paddle.fluid
import
compiler
import
time
import
time
import
numpy
as
np
import
numpy
as
np
import
math
import
math
...
@@ -44,15 +45,8 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -44,15 +45,8 @@ class TestParallelExecutorBase(unittest.TestCase):
optimizer
=
fluid
.
optimizer
.
Adam
,
optimizer
=
fluid
.
optimizer
.
Adam
,
use_fast_executor
=
False
,
use_fast_executor
=
False
,
enable_sequential_execution
=
False
):
enable_sequential_execution
=
False
):
def
run_executor
(
exe
,
feed
,
fetch_list
,
program
=
None
):
def
run_executor
(
exe
,
binary
,
feed
,
fetch_list
):
if
isinstance
(
exe
,
fluid
.
ParallelExecutor
):
res
=
exe
.
run
(
binary
,
feed
=
feed
,
fetch_list
=
fetch_list
)
res
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
feed
)
elif
isinstance
(
exe
,
fluid
.
Executor
):
if
program
is
None
:
program
=
fluid
.
default_main_program
()
res
=
exe
.
run
(
program
=
program
,
feed
=
feed
,
fetch_list
=
fetch_list
)
else
:
raise
ValueError
(
'Unkown type exe'
)
return
res
return
res
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
...
@@ -72,8 +66,8 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -72,8 +66,8 @@ class TestParallelExecutorBase(unittest.TestCase):
fluid
.
memory_optimize
(
main
)
fluid
.
memory_optimize
(
main
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
startup_
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
startup_
exe
.
run
(
startup
)
exe
.
run
(
startup
)
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
.
allow_op_delay
=
allow_op_delay
exec_strategy
.
allow_op_delay
=
allow_op_delay
if
use_fast_executor
:
if
use_fast_executor
:
...
@@ -86,15 +80,13 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -86,15 +80,13 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_cuda
and
core
.
is_compiled_with_cuda
():
if
use_cuda
and
core
.
is_compiled_with_cuda
():
build_strategy
.
remove_unnecessary_lock
=
True
build_strategy
.
remove_unnecessary_lock
=
True
if
use_parallel_executor
:
if
use_parallel_executor
:
exe
=
fluid
.
ParallelExecutor
(
binary
=
compiler
.
CompiledProgram
(
main
).
with_data_parallel
(
use_cuda
,
loss_name
=
loss
.
name
,
loss_name
=
loss
.
name
,
exec_strategy
=
exec
_strategy
,
build_strategy
=
build
_strategy
,
build_strategy
=
build
_strategy
)
exec_strategy
=
exec
_strategy
)
else
:
else
:
exe
=
fluid
.
Executor
(
place
=
place
)
binary
=
compiler
.
CompiledProgram
(
main
)
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
...
@@ -102,13 +94,14 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -102,13 +94,14 @@ class TestParallelExecutorBase(unittest.TestCase):
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
begin
=
time
.
time
()
begin
=
time
.
time
()
first_loss
,
=
run_executor
(
first_loss
,
=
run_executor
(
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
exe
=
exe
,
binary
=
binary
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
for
i
in
range
(
iter
):
for
i
in
range
(
iter
):
run_executor
(
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[])
run_executor
(
exe
=
exe
,
binary
=
binary
,
feed
=
feed_dict
,
fetch_list
=
[])
last_loss
,
=
run_executor
(
last_loss
,
=
run_executor
(
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
exe
=
exe
,
binary
=
binary
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
end
=
time
.
time
()
end
=
time
.
time
()
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
...
...
python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
浏览文件 @
eac5a0aa
...
@@ -25,6 +25,15 @@ from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
...
@@ -25,6 +25,15 @@ from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
def
conv2d_forward_refer
(
input
,
filter
,
group
,
conv_param
):
def
conv2d_forward_refer
(
input
,
filter
,
group
,
conv_param
):
out
,
in_n
,
out_h
,
out_w
,
out_c
=
conv2d_forward_naive
(
input
,
filter
,
group
,
out
,
in_n
,
out_h
,
out_w
,
out_c
=
conv2d_forward_naive
(
input
,
filter
,
group
,
conv_param
)
conv_param
)
size
=
[
in_n
,
out_c
,
out_h
,
out_w
]
return
format_reorder
(
out
,
size
)
def
format_reorder
(
out
,
size
):
in_n
=
size
[
0
]
out_h
=
size
[
2
]
out_w
=
size
[
3
]
out_c
=
size
[
1
]
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
for
n
in
range
(
in_n
):
for
n
in
range
(
in_n
):
for
i
in
range
(
out_h
):
for
i
in
range
(
out_h
):
...
@@ -47,7 +56,9 @@ class TestConv2dInt8Op(TestConv2dOp):
...
@@ -47,7 +56,9 @@ class TestConv2dInt8Op(TestConv2dOp):
self
.
init_group
()
self
.
init_group
()
self
.
init_dilation
()
self
.
init_dilation
()
self
.
init_test_case
()
self
.
init_test_case
()
self
.
init_dtype
()
self
.
init_fuse_relu
()
self
.
init_fuse_residual
()
self
.
init_data_type
()
conv2d_param
=
{
conv2d_param
=
{
'stride'
:
self
.
stride
,
'stride'
:
self
.
stride
,
...
@@ -78,7 +89,24 @@ class TestConv2dInt8Op(TestConv2dOp):
...
@@ -78,7 +89,24 @@ class TestConv2dInt8Op(TestConv2dOp):
np
.
round
((
input_shift
)
*
self
.
scale_in
).
astype
(
np
.
int32
),
np
.
round
((
input_shift
)
*
self
.
scale_in
).
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
filter_int
,
self
.
groups
,
conv2d_param
).
astype
(
np
.
float32
)
*
scale_output_shift
conv2d_param
).
astype
(
np
.
float32
)
*
scale_output_shift
output
=
np
.
round
(
output1
-
output2
).
astype
(
self
.
dsttype
)
if
self
.
fuse_residual
:
input_residual
=
np
.
random
.
randint
(
-
5
,
5
,
self
.
input_residual_size
).
astype
(
self
.
srctype
)
output_tmp
=
np
.
round
(
output1
-
output2
+
format_reorder
(
input_residual
,
self
.
input_residual_size
).
astype
(
self
.
srctype
)
*
(
self
.
scale_out
/
self
.
scale_in_eltwise
))
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output_tmp
,
0
).
astype
(
self
.
dsttype
)
else
:
output
=
output_tmp
.
astype
(
self
.
dsttype
)
else
:
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
np
.
round
(
output1
-
output2
),
0
).
astype
(
self
.
dsttype
)
else
:
output
=
np
.
round
(
output1
-
output2
).
astype
(
self
.
dsttype
)
else
:
else
:
filter_int
=
np
.
round
(
filter
*
filter_int
=
np
.
round
(
filter
*
self
.
scale_weights
[
0
]).
astype
(
np
.
int32
)
self
.
scale_weights
[
0
]).
astype
(
np
.
int32
)
...
@@ -87,13 +115,35 @@ class TestConv2dInt8Op(TestConv2dOp):
...
@@ -87,13 +115,35 @@ class TestConv2dInt8Op(TestConv2dOp):
output1
=
conv2d_forward_refer
(
output1
=
conv2d_forward_refer
(
input
.
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
input
.
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
conv2d_param
).
astype
(
np
.
float32
)
conv2d_param
).
astype
(
np
.
float32
)
output
=
np
.
round
(
output1
*
scale_output_shift
).
astype
(
self
.
dsttype
)
if
self
.
fuse_residual
:
input_residual
=
np
.
random
.
randint
(
0
,
10
,
self
.
input_residual_size
).
astype
(
self
.
srctype
)
output_tmp
=
np
.
round
(
output1
*
(
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
]))
+
format_reorder
(
input_residual
,
self
.
input_residual_size
).
astype
(
np
.
int32
)
*
(
self
.
scale_out
/
self
.
scale_in_eltwise
))
output_tmp2
=
np
.
round
(
output1
*
(
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
])))
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output_tmp
,
0
).
astype
(
self
.
dsttype
)
else
:
output
=
output_tmp
.
astype
(
self
.
dsttype
)
else
:
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
output_tmp2
,
0
).
astype
(
self
.
dsttype
)
else
:
output
=
output_tmp2
.
astype
(
self
.
dsttype
)
self
.
inputs
=
{
self
.
inputs
=
{
'Input'
:
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
.
astype
(
self
.
srctype
)),
OpTest
.
np_dtype_to_fluid_dtype
(
input
.
astype
(
self
.
srctype
)),
'Filter'
:
OpTest
.
np_dtype_to_fluid_dtype
(
filter
)
'Filter'
:
OpTest
.
np_dtype_to_fluid_dtype
(
filter
)
}
}
if
self
.
fuse_residual
:
self
.
inputs
[
'ResidualData'
]
=
OpTest
.
np_dtype_to_fluid_dtype
(
input_residual
)
self
.
attrs
=
{
self
.
attrs
=
{
'strides'
:
self
.
stride
,
'strides'
:
self
.
stride
,
'paddings'
:
self
.
pad
,
'paddings'
:
self
.
pad
,
...
@@ -106,6 +156,9 @@ class TestConv2dInt8Op(TestConv2dOp):
...
@@ -106,6 +156,9 @@ class TestConv2dInt8Op(TestConv2dOp):
'Scale_in'
:
self
.
scale_in
,
'Scale_in'
:
self
.
scale_in
,
'Scale_out'
:
self
.
scale_out
,
'Scale_out'
:
self
.
scale_out
,
'Scale_weights'
:
self
.
scale_weights
,
'Scale_weights'
:
self
.
scale_weights
,
'Scale_in_eltwise'
:
self
.
scale_in_eltwise
,
'fuse_relu'
:
self
.
fuse_relu
,
'fuse_residual_connection'
:
self
.
fuse_residual
}
}
self
.
outputs
=
{
'Output'
:
output
}
self
.
outputs
=
{
'Output'
:
output
}
...
@@ -123,18 +176,27 @@ class TestConv2dInt8Op(TestConv2dOp):
...
@@ -123,18 +176,27 @@ class TestConv2dInt8Op(TestConv2dOp):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
TestConv2dOp
.
init_test_case
(
self
)
TestConv2dOp
.
init_test_case
(
self
)
self
.
input_size
=
[
1
,
1
,
5
,
5
]
# NCHW
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
1
,
f_c
,
3
,
3
]
self
.
input_residual_size
=
[
1
,
2
,
3
,
3
]
self
.
filter_size
=
[
2
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.6
def
init_dtype
(
self
):
def
init_d
ata_
type
(
self
):
self
.
srctype
=
np
.
uint8
self
.
srctype
=
np
.
uint8
self
.
dsttype
=
np
.
int8
self
.
dsttype
=
np
.
int8
def
init_fuse_relu
(
self
):
self
.
fuse_relu
=
True
def
init_fuse_residual
(
self
):
self
.
fuse_residual
=
True
#--------------------test conv2d u8 in and s8 out--------------------
#--------------------test conv2d u8 in and u8 out with residual fuse--------------------
class
TestConv2d
(
TestConv2dInt8Op
):
class
TestConv2d
(
TestConv2dInt8Op
):
...
@@ -142,18 +204,21 @@ class TestConv2d(TestConv2dInt8Op):
...
@@ -142,18 +204,21 @@ class TestConv2d(TestConv2dInt8Op):
self
.
pad
=
[
0
,
0
]
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
input_residual_size
=
[
2
,
6
,
3
,
3
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.6
class
TestWithPad
(
TestConv2d
):
class
TestWithPad
(
TestConv2d
):
def
init_test_case
(
self
):
def
init_test_case
(
self
):
TestConv2d
.
init_test_case
(
self
)
TestConv2d
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
self
.
pad
=
[
1
,
1
]
self
.
input_residual_size
=
[
2
,
6
,
5
,
5
]
class
TestWithGroup
(
TestConv2d
):
class
TestWithGroup
(
TestConv2d
):
...
@@ -166,12 +231,14 @@ class TestWithStride(TestConv2dInt8Op):
...
@@ -166,12 +231,14 @@ class TestWithStride(TestConv2dInt8Op):
self
.
pad
=
[
1
,
1
]
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
6
,
6
]
self
.
input_size
=
[
2
,
3
,
6
,
6
]
self
.
input_residual_size
=
[
2
,
6
,
3
,
3
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_in
=
1.0
self
.
scale_out
=
0.8
self
.
scale_out
=
0.8
self
.
scale_weights
=
[
10.0
]
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.5
class
TestWith1x1
(
TestConv2dInt8Op
):
class
TestWith1x1
(
TestConv2dInt8Op
):
...
@@ -179,12 +246,14 @@ class TestWith1x1(TestConv2dInt8Op):
...
@@ -179,12 +246,14 @@ class TestWith1x1(TestConv2dInt8Op):
self
.
pad
=
[
0
,
0
]
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
1
,
3
,
5
,
5
]
self
.
input_size
=
[
1
,
3
,
5
,
5
]
self
.
input_residual_size
=
[
1
,
6
,
5
,
5
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
scale_in
=
1.0
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
12.0
]
self
.
scale_weights
=
[
12.0
]
self
.
scale_in_eltwise
=
0.5
class
TestWithInput1x1Filter1x1
(
TestConv2dInt8Op
):
class
TestWithInput1x1Filter1x1
(
TestConv2dInt8Op
):
...
@@ -192,29 +261,98 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
...
@@ -192,29 +261,98 @@ class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
self
.
pad
=
[
0
,
0
]
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
1
,
1
]
self
.
input_size
=
[
2
,
3
,
1
,
1
]
self
.
input_residual_size
=
[
2
,
6
,
1
,
1
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
scale_in
=
1.0
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
self
.
scale_weights
=
[
10.0
]
self
.
scale_in_eltwise
=
0.8
def
init_group
(
self
):
def
init_group
(
self
):
self
.
groups
=
3
self
.
groups
=
3
#--------------------test conv2d s8 in and s8 out--------------------
def
init_data_type_with_fusion
(
self
,
input_dt
,
fuse_relu
,
fuse_residual
):
self
.
srctype
=
input_dt
self
.
dsttype
=
np
.
uint8
if
fuse_relu
else
np
.
int8
def
init_fuse_relu
(
self
):
self
.
fuse_relu
=
fuse_relu
def
init_fuse_residual
(
self
):
self
.
fuse_residual
=
fuse_residual
def
create_test_int8_class
(
parent
):
def
create_test_int8_class
(
parent
):
class
TestInt8Case
(
parent
):
def
init_dtype
(
self
):
#--------------------test conv2d s8 in and u8 out--------------------
self
.
srctype
=
np
.
int8
self
.
dsttype
=
np
.
int8
class
TestS8U8Case
(
parent
):
def
init_data_type
(
self
):
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"s8s8"
)
init_data_type_with_fusion
(
self
,
np
.
int8
,
True
,
False
)
TestInt8Case
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestInt8Case
#--------------------test conv2d s8 in and s8 out--------------------
class
TestS8S8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
False
,
False
)
#--------------------test conv2d u8 in and s8 out--------------------
class
TestU8S8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
uint8
,
False
,
False
)
#--------------------test conv2d u8 in and u8 out without residual fuse--------------------
class
TestU8U8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
uint8
,
True
,
False
)
#--------------------test conv2d s8 in and u8 out with residual fuse--------------------
class
TestS8U8ResCase
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
True
,
True
)
#--------------------test conv2d s8 in and s8 out with residual fuse--------------------
class
TestS8S8ResCase
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
False
,
True
)
#--------------------test conv2d u8 in and s8 out with residual fuse--------------------
class
TestU8S8ResCase
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
uint8
,
False
,
True
)
cls_name_s8u8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"1"
)
cls_name_s8s8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"0"
)
cls_name_u8s8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"0"
)
cls_name_u8u8
=
"{0}_relu_{1}_residual_0"
.
format
(
parent
.
__name__
,
"1"
)
cls_name_s8u8_re_1
=
"{0}_relu_{1}_residual_{2}"
.
format
(
parent
.
__name__
,
"1"
,
"1"
)
cls_name_s8s8_re_1
=
"{0}_relu_{1}_residual_{2}"
.
format
(
parent
.
__name__
,
"0"
,
"1"
)
cls_name_u8s8_re_1
=
"{0}_relu_{1}_residual_{2}"
.
format
(
parent
.
__name__
,
"0"
,
"1"
)
TestS8U8Case
.
__name__
=
cls_name_s8u8
TestS8S8Case
.
__name__
=
cls_name_s8s8
TestU8S8Case
.
__name__
=
cls_name_u8s8
TestU8U8Case
.
__name__
=
cls_name_u8u8
TestS8U8ResCase
.
__name__
=
cls_name_s8u8_re_1
TestS8S8ResCase
.
__name__
=
cls_name_s8s8_re_1
TestU8S8ResCase
.
__name__
=
cls_name_u8s8_re_1
globals
()[
cls_name_s8u8
]
=
TestS8U8Case
globals
()[
cls_name_s8s8
]
=
TestS8S8Case
globals
()[
cls_name_u8s8
]
=
TestU8S8Case
globals
()[
cls_name_u8u8
]
=
TestU8U8Case
globals
()[
cls_name_s8u8_re_1
]
=
TestS8U8ResCase
globals
()[
cls_name_s8s8_re_1
]
=
TestS8S8ResCase
globals
()[
cls_name_u8s8_re_1
]
=
TestU8S8ResCase
create_test_int8_class
(
TestConv2dInt8Op
)
create_test_int8_class
(
TestConv2dInt8Op
)
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
eac5a0aa
...
@@ -26,6 +26,7 @@ import pickle
...
@@ -26,6 +26,7 @@ import pickle
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle.fluid
import
compiler
RUN_STEP
=
10
RUN_STEP
=
10
DEFAULT_BATCH_SIZE
=
2
DEFAULT_BATCH_SIZE
=
2
...
@@ -104,8 +105,8 @@ class TestDistRunnerBase(object):
...
@@ -104,8 +105,8 @@ class TestDistRunnerBase(object):
else
:
else
:
place
=
fluid
.
CPUPlace
()
place
=
fluid
.
CPUPlace
()
startup_
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
startup_
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
strategy
.
num_threads
=
1
...
@@ -125,19 +126,16 @@ class TestDistRunnerBase(object):
...
@@ -125,19 +126,16 @@ class TestDistRunnerBase(object):
mypass
.
set_int
(
"num_repeats"
,
args
.
batch_merge_repeat
)
mypass
.
set_int
(
"num_repeats"
,
args
.
batch_merge_repeat
)
if
args
.
update_method
==
"nccl2"
:
if
args
.
update_method
==
"nccl2"
:
num_trainers
=
len
(
args
.
endpoints
.
split
(
","
))
build_stra
.
num_trainers
=
len
(
args
.
endpoints
.
split
(
","
))
trainer_id
=
args
.
trainer_id
build_stra
.
trainer_id
=
args
.
trainer_id
else
:
else
:
num_trainers
=
1
build_stra
.
num_trainers
=
1
trainer_id
=
0
build_stra
.
trainer_id
=
0
exe
=
fluid
.
ParallelExecutor
(
binary
=
compiler
.
CompiledProgram
(
trainer_prog
).
with_data_parallel
(
args
.
use_cuda
,
loss_name
=
avg_cost
.
name
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
,
build_strategy
=
build_stra
,
build_strategy
=
build_stra
,
num_trainers
=
num_trainers
,
exec_strategy
=
strategy
)
trainer_id
=
trainer_id
)
feed_var_list
=
[
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
...
@@ -160,7 +158,8 @@ class TestDistRunnerBase(object):
...
@@ -160,7 +158,8 @@ class TestDistRunnerBase(object):
out_losses
=
[]
out_losses
=
[]
for
_
in
six
.
moves
.
xrange
(
RUN_STEP
):
for
_
in
six
.
moves
.
xrange
(
RUN_STEP
):
loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
loss
,
=
exe
.
run
(
binary
,
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
()))
feed
=
feeder
.
feed
(
get_data
()))
out_losses
.
append
(
loss
[
0
])
out_losses
.
append
(
loss
[
0
])
if
six
.
PY2
:
if
six
.
PY2
:
...
...
python/paddle/fluid/tests/unittests/test_dist_ctr.py
浏览文件 @
eac5a0aa
...
@@ -18,7 +18,6 @@ import unittest
...
@@ -18,7 +18,6 @@ import unittest
from
test_dist_base
import
TestDistBase
from
test_dist_base
import
TestDistBase
# FIXME(tangwei): sum op can not handle when inputs is empty.
class
TestDistCTR2x2
(
TestDistBase
):
class
TestDistCTR2x2
(
TestDistBase
):
def
_setup_config
(
self
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_sync_mode
=
True
...
@@ -28,5 +27,19 @@ class TestDistCTR2x2(TestDistBase):
...
@@ -28,5 +27,19 @@ class TestDistCTR2x2(TestDistBase):
self
.
check_with_place
(
"dist_ctr.py"
,
delta
=
1e-7
,
check_error_log
=
False
)
self
.
check_with_place
(
"dist_ctr.py"
,
delta
=
1e-7
,
check_error_log
=
False
)
class
TestDistCTRWithL2Decay2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
test_dist_ctr
(
self
):
need_envs
=
{
"USE_L2_DECAY"
:
"1"
}
self
.
check_with_place
(
"dist_ctr.py"
,
delta
=
1e-7
,
check_error_log
=
False
,
need_envs
=
need_envs
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
from
paddle.fluid.op
import
Operator
import
paddle.compat
as
cpt
class
TestFusedEmbeddingSeqPoolOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"fused_embedding_seq_pool"
self
.
emb_size
=
2
table
=
np
.
random
.
random
((
17
,
self
.
emb_size
)).
astype
(
"float32"
)
ids
=
np
.
array
([[[
4
],
[
3
]],
[[
4
],
[
3
]],
[[
2
],
[
1
]],
[[
16
],
[
1
]]]).
astype
(
"int64"
)
merged_ids
=
np
.
array
([
4
,
2
,
16
]).
astype
(
"int64"
)
ids_expand
=
np
.
expand_dims
(
ids
,
axis
=
1
)
self
.
lod
=
[[
3
,
1
]]
self
.
attrs
=
{
'is_sparse'
:
True
}
self
.
inputs
=
{
'W'
:
table
,
'Ids'
:
(
ids_expand
,
self
.
lod
)}
self
.
outputs
=
{
'Out'
:
np
.
reshape
(
np
.
array
([
table
[[
4
,
3
]]
+
table
[[
4
,
3
]]
+
table
[[
2
,
1
]],
table
[[
16
,
1
]]
]),
[
len
(
self
.
lod
[
0
]),
2
*
self
.
emb_size
])
}
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
from
test_reorder_lod_tensor
import
convert_to_offset
from
test_seq_pool
import
compute_seqpool_sum
,
compute_seqpool_avg
,
compute_seqpool_sqrt
class
TestFusionSeqPoolConcatOp
(
OpTest
):
def
setUp
(
self
):
self
.
w
=
11
self
.
lods
=
[[[
2
,
3
,
5
]],
[[
1
,
5
,
2
]]]
self
.
set_conf
()
self
.
set_pooltype
()
self
.
op_type
=
'fusion_seqpool_concat'
self
.
axis
=
1
bs
=
len
(
self
.
lods
[
0
][
0
])
inputs
=
[]
outs
=
[]
i
=
0
for
lod
in
self
.
lods
:
assert
bs
==
len
(
lod
[
0
]),
'All lod size should be equal'
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
sum
(
lod
[
0
]),
self
.
w
]).
astype
(
'float32'
)
offset
=
convert_to_offset
(
lod
)
out
=
np
.
zeros
((
bs
,
self
.
w
)).
astype
(
'float32'
)
if
self
.
pooltype
==
"SUM"
:
compute_seqpool_sum
(
x
,
offset
,
out
)
elif
self
.
pooltype
==
"AVERAGE"
:
compute_seqpool_avg
(
x
,
offset
,
out
)
elif
self
.
pooltype
==
"SQRT"
:
compute_seqpool_sqrt
(
x
,
offset
,
out
)
else
:
raise
Exception
(
"Unsupported pool type!"
)
inputs
.
append
((
'x_{0}'
.
format
(
i
),
(
x
,
lod
)))
outs
.
append
(
out
)
i
=
i
+
1
self
.
inputs
=
{
'X'
:
inputs
}
self
.
outputs
=
{
'Out'
:
np
.
concatenate
(
outs
,
axis
=
self
.
axis
)}
self
.
attrs
=
{
'pooltype'
:
self
.
pooltype
,
'axis'
:
self
.
axis
,
}
def
set_pooltype
(
self
):
self
.
pooltype
=
"SUM"
def
set_conf
(
self
):
pass
def
test_check_output
(
self
):
self
.
check_output
()
class
TestFusionSeqPoolConcatOpCase1
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
1
]]]
class
TestFusionSeqPoolConcatOpCase2
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
1
]],
[[
1
]],
[[
1
]]]
class
TestFusionSeqPoolConcatOpCase3
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
1
,
3
,
4
,
6
]]]
self
.
w
=
10
class
TestFusionSeqPoolConcatOpCase4
(
TestFusionSeqPoolConcatOp
):
def
set_conf
(
self
):
self
.
lods
=
[[[
2
,
13
,
4
]],
[[
1
,
1
,
1
]],
[[
5
,
3
,
1
]],
[[
9
,
10
,
3
]]]
self
.
w
=
3
## test avg pool and sqrt
def
create_test_avg_sqrt_class
(
parent
):
class
TestSeqPoolAvgCase
(
parent
):
def
set_pooltype
(
self
):
self
.
pooltype
=
"AVERAGE"
class
TestSeqPoolSqrtCase
(
parent
):
def
set_pooltype
(
self
):
self
.
pooltype
=
"SQRT"
cls_name_avg
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"avg"
)
cls_name_sqrt
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"sqrt"
)
TestSeqPoolAvgCase
.
__name__
=
cls_name_avg
TestSeqPoolSqrtCase
.
__name__
=
cls_name_sqrt
globals
()[
cls_name_avg
]
=
TestSeqPoolAvgCase
globals
()[
cls_name_sqrt
]
=
TestSeqPoolSqrtCase
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOp
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase1
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase2
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase3
)
create_test_avg_sqrt_class
(
TestFusionSeqPoolConcatOpCase4
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_ir_graph.py
0 → 100644
浏览文件 @
eac5a0aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
import
six
from
paddle
import
fluid
class
TestIRGraph
(
unittest
.
TestCase
):
"""
TODO(fc500110): `resolve_hazard` api will be tested when it can be used.
"""
def
test_nodes
(
self
):
graph
=
build_graph
()
self
.
assertTrue
(
{
node
.
name
()
for
node
in
graph
.
nodes
()}
==
{
"x1"
,
"x2"
,
"out"
,
"sum"
})
def
test_has_set_get
(
self
):
graph
=
build_graph
()
for
attr_name
in
[
"int"
,
"float"
,
"string"
]:
self
.
assertFalse
(
graph
.
has
(
attr_name
))
graph
.
set
(
"int"
,
1
)
graph
.
set
(
"float"
,
0.5
)
graph
.
set
(
"string"
,
"string"
)
for
attr_name
in
[
"int"
,
"float"
,
"string"
]:
self
.
assertTrue
(
graph
.
has
(
attr_name
))
self
.
assertTrue
(
graph
.
get_int
(
"int"
)
==
1
)
self
.
assertTrue
(
graph
.
get_float
(
"float"
)
==
0.5
)
self
.
assertTrue
(
graph
.
get_string
(
"string"
)
==
"string"
)
def
test_erase
(
self
):
graph
=
build_graph
()
graph
.
set
(
"test"
,
0
)
self
.
assertTrue
(
graph
.
has
(
"test"
))
graph
.
erase
(
"test"
)
self
.
assertFalse
(
graph
.
has
(
"test"
))
def
test_create_var_node
(
self
):
prog
=
fluid
.
core
.
ProgramDesc
()
block
=
prog
.
block
(
0
)
shape
=
[
10
,
20
]
x1
=
block
.
var
(
six
.
b
(
"x1"
))
x1
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
x1
.
set_shape
(
shape
)
graph
=
fluid
.
core
.
Graph
(
prog
)
node
=
graph
.
create_var_node
(
x1
)
self
.
assertTrue
(
node
.
node_type
()
==
fluid
.
core
.
Node
.
Type
.
Variable
)
def
test_create_op_node
(
self
):
prog
=
fluid
.
core
.
ProgramDesc
()
block
=
prog
.
block
(
0
)
sum_op_desc
=
block
.
append_op
()
graph
=
fluid
.
core
.
Graph
(
prog
)
node
=
graph
.
create_op_node
(
sum_op_desc
)
self
.
assertTrue
(
node
.
node_type
()
==
fluid
.
core
.
Node
.
Type
.
Operation
)
def
test_create_control_dep_var
(
self
):
graph
=
build_graph
()
name
=
"__control_var@{}"
.
format
(
len
(
graph
.
nodes
()))
node
=
graph
.
create_control_dep_var
()
self
.
assertTrue
(
node
.
name
()
==
name
)
def
test_create_empty_node
(
self
):
prog
=
fluid
.
core
.
ProgramDesc
()
graph
=
fluid
.
core
.
Graph
(
prog
)
n1
=
graph
.
create_empty_node
(
'x'
,
fluid
.
core
.
Node
.
Type
.
Operation
)
self
.
assertTrue
(
n1
.
name
()
==
'x'
)
n2
=
graph
.
create_empty_node
(
'y'
,
fluid
.
core
.
Node
.
Type
.
Variable
)
self
.
assertTrue
(
n2
.
name
()
==
'y'
)
def
test_release_nodes
(
self
):
graph
=
build_graph
()
nodes
=
graph
.
release_nodes
()
self
.
assertTrue
(
len
(
graph
.
nodes
())
==
0
)
self
.
assertTrue
({
node
.
name
()
for
node
in
nodes
}
==
{
"x1"
,
"x2"
,
"out"
,
"sum"
})
def
test_remove_node
(
self
):
graph
=
build_graph
()
nodes
=
graph
.
nodes
()
for
node
in
nodes
:
if
node
.
name
()
==
"sum"
:
break
self
.
assertTrue
({
node
.
name
()
for
node
in
nodes
}
==
{
"x1"
,
"x2"
,
"out"
,
"sum"
})
nodes
.
remove
(
node
)
self
.
assertTrue
({
node
.
name
()
for
node
in
nodes
}
==
{
"x1"
,
"x2"
,
"out"
})
def
test_retrieve_node
(
self
):
graph
=
build_graph
()
nodes
=
[]
for
i
in
range
(
len
(
graph
.
nodes
())):
nodes
.
append
(
graph
.
retrieve_node
(
i
))
for
node
in
nodes
:
self
.
assertTrue
(
node
in
graph
.
nodes
())
def
resolve_hazard
(
self
):
pass
def
build_graph
():
prog
=
fluid
.
core
.
ProgramDesc
()
block
=
prog
.
block
(
0
)
shape
=
[
10
,
20
]
# prepare input/output
x1
=
block
.
var
(
six
.
b
(
"x1"
))
x1
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
x1
.
set_shape
(
shape
)
x2
=
block
.
var
(
six
.
b
(
"x2"
))
x2
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
x2
.
set_shape
(
shape
)
out
=
block
.
var
(
six
.
b
(
"out"
))
out
.
set_type
(
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
)
sum_op_desc
=
block
.
append_op
()
sum_op_desc
.
set_type
(
"sum"
)
sum_op_desc
.
set_input
(
"X"
,
[
"x1"
,
"x2"
])
sum_op_desc
.
set_output
(
"Out"
,
[
"out"
])
sum_op_desc
.
check_attrs
()
sum_op_desc
.
infer_shape
(
block
)
graph
=
fluid
.
core
.
Graph
(
prog
)
return
graph
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_optimizer.py
浏览文件 @
eac5a0aa
...
@@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase):
...
@@ -61,6 +61,48 @@ class TestOptimizer(unittest.TestCase):
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"sgd"
])
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"sgd"
])
class
TestOptimizerBackwardApplygrad
(
unittest
.
TestCase
):
def
test_sgd_optimizer
(
self
):
def
check_sgd_optimizer
(
optimizer_attr
):
init_program
=
framework
.
Program
()
program
=
framework
.
Program
()
block
=
program
.
global_block
()
mul_x
=
block
.
create_parameter
(
dtype
=
"float32"
,
shape
=
[
5
,
10
],
lod_level
=
0
,
name
=
"mul.x"
,
optimize_attr
=
optimizer_attr
)
mul_y
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
10
,
8
],
lod_level
=
0
,
name
=
"mul.y"
)
mul_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
5
,
8
],
lod_level
=
0
,
name
=
"mul.out"
)
mean_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
type
=
"mul"
,
inputs
=
{
"X"
:
mul_x
,
"Y"
:
mul_y
},
outputs
=
{
"Out"
:
mul_out
},
attrs
=
{
"x_num_col_dims"
:
1
})
block
.
append_op
(
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
sgd_optimizer
=
optimizer
.
SGDOptimizer
(
learning_rate
=
0.01
)
with
framework
.
program_guard
(
program
,
init_program
):
p_g
=
sgd_optimizer
.
backward
(
mean_out
)
opts
=
sgd_optimizer
.
apply_gradients
(
p_g
)
return
opts
opts
=
check_sgd_optimizer
({
'learning_rate'
:
1.1
})
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"fill_constant"
,
"elementwise_mul"
,
"sgd"
])
opts
=
check_sgd_optimizer
({
'learning_rate'
:
1.0
})
self
.
assertEqual
(
len
(
opts
),
1
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"sgd"
])
class
TestMomentumOptimizer
(
unittest
.
TestCase
):
class
TestMomentumOptimizer
(
unittest
.
TestCase
):
class
MockMomentum
(
optimizer
.
MomentumOptimizer
):
class
MockMomentum
(
optimizer
.
MomentumOptimizer
):
def
get_accumulators
(
self
):
def
get_accumulators
(
self
):
...
@@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase):
...
@@ -99,8 +141,8 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
opts
=
momentum_optimizer
.
_create_optimization_pass
(
with
framework
.
program_guard
(
program
,
init_program
):
params_grads
,
mul_out
,
init_program
)
opts
=
momentum_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
(
len
(
opts
),
3
)
sgd_op
=
opts
[
-
1
]
sgd_op
=
opts
[
-
1
]
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
...
@@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase):
...
@@ -153,8 +195,8 @@ class TestMomentumOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
momentum_optimizer
.
get_accumulators
()),
0
)
opts
=
momentum_optimizer
.
_create_optimization_pass
(
with
framework
.
program_guard
(
program
,
init_program
):
params_grads
,
mul_out
,
init_program
)
opts
=
momentum_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
(
len
(
opts
),
3
)
sgd_op
=
opts
[
-
1
]
sgd_op
=
opts
[
-
1
]
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
...
@@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase):
...
@@ -216,8 +258,8 @@ class TestAdagradOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adagrad_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
adagrad_optimizer
.
get_accumulators
()),
0
)
opts
=
adagrad_optimizer
.
_create_optimization_pass
(
with
framework
.
program_guard
(
program
,
init_program
):
params_grads
,
mul_out
,
init_program
)
opts
=
adagrad_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"fill_constant"
,
"elementwise_mul"
,
"adagrad"
])
[
"fill_constant"
,
"elementwise_mul"
,
"adagrad"
])
...
@@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase):
...
@@ -280,8 +322,8 @@ class TestAdamOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adam_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
adam_optimizer
.
get_accumulators
()),
0
)
opts
=
adam_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
with
framework
.
program_guard
(
program
,
init_program
):
init_program
)
opts
=
adam_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
5
)
self
.
assertEqual
(
len
(
opts
),
5
)
self
.
assertEqual
(
self
.
assertEqual
(
[
op
.
type
for
op
in
opts
],
[
op
.
type
for
op
in
opts
],
...
@@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
...
@@ -347,8 +389,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
adamax_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
adamax_optimizer
.
get_accumulators
()),
0
)
opts
=
adamax_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
with
framework
.
program_guard
(
program
,
init_program
):
init_program
)
opts
=
adamax_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
4
)
self
.
assertEqual
(
len
(
opts
),
4
)
self
.
assertEqual
(
self
.
assertEqual
(
[
op
.
type
for
op
in
opts
],
[
op
.
type
for
op
in
opts
],
...
@@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
...
@@ -411,8 +453,8 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
decayed_adagrad_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
decayed_adagrad_optimizer
.
get_accumulators
()),
0
)
opts
=
decayed_adagrad_optimizer
.
_create_optimization_pass
(
with
framework
.
program_guard
(
program
,
init_program
):
params_grads
,
mul_out
,
init_program
)
opts
=
decayed_adagrad_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
(
self
.
assertEqual
(
[
op
.
type
for
op
in
opts
],
[
op
.
type
for
op
in
opts
],
...
@@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase):
...
@@ -477,8 +519,8 @@ class TestFtrlOptimizer(unittest.TestCase):
params_grads
=
append_backward
(
mean_out
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
ftrl_optimizer
.
get_accumulators
()),
0
)
self
.
assertEqual
(
len
(
ftrl_optimizer
.
get_accumulators
()),
0
)
opts
=
ftrl_optimizer
.
_create_optimization_pass
(
params_grads
,
mul_out
,
with
framework
.
program_guard
(
program
,
init_program
):
init_program
)
opts
=
ftrl_optimizer
.
apply_gradients
(
params_grads
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"fill_constant"
,
"elementwise_mul"
,
"ftrl"
])
[
"fill_constant"
,
"elementwise_mul"
,
"ftrl"
])
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
eac5a0aa
...
@@ -74,7 +74,11 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -74,7 +74,11 @@ class TestMNIST(TestParallelExecutorBase):
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
return
img
,
label
return
img
,
label
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_cuda
):
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_cuda
,
delta1
=
1e-6
,
delta2
=
1e-4
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
return
...
@@ -95,9 +99,9 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -95,9 +99,9 @@ class TestMNIST(TestParallelExecutorBase):
use_reduce
=
True
)
use_reduce
=
True
)
for
loss
in
zip
(
all_reduce_first_loss
,
reduce_first_loss
):
for
loss
in
zip
(
all_reduce_first_loss
,
reduce_first_loss
):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
delta1
)
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-4
)
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
# simple_fc
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
...
@@ -174,8 +178,9 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -174,8 +178,9 @@ class TestMNIST(TestParallelExecutorBase):
self
.
check_batchnorm_fc_convergence
(
use_cuda
,
use_fast_executor
)
self
.
check_batchnorm_fc_convergence
(
use_cuda
,
use_fast_executor
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
def
test_batchnorm_fc_with_new_strategy
(
self
):
# FIXME(zcd): close this test temporally.
# NOTE: the computation result of nccl_reduce is non-deterministic,
# self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
True
,
1e-5
,
1e-3
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
浏览文件 @
eac5a0aa
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle.fluid
import
compiler
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
import
numpy
as
np
import
numpy
as
np
import
unittest
import
unittest
...
@@ -61,22 +62,21 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
...
@@ -61,22 +62,21 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
exe
.
run
(
startup
)
exe
.
run
(
startup
)
feed_dict
=
{
'image'
:
image
,
'label'
:
label
}
feed_dict
=
{
'image'
:
image
,
'label'
:
label
}
train_exe
=
fluid
.
ParallelExecutor
(
train_cp
=
compiler
.
CompiledProgram
(
main
).
with_data_parallel
(
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
build_strategy
=
build_strategy
)
test_cp
=
compiler
.
CompiledProgram
(
test_program
).
with_data_parallel
(
loss_name
=
loss
.
name
,
loss_name
=
loss
.
name
,
main_program
=
main
,
build_strategy
=
build_strategy
,
build_strategy
=
build_strategy
)
share_vars_from
=
train_cp
)
test_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
main_program
=
test_program
,
share_vars_from
=
train_exe
,
build_strategy
=
build_strategy
)
for
i
in
range
(
5
):
for
i
in
range
(
5
):
test_loss
,
=
test_exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
exe
.
run
(
train_cp
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
test_loss
,
=
exe
.
run
(
test_cp
,
train_loss
,
=
train_exe
.
run
([
loss
.
name
],
feed
=
feed_dict
)
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
train_loss
,
=
exe
.
run
(
train_cp
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
avg_test_loss_val
=
np
.
array
(
test_loss
).
mean
()
avg_test_loss_val
=
np
.
array
(
test_loss
).
mean
()
if
math
.
isnan
(
float
(
avg_test_loss_val
)):
if
math
.
isnan
(
float
(
avg_test_loss_val
)):
...
...
python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
浏览文件 @
eac5a0aa
...
@@ -22,6 +22,14 @@ import numpy
...
@@ -22,6 +22,14 @@ import numpy
import
functools
import
functools
def
convert_to_offset
(
lod
):
offset
=
[[
0
]
for
i
in
lod
]
for
i
,
level
in
enumerate
(
lod
):
for
seq_len
in
level
:
offset
[
i
].
append
(
offset
[
i
][
-
1
]
+
seq_len
)
return
offset
class
TestReorderLoDTensor
(
unittest
.
TestCase
):
class
TestReorderLoDTensor
(
unittest
.
TestCase
):
num_seq
=
5
num_seq
=
5
# [name, shape, lod_level] pair indicating data info of source and target
# [name, shape, lod_level] pair indicating data info of source and target
...
@@ -91,13 +99,6 @@ class TestReorderLoDTensor(unittest.TestCase):
...
@@ -91,13 +99,6 @@ class TestReorderLoDTensor(unittest.TestCase):
self
.
inputs
[
desc
[
0
]]
=
tensor
self
.
inputs
[
desc
[
0
]]
=
tensor
def
reorder
(
self
):
def
reorder
(
self
):
def
convert_to_offset
(
lod
):
offset_lod
=
[[
0
]
for
i
in
lod
]
for
i
,
level
in
enumerate
(
lod
):
for
seq_len
in
level
:
offset_lod
[
i
].
append
(
offset_lod
[
i
][
-
1
]
+
seq_len
)
return
offset_lod
level
=
0
level
=
0
# compute the rank_table according to ref_lod
# compute the rank_table according to ref_lod
ref_lod
=
self
.
data
[
self
.
data_desc
[
1
][
0
]][
1
][
level
]
ref_lod
=
self
.
data
[
self
.
data_desc
[
1
][
0
]][
1
][
level
]
...
...
python/paddle/fluid/tests/unittests/test_seq_pool.py
浏览文件 @
eac5a0aa
...
@@ -17,33 +17,43 @@ from __future__ import print_function
...
@@ -17,33 +17,43 @@ from __future__ import print_function
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
from
op_test
import
OpTest
from
op_test
import
OpTest
from
test_reorder_lod_tensor
import
convert_to_offset
class
TestSeqAvgPool
(
OpTest
):
def
compute_seqpool_sum
(
x
,
offset
,
out
):
def
convert_to_offset
(
self
,
lod
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
offset
=
[[
0
]
for
i
in
lod
]
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
for
i
,
level
in
enumerate
(
lod
):
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
for
seq_len
in
level
:
offset
[
i
].
append
(
offset
[
i
][
-
1
]
+
seq_len
)
return
offset
def
compute_seqpool_avg
(
x
,
offset
,
out
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
out
[
i
]
=
sub_x
.
mean
(
axis
=
0
)
def
compute_seqpool_sqrt
(
x
,
offset
,
out
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
seq_len
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
/
np
.
sqrt
(
seq_len
)
class
TestSeqAvgPool
(
OpTest
):
def
set_data
(
self
):
def
set_data
(
self
):
self
.
op_type
=
'sequence_pool'
self
.
op_type
=
'sequence_pool'
# one level, batch size is 4
# one level, batch size is 4
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
23
]).
astype
(
'float32'
)
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
23
]).
astype
(
'float32'
)
lod
=
[[
11
]]
lod
=
[[
11
]]
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
out
=
np
.
zeros
((
len
(
lod
[
0
]),
23
)).
astype
(
'float32'
)
out
=
np
.
zeros
((
len
(
lod
[
0
]),
23
)).
astype
(
'float32'
)
self
.
outputs
=
{
'Out'
:
out
}
self
.
outputs
=
{
'Out'
:
out
}
return
x
,
offset
,
out
return
x
,
offset
,
out
def
compute
(
self
,
x
,
offset
,
out
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"AVERAGE"
}
self
.
attrs
=
{
'pooltype'
:
"AVERAGE"
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
compute_seqpool_avg
(
x
,
offset
,
out
)
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
out
[
i
]
=
sub_x
.
mean
(
axis
=
0
)
def
setUp
(
self
):
def
setUp
(
self
):
x
,
offset
,
out
=
self
.
set_data
()
x
,
offset
,
out
=
self
.
set_data
()
...
@@ -62,9 +72,7 @@ class TestSeqAvgPool(OpTest):
...
@@ -62,9 +72,7 @@ class TestSeqAvgPool(OpTest):
class
TestSeqSumPool
(
TestSeqAvgPool
):
class
TestSeqSumPool
(
TestSeqAvgPool
):
def
compute
(
self
,
x
,
offset
,
out
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"SUM"
}
self
.
attrs
=
{
'pooltype'
:
"SUM"
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
compute_seqpool_sum
(
x
,
offset
,
out
)
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
class
TestSeqMaxPool
(
TestSeqAvgPool
):
class
TestSeqMaxPool
(
TestSeqAvgPool
):
...
@@ -72,7 +80,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
...
@@ -72,7 +80,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
self
.
op_type
=
'sequence_pool'
self
.
op_type
=
'sequence_pool'
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
23
]).
astype
(
'float32'
)
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
23
]).
astype
(
'float32'
)
lod
=
[[
13
]]
lod
=
[[
13
]]
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
l
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
l
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
x
[
offset
[
0
][
i
]
+
np
.
random
.
randint
(
l
),
:]
+=
2.0
x
[
offset
[
0
][
i
]
+
np
.
random
.
randint
(
l
),
:]
+=
2.0
...
@@ -93,10 +101,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
...
@@ -93,10 +101,7 @@ class TestSeqMaxPool(TestSeqAvgPool):
class
TestSeqSqrtPool
(
TestSeqAvgPool
):
class
TestSeqSqrtPool
(
TestSeqAvgPool
):
def
compute
(
self
,
x
,
offset
,
out
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"SQRT"
}
self
.
attrs
=
{
'pooltype'
:
"SQRT"
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
compute_seqpool_sqrt
(
x
,
offset
,
out
)
sub_x
=
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:]
seq_len
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
out
[
i
]
=
sub_x
.
sum
(
axis
=
0
)
/
np
.
sqrt
(
seq_len
)
class
TestSeqLastPool
(
TestSeqAvgPool
):
class
TestSeqLastPool
(
TestSeqAvgPool
):
...
@@ -122,7 +127,7 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
...
@@ -122,7 +127,7 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
3
,
17
]).
astype
(
'float32'
)
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
3
,
17
]).
astype
(
'float32'
)
lod
=
[[
4
,
1
,
3
,
5
]]
lod
=
[[
4
,
1
,
3
,
5
]]
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
out
=
np
.
zeros
((
4
,
3
,
17
)).
astype
(
'float32'
)
out
=
np
.
zeros
((
4
,
3
,
17
)).
astype
(
'float32'
)
self
.
outputs
=
{
'Out'
:
out
}
self
.
outputs
=
{
'Out'
:
out
}
...
@@ -167,7 +172,7 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
...
@@ -167,7 +172,7 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
3
,
11
]).
astype
(
'float32'
)
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
13
,
3
,
11
]).
astype
(
'float32'
)
lod
=
[[
4
,
1
,
3
,
5
]]
lod
=
[[
4
,
1
,
3
,
5
]]
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
self
.
inputs
=
{
'X'
:
(
x
,
lod
)}
offset
=
self
.
convert_to_offset
(
lod
)
offset
=
convert_to_offset
(
lod
)
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
l
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
l
=
offset
[
0
][
i
+
1
]
-
offset
[
0
][
i
]
x
[
offset
[
0
][
i
]
+
np
.
random
.
randint
(
l
),
:]
+=
1.0
x
[
offset
[
0
][
i
]
+
np
.
random
.
randint
(
l
),
:]
+=
1.0
...
...
python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
浏览文件 @
eac5a0aa
...
@@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
...
@@ -28,6 +28,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
def
initParams
(
self
):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
False
self
.
numeric_stable_mode
=
False
self
.
dtype
=
np
.
float64
def
setUp
(
self
):
def
setUp
(
self
):
self
.
initParams
()
self
.
initParams
()
...
@@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
...
@@ -36,19 +37,19 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
class_num
=
37
class_num
=
37
logits
=
np
.
random
.
uniform
(
0.1
,
1.0
,
logits
=
np
.
random
.
uniform
(
0.1
,
1.0
,
[
batch_size
,
class_num
]).
astype
(
"float64"
)
[
batch_size
,
class_num
]).
astype
(
self
.
dtype
)
softmax
=
np
.
apply_along_axis
(
stable_softmax
,
1
,
logits
)
softmax
=
np
.
apply_along_axis
(
stable_softmax
,
1
,
logits
)
labels
=
np
.
random
.
randint
(
0
,
class_num
,
[
batch_size
,
1
],
dtype
=
"int64"
)
labels
=
np
.
random
.
randint
(
0
,
class_num
,
[
batch_size
,
1
],
dtype
=
"int64"
)
cross_entropy
=
np
.
asmatrix
(
cross_entropy
=
np
.
asmatrix
(
[[
-
np
.
log
(
softmax
[
i
][
labels
[
i
][
0
]])]
[[
-
np
.
log
(
softmax
[
i
][
labels
[
i
][
0
]])]
for
i
in
range
(
softmax
.
shape
[
0
])],
for
i
in
range
(
softmax
.
shape
[
0
])],
dtype
=
"float64"
)
dtype
=
self
.
dtype
)
self
.
inputs
=
{
"Logits"
:
logits
,
"Label"
:
labels
}
self
.
inputs
=
{
"Logits"
:
logits
,
"Label"
:
labels
}
self
.
outputs
=
{
self
.
outputs
=
{
"Softmax"
:
softmax
.
astype
(
"float64"
),
"Softmax"
:
softmax
.
astype
(
self
.
dtype
),
"Loss"
:
cross_entropy
.
astype
(
"float64"
)
"Loss"
:
cross_entropy
.
astype
(
self
.
dtype
)
}
}
self
.
attrs
=
{
"numeric_stable_mode"
:
self
.
numeric_stable_mode
}
self
.
attrs
=
{
"numeric_stable_mode"
:
self
.
numeric_stable_mode
}
...
@@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
...
@@ -56,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
self
.
check_output
()
self
.
check_output
()
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
([
"Logits"
],
"Loss"
)
self
.
check_grad
([
"Logits"
],
"Loss"
,
max_relative_error
=
0.05
)
class
TestSoftmaxWithCrossEntropyOpNoCudnn
(
TestSoftmaxWithCrossEntropyOp
):
class
TestSoftmaxWithCrossEntropyOpNoCudnn
(
TestSoftmaxWithCrossEntropyOp
):
...
@@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
...
@@ -64,6 +65,55 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
self
.
numeric_stable_mode
=
True
self
.
numeric_stable_mode
=
True
class
TestSoftmaxWithCrossEntropyOpFp16
(
TestSoftmaxWithCrossEntropyOp
):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
False
self
.
dtype
=
np
.
float16
def
setUp
(
self
):
self
.
initParams
()
self
.
op_type
=
"softmax_with_cross_entropy"
batch_size
=
41
class_num
=
37
# NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
logits
=
np
.
random
.
uniform
(
0.1
,
1.0
,
[
batch_size
,
class_num
]).
astype
(
np
.
float32
)
softmax
=
np
.
apply_along_axis
(
stable_softmax
,
1
,
logits
)
labels
=
np
.
random
.
randint
(
0
,
class_num
,
[
batch_size
,
1
],
dtype
=
"int64"
)
cross_entropy
=
np
.
asmatrix
(
[[
-
np
.
log
(
softmax
[
i
][
labels
[
i
][
0
]])]
for
i
in
range
(
softmax
.
shape
[
0
])],
dtype
=
np
.
float32
)
self
.
inputs
=
{
"Logits"
:
logits
.
astype
(
self
.
dtype
).
view
(
np
.
uint16
),
"Label"
:
labels
}
self
.
outputs
=
{
"Softmax"
:
softmax
.
astype
(
self
.
dtype
),
"Loss"
:
cross_entropy
.
astype
(
self
.
dtype
)
}
self
.
attrs
=
{
"numeric_stable_mode"
:
self
.
numeric_stable_mode
}
def
test_check_output
(
self
):
self
.
check_output
(
atol
=
1e-2
)
def
test_check_grad
(
self
):
self
.
check_grad
([
"Logits"
],
"Loss"
,
max_relative_error
=
0.1
)
class
TestSoftmaxWithCrossEntropyOpNoCudnnFp16
(
TestSoftmaxWithCrossEntropyOpFp16
):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
True
self
.
dtype
=
np
.
float16
def
test_check_grad
(
self
):
self
.
check_grad
([
"Logits"
],
"Loss"
,
max_relative_error
=
0.1
)
class
TestSoftmaxWithCrossEntropyOp2
(
OpTest
):
class
TestSoftmaxWithCrossEntropyOp2
(
OpTest
):
"""
"""
Test softmax with cross entropy operator with soft labels.
Test softmax with cross entropy operator with soft labels.
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
eac5a0aa
...
@@ -752,12 +752,6 @@ class DistributeTranspiler(object):
...
@@ -752,12 +752,6 @@ class DistributeTranspiler(object):
elif
op
not
in
lr_ops
:
elif
op
not
in
lr_ops
:
self
.
_append_pserver_non_opt_ops
(
block
,
op
)
self
.
_append_pserver_non_opt_ops
(
block
,
op
)
def
__op_have_grad_input__
(
op
):
for
varname
in
op
.
input_arg_names
:
if
varname
.
find
(
"@GRAD"
)
>=
0
:
return
varname
return
""
def
__clone_lr_op_sub_block__
(
op
,
program
,
lr_block
):
def
__clone_lr_op_sub_block__
(
op
,
program
,
lr_block
):
if
not
op
.
has_attr
(
'sub_block'
):
if
not
op
.
has_attr
(
'sub_block'
):
return
return
...
@@ -808,7 +802,7 @@ class DistributeTranspiler(object):
...
@@ -808,7 +802,7 @@ class DistributeTranspiler(object):
merged_var
=
None
merged_var
=
None
for
_
,
op
in
enumerate
(
self
.
optimize_ops
):
for
_
,
op
in
enumerate
(
self
.
optimize_ops
):
# find the origin grad var before clipping/L2Decay,
# find the origin grad var before clipping/L2Decay,
# merged_var should be the input var name of L2Decay
buil
# merged_var should be the input var name of L2Decay
grad_varname_for_block
=
op
.
attr
(
OP_ROLE_VAR_ATTR_NAME
)[
1
]
grad_varname_for_block
=
op
.
attr
(
OP_ROLE_VAR_ATTR_NAME
)[
1
]
if
op
.
attr
(
OP_ROLE_VAR_ATTR_NAME
)[
if
op
.
attr
(
OP_ROLE_VAR_ATTR_NAME
)[
0
]
==
optimize_target_param_name
:
0
]
==
optimize_target_param_name
:
...
@@ -1684,7 +1678,16 @@ class DistributeTranspiler(object):
...
@@ -1684,7 +1678,16 @@ class DistributeTranspiler(object):
if
self
.
config
.
enable_dc_asgd
:
if
self
.
config
.
enable_dc_asgd
:
new_inputs
[
key
]
=
dc
new_inputs
[
key
]
=
dc
else
:
else
:
new_inputs
[
key
]
=
merged_var
# Note!! This is for l2decay on sparse gradient, because it will create a new tensor for
# decayed gradient but not inplace modify the origin one
origin_grad_name
=
opt_op
.
input
(
key
)[
0
]
if
core
.
kNewGradSuffix
(
)
in
origin_grad_name
and
pserver_block
.
has_var
(
origin_grad_name
):
new_grad
=
pserver_block
.
var
(
origin_grad_name
)
new_inputs
[
key
]
=
new_grad
else
:
new_inputs
[
key
]
=
merged_var
elif
key
==
"Param"
:
elif
key
==
"Param"
:
param_block
=
_get_param_block
(
opt_op
)
param_block
=
_get_param_block
(
opt_op
)
if
not
param_block
:
if
not
param_block
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录