Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
4a443ffc
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4a443ffc
编写于
1月 08, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
merge develop
test=develop
上级
7c7342bf
b1ea335f
变更
120
显示空白变更内容
内联
并排
Showing
120 changed file
with
4359 addition
and
1392 deletion
+4359
-1392
CMakeLists.txt
CMakeLists.txt
+8
-1
cmake/FindJeMalloc.cmake
cmake/FindJeMalloc.cmake
+21
-0
cmake/configure.cmake
cmake/configure.cmake
+1
-0
cmake/cuda.cmake
cmake/cuda.cmake
+4
-2
cmake/external/ngraph.cmake
cmake/external/ngraph.cmake
+1
-1
cmake/generic.cmake
cmake/generic.cmake
+5
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+1
-1
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+2
-0
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+93
-82
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+46
-16
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+14
-2
paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
...fluid/framework/details/multi_devices_graph_check_pass.cc
+57
-47
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+475
-388
paddle/fluid/framework/details/multi_devices_graph_pass.h
paddle/fluid/framework/details/multi_devices_graph_pass.h
+106
-38
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+99
-0
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+51
-0
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
...id/framework/details/scope_buffered_ssa_graph_executor.cc
+1
-1
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+8
-8
paddle/fluid/framework/ngraph_bridge.cc
paddle/fluid/framework/ngraph_bridge.cc
+3
-0
paddle/fluid/framework/ngraph_operator.cc
paddle/fluid/framework/ngraph_operator.cc
+1
-1
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+24
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+103
-30
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+10
-0
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+0
-22
paddle/fluid/framework/threadpool.cc
paddle/fluid/framework/threadpool.cc
+0
-1
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+0
-2
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+0
-10
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+11
-7
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+5
-3
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
...uid/inference/analysis/passes/ir_analysis_compose_pass.cc
+0
-23
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
...luid/inference/analysis/passes/ir_analysis_compose_pass.h
+0
-2
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+154
-66
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+46
-37
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+15
-15
paddle/fluid/inference/api/api_anakin_engine.h
paddle/fluid/inference/api/api_anakin_engine.h
+0
-2
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-1
paddle/fluid/inference/api/api_impl_tester.cc
paddle/fluid/inference/api/api_impl_tester.cc
+2
-1
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+4
-5
paddle/fluid/inference/api/demo_ci/vis_demo.cc
paddle/fluid/inference/api/demo_ci/vis_demo.cc
+6
-7
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+88
-21
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+2
-3
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+11
-1
paddle/fluid/inference/tensorrt/CMakeLists.txt
paddle/fluid/inference/tensorrt/CMakeLists.txt
+1
-0
paddle/fluid/inference/tensorrt/op_teller.cc
paddle/fluid/inference/tensorrt/op_teller.cc
+49
-0
paddle/fluid/inference/tensorrt/op_teller.h
paddle/fluid/inference/tensorrt/op_teller.h
+68
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+5
-5
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+3
-6
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+4
-5
paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+4
-5
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+5
-6
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+4
-6
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+14
-14
paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+4
-6
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+4
-5
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
...le/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+3
-6
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
...nference/tests/api/analyzer_text_classification_tester.cc
+4
-5
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+5
-6
paddle/fluid/inference/tests/api/config_printer.h
paddle/fluid/inference/tests/api/config_printer.h
+10
-6
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+4
-1
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+12
-12
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+360
-3
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+29
-4
paddle/fluid/operators/conv_op.h
paddle/fluid/operators/conv_op.h
+3
-9
paddle/fluid/operators/detection/density_prior_box_op.cu
paddle/fluid/operators/detection/density_prior_box_op.cu
+1
-1
paddle/fluid/operators/distributed/parameter_prefetch.cc
paddle/fluid/operators/distributed/parameter_prefetch.cc
+15
-9
paddle/fluid/operators/distributed/parameter_prefetch.h
paddle/fluid/operators/distributed/parameter_prefetch.h
+50
-1
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+4
-2
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+194
-0
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+142
-0
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+2
-2
paddle/fluid/operators/hierarchical_sigmoid_op.cc
paddle/fluid/operators/hierarchical_sigmoid_op.cc
+34
-13
paddle/fluid/operators/hierarchical_sigmoid_op.h
paddle/fluid/operators/hierarchical_sigmoid_op.h
+58
-27
paddle/fluid/operators/huber_loss_op.h
paddle/fluid/operators/huber_loss_op.h
+4
-2
paddle/fluid/operators/linear_chain_crf_op.cc
paddle/fluid/operators/linear_chain_crf_op.cc
+2
-0
paddle/fluid/operators/lookup_table_op.cu
paddle/fluid/operators/lookup_table_op.cu
+2
-1
paddle/fluid/operators/lookup_table_op.h
paddle/fluid/operators/lookup_table_op.h
+2
-1
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+70
-64
paddle/fluid/operators/math/matrix_bit_code.cc
paddle/fluid/operators/math/matrix_bit_code.cc
+0
-35
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+21
-25
paddle/fluid/operators/nce_op.cc
paddle/fluid/operators/nce_op.cc
+20
-6
paddle/fluid/operators/nce_op.h
paddle/fluid/operators/nce_op.h
+91
-46
paddle/fluid/operators/ngraph/ngraph_ops.h
paddle/fluid/operators/ngraph/ngraph_ops.h
+2
-0
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
+61
-0
paddle/fluid/operators/ngraph/ops/mean_op.h
paddle/fluid/operators/ngraph/ops/mean_op.h
+68
-0
paddle/fluid/operators/ngraph/ops/scale_op.h
paddle/fluid/operators/ngraph/ops/scale_op.h
+41
-0
paddle/fluid/operators/optimizers/adam_op.h
paddle/fluid/operators/optimizers/adam_op.h
+10
-3
paddle/fluid/operators/reader/ctr_reader.h
paddle/fluid/operators/reader/ctr_reader.h
+1
-1
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+2
-2
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+5
-13
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+52
-24
paddle/fluid/platform/device_context_test.cu
paddle/fluid/platform/device_context_test.cu
+3
-0
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+96
-16
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+1
-1
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+6
-5
paddle/fluid/platform/temporary_allocator_test.cc
paddle/fluid/platform/temporary_allocator_test.cc
+63
-52
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-7
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+5
-3
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+6
-16
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+7
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+22
-11
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+14
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+3
-2
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
...addle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+31
-0
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
...ddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+40
-0
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+0
-1
python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+3
-2
python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
...addle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
+270
-0
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+4
-3
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+2
-2
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+146
-10
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
...d/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+6
-0
python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
...addle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
+51
-0
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+3
-3
python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
...le/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
+269
-0
python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
.../paddle/fluid/tests/unittests/test_nce_remote_table_op.py
+236
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
...addle/fluid/tests/unittests/test_parallel_executor_crf.py
+36
-16
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+1
-0
python/paddle/fluid/tests/unittests/test_reader_reset.py
python/paddle/fluid/tests/unittests/test_reader_reset.py
+0
-2
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+2
-3
未找到文件。
CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option
(
WITH_RDMA
"Compile PaddlePaddle with RDMA support"
OFF
)
option
(
WITH_TIMER
"Compile PaddlePaddle with stats timer"
OFF
)
option
(
WITH_PROFILER
"Compile PaddlePaddle with GPU profiler and gperftools"
OFF
)
option
(
WITH_JEMALLOC
"Compile PaddlePaddle with jemalloc"
OFF
)
option
(
WITH_DOC
"Compile PaddlePaddle with documentation"
OFF
)
option
(
WITH_COVERAGE
"Compile PaddlePaddle with code coverage"
OFF
)
option
(
COVERALLS_UPLOAD
"Package code coverage data to coveralls"
OFF
)
...
...
@@ -261,6 +262,12 @@ if (WITH_PROFILER)
add_definitions
(
-DWITH_GPERFTOOLS
)
endif
()
if
(
WITH_JEMALLOC
)
find_package
(
JeMalloc REQUIRED
)
include_directories
(
${
JEMALLOC_INCLUDE_DIR
}
)
add_definitions
(
-DWITH_JEMALLOC
)
endif
()
include
(
generic
)
# simplify cmake module
include
(
package
)
# set paddle packages
include
(
ccache
)
# set ccache for compilation
...
...
cmake/FindJeMalloc.cmake
0 → 100644
浏览文件 @
4a443ffc
# - Find JeMalloc library
# Find the native JeMalloc includes and library
#
# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
# JEMALLOC_FOUND - True if jemalloc found.
find_path
(
JEMALLOC_INCLUDE_DIR
NAMES jemalloc/jemalloc.h
HINTS
${
JEMALLOC_ROOT_DIR
}
/include
)
find_library
(
JEMALLOC_LIBRARIES
NAMES jemalloc
HINTS
${
JEMALLOC_ROOT_DIR
}
/lib
)
include
(
FindPackageHandleStandardArgs
)
find_package_handle_standard_args
(
jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR
)
mark_as_advanced
(
JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR
)
cmake/configure.cmake
浏览文件 @
4a443ffc
...
...
@@ -134,6 +134,7 @@ if(WITH_GPU)
message
(
WARNING
"Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF"
)
set
(
WITH_ANAKIN OFF CACHE STRING
"Anakin is valid only when CUDNN >= 7.0."
FORCE
)
endif
()
add_definitions
(
-DWITH_ANAKIN
)
endif
()
if
(
WITH_ANAKIN
)
# NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
...
...
cmake/cuda.cmake
浏览文件 @
4a443ffc
...
...
@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
return
()
endif
()
set
(
paddle_known_gpu_archs
"30 35 50 52 60 61 70"
)
set
(
paddle_known_gpu_archs
"30 35 50 52 60 61 70
75
"
)
set
(
paddle_known_gpu_archs7
"30 35 50 52"
)
set
(
paddle_known_gpu_archs8
"30 35 50 52 60 61"
)
...
...
@@ -59,7 +59,7 @@ endfunction()
# select_nvcc_arch_flags(out_variable)
function
(
select_nvcc_arch_flags out_variable
)
# List of arch names
set
(
archs_names
"Kepler"
"Maxwell"
"Pascal"
"All"
"Manual"
)
set
(
archs_names
"Kepler"
"Maxwell"
"Pascal"
"
Volta"
"Turing"
"
All"
"Manual"
)
set
(
archs_name_default
"All"
)
if
(
NOT CMAKE_CROSSCOMPILING
)
list
(
APPEND archs_names
"Auto"
)
...
...
@@ -93,6 +93,8 @@ function(select_nvcc_arch_flags out_variable)
set
(
cuda_arch_bin
"60 61"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Volta"
)
set
(
cuda_arch_bin
"70"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Turing"
)
set
(
cuda_arch_bin
"75"
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"All"
)
set
(
cuda_arch_bin
${
paddle_known_gpu_archs
}
)
elseif
(
${
CUDA_ARCH_NAME
}
STREQUAL
"Auto"
)
...
...
cmake/external/ngraph.cmake
浏览文件 @
4a443ffc
...
...
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
INCLUDE
(
ExternalProject
)
SET
(
NGRAPH_PROJECT
"extern_ngraph"
)
SET
(
NGRAPH_GIT_TAG
"
v0.10.1
"
)
SET
(
NGRAPH_GIT_TAG
"
08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9
"
)
SET
(
NGRAPH_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/ngraph
)
SET
(
NGRAPH_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/ngraph
)
SET
(
NGRAPH_INC_DIR
${
NGRAPH_INSTALL_DIR
}
/include
)
...
...
cmake/generic.cmake
浏览文件 @
4a443ffc
...
...
@@ -115,6 +115,10 @@ function(common_link TARGET_NAME)
if
(
WITH_PROFILER
)
target_link_libraries
(
${
TARGET_NAME
}
gperftools::profiler
)
endif
()
if
(
WITH_JEMALLOC
)
target_link_libraries
(
${
TARGET_NAME
}
${
JEMALLOC_LIBRARIES
}
)
endif
()
endfunction
()
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -184,7 +184,7 @@ endif()
target_link_libraries
(
executor garbage_collector
)
cc_library
(
parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
parallel_ssa_graph_executor
graph build_strategy
fast_threaded_ssa_graph_executor variable_helper
)
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context
)
cc_library
(
parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor
)
cc_test
(
broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context broadcast_op_handle
)
cc_test
(
gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
...
...
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
4a443ffc
...
...
@@ -19,6 +19,13 @@
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
// asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049
DEFINE_bool
(
sync_nccl_allreduce
,
false
,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios."
);
namespace
paddle
{
namespace
framework
{
namespace
details
{
...
...
@@ -48,17 +55,6 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
void
AllReduceOpHandle
::
RunImpl
()
{
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
cbegin
()
->
second
);
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way.
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
NoDummyInputSize
()
==
1
&&
local_scopes_
[
0
]
->
FindLocalVar
(
NCCL_ID_VARNAME
)
==
nullptr
)
{
#else
if
(
NoDummyInputSize
()
==
1
)
{
#endif
return
;
// No need to all reduce when GPU count = 1;
}
else
{
// Wait input done
WaitInputVarGenerated
();
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Inputs
());
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
());
...
...
@@ -105,16 +101,32 @@ void AllReduceOpHandle::RunImpl() {
auto
comm
=
nccl_ctx
.
comm_
;
all_reduce_calls
.
emplace_back
([
=
]
{
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
dtype
)
,
ncclSum
,
comm
,
stream
));
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
,
comm
,
stream
));
});
}
this
->
RunAndRecordEvent
([
&
]
{
if
(
all_reduce_calls
.
size
()
==
1UL
)
{
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls
[
0
]();
}
else
{
platform
::
NCCLGroupGuard
guard
;
for
(
auto
&
call
:
all_reduce_calls
)
{
call
();
}
}
});
if
(
FLAGS_sync_nccl_allreduce
)
{
for
(
auto
&
p
:
places_
)
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
cudaStreamSynchronize
(
stream
);
}
}
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
...
...
@@ -143,7 +155,6 @@ void AllReduceOpHandle::RunImpl() {
});
}
}
}
}
std
::
string
AllReduceOpHandle
::
Name
()
const
{
return
"all_reduce"
;
}
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
4a443ffc
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_graph_
check_
pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
...
...
@@ -31,7 +31,11 @@ namespace framework {
namespace
details
{
static
inline
bool
SeqOnlyAllReduceOps
(
const
BuildStrategy
&
strategy
)
{
return
(
!
strategy
.
enable_sequential_execution_
&&
strategy
.
num_trainers_
>
1
);
// Should fix the allreduce op order if scheduling
// them in multiple threads or processes to avoid hang.
return
(
!
strategy
.
enable_sequential_execution_
&&
strategy
.
num_trainers_
>
1
)
||
strategy
.
enable_parallel_graph_
;
}
class
ParallelExecutorPassBuilder
:
public
ir
::
PassBuilder
{
...
...
@@ -82,12 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
if
(
strategy
.
memory_optimize_
)
{
auto
analysis_var_pass
=
AppendPass
(
"analysis_var_pass"
);
}
// Convert graph to run on multi-devices.
auto
multi_devices_pass
=
AppendPass
(
"multi_devices_pass"
);
multi_devices_pass
->
SetNotOwned
<
const
BuildStrategy
>
(
"strategy"
,
&
strategy_
);
multi_devices_pass
->
Set
<
int
>
(
"num_trainers"
,
new
int
(
strategy_
.
num_trainers_
));
AppendMultiDevPass
(
strategy
);
// Add a graph print pass to record a graph with device info.
if
(
!
strategy_
.
debug_graphviz_path_
.
empty
())
{
...
...
@@ -113,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
}
}
// Convert graph to run on multi-devices.
void
AppendMultiDevPass
(
const
BuildStrategy
&
strategy
)
{
ir
::
Pass
*
multi_devices_pass
;
if
(
strategy_
.
is_distribution_
)
{
multi_devices_pass
=
AppendPass
(
"dist_multi_devices_pass"
).
get
();
}
else
{
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
multi_devices_pass
=
AppendPass
(
"allreduce_mode_multi_devices_pass"
).
get
();
}
else
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
multi_devices_pass
=
AppendPass
(
"reduce_mode_multi_devices_pass"
).
get
();
}
else
{
PADDLE_THROW
(
"Unknown reduce strategy."
);
}
}
multi_devices_pass
->
SetNotOwned
<
const
BuildStrategy
>
(
"strategy"
,
&
strategy_
);
}
private:
BuildStrategy
strategy_
;
};
...
...
@@ -129,9 +148,14 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
return
pass_builder_
;
}
bool
BuildStrategy
::
IsMultiDevPass
(
const
std
::
string
&
pass_name
)
const
{
return
framework
::
details
::
MultiDevSSAGraphBuilder
().
count
(
pass_name
)
>
0
;
}
std
::
unique_ptr
<
ir
::
Graph
>
BuildStrategy
::
Apply
(
const
ProgramDesc
&
main_program
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
{
#else
...
...
@@ -142,19 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
main_program
));
for
(
std
::
shared_ptr
<
ir
::
Pass
>
&
pass
:
pass_builder_
->
AllPasses
())
{
if
(
pass
->
Type
()
==
"multi_devices_pass"
)
{
pass
->
Erase
(
"places"
);
pass
->
SetNotOwned
<
const
std
::
vector
<
platform
::
Place
>>
(
"places"
,
&
places
);
pass
->
Erase
(
"loss_var_name"
);
pass
->
SetNotOwned
<
const
std
::
string
>
(
"loss_var_name"
,
&
loss_var_name
);
pass
->
Erase
(
"local_scopes"
);
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
"local_scopes"
,
if
(
IsMultiDevPass
(
pass
->
Type
())
)
{
pass
->
Erase
(
kPlaces
);
pass
->
SetNotOwned
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
,
&
places
);
pass
->
Erase
(
kLossVarName
);
pass
->
SetNotOwned
<
const
std
::
string
>
(
kLossVarName
,
&
loss_var_name
);
pass
->
Erase
(
kLocalScopes
);
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
kLocalScopes
,
&
local_scopes
);
pass
->
Erase
(
kNRanks
);
pass
->
Set
<
size_t
>
(
kNRanks
,
new
size_t
(
nranks
));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform
::
NCCLContextMap
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"analysis_var_pass"
)
{
const
std
::
vector
<
OpDesc
*>
*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
...
...
@@ -195,7 +223,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
USE_PASS
(
fuse_elewise_add_act_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
multi_batch_merge_pass
);
USE_PASS
(
multi_devices_pass
);
USE_PASS
(
reduce_mode_multi_devices_pass
);
USE_PASS
(
allreduce_mode_multi_devices_pass
);
USE_PASS
(
dist_multi_devices_pass
);
USE_PASS
(
multi_devices_check_pass
);
USE_PASS
(
multi_devices_print_pass
);
USE_PASS
(
analysis_var_pass
);
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
4a443ffc
...
...
@@ -74,8 +74,6 @@ struct BuildStrategy {
bool
fuse_elewise_add_act_ops_
{
false
};
bool
enable_data_balance_
{
false
};
bool
memory_optimize_
{
false
};
bool
memory_early_delete_
{
false
};
...
...
@@ -84,6 +82,10 @@ struct BuildStrategy {
bool
fuse_broadcast_op_
{
false
};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
bool
is_distribution_
{
false
};
int
num_trainers_
{
1
};
int
trainer_id_
{
0
};
std
::
vector
<
std
::
string
>
trainers_endpoints_
;
...
...
@@ -104,12 +106,15 @@ struct BuildStrategy {
bool
IsFinalized
()
const
{
return
is_finalized_
;
}
bool
IsMultiDevPass
(
const
std
::
string
&
pass_name
)
const
;
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
const
ProgramDesc
&
main_program
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
string
&
loss_var_name
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
const
bool
use_cuda
,
platform
::
NCCLContextMap
*
nccl_ctxs
)
const
;
...
...
@@ -117,6 +122,13 @@ struct BuildStrategy {
const
bool
use_cuda
)
const
;
#endif
// If set true, ParallelExecutor would build the main_program into multiple
// graphs,
// each of the graphs would run with one device. This approach can achieve
// better performance
// on some scenarios.
mutable
bool
enable_parallel_graph_
=
false
;
private:
mutable
bool
is_finalized_
=
false
;
mutable
std
::
shared_ptr
<
ir
::
PassBuilder
>
pass_builder_
;
...
...
paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
浏览文件 @
4a443ffc
...
...
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include <string>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
...
...
@@ -21,7 +21,15 @@ namespace paddle {
namespace
framework
{
namespace
details
{
bool
SSAGraghBuilderWithChecker
::
IsValidGraph
(
const
ir
::
Graph
*
graph
)
const
{
class
SSAGraghBuilderWithChecker
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
{
PADDLE_ENFORCE
(
IsValidGraph
(
graph
.
get
()));
return
graph
;
}
bool
IsValidGraph
(
const
ir
::
Graph
*
graph
)
const
{
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
pending_ops
;
std
::
unordered_set
<
VarHandleBase
*>
pending_vars
;
std
::
unordered_set
<
VarHandleBase
*>
ready_vars
;
...
...
@@ -82,7 +90,9 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
ready_vars
.
clear
();
}
return
true
;
}
}
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
4a443ffc
...
...
@@ -134,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) {
}
}
// namespace
static
const
char
kLossVarName
[]
=
"loss_var_name"
;
static
const
char
kPlaces
[]
=
"places"
;
static
const
char
kLocalScopes
[]
=
"local_scopes"
;
static
const
char
kStrategy
[]
=
"strategy"
;
static
const
char
kNumTrainers
[]
=
"num_trainers"
;
void
MultiDevSSAGraphBuilder
::
Init
()
const
{
void
MultiDevSSAGraphBuilderBase
::
Init
()
const
{
all_vars_
.
clear
();
balance_vars_
.
clear
();
loss_var_name_
=
Get
<
const
std
::
string
>
(
kLossVarName
);
places_
=
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
);
...
...
@@ -151,31 +144,16 @@ void MultiDevSSAGraphBuilder::Init() const {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
nccl_ctxs_
=
&
Get
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
);
#endif
balance_vars_
.
resize
(
places_
.
size
(),
0
);
if
(
strategy_
.
enable_data_balance_
&&
places_
.
size
()
==
1
)
{
LOG
(
WARNING
)
<<
"It is no need to enable data balance when there is only "
"one place. enable_data_balance is set to False."
;
strategy_
.
enable_data_balance_
=
false
;
}
}
std
::
unique_ptr
<
ir
::
Graph
>
MultiDevSSAGraphBuilder
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
MultiDevSSAGraphBuilder
Base
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
Init
();
// Give the topology sort order and rebuild the graph structure.
std
::
vector
<
ir
::
Node
*>
sorted_ops
=
ir
::
TopologySortOperations
(
*
graph
);
if
(
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
sorted_ops
=
SortForReduceMode
(
sorted_ops
);
}
std
::
vector
<
ir
::
Node
*>
sorted_ops
=
SortOperations
(
*
graph
);
auto
nodes
=
graph
->
ReleaseNodes
();
ir
::
Graph
&
result
=
*
graph
;
int
num_trainers
=
Get
<
int
>
(
kNumTrainers
);
for
(
auto
&
node
:
nodes
)
{
if
(
node
->
IsVar
()
&&
node
->
Var
())
{
all_vars_
.
emplace
(
node
->
Name
(),
node
->
Var
());
...
...
@@ -187,142 +165,57 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
result
.
Set
(
kGraphDepVars
,
new
GraphDepVars
);
result
.
Set
(
kGraphOps
,
new
GraphOps
);
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
bcast_var_name_set
;
bcast_var_name_set
.
resize
(
places_
.
size
());
bool
is_forwarding
=
true
;
bool
is_dist_train
=
false
;
std
::
unordered_map
<
std
::
string
,
int
>
sharded_var_device
;
bool
insert_collection_ops
=
NeedCollectiveOps
();
for
(
ir
::
Node
*
node
:
sorted_ops
)
{
if
(
OpHaveRole
(
*
node
,
OpRole
::
kRPC
))
{
int
op_dev_id
=
CreateRPCOp
(
&
result
,
node
,
&
sharded_var_device
);
PADDLE_ENFORCE
(
op_dev_id
!=
-
1
,
"Can not schedule the RPC operator to the right place."
);
if
(
node
->
Op
()
->
Type
()
==
"recv"
)
{
auto
recv_vars_attr
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE
(
recv_vars_attr
.
size
()
==
2UL
);
// [parameter, gradient]
if
(
recv_vars_attr
[
0
].
find
(
".block"
)
==
std
::
string
::
npos
)
{
bcast_var_name_set
[
op_dev_id
].
emplace
(
recv_vars_attr
[
0
]);
}
}
is_dist_train
=
true
;
}
else
if
(
OpHaveRole
(
*
node
,
OpRole
::
kDist
))
{
int
op_dev_id
=
CreateDistTrainOp
(
&
result
,
node
,
&
sharded_var_device
);
if
(
node
->
Op
()
->
Type
()
==
"concat"
)
{
auto
origin_param_name
=
node
->
Op
()
->
OutputArgumentNames
()[
0
];
bcast_var_name_set
[
op_dev_id
].
emplace
(
origin_param_name
);
}
}
else
if
(
IsScaleLossOp
(
node
))
{
if
(
DealWithSpecialOp
(
&
result
,
node
))
{
continue
;
}
else
{
// This op runs on all devices
if
(
IsScaleLossOp
(
node
))
{
// user can customize loss@grad if not use_default_grad_scale_
if
(
strategy_
.
gradient_scale_
!=
BuildStrategy
::
GradientScaleStrategy
::
kCustomized
)
{
// TODO(paddle-dev): Why is there no input for this op_handle?
auto
loss_grad_name
=
node
->
Op
()
->
OutputArgumentNames
()[
0
];
auto
out_dtype
=
all_vars_
.
at
(
loss_grad_name
)
->
GetDataType
();
CreateScaleLossGradOp
(
&
result
,
loss_grad_name
,
node
->
outputs
[
0
],
out_dtype
);
}
InsertScaleLossGradOp
(
&
result
,
node
);
// This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss.
// It also assumes backward op will always follow the forward op in
// the block.
is_forwarding
=
false
;
}
else
{
int
op_dev_id
=
GetOpDeviceID
(
node
,
sharded_var_device
);
if
(
op_dev_id
!=
-
1
)
{
// This op only runs on one specific device.
CreateComputationalOp
(
&
result
,
node
,
op_dev_id
);
for
(
ir
::
Node
*
n
:
node
->
outputs
)
{
sharded_var_device
.
emplace
(
n
->
Name
(),
op_dev_id
);
}
}
else
{
// This op runs on all devices, and its output may have parameter's
// gradients.
// TODO(paddle-dev): Why is so special about "read" op?
if
(
node
->
Op
()
->
Type
()
==
"read"
&&
strategy_
.
enable_data_balance_
)
{
node
->
Op
()
->
SetAttr
(
"throw_eof_exp"
,
false
);
CreateComputationalOps
(
&
result
,
node
,
places_
.
size
());
const
auto
&
data_var_names
=
node
->
Op
()
->
Output
(
"Out"
);
InsertDataBalanceOp
(
&
result
,
data_var_names
);
}
else
{
CreateComputationalOps
(
&
result
,
node
,
places_
.
size
());
}
if
(
!
is_forwarding
&&
(
places_
.
size
()
>
1
||
num_trainers
>
1
))
{
// Insert collection ops
if
(
!
is_forwarding
&&
insert_collection_ops
)
{
try
{
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
if
(
!
is_bk_op
)
continue
;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
try
{
auto
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
auto
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
0
);
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
auto
&
p_name
=
backward_vars
[
i
];
auto
&
g_name
=
backward_vars
[
i
+
1
];
VLOG
(
10
)
<<
"Bcast "
<<
g_name
<<
" for parameter "
<<
p_name
;
size_t
cur_device_id
=
-
1
;
switch
(
strategy_
.
reduce_
)
{
case
BuildStrategy
::
ReduceStrategy
::
kReduce
:
cur_device_id
=
GetAppropriateDeviceID
({
g_name
});
CreateReduceOp
(
&
result
,
g_name
,
cur_device_id
);
sharded_var_device
.
emplace
(
g_name
,
cur_device_id
);
if
(
!
is_dist_train
)
{
bcast_var_name_set
[
cur_device_id
].
emplace
(
p_name
);
}
break
;
case
BuildStrategy
::
ReduceStrategy
::
kAllReduce
:
if
(
IsSparseGradient
(
g_name
))
{
CreateReduceOp
(
&
result
,
g_name
,
0
);
CreateBroadcastOp
(
&
result
,
g_name
,
0
);
}
else
{
InsertAllReduceOp
(
&
result
,
g_name
);
}
break
;
default:
LOG
(
FATAL
)
<<
"Unknown reduce strategy "
;
break
;
}
InsertCollectiveOp
(
&
result
,
p_name
,
g_name
);
}
}
catch
(
boost
::
bad_get
e
)
{
}
}
}
}
}
bool
use_gpu
=
false
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
use_gpu
=
nccl_ctxs_
!=
nullptr
;
#endif
// Insert broadcast operators principle:
// 1. Broadcast optimized parameters in Reduce strategy;
// 2. No need broadcast optimized parameters in AllReduce strategy because of
// the optimization sub-graph would be run on every GPU;
// 3. Allways broadcast received parameters in Distribute Training.
if
((
use_gpu
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
||
is_dist_train
)
{
if
(
strategy_
.
fuse_broadcast_op_
)
{
CreateFusedBroadcastOp
(
&
result
,
bcast_var_name_set
);
}
else
{
for
(
size_t
dev_id
=
0
;
dev_id
<
bcast_var_name_set
.
size
();
++
dev_id
)
{
auto
&
to_bcast_set
=
bcast_var_name_set
[
dev_id
];
for
(
auto
&
bcast_name
:
to_bcast_set
)
{
CreateBroadcastOp
(
&
result
,
bcast_name
,
dev_id
);
}
}
}
}
InsertPostprocessOps
(
&
result
);
/*
Dependency graph has been constructed. However, there are still data
hazards need to be handled.
...
...
@@ -337,65 +230,52 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
return
graph
;
}
std
::
vector
<
ir
::
Node
*>
MultiDevSSAGraphBuilder
::
SortForReduceMode
(
const
std
::
vector
<
ir
::
Node
*>
&
topo_ops
)
const
{
std
::
unordered_map
<
std
::
string
,
int
>
sharded_var_device
;
std
::
vector
<
ir
::
Node
*>
sorted_ops
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
delayed_op
;
sorted_ops
.
reserve
(
topo_ops
.
size
());
auto
insert_delayed_op
=
[
&
](
const
std
::
string
&
var_name
,
int
dev_id
)
{
sharded_var_device
.
emplace
(
var_name
,
dev_id
);
if
(
delayed_op
.
count
(
var_name
))
{
auto
&
ops
=
delayed_op
.
at
(
var_name
);
sorted_ops
.
insert
(
sorted_ops
.
end
(),
ops
.
begin
(),
ops
.
end
());
delayed_op
.
at
(
var_name
).
clear
();
void
MultiDevSSAGraphBuilderBase
::
InsertScaleLossGradOp
(
ir
::
Graph
*
result
,
const
ir
::
Node
*
node
)
const
{
// user can customize loss@grad if not use_default_grad_scale_
size_t
loss_scale
=
0
;
switch
(
this
->
strategy_
.
gradient_scale_
)
{
case
BuildStrategy
::
GradientScaleStrategy
::
kOne
:
loss_scale
=
1
;
break
;
case
BuildStrategy
::
GradientScaleStrategy
::
kCoeffNumDevice
:
loss_scale
=
Get
<
size_t
>
(
kNRanks
);
break
;
case
BuildStrategy
::
GradientScaleStrategy
::
kCustomized
:
loss_scale
=
0
;
break
;
default:
LOG
(
FATAL
)
<<
"Unknown gradient scale strategy."
;
break
;
}
};
for
(
ir
::
Node
*
node
:
topo_ops
)
{
int
op_dev_id
=
GetOpDeviceID
(
node
,
sharded_var_device
,
&
delayed_op
);
if
(
op_dev_id
>
-
1
)
{
// This op only runs on one specific device.
sorted_ops
.
emplace_back
(
node
);
for
(
ir
::
Node
*
n
:
node
->
outputs
)
{
insert_delayed_op
(
n
->
Name
(),
op_dev_id
);
}
}
else
if
(
op_dev_id
==
-
1
)
{
// This op runs on all devices, and its output may have parameter's
// gradients.
sorted_ops
.
emplace_back
(
node
);
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
if
(
!
is_bk_op
)
continue
;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
std
::
vector
<
std
::
string
>
backward_vars
;
try
{
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
}
catch
(
boost
::
bad_get
e
)
{
if
(
loss_scale
)
{
// TODO(paddle-dev): Why is there no input for this op_handle?
auto
loss_grad_name
=
node
->
Op
()
->
OutputArgumentNames
()[
0
];
auto
out_dtype
=
this
->
all_vars_
.
at
(
loss_grad_name
)
->
GetDataType
();
this
->
CreateScaleLossGradOp
(
result
,
loss_grad_name
,
node
->
outputs
[
0
],
loss_scale
,
out_dtype
);
}
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
0
);
}
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
auto
&
g_name
=
backward_vars
[
i
+
1
];
size_t
cur_device_id
=
GetAppropriateDeviceID
({
g_name
});
insert_delayed_op
(
g_name
,
static_cast
<
int
>
(
cur_device_id
));
}
}
else
if
(
op_dev_id
==
-
2
)
{
// The Op on which the Op depends has not yet been generated.
}
}
std
::
vector
<
ir
::
Node
*>
MultiDevSSAGraphBuilderBase
::
SortOperations
(
const
ir
::
Graph
&
graph
)
const
{
return
ir
::
TopologySortOperations
(
graph
);
}
PADDLE_ENFORCE_EQ
(
sorted_ops
.
size
(),
topo_ops
.
size
());
return
sorted_ops
;
bool
MultiDevSSAGraphBuilderBase
::
UseGPU
()
const
{
bool
use_gpu
=
false
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
use_gpu
=
nccl_ctxs_
!=
nullptr
;
#endif
return
use_gpu
;
}
void
MultiDevSSAGraphBuilder
::
CreateOpHandleIOs
(
ir
::
Graph
*
result
,
bool
MultiDevSSAGraphBuilderBase
::
NeedCollectiveOps
()
const
{
return
Get
<
size_t
>
(
kNRanks
)
>
1
;
}
void
MultiDevSSAGraphBuilderBase
::
CreateOpHandleIOs
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
place_id
)
const
{
auto
p
=
places_
[
place_id
];
...
...
@@ -420,28 +300,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
}
}
size_t
MultiDevSSAGraphBuilder
::
GetAppropriateDeviceID
(
const
std
::
vector
<
std
::
string
>
&
var_names
)
const
{
int64_t
numel_sum
=
0
;
for
(
auto
var_name
:
var_names
)
{
if
(
all_vars_
.
find
(
var_name
)
==
all_vars_
.
end
())
continue
;
auto
var_desc
=
all_vars_
.
at
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var_desc
);
auto
dim
=
framework
::
make_ddim
(
var_desc
->
GetShape
());
int64_t
numel
=
framework
::
product
(
dim
);
PADDLE_ENFORCE_GT
(
numel
,
0
);
numel_sum
+=
numel
;
}
auto
smallest
=
std
::
min_element
(
std
::
begin
(
balance_vars_
),
std
::
end
(
balance_vars_
));
size_t
dev_id
=
static_cast
<
size_t
>
(
std
::
distance
(
std
::
begin
(
balance_vars_
),
smallest
));
balance_vars_
[
dev_id
]
+=
numel_sum
;
return
dev_id
;
}
void
MultiDevSSAGraphBuilder
::
SetCommunicationContext
(
void
MultiDevSSAGraphBuilderBase
::
SetCommunicationContext
(
OpHandleBase
*
op_handle
,
const
platform
::
Place
&
p
)
const
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
nccl_ctxs_
==
nullptr
)
{
...
...
@@ -454,7 +313,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
#endif
}
void
MultiDevSSAGraphBuilder
::
CreateBroadcastOp
(
ir
::
Graph
*
result
,
void
MultiDevSSAGraphBuilder
Base
::
CreateBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -484,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
}
}
void
MultiDevSSAGraphBuilder
::
CreateFusedBroadcastOp
(
void
MultiDevSSAGraphBuilder
Base
::
CreateFusedBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
bcast_varnames
)
const
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -522,7 +381,7 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
}
}
void
MultiDevSSAGraphBuilder
::
CreateComputationalOp
(
ir
::
Graph
*
result
,
void
MultiDevSSAGraphBuilder
Base
::
CreateComputationalOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
dev_id
)
const
{
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
...
...
@@ -531,8 +390,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
CreateOpHandleIOs
(
result
,
node
,
dev_id
);
}
void
MultiDevSSAGraphBuilder
::
InsertAllReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
)
const
{
void
MultiDevSSAGraphBuilder
Base
::
CreateAllReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
)
const
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
AllReduceOpHandle
(
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
...
...
@@ -560,101 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
}
}
void
MultiDevSSAGraphBuilder
::
InsertDataBalanceOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
string
>
&
datas
)
const
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
DataBalanceOpHandle
(
result
->
CreateEmptyNode
(
"data_balance"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
));
#else
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
DataBalanceOpHandle
(
result
->
CreateEmptyNode
(
"data_balance"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
));
#endif
auto
*
op_handle
=
result
->
Get
<
GraphOps
>
(
kGraphOps
).
back
();
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
SetCommunicationContext
(
op_handle
,
p
);
for
(
const
std
::
string
&
d_name
:
datas
)
{
auto
&
vars
=
result
->
Get
<
GraphVars
>
(
kGraphVars
)[
i
][
d_name
];
PADDLE_ENFORCE
(
!
vars
.
empty
());
op_handle
->
AddInput
(
vars
.
back
());
auto
var
=
new
VarHandle
(
result
->
CreateEmptyNode
(
d_name
,
ir
::
Node
::
Type
::
kVariable
),
vars
.
size
(),
i
,
d_name
,
p
);
vars
.
emplace_back
(
var
);
op_handle
->
AddOutput
(
var
);
}
}
}
int
MultiDevSSAGraphBuilder
::
GetOpDeviceID
(
ir
::
Node
*
node
,
const
std
::
unordered_map
<
std
::
string
,
int
>
&
sharded_var_device
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
*
delay_ops
)
const
{
if
(
strategy_
.
reduce_
!=
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
return
-
1
;
}
if
(
!
OpHaveRole
(
*
node
,
framework
::
OpRole
::
kOptimize
))
{
return
-
1
;
}
auto
param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
param_grad
.
size
(),
2U
);
int
dev_id
=
GetVarDeviceID
(
param_grad
[
1
],
sharded_var_device
);
if
(
dev_id
==
-
1
)
{
(
*
delay_ops
)[
param_grad
[
1
]].
push_back
(
node
);
return
-
2
;
}
return
dev_id
;
}
int
MultiDevSSAGraphBuilder
::
GetOpDeviceID
(
ir
::
Node
*
node
,
const
std
::
unordered_map
<
std
::
string
,
int
>
&
sharded_var_device
)
const
{
if
(
strategy_
.
reduce_
!=
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
return
-
1
;
}
if
(
!
OpHaveRole
(
*
node
,
framework
::
OpRole
::
kOptimize
))
{
return
-
1
;
}
auto
param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
param_grad
.
size
(),
2U
);
int
dev_id
=
GetVarDeviceID
(
param_grad
[
1
],
sharded_var_device
);
PADDLE_ENFORCE_NE
(
dev_id
,
-
1
,
"dev_id should not be -1.[%s, %s, %s]"
,
node
->
Op
()
->
Type
(),
param_grad
[
0
],
param_grad
[
1
]);
return
dev_id
;
}
int
MultiDevSSAGraphBuilder
::
GetVarDeviceID
(
const
std
::
string
&
varname
,
const
std
::
unordered_map
<
std
::
string
,
int
>
&
sharded_var_device
)
const
{
auto
got
=
sharded_var_device
.
find
(
varname
);
if
(
got
==
sharded_var_device
.
end
())
{
auto
pos
=
varname
.
find
(
framework
::
kNewGradSuffix
);
if
(
pos
!=
std
::
string
::
npos
)
{
got
=
sharded_var_device
.
find
(
varname
.
substr
(
0
,
pos
));
}
}
return
got
==
sharded_var_device
.
end
()
?
-
1
:
got
->
second
;
}
void
MultiDevSSAGraphBuilder
::
CreateScaleLossGradOp
(
void
MultiDevSSAGraphBuilderBase
::
CreateScaleLossGradOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
loss_grad_name
,
ir
::
Node
*
out_var_node
,
proto
::
VarType
::
Type
dtype
)
const
{
ir
::
Node
*
out_var_node
,
size_t
loss_scale
,
proto
::
VarType
::
Type
dtype
)
const
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// Insert ScaleCost OpHandle
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
places_
[
i
]);
auto
*
op_handle
=
new
ScaleLossGradOpHandle
(
result
->
CreateEmptyNode
(
"scale_loss_grad"
,
ir
::
Node
::
Type
::
kOperation
),
lo
cal_scopes_
.
size
()
,
local_scopes_
[
i
],
places_
[
i
],
dev_ctx
,
dtype
);
lo
ss_scale
,
local_scopes_
[
i
],
places_
[
i
],
dev_ctx
,
dtype
);
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
op_handle
);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
...
...
@@ -668,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
}
}
void
MultiDevSSAGraphBuilder
::
CreateComputationalOps
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
num_places
)
const
{
void
MultiDevSSAGraphBuilderBase
::
CreateComputationalOps
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
num_places
)
const
{
for
(
size_t
scope_idx
=
0
;
scope_idx
<
num_places
;
++
scope_idx
)
{
auto
p
=
places_
[
scope_idx
];
auto
s
=
local_scopes_
[
scope_idx
];
...
...
@@ -680,7 +452,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
}
}
VarHandle
*
MultiDevSSAGraphBuilder
::
CreateReduceOp
(
ir
::
Graph
*
result
,
VarHandle
*
MultiDevSSAGraphBuilder
Base
::
CreateReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
,
int
dst_dev_id
)
const
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...
...
@@ -711,51 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return
var
;
}
int
MultiDevSSAGraphBuilder
::
CreateDistTrainOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
std
::
unordered_map
<
std
::
string
,
int
>
*
sharded_var_device
)
const
{
int
op_dev_id
=
-
1
;
std
::
vector
<
std
::
string
>
input_var_names
;
std
::
vector
<
std
::
string
>
output_var_names
;
for
(
ir
::
Node
*
input
:
node
->
inputs
)
{
input_var_names
.
push_back
(
input
->
Name
());
bool
MultiDevSSAGraphBuilderBase
::
IsScaleLossOp
(
ir
::
Node
*
node
)
const
{
return
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
))
&&
!
loss_var_name_
.
empty
();
// If loss_var is empty. This is test mode
}
bool
MultiDevSSAGraphBuilderBase
::
IsSparseGradient
(
const
std
::
string
&
og
)
const
{
PADDLE_ENFORCE
(
all_vars_
.
count
(
og
)
!=
0
);
if
(
all_vars_
.
at
(
og
)
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
return
true
;
}
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
output_var_names
.
push_back
(
output
->
Name
());
return
false
;
}
void
AllReduceSSAGraphBuilder
::
InsertCollectiveOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
g_name
)
const
{
if
(
IsSparseGradient
(
g_name
))
{
CreateReduceOp
(
result
,
g_name
,
0
);
CreateBroadcastOp
(
result
,
g_name
,
0
);
}
else
{
CreateAllReduceOp
(
result
,
g_name
);
}
}
if
(
node
->
Op
()
->
Type
()
==
"split_byref"
||
node
->
Op
()
->
Type
()
==
"split_selected_rows"
||
node
->
Op
()
->
Type
()
==
"split_ids"
)
{
// TODO(paddle-dev): getting the first var is not safe.
op_dev_id
=
GetVarDeviceID
(
input_var_names
[
0
],
*
sharded_var_device
);
if
(
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
op_dev_id
=
GetAppropriateDeviceID
(
input_var_names
);
for
(
auto
&
varname
:
input_var_names
)
{
sharded_var_device
->
emplace
(
varname
,
op_dev_id
);
int
BalanceVarSSAGraphBuilder
::
GetVarDeviceID
(
const
std
::
string
&
varname
)
const
{
auto
got
=
sharded_var_device_
.
find
(
varname
);
if
(
got
==
sharded_var_device_
.
end
())
{
auto
pos
=
varname
.
find
(
framework
::
kNewGradSuffix
);
if
(
pos
!=
std
::
string
::
npos
)
{
got
=
sharded_var_device_
.
find
(
varname
.
substr
(
0
,
pos
));
}
}
for
(
auto
&
varname
:
output_var_names
)
{
sharded_var_device
->
emplace
(
varname
,
op_dev_id
);
return
got
==
sharded_var_device_
.
end
()
?
-
1
:
got
->
second
;
}
int
BalanceVarSSAGraphBuilder
::
GetOpDeviceID
(
ir
::
Node
*
node
)
const
{
if
(
strategy_
.
reduce_
!=
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
return
-
1
;
}
}
else
if
(
node
->
Op
()
->
Type
()
==
"concat"
)
{
op_dev_id
=
GetVarDeviceID
(
input_var_names
[
0
],
*
sharded_var_device
);
for
(
auto
&
varname
:
output_var_names
)
{
sharded_var_device
->
emplace
(
varname
,
op_dev_id
);
if
(
!
OpHaveRole
(
*
node
,
framework
::
OpRole
::
kOptimize
))
{
return
-
1
;
}
auto
param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
param_grad
.
size
(),
2U
);
int
dev_id
=
GetVarDeviceID
(
param_grad
[
1
]);
PADDLE_ENFORCE_NE
(
dev_id
,
-
1
,
"dev_id should not be -1.[%s, %s, %s]"
,
node
->
Op
()
->
Type
(),
param_grad
[
0
],
param_grad
[
1
]);
return
dev_id
;
}
size_t
BalanceVarSSAGraphBuilder
::
GetAppropriateDeviceID
(
const
std
::
vector
<
std
::
string
>
&
var_names
)
const
{
int64_t
numel_sum
=
0
;
for
(
auto
var_name
:
var_names
)
{
if
(
all_vars_
.
find
(
var_name
)
==
all_vars_
.
end
())
continue
;
auto
var_desc
=
all_vars_
.
at
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var_desc
);
auto
dim
=
framework
::
make_ddim
(
var_desc
->
GetShape
());
int64_t
numel
=
framework
::
product
(
dim
);
PADDLE_ENFORCE_GT
(
numel
,
0
);
numel_sum
+=
numel
;
}
auto
smallest
=
std
::
min_element
(
std
::
begin
(
balance_vars_
),
std
::
end
(
balance_vars_
));
size_t
dev_id
=
static_cast
<
size_t
>
(
std
::
distance
(
std
::
begin
(
balance_vars_
),
smallest
));
balance_vars_
[
dev_id
]
+=
numel_sum
;
return
dev_id
;
}
void
BalanceVarSSAGraphBuilder
::
ResetState
()
const
{
balance_vars_
.
clear
();
sharded_var_device_
.
clear
();
balance_vars_
.
resize
(
places_
.
size
(),
0
);
}
void
ReduceSSAGraphBuilder
::
Init
()
const
{
MultiDevSSAGraphBuilderBase
::
Init
();
ResetState
();
}
void
ReduceSSAGraphBuilder
::
ResetState
()
const
{
BalanceVarSSAGraphBuilder
::
ResetState
();
bcast_var_name_set_
.
clear
();
bcast_var_name_set_
.
resize
(
places_
.
size
());
}
void
ReduceSSAGraphBuilder
::
InsertCollectiveOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
g_name
)
const
{
size_t
cur_device_id
=
GetAppropriateDeviceID
({
g_name
});
CreateReduceOp
(
result
,
g_name
,
cur_device_id
);
sharded_var_device_
.
emplace
(
g_name
,
cur_device_id
);
bcast_var_name_set_
[
cur_device_id
].
emplace
(
p_name
);
}
bool
ReduceSSAGraphBuilder
::
DealWithSpecialOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
int
op_dev_id
=
BalanceVarSSAGraphBuilder
::
GetOpDeviceID
(
node
);
if
(
op_dev_id
!=
-
1
)
{
// This op only runs on one specific device.
CreateComputationalOp
(
result
,
node
,
op_dev_id
);
for
(
ir
::
Node
*
n
:
node
->
outputs
)
{
sharded_var_device_
.
emplace
(
n
->
Name
(),
op_dev_id
);
}
return
true
;
}
return
false
;
}
void
ReduceSSAGraphBuilder
::
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
{
if
(
UseGPU
())
{
if
(
strategy_
.
fuse_broadcast_op_
)
{
CreateFusedBroadcastOp
(
result
,
bcast_var_name_set_
);
}
else
{
LOG
(
ERROR
)
<<
"got unexpected dist op: "
<<
node
->
Op
()
->
Type
();
PADDLE_THROW
(
"the distribute training related op should be in [split_byref, "
"concat]."
);
for
(
size_t
dev_id
=
0
;
dev_id
<
bcast_var_name_set_
.
size
();
++
dev_id
)
{
auto
&
to_bcast_set
=
bcast_var_name_set_
[
dev_id
];
for
(
auto
&
bcast_name
:
to_bcast_set
)
{
CreateBroadcastOp
(
result
,
bcast_name
,
dev_id
);
}
}
}
}
}
PADDLE_ENFORCE
(
op_dev_id
!=
-
1
,
"can not find right place for distributed op: %s"
,
node
->
Op
()
->
Type
());
int
ReduceSSAGraphBuilder
::
GetOpDeviceID
(
ir
::
Node
*
node
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
*
delay_ops
)
const
{
if
(
!
OpHaveRole
(
*
node
,
framework
::
OpRole
::
kOptimize
))
{
return
-
1
;
}
auto
param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
param_grad
.
size
(),
2U
);
int
dev_id
=
GetVarDeviceID
(
param_grad
[
1
]);
if
(
dev_id
==
-
1
)
{
(
*
delay_ops
)[
param_grad
[
1
]].
push_back
(
node
);
return
-
2
;
}
return
dev_id
;
}
std
::
vector
<
ir
::
Node
*>
ReduceSSAGraphBuilder
::
SortOperations
(
const
ir
::
Graph
&
graph
)
const
{
std
::
vector
<
ir
::
Node
*>
sorted_ops
=
ir
::
TopologySortOperations
(
graph
);
return
SortForReduceMode
(
sorted_ops
);
}
std
::
vector
<
ir
::
Node
*>
ReduceSSAGraphBuilder
::
SortForReduceMode
(
const
std
::
vector
<
ir
::
Node
*>
&
topo_ops
)
const
{
std
::
vector
<
ir
::
Node
*>
sorted_ops
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
delayed_op
;
sorted_ops
.
reserve
(
topo_ops
.
size
());
ResetState
();
auto
insert_delayed_op
=
[
&
](
const
std
::
string
&
var_name
,
int
dev_id
)
{
sharded_var_device_
.
emplace
(
var_name
,
dev_id
);
if
(
delayed_op
.
count
(
var_name
))
{
auto
&
ops
=
delayed_op
.
at
(
var_name
);
sorted_ops
.
insert
(
sorted_ops
.
end
(),
ops
.
begin
(),
ops
.
end
());
delayed_op
.
at
(
var_name
).
clear
();
}
};
for
(
ir
::
Node
*
node
:
topo_ops
)
{
int
op_dev_id
=
GetOpDeviceID
(
node
,
&
delayed_op
);
if
(
op_dev_id
>
-
1
)
{
// This op only runs on one specific device.
sorted_ops
.
emplace_back
(
node
);
for
(
ir
::
Node
*
n
:
node
->
outputs
)
{
insert_delayed_op
(
n
->
Name
(),
op_dev_id
);
}
}
else
if
(
op_dev_id
==
-
1
)
{
// This op runs on all devices, and its output may have parameter's
// gradients.
sorted_ops
.
emplace_back
(
node
);
bool
is_bk_op
=
static_cast
<
bool
>
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
&
static_cast
<
int
>
(
OpRole
::
kBackward
));
if
(
!
is_bk_op
)
continue
;
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
std
::
vector
<
std
::
string
>
backward_vars
;
try
{
backward_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
}
catch
(
boost
::
bad_get
e
)
{
}
PADDLE_ENFORCE_EQ
(
backward_vars
.
size
()
%
2
,
0
);
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
auto
&
g_name
=
backward_vars
[
i
+
1
];
size_t
cur_device_id
=
GetAppropriateDeviceID
({
g_name
});
insert_delayed_op
(
g_name
,
static_cast
<
int
>
(
cur_device_id
));
}
}
else
if
(
op_dev_id
==
-
2
)
{
// The Op on which the Op depends has not yet been generated.
}
}
PADDLE_ENFORCE_EQ
(
sorted_ops
.
size
(),
topo_ops
.
size
());
ResetState
();
return
sorted_ops
;
}
void
DistSSAGraphBuilder
::
Init
()
const
{
MultiDevSSAGraphBuilderBase
::
Init
();
ResetState
();
}
void
DistSSAGraphBuilder
::
ResetState
()
const
{
BalanceVarSSAGraphBuilder
::
ResetState
();
bcast_var_name_set_
.
clear
();
bcast_var_name_set_
.
resize
(
places_
.
size
());
}
bool
DistSSAGraphBuilder
::
DealWithSpecialOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
bool
insert_op
=
false
;
if
(
OpHaveRole
(
*
node
,
OpRole
::
kRPC
))
{
int
op_dev_id
=
CreateRPCOp
(
result
,
node
);
PADDLE_ENFORCE
(
op_dev_id
!=
-
1
,
"Can not schedule the RPC operator to the right place."
);
if
(
node
->
Op
()
->
Type
()
==
"recv"
)
{
auto
recv_vars_attr
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE
(
recv_vars_attr
.
size
()
==
2UL
);
// [parameter, gradient]
if
(
recv_vars_attr
[
0
].
find
(
".block"
)
==
std
::
string
::
npos
)
{
bcast_var_name_set_
[
op_dev_id
].
emplace
(
recv_vars_attr
[
0
]);
}
}
insert_op
=
true
;
need_broadcast_var_
=
true
;
}
else
if
(
OpHaveRole
(
*
node
,
OpRole
::
kDist
))
{
int
op_dev_id
=
CreateDistTrainOp
(
result
,
node
);
if
(
node
->
Op
()
->
Type
()
==
"concat"
)
{
auto
origin_param_name
=
node
->
Op
()
->
OutputArgumentNames
()[
0
];
bcast_var_name_set_
[
op_dev_id
].
emplace
(
origin_param_name
);
}
insert_op
=
true
;
}
else
{
int
op_dev_id
=
GetOpDeviceID
(
node
);
if
(
op_dev_id
!=
-
1
)
{
// This op only runs on one specific device.
CreateComputationalOp
(
result
,
node
,
op_dev_id
);
return
op_dev_id
;
for
(
ir
::
Node
*
n
:
node
->
outputs
)
{
sharded_var_device_
.
emplace
(
n
->
Name
(),
op_dev_id
);
}
insert_op
=
true
;
}
}
return
insert_op
;
}
void
SetOpInputsAllPlaces
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
num_places
)
{
...
...
@@ -774,13 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
}
// Create RPC related op handles that connects its in ops and out ops.
int
MultiDevSSAGraphBuilder
::
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
std
::
unordered_map
<
std
::
string
,
int
>
*
sharded_var_device
)
const
{
int
DistSSAGraphBuilder
::
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
int
op_dev_id
=
-
1
;
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
// TODO(paddle-dev): getting the first var is not safe.
op_dev_id
=
GetVarDeviceID
(
node
->
inputs
[
0
]
->
Name
()
,
*
sharded_var_device
);
op_dev_id
=
GetVarDeviceID
(
node
->
inputs
[
0
]
->
Name
());
PADDLE_ENFORCE
(
!
ir
::
IsControlDepVar
(
*
node
->
inputs
[
0
]),
"This hack no longer holds, please fix."
);
// the variable name which contains .block means it was splited by
...
...
@@ -798,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
VLOG
(
10
)
<<
"send grad "
<<
input_var_names
[
0
]
<<
" origin "
<<
send_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
for
(
auto
&
varname
:
input_var_names
)
{
sharded_var_device
->
emplace
(
varname
,
op_dev_id
);
sharded_var_device
_
.
emplace
(
varname
,
op_dev_id
);
}
sharded_var_device
->
emplace
(
send_param_grad
[
1
],
op_dev_id
);
sharded_var_device
_
.
emplace
(
send_param_grad
[
1
],
op_dev_id
);
}
}
else
if
(
node
->
Op
()
->
Type
()
==
"recv"
)
{
std
::
vector
<
std
::
string
>
output_var_names
;
...
...
@@ -810,7 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
auto
recv_param_grad
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
if
(
recv_param_grad
.
size
()
==
2U
)
{
op_dev_id
=
GetVarDeviceID
(
recv_param_grad
[
1
]
,
*
sharded_var_device
);
op_dev_id
=
GetVarDeviceID
(
recv_param_grad
[
1
]);
VLOG
(
10
)
<<
"recv param "
<<
recv_param_grad
[
0
]
<<
" get grad place: "
<<
recv_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
...
...
@@ -818,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
op_dev_id
=
GetAppropriateDeviceID
(
output_var_names
);
}
for
(
auto
&
varname
:
output_var_names
)
{
sharded_var_device
->
emplace
(
varname
,
op_dev_id
);
sharded_var_device
_
.
emplace
(
varname
,
op_dev_id
);
}
}
else
{
// send_barrier, fetch_barrier will run on place 0;
...
...
@@ -845,7 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
int
outvar_dev_id
=
op_dev_id
;
if
(
node
->
Op
()
->
Type
()
==
"fetch_barrier"
)
{
outvar_dev_id
=
GetVarDeviceID
(
output
->
Name
()
,
*
sharded_var_device
);
outvar_dev_id
=
GetVarDeviceID
(
output
->
Name
());
PADDLE_ENFORCE_NE
(
outvar_dev_id
,
-
1
,
"output name %s"
,
output
->
Name
());
}
p
=
places_
[
outvar_dev_id
];
...
...
@@ -862,29 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
return
op_dev_id
;
}
bool
MultiDevSSAGraphBuilder
::
IsSparseGradient
(
const
std
::
string
&
og
)
const
{
PADDLE_ENFORCE
(
all_vars_
.
count
(
og
)
!=
0
);
if
(
all_vars_
.
at
(
og
)
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
return
true
;
int
DistSSAGraphBuilder
::
CreateDistTrainOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
int
op_dev_id
=
-
1
;
std
::
vector
<
std
::
string
>
input_var_names
;
std
::
vector
<
std
::
string
>
output_var_names
;
for
(
ir
::
Node
*
input
:
node
->
inputs
)
{
input_var_names
.
push_back
(
input
->
Name
());
}
return
false
;
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
output_var_names
.
push_back
(
output
->
Name
());
}
if
(
node
->
Op
()
->
Type
()
==
"split_byref"
||
node
->
Op
()
->
Type
()
==
"split_selected_rows"
||
node
->
Op
()
->
Type
()
==
"split_ids"
)
{
// TODO(paddle-dev): getting the first var is not safe.
op_dev_id
=
GetVarDeviceID
(
input_var_names
[
0
]);
if
(
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
op_dev_id
=
GetAppropriateDeviceID
(
input_var_names
);
for
(
auto
&
varname
:
input_var_names
)
{
sharded_var_device_
.
emplace
(
varname
,
op_dev_id
);
}
}
for
(
auto
&
varname
:
output_var_names
)
{
sharded_var_device_
.
emplace
(
varname
,
op_dev_id
);
}
}
else
if
(
node
->
Op
()
->
Type
()
==
"concat"
)
{
op_dev_id
=
GetVarDeviceID
(
input_var_names
[
0
]);
for
(
auto
&
varname
:
output_var_names
)
{
sharded_var_device_
.
emplace
(
varname
,
op_dev_id
);
}
}
else
{
LOG
(
ERROR
)
<<
"got unexpected dist op: "
<<
node
->
Op
()
->
Type
();
PADDLE_THROW
(
"the distribute training related op should be in [split_byref, "
"concat]."
);
}
PADDLE_ENFORCE
(
op_dev_id
!=
-
1
,
"can not find right place for distributed op: %s"
,
node
->
Op
()
->
Type
());
CreateComputationalOp
(
result
,
node
,
op_dev_id
);
return
op_dev_id
;
}
bool
MultiDevSSAGraphBuilder
::
IsScaleLossOp
(
ir
::
Node
*
node
)
const
{
return
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
))
&&
!
loss_var_name_
.
empty
();
// If loss_var is empty. This is test mode
void
DistSSAGraphBuilder
::
InsertCollectiveOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
g_name
)
const
{
size_t
cur_device_id
=
0
;
switch
(
strategy_
.
reduce_
)
{
case
BuildStrategy
::
ReduceStrategy
::
kReduce
:
cur_device_id
=
GetAppropriateDeviceID
({
g_name
});
CreateReduceOp
(
result
,
g_name
,
cur_device_id
);
sharded_var_device_
.
emplace
(
g_name
,
cur_device_id
);
break
;
case
BuildStrategy
::
ReduceStrategy
::
kAllReduce
:
if
(
IsSparseGradient
(
g_name
))
{
CreateReduceOp
(
result
,
g_name
,
0
);
CreateBroadcastOp
(
result
,
g_name
,
0
);
}
else
{
CreateAllReduceOp
(
result
,
g_name
);
}
break
;
default:
LOG
(
FATAL
)
<<
"Unknown reduce strategy."
;
break
;
}
}
void
DistSSAGraphBuilder
::
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
{
if
(
need_broadcast_var_
||
(
UseGPU
()
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
))
{
if
(
strategy_
.
fuse_broadcast_op_
)
{
CreateFusedBroadcastOp
(
result
,
bcast_var_name_set_
);
}
else
{
for
(
size_t
dev_id
=
0
;
dev_id
<
bcast_var_name_set_
.
size
();
++
dev_id
)
{
auto
&
to_bcast_set
=
bcast_var_name_set_
[
dev_id
];
for
(
auto
&
bcast_name
:
to_bcast_set
)
{
CreateBroadcastOp
(
result
,
bcast_name
,
dev_id
);
}
}
}
}
}
std
::
unordered_set
<
std
::
string
>
&
MultiDevSSAGraphBuilder
()
{
static
std
::
unordered_set
<
std
::
string
>
regs
;
return
regs
;
}
static
int
MultiDevSSAGraphBuilderRegister
(
const
std
::
string
&
builder_mode
)
{
MultiDevSSAGraphBuilder
().
insert
(
builder_mode
);
return
0
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
multi_devices_pass
,
paddle
::
framework
::
details
::
MultiDevSSAGraphBuilder
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kLossVarName
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kPlaces
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kLocalScopes
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kStrategy
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kNumTrainers
);
#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
_reg_ssa_graph_builder_##pass_name, \
"REGISTER_MULTI_DEVICES_PASS must be called in global namespace."); \
int _reg_ssa_graph_builder_entry_##pass_name = \
paddle::framework::details::MultiDevSSAGraphBuilderRegister(#pass_name); \
REGISTER_PASS(pass_name, pass_class) \
.RequirePassAttr(paddle::framework::details::kLossVarName) \
.RequirePassAttr(paddle::framework::details::kPlaces) \
.RequirePassAttr(paddle::framework::details::kLocalScopes) \
.RequirePassAttr(paddle::framework::details::kStrategy) \
.RequirePassAttr(paddle::framework::details::kNRanks)
REGISTER_MULTI_DEVICES_PASS
(
reduce_mode_multi_devices_pass
,
paddle
::
framework
::
details
::
ReduceSSAGraphBuilder
);
REGISTER_MULTI_DEVICES_PASS
(
allreduce_mode_multi_devices_pass
,
paddle
::
framework
::
details
::
AllReduceSSAGraphBuilder
);
REGISTER_MULTI_DEVICES_PASS
(
dist_multi_devices_pass
,
paddle
::
framework
::
details
::
DistSSAGraphBuilder
);
paddle/fluid/framework/details/multi_devices_graph_pass.h
浏览文件 @
4a443ffc
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <string>
#include <utility>
#include <vector>
...
...
@@ -30,78 +31,70 @@ namespace framework {
class
Scope
;
namespace
details
{
class
MultiDevSSAGraphBuilder
:
public
ir
::
Pass
{
constexpr
char
kLossVarName
[]
=
"loss_var_name"
;
constexpr
char
kPlaces
[]
=
"places"
;
constexpr
char
kLocalScopes
[]
=
"local_scopes"
;
constexpr
char
kStrategy
[]
=
"strategy"
;
constexpr
char
kNRanks
[]
=
"nranks"
;
class
MultiDevSSAGraphBuilderBase
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
private:
void
CreateOpHandleIOs
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
device_id
)
const
;
void
Init
()
const
;
virtual
void
Init
()
const
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
virtual
std
::
vector
<
ir
::
Node
*>
SortOperations
(
const
ir
::
Graph
&
graph
)
const
;
int
GetVarDeviceID
(
const
std
::
string
&
varname
,
const
std
::
unordered_map
<
std
::
string
,
int
>
&
sharded_var_device
)
const
;
virtual
void
InsertCollectiveOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
g_name
)
const
=
0
;
bool
IsScaleLossOp
(
ir
::
Node
*
node
)
const
;
virtual
bool
DealWithSpecialOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
=
0
;
virtual
void
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
=
0
;
int
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
std
::
unordered_map
<
std
::
string
,
int
>
*
sharded_var_device
)
const
;
int
CreateDistTrainOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
std
::
unordered_map
<
std
::
string
,
int
>
*
sharded_var_device
)
const
;
bool
UseGPU
()
const
;
bool
NeedCollectiveOps
()
const
;
bool
IsScaleLossOp
(
ir
::
Node
*
node
)
const
;
void
CreateComputationalOps
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
num_places
)
const
;
void
CreateScaleLossGradOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
loss_grad_name
,
ir
::
Node
*
out_var_node
,
ir
::
Node
*
out_var_node
,
size_t
loss_scale
,
proto
::
VarType
::
Type
dtype
)
const
;
VarHandle
*
CreateReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
,
int
dst_dev_id
)
const
;
void
CreateComputationalOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
dev_id
)
const
;
int
GetOpDeviceID
(
ir
::
Node
*
node
,
const
std
::
unordered_map
<
std
::
string
,
int
>
&
sharded_var_device
)
const
;
void
InsertAllReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
)
const
;
bool
IsSparseGradient
(
const
std
::
string
&
og
)
const
;
void
InsertDataBalanceOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
string
>
&
datas
)
const
;
void
CreateAllReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
)
const
;
void
CreateBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
;
void
InsertScaleLossGradOp
(
ir
::
Graph
*
result
,
const
ir
::
Node
*
node
)
const
;
void
CreateFusedBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
bcast_varnames
)
const
;
bool
IsSparseGradient
(
const
std
::
string
&
og
)
const
;
size_t
GetAppropriateDeviceID
(
const
std
::
vector
<
std
::
string
>
&
var_names
)
const
;
void
SetCommunicationContext
(
OpHandleBase
*
op_handle
,
const
platform
::
Place
&
p
)
const
;
std
::
vector
<
ir
::
Node
*>
SortForReduceMode
(
const
std
::
vector
<
ir
::
Node
*>
&
)
const
;
void
CreateOpHandleIOs
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
device_id
)
const
;
int
GetOpDeviceID
(
ir
::
Node
*
node
,
const
std
::
unordered_map
<
std
::
string
,
int
>
&
shared_var_device
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
*
delay_ops
)
const
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
mutable
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
mutable
std
::
string
loss_var_name_
;
mutable
std
::
vector
<
platform
::
Place
>
places_
;
...
...
@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
mutable
BuildStrategy
strategy_
;
mutable
std
::
unordered_map
<
std
::
string
,
VarDesc
*>
all_vars_
;
};
class
AllReduceSSAGraphBuilder
:
public
MultiDevSSAGraphBuilderBase
{
protected:
virtual
void
InsertCollectiveOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
g_name
)
const
;
virtual
bool
DealWithSpecialOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
return
false
;
}
virtual
void
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
{}
};
class
BalanceVarSSAGraphBuilder
:
public
MultiDevSSAGraphBuilderBase
{
protected:
int
GetVarDeviceID
(
const
std
::
string
&
varname
)
const
;
int
GetOpDeviceID
(
ir
::
Node
*
node
)
const
;
size_t
GetAppropriateDeviceID
(
const
std
::
vector
<
std
::
string
>
&
var_names
)
const
;
virtual
void
ResetState
()
const
;
mutable
std
::
unordered_map
<
std
::
string
,
int
>
sharded_var_device_
;
mutable
std
::
vector
<
int64_t
>
balance_vars_
;
};
class
ReduceSSAGraphBuilder
:
public
BalanceVarSSAGraphBuilder
{
protected:
virtual
void
Init
()
const
;
virtual
void
InsertCollectiveOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
g_name
)
const
;
virtual
bool
DealWithSpecialOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
;
virtual
void
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
;
virtual
std
::
vector
<
ir
::
Node
*>
SortOperations
(
const
ir
::
Graph
&
graph
)
const
;
virtual
void
ResetState
()
const
;
int
GetOpDeviceID
(
ir
::
Node
*
node
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
*
delay_ops
)
const
;
std
::
vector
<
ir
::
Node
*>
SortForReduceMode
(
const
std
::
vector
<
ir
::
Node
*>
&
topo_ops
)
const
;
mutable
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
bcast_var_name_set_
;
};
class
DistSSAGraphBuilder
:
public
BalanceVarSSAGraphBuilder
{
protected:
virtual
void
Init
()
const
;
virtual
bool
DealWithSpecialOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
;
virtual
void
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
;
virtual
void
InsertCollectiveOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
g_name
)
const
;
virtual
void
ResetState
()
const
;
int
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
;
int
CreateDistTrainOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
;
mutable
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
bcast_var_name_set_
;
mutable
bool
need_broadcast_var_
{
false
};
};
std
::
unordered_set
<
std
::
string
>
&
MultiDevSSAGraphBuilder
();
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
0 → 100644
浏览文件 @
4a443ffc
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
ParallelSSAGraphExecutor
::
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
&&
graphs
)
:
strategy_
(
std
::
move
(
strategy
)),
local_scopes_
(
std
::
move
(
local_scopes
)),
pool_
(
places
.
size
()
>=
2
?
new
::
ThreadPool
(
places
.
size
())
:
nullptr
),
places_
(
std
::
move
(
places
)),
graphs_
(
std
::
move
(
graphs
))
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
// set the correct size of thread pool to each device.
strategy_
.
num_threads_
=
strategy_
.
num_threads_
<
places_
.
size
()
?
1UL
:
strategy_
.
num_threads_
/
places_
.
size
();
VLOG
(
1
)
<<
"set num_threads: "
<<
strategy_
.
num_threads_
<<
" to run the operators of the graph on each device."
;
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
executors_
.
emplace_back
(
new
details
::
ThreadedSSAGraphExecutor
(
strategy_
,
{
local_scopes_
[
i
]},
{
places_
[
i
]},
std
::
move
(
graphs_
[
i
])));
}
}
FeedFetchList
ParallelSSAGraphExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
{
std
::
vector
<
std
::
future
<
FeedFetchList
>>
run_futures
;
std
::
vector
<
FeedFetchList
>
fetch_data
;
FeedFetchList
ret
;
fetch_data
.
reserve
(
places_
.
size
());
ret
.
reserve
(
fetch_tensors
.
size
());
exception_holder_
.
Clear
();
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
call
=
[
this
,
i
,
&
fetch_tensors
]()
->
FeedFetchList
{
try
{
return
executors_
[
i
]
->
Run
(
fetch_tensors
);
}
catch
(...)
{
exception_holder_
.
Catch
(
std
::
current_exception
());
}
return
FeedFetchList
();
};
if
(
pool_
)
{
run_futures
.
emplace_back
(
pool_
->
enqueue
(
std
::
move
(
call
)));
}
else
{
fetch_data
.
emplace_back
(
std
::
move
(
call
()));
}
}
if
(
pool_
)
{
for
(
auto
&
f
:
run_futures
)
{
if
(
exception_holder_
.
IsCaught
())
{
f
.
wait
();
}
else
{
fetch_data
.
emplace_back
(
std
::
move
(
f
.
get
()));
}
}
}
if
(
exception_holder_
.
IsCaught
())
{
exception_holder_
.
ReThrow
();
}
for
(
size_t
fetch_idx
=
0
;
fetch_idx
<
fetch_tensors
.
size
();
++
fetch_idx
)
{
std
::
vector
<
const
LoDTensor
*>
lodtensor_ptrs
;
lodtensor_ptrs
.
reserve
(
local_scopes_
.
size
());
for
(
size_t
scope_idx
=
0
;
scope_idx
<
local_scopes_
.
size
();
++
scope_idx
)
{
lodtensor_ptrs
.
push_back
(
&
fetch_data
.
at
(
scope_idx
).
at
(
fetch_idx
));
}
ret
.
emplace_back
();
ret
.
back
().
MergeLoDTensor
(
lodtensor_ptrs
,
platform
::
CPUPlace
());
}
return
ret
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/
multi_devices_graph_check_pass
.h
→
paddle/fluid/framework/details/
parallel_ssa_graph_executor
.h
浏览文件 @
4a443ffc
...
...
@@ -14,23 +14,36 @@
#pragma once
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include <string>
#include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
SSAGraghBuilderWithChecker
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
{
PADDLE_ENFORCE
(
IsValidGraph
(
graph
.
get
()));
return
graph
;
}
class
ParallelSSAGraphExecutor
:
public
SSAGraphExecutor
{
public:
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
&&
graphs
);
~
ParallelSSAGraphExecutor
()
final
=
default
;
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graphs_
[
0
];
}
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
private:
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
{
nullptr
};
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs_
;
bool
IsValidGraph
(
const
ir
::
Graph
*
graph
)
const
;
std
::
vector
<
std
::
unique_ptr
<
details
::
ThreadedSSAGraphExecutor
>>
executors_
;
ExceptionHolder
exception_holder_
;
};
}
// namespace details
...
...
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
浏览文件 @
4a443ffc
...
...
@@ -56,7 +56,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
}
}
std
::
vector
<
framework
::
LoDTensor
>
fetch_data
;
std
::
exception_ptr
eptr
;
std
::
exception_ptr
eptr
=
nullptr
;
try
{
fetch_data
=
underlying_executor_
->
Run
(
fetch_tensors
);
}
catch
(...)
{
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
4a443ffc
...
...
@@ -40,12 +40,12 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
void
NaiveExecutor
::
Run
()
{
#ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N
(
WARNING
,
1
5
)
<<
"The NaiveExecutor can not work properly if the "
LOG_FIRST_N
(
WARNING
,
5
)
<<
"The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set."
;
LOG_FIRST_N
(
WARNING
,
1
5
)
<<
"Unlike the training phase, all the scopes and "
LOG_FIRST_N
(
WARNING
,
5
)
<<
"Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead."
;
LOG_FIRST_N
(
WARNING
,
1
5
)
<<
"Please re-compile the inference library by "
LOG_FIRST_N
(
WARNING
,
5
)
<<
"Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference"
;
#endif // PADDLE_ON_INFERENCE
...
...
paddle/fluid/framework/ngraph_bridge.cc
浏览文件 @
4a443ffc
...
...
@@ -32,8 +32,11 @@ std::map<std::string,
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphBridge
::
NG_NODE_MAP
=
{
{
"fill_constant"
,
paddle
::
operators
::
ngraphs
::
BuildFillConstantNode
},
{
"mean"
,
paddle
::
operators
::
ngraphs
::
BuildMeanNode
},
{
"mean_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMeanGradNode
},
{
"mul"
,
paddle
::
operators
::
ngraphs
::
BuildMulNode
},
{
"mul_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMulGradNode
},
{
"scale"
,
paddle
::
operators
::
ngraphs
::
BuildScaleNode
},
{
"relu"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"tanh"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"top_k"
,
paddle
::
operators
::
ngraphs
::
BuildTopKNode
}};
...
...
paddle/fluid/framework/ngraph_operator.cc
浏览文件 @
4a443ffc
...
...
@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
}
}
backend_
->
call
(
ngraph_function_
,
t_out
,
t_in
);
backend_
->
call
(
backend_
->
compile
(
ngraph_function_
)
,
t_out
,
t_in
);
}
// NgraphEngine::RunImpl
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/operator.h
浏览文件 @
4a443ffc
...
...
@@ -377,6 +377,30 @@ class ExecutionContext {
return
op_
.
Outputs
(
name
);
}
template
<
typename
T
,
typename
DevContext
>
Tensor
AllocateTmpTensor
(
const
framework
::
DDim
&
dim
,
const
DevContext
&
dev_ctx
)
const
{
auto
tmp_allocation_ptr
=
platform
::
DeviceTemporaryAllocator
::
Instance
()
.
Get
<
DevContext
>
(
dev_ctx
)
.
Allocate
(
product
(
dim
)
*
sizeof
(
T
));
auto
&
deleter
=
tmp_allocation_ptr
.
get_deleter
();
auto
*
allocation_ptr
=
tmp_allocation_ptr
.
release
();
auto
shared_allocation
=
std
::
shared_ptr
<
memory
::
allocation
::
Allocation
>
(
allocation_ptr
,
deleter
);
PADDLE_ENFORCE
(
dynamic_cast
<
platform
::
TemporaryAllocation
*>
(
allocation_ptr
)
!=
nullptr
,
"The AllocationPtr must be TemporaryAllocation."
);
PADDLE_ENFORCE_EQ
(
allocation_ptr
->
size
(),
framework
::
product
(
dim
)
*
sizeof
(
T
));
paddle
::
framework
::
Tensor
temp_tensor
(
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
))));
temp_tensor
.
Resize
(
dim
);
temp_tensor
.
ResetHolder
(
std
::
move
(
shared_allocation
));
return
temp_tensor
;
}
private:
const
OperatorBase
&
op_
;
const
Scope
&
scope_
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
4a443ffc
...
...
@@ -21,12 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
...
...
@@ -38,6 +35,8 @@ limitations under the License. */
DEFINE_string
(
pe_profile_fname
,
""
,
"Profiler filename for PE, which generated by gperftools."
"Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."
);
DEFINE_bool
(
enable_parallel_graph
,
false
,
"Force disable parallel graph execution mode if set false."
);
namespace
paddle
{
namespace
framework
{
...
...
@@ -106,6 +105,7 @@ class ParallelExecutorPrivate {
bool
own_local_scope_
;
bool
use_cuda_
;
bool
use_all_reduce_
;
size_t
nranks_
;
// global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
// then keeps unchanged
...
...
@@ -201,6 +201,7 @@ ParallelExecutor::ParallelExecutor(
member_
->
build_strategy_
=
build_strategy
;
member_
->
use_all_reduce_
=
build_strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
member_
->
nranks_
=
num_trainers
*
places
.
size
();
if
(
!
member_
->
use_all_reduce_
)
{
PADDLE_ENFORCE
(
places
.
size
()
>
1
,
...
...
@@ -224,46 +225,80 @@ ParallelExecutor::ParallelExecutor(
}
}
// FIXME(Yancey1989): parallel graph mode get better performance
// in GPU allreduce distributed training. Need an elegant way to
// choice the execution strategy.
build_strategy
.
enable_parallel_graph_
=
EnableParallelGraphExecution
(
main_program
,
exec_strategy
,
build_strategy
);
VLOG
(
1
)
<<
"Enable ParallelGraph Execution: "
<<
build_strategy
.
enable_parallel_graph_
;
if
(
member_
->
use_cuda_
)
{
// Bcast Parameters to all GPUs
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto
*
nccl_id_var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
ncclUniqueId
*
nccl_id
=
nullptr
;
// gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective
// distributed training
auto
*
nccl_id_var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
if
(
nccl_id_var
!=
nullptr
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
}
if
(
build_strategy
.
enable_parallel_graph_
&&
member_
->
nranks_
>
1UL
)
{
if
(
nccl_id
==
nullptr
)
{
local_nccl_id_
.
reset
(
new
ncclUniqueId
());
platform
::
dynload
::
ncclGetUniqueId
(
local_nccl_id_
.
get
());
nccl_id
=
local_nccl_id_
.
get
();
}
}
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
,
nccl_id
,
num_trainers
,
trainer_id
));
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
}
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
BCastParamsToDevices
(
bcast_vars
);
}
// Startup Program has been run. All local scopes has correct parameters.
// Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if
(
build_strategy
.
enable_parallel_graph_
)
{
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
std
::
unique_ptr
<
ir
::
Graph
>
graph
=
build_strategy
.
Apply
(
main_program
,
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
graphs
.
push_back
(
std
::
move
(
graph
));
}
}
else
{
std
::
unique_ptr
<
ir
::
Graph
>
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
member_
->
nranks_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
graphs
.
push_back
(
std
::
move
(
graph
));
}
#else
std
::
unique_ptr
<
ir
::
Graph
>
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
use_cuda_
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_cuda_
);
graphs
.
push_back
(
std
::
move
(
graph
));
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
if
(
max_memory_size
>=
0
)
{
graph
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
graph
),
static_cast
<
size_t
>
(
max_memory_size
));
for
(
size_t
i
=
0
;
i
<
graphs
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
graphs
[
i
]),
static_cast
<
size_t
>
(
max_memory_size
));
}
}
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std
::
vector
<
details
::
VariableInfo
>
var_infos
;
for
(
auto
&
graph
:
graphs
)
{
for
(
auto
&
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
()
&&
!
node
->
IsCtrlVar
()
&&
node
->
Var
())
{
var_infos
.
emplace_back
();
...
...
@@ -272,14 +307,16 @@ ParallelExecutor::ParallelExecutor(
var_infos
.
back
().
persistable_
=
node
->
Var
()
->
Persistable
();
}
}
}
// If the loss_var_name is given, the number of graph should be only one.
if
(
loss_var_name
.
size
())
{
size_t
graph_num
=
ir
::
GraphNum
(
*
graph
);
size_t
graph_num
=
ir
::
GraphNum
(
*
graph
s
[
0
]
);
if
(
graph_num
>
1
)
{
LOG
(
WARNING
)
<<
"The number of graph should be only one, "
"but the current graph has "
<<
ir
::
GraphNum
(
*
graph
)
<<
ir
::
GraphNum
(
*
graph
s
[
0
]
)
<<
" sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
...
...
@@ -287,14 +324,20 @@ ParallelExecutor::ParallelExecutor(
}
}
if
(
build_strategy
.
enable_parallel_graph_
)
{
member_
->
executor_
.
reset
(
new
details
::
ParallelSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graphs
)));
}
else
{
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)));
std
::
move
(
graphs
[
0
]
)));
}
else
{
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)));
std
::
move
(
graphs
[
0
])));
}
}
member_
->
executor_
.
reset
(
new
details
::
ScopeBufferedSSAGraphExecutor
(
...
...
@@ -423,6 +466,36 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
bool
ParallelExecutor
::
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
{
if
(
!
FLAGS_enable_parallel_graph
)
return
false
;
bool
enable_parallel_graph
=
true
;
// TODO(Yancey1989): support sparse update in ParallelGraph mode.
for
(
auto
&
var_desc
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
var_desc
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
)
{
enable_parallel_graph
=
false
;
}
}
// TODO(Yancey1989): support pserver mode
for
(
auto
&
op_desc
:
main_program
.
Block
(
0
).
AllOps
())
{
if
(
op_desc
->
Type
()
==
"send"
||
op_desc
->
Type
()
==
"recv"
)
{
enable_parallel_graph
=
false
;
break
;
}
}
if
(
!
member_
->
use_all_reduce_
||
!
member_
->
use_cuda_
)
enable_parallel_graph
=
false
;
if
(
build_strategy
.
enable_sequential_execution_
||
exec_strategy
.
type_
==
ExecutionStrategy
::
ExecutorType
::
kExperimental
)
enable_parallel_graph
=
false
;
return
enable_parallel_graph
;
}
ParallelExecutor
::~
ParallelExecutor
()
{
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
4a443ffc
...
...
@@ -28,6 +28,10 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -68,8 +72,14 @@ class ParallelExecutor {
private:
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
bool
EnableParallelGraphExecution
(
const
ProgramDesc
&
main_program
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
;
ParallelExecutorPrivate
*
member_
;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
std
::
unique_ptr
<
ncclUniqueId
>
local_nccl_id_
;
#endif
};
}
// namespace framework
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
4a443ffc
...
...
@@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
memory
::
Copy
(
dst_place
,
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src
.
place
()),
src_ptr
,
size
);
}
template
<
typename
T
>
paddle
::
framework
::
Tensor
GetTensor
(
memory
::
allocation
::
AllocationPtr
temp_allocation_ptr
,
const
framework
::
DDim
&
dim
)
{
auto
&
deleter
=
temp_allocation_ptr
.
get_deleter
();
auto
*
allocation_ptr
=
temp_allocation_ptr
.
release
();
auto
shared_allocation
=
std
::
shared_ptr
<
memory
::
allocation
::
Allocation
>
(
allocation_ptr
,
deleter
);
PADDLE_ENFORCE
(
dynamic_cast
<
platform
::
TemporaryAllocation
*>
(
allocation_ptr
)
!=
nullptr
,
"The AllocationPtr must be TemporaryAllocation."
);
PADDLE_ENFORCE_EQ
(
allocation_ptr
->
size
(),
framework
::
product
(
dim
)
*
sizeof
(
T
));
paddle
::
framework
::
Tensor
temp_tensor
(
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
))));
temp_tensor
.
Resize
(
dim
);
temp_tensor
.
ResetHolder
(
std
::
move
(
shared_allocation
));
return
temp_tensor
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/threadpool.cc
浏览文件 @
4a443ffc
...
...
@@ -89,7 +89,6 @@ void ThreadPool::TaskLoop() {
task
=
std
::
move
(
tasks_
.
front
());
tasks_
.
pop
();
}
// run the task
task
();
}
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
4a443ffc
...
...
@@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
use_gpu
,
UseGPU
,
bool
);
DECL_ARGUMENT_FIELD
(
gpu_device_id
,
GPUDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
use_tensorrt
,
UseTensorRT
,
bool
);
DECL_ARGUMENT_FIELD
(
tensorrt_node_teller
,
TensorRtNodeTeller
,
std
::
function
<
bool
(
const
framework
::
ir
::
Node
*
)
>
);
DECL_ARGUMENT_FIELD
(
tensorrt_max_batch_size
,
TensorRtMaxBatchSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
4a443ffc
...
...
@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for
(
const
std
::
string
&
pass_name
:
passes
)
{
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
pass_name
);
// Set some pass attributes.
if
(
pass_name
==
"ir_analysis_pass"
)
{
pass
->
Set
(
"tensorrt_node_teller"
,
new
SubgraphDetector
::
NodeInsideSubgraphTeller
(
argument
->
tensorrt_node_teller
()));
}
if
(
pass_name
==
"graph_viz_pass"
)
{
std
::
string
dot_file_path
=
std
::
to_string
(
pass_num
)
+
"_ir_"
+
(
pre_pass
.
empty
()
?
"origin"
:
pre_pass
)
+
...
...
@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
}
if
(
pass_name
==
"tensorrt_subgraph_pass"
)
{
PADDLE_ENFORCE
(
argument
->
tensorrt_node_teller_valid
());
pass
->
SetNotOwned
(
"tensorrt_node_teller"
,
argument
->
tensorrt_node_teller_ptr
());
pass
->
Set
(
"workspace_size"
,
new
int
(
argument
->
tensorrt_workspace_size
()));
pass
->
Set
(
"max_batch_size"
,
new
int
(
argument
->
tensorrt_max_batch_size
()));
pass
->
Set
(
"min_subgraph_size"
,
...
...
paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
浏览文件 @
4a443ffc
cc_library
(
subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector
)
set
(
analysis_deps
${
analysis_deps
}
if
(
TENSORRT_FOUND
)
cc_library
(
tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller
)
set
(
analysis_deps
${
analysis_deps
}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL
""
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
APPEND
${
pass_file
}
"USE_PASS(tensorrt_subgraph_pass);
\n
"
)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
tensorrt_subgraph_pass CACHE INTERNAL
""
)
set
(
pass_file
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/paddle_inference_pass.h
)
file
(
APPEND
${
pass_file
}
"USE_PASS(tensorrt_subgraph_pass);
\n
"
)
set
(
INFER_IR_PASSES
${
INFER_IR_PASSES
}
tensorrt_subgraph_pass CACHE INTERNAL
""
)
endif
()
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
4a443ffc
...
...
@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace
paddle
{
namespace
inference
{
...
...
@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std
::
unique_ptr
<
framework
::
ir
::
Graph
>
graph
)
const
{
framework
::
ir
::
FusePassBase
::
Init
(
"tensorrt_subgraph_pass"
,
graph
.
get
());
auto
teller
=
Get
<
SubgraphDetector
::
NodeInsideSubgraphTeller
>
(
"tensorrt_node_teller"
);
auto
teller
=
[](
const
framework
::
ir
::
Node
*
node
)
{
if
(
!
node
->
IsOp
()
||
!
node
->
Op
())
return
false
;
return
tensorrt
::
OpTeller
::
Global
().
Tell
(
node
->
Op
()
->
Type
(),
*
node
->
Op
());
};
SubGraphFuser
fuser
(
graph
.
get
(),
teller
,
Get
<
int
>
(
"min_subgraph_size"
)
/*min subgraph size*/
);
...
...
@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS
(
tensorrt_subgraph_pass
,
paddle
::
inference
::
analysis
::
TensorRtSubgraphPass
)
.
RequirePassAttr
(
"tensorrt_node_teller"
)
.
RequirePassAttr
(
"max_batch_size"
)
.
RequirePassAttr
(
"workspace_size"
)
.
RequirePassAttr
(
"min_subgraph_size"
);
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
浏览文件 @
4a443ffc
...
...
@@ -27,9 +27,6 @@ namespace analysis {
void
IrAnalysisComposePass
::
RunImpl
(
Argument
*
argument
)
{
ARGUMENT_CHECK_FIELD
(
argument
,
ir_analysis_passes
);
if
(
argument
->
use_tensorrt_valid
()
&&
argument
->
use_tensorrt
())
{
InitTensorRTAttrs
(
argument
);
}
ApplyIrPasses
(
argument
);
CollectFusionStatis
(
argument
);
}
...
...
@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return
"ir-analysis-compose-pass"
;
}
void
IrAnalysisComposePass
::
InitTensorRTAttrs
(
Argument
*
argument
)
{
if
(
argument
->
use_tensorrt_valid
()
&&
argument
->
use_tensorrt
())
{
LOG
(
INFO
)
<<
"Initing TensorRT pass"
;
argument
->
SetTensorRtNodeTeller
([](
const
framework
::
ir
::
Node
*
node
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
,
"leaky_relu"
});
if
(
!
node
->
IsOp
())
return
false
;
if
(
teller_set
.
count
(
node
->
Op
()
->
Type
()))
{
return
true
;
}
else
{
return
false
;
}
});
}
}
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
浏览文件 @
4a443ffc
...
...
@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std
::
string
repr
()
const
override
;
private:
void
InitTensorRTAttrs
(
Argument
*
argument
);
void
ApplyIrPasses
(
Argument
*
argument
);
void
CollectFusionStatis
(
Argument
*
argument
);
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
4a443ffc
...
...
@@ -14,86 +14,101 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle
_pass_builder.h" // NOLINT
#include "paddle
/fluid/platform/gpu_info.h"
namespace
paddle
{
PassStrategy
*
contrib
::
AnalysisConfig
::
pass_builder
()
const
{
PADDLE_ENFORCE
(
pass_builder_
.
get
(),
"Should call constructor first, that will init the pass_builder_."
);
return
pass_builder_
.
get
();
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
bool
use_gpu
)
{
this
->
use_gpu
=
use_gpu
;
if
(
use_gpu
)
{
if
(
!
pass_builder_
.
get
())
{
if
(
use_gpu_
)
{
LOG
(
INFO
)
<<
"Create GPU IR passes"
;
pass_builder_
.
reset
(
new
GpuPassStrategy
);
}
else
{
LOG
(
INFO
)
<<
"Create CPU IR passes"
;
pass_builder_
.
reset
(
new
CpuPassStrategy
);
}
}
else
if
(
pass_builder_
->
use_gpu
()
^
use_gpu
())
{
LOG
(
WARNING
)
<<
"The use_gpu flag is not compatible between Config and "
"PassBuilder, the flags are "
<<
use_gpu
()
<<
" "
<<
pass_builder_
->
use_gpu
();
LOG
(
WARNING
)
<<
"Please make them compatible, still use the existing "
"PassBuilder."
;
}
return
pass_builder_
.
get
();
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
std
::
string
&
model_dir
)
{
model_dir_
=
model_dir
;
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
std
::
string
&
prog_file
,
const
std
::
string
&
params_file
)
{
prog_file_
=
prog_file
;
params_file_
=
params_file
;
}
void
contrib
::
AnalysisConfig
::
SetModel
(
const
std
::
string
&
prog_file_path
,
const
std
::
string
&
params_file_path
)
{
prog_file_
=
prog_file_path
;
params_file_
=
params_file_path
;
}
void
contrib
::
AnalysisConfig
::
EnableUseGpu
(
uint64_t
memory_pool_init_size_mb
,
int
device_id
)
{
#ifdef PADDLE_WITH_CUDA
use_gpu_
=
true
;
memory_pool_init_size_mb_
=
memory_pool_init_size_mb
;
device_id_
=
device_id
;
#else
LOG
(
ERROR
)
<<
"Please compile with gpu to EnableGpu"
;
use_gpu_
=
false
;
#endif
}
void
contrib
::
AnalysisConfig
::
DisableGpu
()
{
use_gpu_
=
false
;
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
const
contrib
::
AnalysisConfig
&
other
)
{
// fields from Config
model_dir
=
other
.
model_dir
;
// fields from NativeConfig
use_gpu
=
other
.
use_gpu
;
device
=
other
.
device
;
fraction_of_gpu_memory
=
other
.
fraction_of_gpu_memory
;
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
// For mkldnn
use_mkldnn_
=
other
.
use_mkldnn_
;
mkldnn_enabled_op_types_
=
other
.
mkldnn_enabled_op_types_
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
use_tensorrt_
=
other
.
use_tensorrt_
;
tensorrt_max_batchsize_
=
other
.
tensorrt_max_batchsize_
;
tensorrt_workspace_size_
=
other
.
tensorrt_workspace_size_
;
tensorrt_min_subgraph_size_
=
other
.
tensorrt_min_subgraph_size_
;
model_from_memory_
=
other
.
model_from_memory_
;
if
(
use_gpu
)
{
#define CP_MEMBER(member__) member__ = other.member__;
// Model related.
CP_MEMBER
(
model_dir_
);
CP_MEMBER
(
prog_file_
);
CP_MEMBER
(
params_file_
);
CP_MEMBER
(
model_from_memory_
);
// the memory model reuses prog_file_ and
// params_file_ fields.
// Gpu releated.
CP_MEMBER
(
use_gpu_
);
CP_MEMBER
(
device_id_
);
CP_MEMBER
(
memory_pool_init_size_mb_
);
// TensorRT releated.
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
// MKLDNN releated.
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
// Ir related.
CP_MEMBER
(
enable_ir_optim_
);
CP_MEMBER
(
use_feed_fetch_ops_
);
CP_MEMBER
(
ir_debug_
);
CP_MEMBER
(
specify_input_name_
);
CP_MEMBER
(
cpu_math_library_num_threads_
);
CP_MEMBER
(
serialized_info_cache_
);
if
(
use_gpu_
)
{
pass_builder_
.
reset
(
new
GpuPassStrategy
(
*
static_cast
<
GpuPassStrategy
*>
(
other
.
pass_builder
())));
}
else
{
pass_builder_
.
reset
(
new
CpuPassStrategy
(
*
static_cast
<
CpuPassStrategy
*>
(
other
.
pass_builder
())));
}
}
contrib
::
AnalysisConfig
::
AnalysisConfig
(
contrib
::
AnalysisConfig
&&
other
)
{
// fields from Config
model_dir
=
other
.
model_dir
;
// fields from NativeConfig
use_gpu
=
other
.
use_gpu
;
device
=
other
.
device
;
fraction_of_gpu_memory
=
other
.
fraction_of_gpu_memory
;
prog_file
=
other
.
prog_file
;
param_file
=
other
.
param_file
;
specify_input_name
=
other
.
specify_input_name
;
cpu_math_library_num_threads_
=
other
.
cpu_math_library_num_threads_
;
// fields from this.
enable_ir_optim
=
other
.
enable_ir_optim
;
// For mkldnn
use_mkldnn_
=
other
.
use_mkldnn_
;
mkldnn_enabled_op_types_
=
other
.
mkldnn_enabled_op_types_
;
use_feed_fetch_ops
=
other
.
use_feed_fetch_ops
;
use_tensorrt_
=
other
.
use_tensorrt_
;
tensorrt_max_batchsize_
=
other
.
tensorrt_max_batchsize_
;
tensorrt_workspace_size_
=
other
.
tensorrt_workspace_size_
;
tensorrt_min_subgraph_size_
=
other
.
tensorrt_min_subgraph_size_
;
model_from_memory_
=
other
.
model_from_memory_
;
pass_builder_
=
std
::
move
(
other
.
pass_builder_
);
#undef CP_MEMBER
}
void
contrib
::
AnalysisConfig
::
EnableMKLDNN
()
{
...
...
@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
// Append after the conv+affine_channel fuse pass.
pass_builder
()
->
InsertPass
(
3
,
"tensorrt_subgraph_pass"
);
}
void
contrib
::
AnalysisConfig
::
Update
()
{
auto
info
=
SerializeInfoCache
();
if
(
info
==
serialized_info_cache_
)
return
;
if
(
use_gpu_
)
{
pass_builder_
.
reset
(
new
GpuPassStrategy
);
}
else
{
pass_builder_
.
reset
(
new
CpuPassStrategy
);
}
if
(
use_tensorrt_
)
{
if
(
!
use_gpu_
)
{
LOG
(
ERROR
)
<<
"TensorRT engine is not available when EnableGpu() not actived."
;
}
else
{
// Append after the infer_clean pass.
pass_builder
()
->
InsertPass
(
1
,
"tensorrt_subgraph_pass"
);
}
}
if
(
use_mkldnn_
)
{
if
(
!
enable_ir_optim_
)
{
LOG
(
ERROR
)
<<
"EnableMKLDNN() only works when IR optimization is enabled."
;
}
#ifdef PADDLE_WITH_MKLDNN
pass_builder
()
->
EnableMKLDNN
();
use_mkldnn_
=
true
;
#else
LOG
(
ERROR
)
<<
"Please compile with MKLDNN first to use MKLDNN"
;
use_mkldnn_
=
false
;
#endif
}
if
(
ir_debug_
)
{
pass_builder
()
->
TurnOnDebug
();
}
}
std
::
string
contrib
::
AnalysisConfig
::
SerializeInfoCache
()
{
std
::
stringstream
ss
;
ss
<<
use_gpu_
;
ss
<<
memory_pool_init_size_mb_
;
ss
<<
use_tensorrt_
;
ss
<<
tensorrt_workspace_size_
;
ss
<<
tensorrt_max_batchsize_
;
ss
<<
use_mkldnn_
;
ss
<<
enable_ir_optim_
;
ss
<<
use_feed_fetch_ops_
;
ss
<<
ir_debug_
;
return
ss
.
str
();
}
void
contrib
::
AnalysisConfig
::
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
)
{
cpu_math_library_num_threads_
=
cpu_math_library_num_threads
;
}
float
contrib
::
AnalysisConfig
::
fraction_of_gpu_memory_for_pool
()
const
{
#ifdef PADDLE_WITH_CUDA
// Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool.
size_t
gpu_used
,
gpu_available
;
platform
::
GpuMemoryUsage
(
&
gpu_used
,
&
gpu_available
);
double
total_gpu_memory
=
(
gpu_used
+
gpu_available
)
/
1024.
/
1024.
;
float
fraction_of_gpu_memory
=
static_cast
<
double
>
(
memory_pool_init_size_mb
())
/
total_gpu_memory
;
return
fraction_of_gpu_memory
;
#else
return
0.
;
#endif
}
void
contrib
::
AnalysisConfig
::
SetModelBuffer
(
const
char
*
prog_buffer
,
size_t
prog_buffer_size
,
const
char
*
param_buffer
,
size_t
param_buffer_size
)
{
prog_file
=
std
::
string
(
prog_buffer
,
prog_buffer
+
prog_buffer_size
);
param
_file
=
std
::
string
(
param_buffer
,
param_buffer
+
param_buffer_size
);
prog_file
_
=
std
::
string
(
prog_buffer
,
prog_buffer
+
prog_buffer_size
);
param
s_file_
=
std
::
string
(
param_buffer
,
param_buffer
+
param_buffer_size
);
model_from_memory_
=
true
;
}
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
4a443ffc
...
...
@@ -33,6 +33,7 @@
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
profile
);
...
...
@@ -59,7 +60,7 @@ bool AnalysisPredictor::Init(
if
(
FLAGS_profile
)
{
LOG
(
WARNING
)
<<
"Profiler is actived, might affect the performance"
;
LOG
(
INFO
)
<<
"You can turn off by set gflags '-profile false'"
;
auto
tracking_device
=
config_
.
use_gpu
?
platform
::
ProfilerState
::
kAll
auto
tracking_device
=
config_
.
use_gpu
()
?
platform
::
ProfilerState
::
kAll
:
platform
::
ProfilerState
::
kCPU
;
platform
::
EnableProfiler
(
tracking_device
);
}
...
...
@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
// Optimize the program, and load parameters and modify them in the
// scope_.
// This will change the scope_ address.
if
(
config_
.
enable_ir_optim
)
{
if
(
config_
.
ir_optim
()
)
{
status_ir_optim_enabled_
=
true
;
OptimizeInferenceProgram
();
}
else
{
...
...
@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
return
true
;
}
bool
AnalysisPredictor
::
CreateExecutor
()
{
if
(
config_
.
use_gpu
)
{
if
(
config_
.
use_gpu
_
)
{
status_use_gpu_
=
true
;
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
_id_
);
}
else
{
place_
=
paddle
::
platform
::
CPUPlace
();
}
...
...
@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
}
bool
AnalysisPredictor
::
PrepareExecutor
()
{
executor_
->
Prepare
(
sub_scope_
,
*
inference_program_
,
0
,
config_
.
use_feed_fetch_ops
);
config_
.
use_feed_fetch_ops
_
);
PADDLE_ENFORCE_NOT_NULL
(
sub_scope_
);
...
...
@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
}
input
.
set_lod
(
lod
);
int
idx
=
-
1
;
if
(
config_
.
specify_input_name
)
{
if
(
config_
.
specify_input_name
_
)
{
auto
name
=
inputs
[
i
].
name
;
if
(
feed_names_
.
find
(
name
)
==
feed_names_
.
end
())
{
LOG
(
ERROR
)
<<
"feed names from program do not have name: ["
<<
name
...
...
@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
void
AnalysisPredictor
::
OptimizeInferenceProgram
()
{
status_program_optimized_
=
true
;
argument_
.
SetUseGPU
(
config_
.
use_gpu
);
argument_
.
SetGPUDeviceId
(
config_
.
device
);
argument_
.
SetUseGPU
(
config_
.
use_gpu
()
);
argument_
.
SetGPUDeviceId
(
config_
.
gpu_device_id
()
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
// Analyze inference_program
if
(
!
config_
.
model_dir
.
empty
())
{
argument_
.
SetModelDir
(
config_
.
model_dir
);
if
(
!
config_
.
model_dir
()
.
empty
())
{
argument_
.
SetModelDir
(
config_
.
model_dir
()
);
}
else
{
PADDLE_ENFORCE
(
!
config_
.
param
_file
.
empty
(),
!
config_
.
param
s_file
()
.
empty
(),
"Either model_dir or (param_file, prog_file) should be set."
);
PADDLE_ENFORCE
(
!
config_
.
prog_file
.
empty
());
argument_
.
SetModelProgramPath
(
config_
.
prog_file
);
argument_
.
SetModelParamsPath
(
config_
.
param
_file
);
PADDLE_ENFORCE
(
!
config_
.
prog_file
()
.
empty
());
argument_
.
SetModelProgramPath
(
config_
.
prog_file
()
);
argument_
.
SetModelParamsPath
(
config_
.
param
s_file
()
);
}
if
(
config_
.
use_gpu
&&
config_
.
use_tensorrt_
)
{
if
(
config_
.
use_gpu
()
&&
config_
.
tensorrt_engine_enabled
()
)
{
argument_
.
SetUseTensorRT
(
true
);
argument_
.
SetTensorRtWorkspaceSize
(
config_
.
tensorrt_workspace_size_
);
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
...
...
@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
auto
passes
=
config_
.
pass_builder
()
->
AllPasses
();
if
(
!
config_
.
enable_ir_optim
)
passes
.
clear
();
if
(
!
config_
.
ir_optim
()
)
passes
.
clear
();
argument_
.
SetIrAnalysisPasses
(
passes
);
argument_
.
SetScopeNotOwned
(
const_cast
<
framework
::
Scope
*>
(
scope_
.
get
()));
Analyzer
().
Run
(
&
argument_
);
...
...
@@ -358,18 +359,26 @@ template <>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
AnalysisConfig
&
config
)
{
VLOG
(
3
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
)
{
if
(
config
.
use_gpu
()
)
{
// 1. GPU memeroy
PADDLE_ENFORCE_GT
(
config
.
fraction_of_gpu_memory
,
0.
f
,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"
);
PADDLE_ENFORCE_GE
(
config
.
device
,
0
,
"Invalid device id %d"
,
config
.
device
);
PADDLE_ENFORCE_GT
(
config
.
memory_pool_init_size_mb
(),
0.
f
);
PADDLE_ENFORCE_GE
(
config
.
gpu_device_id
(),
0
,
"Invalid device id %d"
,
config
.
gpu_device_id
());
std
::
vector
<
std
::
string
>
flags
;
if
(
config
.
fraction_of_gpu_memory
>=
0.0
f
||
config
.
fraction_of_gpu_memory
<=
0.95
f
)
{
float
fraction_of_gpu_memory
=
config
.
fraction_of_gpu_memory_for_pool
();
if
(
fraction_of_gpu_memory
>
0.95
f
)
{
LOG
(
ERROR
)
<<
"Allocate too much memory for the GPU memory pool, assigned "
<<
config
.
memory_pool_init_size_mb
()
<<
" MB"
;
LOG
(
ERROR
)
<<
"Try to shink the value by setting AnalysisConfig::EnableGpu(...)"
;
}
if
(
fraction_of_gpu_memory
>=
0.0
f
||
fraction_of_gpu_memory
<=
0.95
f
)
{
flags
.
push_back
(
"dummpy"
);
std
::
string
flag
=
"--fraction_of_gpu_memory_to_use="
+
std
::
to_string
(
config
.
fraction_of_gpu_memory
);
std
::
to_string
(
fraction_of_gpu_memory
);
flags
.
push_back
(
flag
);
VLOG
(
3
)
<<
"set flag: "
<<
flag
;
framework
::
InitGflags
(
flags
);
...
...
@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
bool
AnalysisPredictor
::
LoadProgramDesc
()
{
// Initialize the inference program
std
::
string
filename
;
if
(
!
config_
.
model_dir
.
empty
())
{
filename
=
config_
.
model_dir
+
"/__model__"
;
}
else
if
(
!
config_
.
prog_file
.
empty
()
&&
!
config_
.
param_file
.
empty
())
{
if
(
!
config_
.
model_dir
()
.
empty
())
{
filename
=
config_
.
model_dir
()
+
"/__model__"
;
}
else
if
(
!
config_
.
prog_file
().
empty
()
&&
!
config_
.
params_file
()
.
empty
())
{
// All parameters are saved in a single file.
// The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`.
filename
=
config_
.
prog_file
;
filename
=
config_
.
prog_file
()
;
}
else
{
if
(
config_
.
model_dir
.
empty
()
&&
config_
.
prog_file
.
empty
())
{
if
(
config_
.
model_dir
().
empty
()
&&
config_
.
prog_file
()
.
empty
())
{
LOG
(
ERROR
)
<<
"Either model_dir or (prog_file, param_file) should be set."
;
return
false
;
}
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"not valid model path '%s' or program path '%s'."
,
config_
.
model_dir
,
config_
.
param
_file
);
"not valid model path '%s' or program path '%s'."
,
config_
.
model_dir
()
,
config_
.
param
s_file
()
);
return
false
;
}
...
...
@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
proto
.
ParseFromString
(
pb_content
);
}
else
{
proto
.
ParseFromString
(
config_
.
prog_file
);
proto
.
ParseFromString
(
config_
.
prog_file
()
);
}
inference_program_
.
reset
(
new
framework
::
ProgramDesc
(
proto
));
return
true
;
...
...
@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
new_var
->
SetLoDLevel
(
var
->
GetLoDLevel
());
new_var
->
SetPersistable
(
true
);
if
(
!
config_
.
param
_file
.
empty
())
{
if
(
!
config_
.
param
s_file
()
.
empty
())
{
params
.
push_back
(
new_var
->
Name
());
}
else
{
// append_op
framework
::
OpDesc
*
op
=
load_block
->
AppendOp
();
op
->
SetType
(
"load"
);
op
->
SetOutput
(
"Out"
,
{
new_var
->
Name
()});
op
->
SetAttr
(
"file_path"
,
{
config_
.
model_dir
+
"/"
+
new_var
->
Name
()});
op
->
SetAttr
(
"file_path"
,
{
config_
.
model_dir
()
+
"/"
+
new_var
->
Name
()});
op
->
CheckAttrs
();
}
}
}
if
(
!
config_
.
param
_file
.
empty
())
{
if
(
!
config_
.
param
s_file
()
.
empty
())
{
// sort paramlist to have consistent ordering
std
::
sort
(
params
.
begin
(),
params
.
end
());
// append just the load_combine op
framework
::
OpDesc
*
op
=
load_block
->
AppendOp
();
op
->
SetType
(
"load_combine"
);
op
->
SetOutput
(
"Out"
,
params
);
op
->
SetAttr
(
"file_path"
,
{
config_
.
param
_file
});
op
->
SetAttr
(
"file_path"
,
{
config_
.
param
s_file
()
});
op
->
CheckAttrs
();
}
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -25,9 +25,9 @@ namespace paddle {
using
contrib
::
AnalysisConfig
;
TEST
(
AnalysisPredictor
,
analysis_off
)
{
AnalysisConfig
config
(
false
)
;
config
.
model_dir
=
FLAGS_dirname
;
config
.
enable_ir_optim
=
false
;
AnalysisConfig
config
;
config
.
SetModel
(
FLAGS_dirname
)
;
config
.
SwitchIrOptim
(
false
)
;
auto
_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
auto
*
predictor
=
static_cast
<
AnalysisPredictor
*>
(
_predictor
.
get
());
...
...
@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
}
TEST
(
AnalysisPredictor
,
analysis_on
)
{
AnalysisConfig
config
;
config
.
SetModel
(
FLAGS_dirname
);
config
.
SwitchIrOptim
(
true
);
#ifdef PADDLE_WITH_CUDA
AnalysisConfig
config
(
true
);
config
.
fraction_of_gpu_memory
=
0.15
;
config
.
EnableUseGpu
(
100
,
0
);
#else
AnalysisConfig
config
;
config
.
DisableGpu
()
;
#endif
config
.
model_dir
=
FLAGS_dirname
;
config
.
enable_ir_optim
=
true
;
auto
_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
auto
*
predictor
=
static_cast
<
AnalysisPredictor
*>
(
_predictor
.
get
());
...
...
@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
}
// compare with NativePredictor
auto
naive_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
auto
naive_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
.
ToNativeConfig
());
std
::
vector
<
PaddleTensor
>
naive_outputs
;
ASSERT_TRUE
(
naive_predictor
->
Run
(
inputs
,
&
naive_outputs
));
ASSERT_EQ
(
naive_outputs
.
size
(),
1UL
);
...
...
@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {
TEST
(
AnalysisPredictor
,
ZeroCopy
)
{
AnalysisConfig
config
;
config
.
model_dir
=
FLAGS_dirname
;
config
.
use_feed_fetch_ops
=
false
;
config
.
SetModel
(
FLAGS_dirname
);
config
.
SwitchUseFeedFetchOps
(
false
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
auto
w0
=
predictor
->
GetInputTensor
(
"firstw"
);
...
...
@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {
TEST
(
AnalysisPredictor
,
Clone
)
{
AnalysisConfig
config
;
config
.
model_dir
=
FLAGS_dirname
;
config
.
use_feed_fetch_ops
=
true
;
config
.
enable_ir_optim
=
true
;
config
.
SetModel
(
FLAGS_dirname
)
;
config
.
SwitchUseFeedFetchOps
(
true
)
;
config
.
SwitchIrOptim
(
true
)
;
std
::
vector
<
std
::
unique_ptr
<
PaddlePredictor
>>
predictors
;
predictors
.
emplace_back
(
CreatePaddlePredictor
(
config
));
...
...
paddle/fluid/inference/api/api_anakin_engine.h
浏览文件 @
4a443ffc
...
...
@@ -19,8 +19,6 @@ limitations under the License. */
#pragma once
#define WITH_ANAKIN
#include <vector>
#include "framework/core/net/net.h"
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
4a443ffc
...
...
@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG
(
3
)
<<
"create NativePaddlePredictor"
;
if
(
config
.
use_gpu
)
{
// 1. GPU memeroy
PADDLE_ENFORCE_G
T
(
PADDLE_ENFORCE_G
E
(
config
.
fraction_of_gpu_memory
,
0.
f
,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"
);
PADDLE_ENFORCE_GE
(
config
.
device
,
0
,
"Invalid device id %d"
,
config
.
device
);
...
...
paddle/fluid/inference/api/api_impl_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
#endif
TEST
(
PassBuilder
,
Delete
)
{
contrib
::
AnalysisConfig
config
(
false
);
contrib
::
AnalysisConfig
config
;
config
.
DisableGpu
();
config
.
pass_builder
()
->
DeletePass
(
"attention_lstm_fuse_pass"
);
const
auto
&
passes
=
config
.
pass_builder
()
->
AllPasses
();
auto
it
=
std
::
find
(
passes
.
begin
(),
passes
.
end
(),
"attention_lstm_fuse_pass"
);
...
...
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
浏览文件 @
4a443ffc
...
...
@@ -36,12 +36,11 @@ namespace demo {
*/
void
Main
()
{
std
::
unique_ptr
<
PaddlePredictor
>
predictor
;
paddle
::
contrib
::
AnalysisConfig
config
(
true
)
;
config
.
param_file
=
FLAGS_modeldir
+
"/__params__"
;
config
.
prog_file
=
FLAGS_modeldir
+
"/__model__"
;
config
.
device
=
0
;
paddle
::
contrib
::
AnalysisConfig
config
;
config
.
EnableUseGpu
(
100
,
0
)
;
config
.
SetModel
(
FLAGS_modeldir
+
"/__params__"
,
FLAGS_modeldir
+
"/__model__"
)
;
config
.
EnableTensorRtEngine
();
config
.
fraction_of_gpu_memory
=
0.1
;
// set by yourself
predictor
=
CreatePaddlePredictor
(
config
);
VLOG
(
3
)
<<
"begin to process data"
;
...
...
paddle/fluid/inference/api/demo_ci/vis_demo.cc
浏览文件 @
4a443ffc
...
...
@@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
*/
void
Main
(
bool
use_gpu
)
{
std
::
unique_ptr
<
PaddlePredictor
>
predictor
,
analysis_predictor
;
AnalysisConfig
config
(
use_gpu
);
config
.
param_file
=
FLAGS_modeldir
+
"/__params__"
;
config
.
prog_file
=
FLAGS_modeldir
+
"/__model__"
;
config
.
device
=
0
;
if
(
FLAGS_use_gpu
)
{
config
.
fraction_of_gpu_memory
=
0.1
;
// set by yourself
AnalysisConfig
config
;
if
(
use_gpu
)
{
config
.
EnableUseGpu
(
100
,
0
);
}
config
.
SetModel
(
FLAGS_modeldir
+
"/__model__"
,
FLAGS_modeldir
+
"/__params__"
);
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
.
ToNativeConfig
()
);
analysis_predictor
=
CreatePaddlePredictor
(
config
);
// Just a single batch of data.
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
4a443ffc
...
...
@@ -34,26 +34,67 @@ class AnalysisPredictor;
namespace
contrib
{
// NOTE WIP, not stable yet.
struct
AnalysisConfig
:
public
NativeConfig
{
explicit
AnalysisConfig
(
bool
use_gpu
=
false
)
;
struct
AnalysisConfig
{
AnalysisConfig
()
=
default
;
explicit
AnalysisConfig
(
const
AnalysisConfig
&
other
);
explicit
AnalysisConfig
(
AnalysisConfig
&&
other
);
explicit
AnalysisConfig
(
const
std
::
string
&
model_dir
);
explicit
AnalysisConfig
(
const
std
::
string
&
prog_file
,
const
std
::
string
&
params_file
);
// Model path related.
void
SetModel
(
const
std
::
string
&
model_dir
)
{
model_dir_
=
model_dir
;
}
void
SetModel
(
const
std
::
string
&
prog_file_path
,
const
std
::
string
&
params_file_path
);
void
SetProgFile
(
const
std
::
string
&
x
)
{
prog_file_
=
x
;
}
void
SetParamsFile
(
const
std
::
string
&
x
)
{
params_file_
=
x
;
}
const
std
::
string
&
model_dir
()
const
{
return
model_dir_
;
}
const
std
::
string
&
prog_file
()
const
{
return
prog_file_
;
}
const
std
::
string
&
params_file
()
const
{
return
params_file_
;
}
// GPU related.
void
EnableUseGpu
(
uint64_t
memory_pool_init_size_mb
,
int
device_id
=
0
);
void
DisableGpu
();
bool
use_gpu
()
const
{
return
use_gpu_
;
}
int
gpu_device_id
()
const
{
return
device_id_
;
}
int
memory_pool_init_size_mb
()
const
{
return
memory_pool_init_size_mb_
;
}
float
fraction_of_gpu_memory_for_pool
()
const
;
// Determine whether to perform graph optimization.
bool
enable_ir_optim
=
true
;
void
SwitchIrOptim
(
int
x
=
true
)
{
enable_ir_optim_
=
x
;
}
bool
ir_optim
()
const
{
return
enable_ir_optim_
;
}
// Get a pass builder for customize the passes in IR analysis phase.
PassStrategy
*
pass_builder
()
const
;
void
SwitchUseFeedFetchOps
(
int
x
=
true
)
{
use_feed_fetch_ops_
=
x
;
}
bool
use_feed_fetch_ops_enabled
()
const
{
return
use_feed_fetch_ops_
;
}
// NOT stable yet.
bool
use_feed_fetch_ops
{
true
};
void
SwitchSpecifyInputNames
(
bool
x
=
true
)
{
specify_input_name_
=
x
;
}
bool
specify_input_name
()
const
{
return
specify_input_name_
;
}
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
);
bool
use_tensorrt
()
const
{
return
use_tensorrt_
;
}
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
void
SwitchIrDebug
(
int
x
=
true
)
{
ir_debug_
=
x
;
}
void
EnableMKLDNN
();
bool
use_mkldnn
()
const
{
return
use_mkldnn_
;
}
bool
mkldnn_enabled
()
const
{
return
use_mkldnn_
;
}
// Set and get the number of cpu math library threads.
void
SetCpuMathLibraryNumThreads
(
int
cpu_math_library_num_threads
);
int
cpu_math_library_num_threads
()
const
{
return
cpu_math_library_num_threads_
;
}
NativeConfig
ToNativeConfig
()
const
{
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
config
.
prog_file
=
prog_file_
;
config
.
param_file
=
params_file_
;
config
.
use_gpu
=
use_gpu_
;
config
.
device
=
device_id_
;
config
.
fraction_of_gpu_memory
=
fraction_of_gpu_memory_for_pool
();
config
.
specify_input_name
=
specify_input_name_
;
return
config
;
}
void
SetMKLDNNOp
(
std
::
unordered_set
<
std
::
string
>
op_list
)
{
mkldnn_enabled_op_types_
=
op_list
;
}
...
...
@@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig {
friend
class
::
paddle
::
AnalysisPredictor
;
// NOTE just for developer, not an official API, easily to be broken.
// Get a pass builder for customize the passes in IR analysis phase.
PassStrategy
*
pass_builder
()
const
;
protected:
// Update the config.
void
Update
();
std
::
string
SerializeInfoCache
();
protected:
// Model pathes.
std
::
string
model_dir_
;
std
::
string
prog_file_
;
std
::
string
params_file_
;
// GPU releated.
bool
use_gpu_
{
false
};
int
device_id_
{
0
};
uint64_t
memory_pool_init_size_mb_
{
100
};
// initial size is 100MB.
// TensorRT releated.
bool
use_tensorrt_
{
false
};
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int
tensorrt_workspace_size_
;
...
...
@@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig {
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int
tensorrt_min_subgraph_size_
{
3
};
std
::
unique_ptr
<
PassStrategy
>
pass_builder_
;
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
bool
model_from_memory_
{
false
};
};
// Configurations for Anakin engine.
struct
AnakinConfig
:
public
PaddlePredictor
::
Config
{
enum
TargetType
{
NVGPU
=
0
,
X86
};
int
device
;
std
::
string
model_file
;
int
max_batch_size
{
-
1
};
TargetType
target_type
;
bool
enable_ir_optim_
{
true
};
bool
use_feed_fetch_ops_
{
true
};
bool
ir_debug_
{
false
};
bool
specify_input_name_
{
false
};
int
cpu_math_library_num_threads_
{
1
};
// A runtime cache, shouldn't be transferred to others.
std
::
string
serialized_info_cache_
;
mutable
std
::
unique_ptr
<
PassStrategy
>
pass_builder_
;
};
}
// namespace contrib
...
...
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
4a443ffc
...
...
@@ -26,9 +26,8 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle_api.h" // NOLINT
#ifndef WITH_ANAKIN
#include "paddle_analysis_config.h" // NOLINT
#else
#include "paddle_api.h" // NOLINT
#ifdef WITH_ANAKIN
#include "paddle_anakin_config.h" // NOLINT
#endif
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
4a443ffc
...
...
@@ -62,7 +62,12 @@ class PassStrategy : public PaddlePassBuilder {
// still some CPU kernels running in CPU mode.
virtual
void
EnableMKLDNN
()
=
0
;
bool
use_gpu
()
const
{
return
use_gpu_
;
}
virtual
~
PassStrategy
()
=
default
;
protected:
bool
use_gpu_
{
false
};
};
/*
...
...
@@ -88,6 +93,7 @@ class CpuPassStrategy : public PassStrategy {
"conv_eltwiseadd_bn_fuse_pass"
,
//
"is_test_pass"
,
//
});
use_gpu_
=
false
;
}
virtual
~
CpuPassStrategy
()
=
default
;
...
...
@@ -126,10 +132,14 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add2_act_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
});
use_gpu_
=
true
;
}
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{}
:
PassStrategy
(
other
.
AllPasses
())
{
use_gpu_
=
true
;
}
void
EnableMKLDNN
()
override
;
...
...
paddle/fluid/inference/tensorrt/CMakeLists.txt
浏览文件 @
4a443ffc
nv_library
(
tensorrt_engine SRCS engine.cc DEPS
${
GLOB_OPERATOR_DEPS
}
framework_proto device_context
)
nv_library
(
tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
add_subdirectory
(
plugin
)
...
...
paddle/fluid/
platform/cuda_helper.h
→
paddle/fluid/
inference/tensorrt/op_teller.cc
浏览文件 @
4a443ffc
...
...
@@ -12,47 +12,38 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000
enum
cublasMath_t
{
CUBLAS_DEFAULT_MATH
=
0
};
#endif
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace
paddle
{
namespace
platform
{
class
CublasHandleHolder
{
public:
CublasHandleHolder
(
cudaStream_t
stream
,
cublasMath_t
math_type
)
{
PADDLE_ENFORCE
(
dynload
::
cublasCreate
(
&
handle_
));
PADDLE_ENFORCE
(
dynload
::
cublasSetStream
(
handle_
,
stream
));
#if CUDA_VERSION >= 9000
if
(
math_type
==
CUBLAS_TENSOR_OP_MATH
)
{
PADDLE_ENFORCE
(
dynload
::
cublasSetMathMode
(
handle_
,
CUBLAS_TENSOR_OP_MATH
));
}
#endif
}
namespace
inference
{
namespace
tensorrt
{
~
CublasHandleHolder
()
{
PADDLE_ENFORCE
(
dynload
::
cublasDestroy
(
handle_
));
}
// Just tell by the op_types.
struct
SimpleOpTypeSetTeller
:
public
Teller
{
SimpleOpTypeSetTeller
()
{}
template
<
typename
Callback
>
inline
void
Call
(
Callback
&&
callback
)
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
callback
(
handle_
);
bool
operator
()(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
override
{
return
teller_set
.
count
(
op_type
);
}
private:
DISABLE_COPY_AND_ASSIGN
(
CublasHandleHolder
);
cublasHandle_t
handle_
;
mutable
std
::
mutex
mtx_
;
std
::
unordered_set
<
std
::
string
>
teller_set
{
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"elementwise_mul"
,
"dropout"
,
"split"
,
"prelu"
,
"conv2d_transpose"
,
"leaky_relu"
}};
};
}
// namespace platform
bool
OpTeller
::
Tell
(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
{
for
(
auto
&
teller
:
tellers_
)
{
if
((
*
teller
)(
op_type
,
desc
))
return
true
;
}
return
false
;
}
OpTeller
::
OpTeller
()
{
tellers_
.
emplace_back
(
new
SimpleOpTypeSetTeller
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/op_teller.h
0 → 100644
浏览文件 @
4a443ffc
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct
Teller
{
virtual
bool
operator
()(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
)
=
0
;
virtual
~
Teller
()
=
default
;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class
OpTeller
{
public:
static
OpTeller
&
Global
()
{
static
std
::
unique_ptr
<
OpTeller
>
x
(
new
OpTeller
);
return
*
x
;
}
bool
Tell
(
const
std
::
string
&
op_type
,
const
framework
::
OpDesc
&
desc
);
private:
OpTeller
();
private:
std
::
vector
<
std
::
unique_ptr
<
Teller
>>
tellers_
;
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -41,7 +41,7 @@ endfunction()
if
(
NOT APPLE AND WITH_MKLML
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
download_model_and_data
(
${
RNN1_INSTALL_DIR
}
"rnn1%2Fmodel.tar.gz"
"rnn1%2Fdata.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_rnn1
${
RNN1_INSTALL_DIR
}
analyzer_rnn1_tester.cc
)
inference_analysis_api_test
(
test_analyzer_rnn1
${
RNN1_INSTALL_DIR
}
analyzer_rnn1_tester.cc
SERIAL
)
else
()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
...
...
@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
SERIAL
)
# small DAM
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
download_model_and_data
(
${
DAM_SMALL_INSTALL_DIR
}
"dam_small_model.tar.gz"
"dam_small_data.txt.tar.gz"
)
inference_analysis_test
(
test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
DAM_SMALL_INSTALL_DIR
}
/model --infer_data=
${
DAM_SMALL_INSTALL_DIR
}
/data.txt --max_turn_num=1
)
ARGS --infer_model=
${
DAM_SMALL_INSTALL_DIR
}
/model --infer_data=
${
DAM_SMALL_INSTALL_DIR
}
/data.txt --max_turn_num=1
SERIAL
)
# chinese_ner
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
...
...
@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
# resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
SERIAL
)
# mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data
(
test_analyzer_mobilenet_depthwise_conv
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet_depthwise_conv"
analyzer_resnet50_tester.cc
"mobilenet_model.tar.gz"
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet_depthwise_conv"
analyzer_resnet50_tester.cc
"mobilenet_model.tar.gz"
SERIAL
)
# anakin
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void
SetConfig
(
contrib
::
AnalysisConfig
*
cfg
)
{
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
param_file
=
FLAGS_infer_model
+
"/param"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
+
"/__model__"
,
FLAGS_infer_model
+
"/param"
);
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
(
true
);
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
model_dir
=
FLAGS_infer_model
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void
SetConfig
(
contrib
::
AnalysisConfig
*
cfg
)
{
cfg
->
model_dir
=
FLAGS_infer_model
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
cfg
->
SetModelBuffer
(
&
buffer_prog
[
0
],
buffer_prog
.
size
(),
&
buffer_param
[
0
],
buffer_param
.
size
());
}
else
{
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
param_file
=
FLAGS_infer_model
+
"/param"
;
cfg
->
SetModel
(
FLAGS_infer_model
+
"/__model__"
,
FLAGS_infer_model
+
"/param"
)
;
}
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -21,12 +21,10 @@ namespace inference {
namespace
analysis
{
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
param_file
=
FLAGS_infer_model
+
"/params"
;
cfg
->
prog_file
=
FLAGS_infer_model
+
"/model"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
+
"/model"
,
FLAGS_infer_model
+
"/params"
);
cfg
->
DisableGpu
();
cfg
->
SwitchIrOptim
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
}
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
param_file
=
FLAGS_infer_model
+
"/param"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
+
"/__model__"
,
FLAGS_infer_model
+
"/param"
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently.
TEST
(
Analyzer_rnn1
,
profile
)
{
contrib
::
AnalysisConfig
cfg
(
false
)
;
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
cfg
.
fraction_of_gpu_memory
=
0.1
;
cfg
.
pass_builder
()
->
TurnOn
Debug
();
cfg
.
DisableGpu
()
;
cfg
.
SwitchIr
Debug
();
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
...
...
@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) {
TEST
(
Analyzer_rnn1
,
ZeroCopy
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
use_feed_fetch_ops
=
false
;
config
.
SwitchUseFeedFetchOps
(
false
)
;
PaddlePlace
place
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
config
.
use_feed_fetch_ops
=
true
;
auto
native_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
config
.
SwitchUseFeedFetchOps
(
true
);
auto
native_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
.
ToNativeConfig
());
config
.
use_feed_fetch_ops
=
true
;
// the analysis predictor needs feed/fetch.
config
.
SwitchUseFeedFetchOps
(
true
);
// the analysis predictor needs feed/fetch.
auto
analysis_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
#define NEW_TENSOR(name__) \
...
...
@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
TEST
(
Analyzer_rnn1
,
ZeroCopyMultiThread
)
{
AnalysisConfig
config
;
SetConfig
(
&
config
);
config
.
use_feed_fetch_ops
=
false
;
config
.
SwitchUseFeedFetchOps
(
false
)
;
#define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__);
...
...
paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
param_file
=
FLAGS_infer_model
+
"/param"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
+
"/__model__"
,
FLAGS_infer_model
+
"/param"
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
model_dir
=
FLAGS_infer_model
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
param_file
=
FLAGS_infer_model
+
"/params"
;
cfg
->
prog_file
=
FLAGS_infer_model
+
"/model"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
+
"/model"
,
FLAGS_infer_model
+
"/params"
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
pass_builder
()
->
TurnOnDebug
();
cfg
->
SetCpuMathLibraryNumThreads
(
FLAGS_paddle_num_threads
);
}
...
...
paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -47,11 +47,10 @@ struct DataReader {
};
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
model_dir
=
FLAGS_infer_model
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
specify_input_name
=
true
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) {
}
void
SetConfig
(
AnalysisConfig
*
cfg
)
{
cfg
->
param_file
=
FLAGS_infer_model
+
"/__params__"
;
cfg
->
prog_file
=
FLAGS_infer_model
+
"/__model__"
;
cfg
->
use_gpu
=
false
;
cfg
->
device
=
0
;
cfg
->
enable_ir_optim
=
true
;
cfg
->
specify_input_name
=
true
;
cfg
->
SetModel
(
FLAGS_infer_model
+
"/__model__"
,
FLAGS_infer_model
+
"/__params__"
);
cfg
->
DisableGpu
();
cfg
->
SwitchIrDebug
();
cfg
->
SwitchSpecifyInputNames
();
// TODO(TJ): fix fusion gru
cfg
->
pass_builder
()
->
DeletePass
(
"fc_gru_fuse_pass"
);
}
...
...
paddle/fluid/inference/tests/api/config_printer.h
浏览文件 @
4a443ffc
...
...
@@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os,
num_spaces
++
;
os
<<
*
reinterpret_cast
<
const
NativeConfig
*>
(
&
config
);
if
(
!
config
.
model_from_memory
())
{
os
<<
GenSpaces
(
num_spaces
)
<<
"prog_file: "
<<
config
.
prog_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
param_file
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"prog_file: "
<<
config
.
prog_file
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"param_file: "
<<
config
.
params_file
()
<<
"
\n
"
;
}
else
{
os
<<
GenSpaces
(
num_spaces
)
<<
"prog_file and param_file: load from memory
\n
"
;
}
os
<<
GenSpaces
(
num_spaces
)
<<
"enable_ir_optim: "
<<
config
.
enable_ir_optim
os
<<
GenSpaces
(
num_spaces
)
<<
"enable_ir_optim: "
<<
config
.
ir_optim
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"enable_ir_optim: "
<<
config
.
ir_optim
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_feed_fetch_ops: "
<<
config
.
use_feed_fetch_ops
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_tensorrt: "
<<
config
.
use_tensorrt
()
<<
"use_feed_fetch_ops: "
<<
config
.
use_feed_fetch_ops_enabled
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_tensorrt: "
<<
config
.
tensorrt_engine_enabled
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_mkldnn: "
<<
config
.
mkldnn_enabled
()
<<
"
\n
"
;
os
<<
GenSpaces
(
num_spaces
)
<<
"use_mkldnn: "
<<
config
.
use_mkldnn
()
<<
"
\n
"
;
num_spaces
--
;
os
<<
GenSpaces
(
num_spaces
)
<<
"}
\n
"
;
return
os
;
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
4a443ffc
...
...
@@ -328,7 +328,10 @@ void CompareNativeAndAnalysis(
const
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
&
inputs
)
{
PrintConfig
(
config
,
true
);
std
::
vector
<
PaddleTensor
>
native_outputs
,
analysis_outputs
;
TestOneThreadPrediction
(
config
,
inputs
,
&
native_outputs
,
false
);
const
auto
*
analysis_config
=
reinterpret_cast
<
const
contrib
::
AnalysisConfig
*>
(
config
);
auto
native_config
=
analysis_config
->
ToNativeConfig
();
TestOneThreadPrediction
(
&
native_config
,
inputs
,
&
native_outputs
,
false
);
TestOneThreadPrediction
(
config
,
inputs
,
&
analysis_outputs
,
true
);
CompareResult
(
analysis_outputs
,
native_outputs
);
}
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
4a443ffc
...
...
@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
std
::
string
model_dir
,
bool
use_gpu
,
bool
use_tensorrt
,
int
batch_size
)
{
if
(
!
FLAGS_prog_filename
.
empty
()
&&
!
FLAGS_param_filename
.
empty
())
{
config
->
prog_file
=
model_dir
+
"/"
+
FLAGS_prog_filename
;
config
->
param_file
=
model_dir
+
"/"
+
FLAGS_param_filename
;
config
->
SetModel
(
model_dir
+
"/"
+
FLAGS_prog_filename
,
model_dir
+
"/"
+
FLAGS_param_filename
)
;
}
else
{
config
->
model_dir
=
model_dir
;
config
->
SetModel
(
model_dir
)
;
}
if
(
use_gpu
)
{
config
->
use_gpu
=
true
;
config
->
device
=
0
;
config
->
fraction_of_gpu_memory
=
0.15
;
config
->
EnableUseGpu
(
100
,
0
);
if
(
use_tensorrt
)
{
config
->
EnableTensorRtEngine
(
1
<<
10
,
batch_size
);
config
->
pass_builder
()
->
DeletePass
(
"conv_bn_fuse_pass"
);
config
->
pass_builder
()
->
DeletePass
(
"fc_fuse_pass"
);
config
->
pass_builder
()
->
TurnOnDebug
();
}
else
{
config
->
enable_ir_optim
=
true
;
config
->
SwitchIrOptim
()
;
}
}
}
...
...
@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
std
::
vector
<
PaddleTensor
>
outputs
;
if
(
use_analysis
||
use_tensorrt
)
{
contrib
::
AnalysisConfig
config
(
true
);
contrib
::
AnalysisConfig
config
;
config
.
EnableUseGpu
(
100
,
0
);
config
.
pass_builder
()
->
TurnOnDebug
();
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
config
,
model_dir
,
true
,
use_tensorrt
,
FLAGS_batch_size
);
...
...
@@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) {
&
native_outputs
,
false
);
std
::
vector
<
PaddleTensor
>
analysis_outputs
;
contrib
::
AnalysisConfig
analysis_config
(
true
);
contrib
::
AnalysisConfig
analysis_config
;
analysis_config
.
EnableUseGpu
(
50
,
0
);
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
analysis_config
,
model_dir
,
true
,
use_tensorrt
,
FLAGS_batch_size
);
TestOneThreadPrediction
(
...
...
@@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) {
TEST
(
AnalysisPredictor
,
use_gpu
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/"
+
"mobilenet"
;
AnalysisConfig
config
(
true
)
;
config
.
model_dir
=
model_dir
;
config
.
fraction_of_gpu_memory
=
0.15
;
AnalysisConfig
config
;
config
.
EnableUseGpu
(
100
,
0
)
;
config
.
SetModel
(
model_dir
)
;
config
.
pass_builder
()
->
TurnOnDebug
();
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
inputs_all
;
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -53,7 +53,7 @@ if (WITH_GPU)
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
endif
()
# conv_fusion_op needs cudnn 7 above
if
(
NOT
${
CUDNN_
MAJOR_VERSION
}
VERSION_LESS 7
)
if
(
NOT
${
CUDNN_
VERSION
}
VERSION_LESS 7100
)
op_library
(
conv_fusion_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(conv2d_fusion);
\n
"
)
endif
()
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
浏览文件 @
4a443ffc
...
...
@@ -12,6 +12,7 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <unordered_map>
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/conv_op.h"
...
...
@@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
}
}
template
<
typename
T
>
template
<
typename
T
,
typename
K
>
class
ConvMKLDNNOpKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
bool
is_INT8
=
std
::
is_same
<
T
,
int8_t
>::
value
||
std
::
is_same
<
T
,
uint8_t
>::
value
;
if
(
!
is_INT8
)
{
ComputeFP32
(
ctx
);
}
else
{
ComputeINT8
(
ctx
);
}
}
void
ComputeFP32
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
{
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dev_ctx
=
...
...
@@ -274,6 +284,271 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory_p
));
}
void
ComputeINT8
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
{
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
auto
*
bias
=
ctx
.
HasInput
(
"Bias"
)
?
ctx
.
Input
<
Tensor
>
(
"Bias"
)
:
nullptr
;
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
&&
input
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input tensor"
);
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
&&
filter
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Filter tensor"
);
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
||
input
->
dims
().
size
()
==
5
,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"
);
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
||
filter
->
dims
().
size
()
==
5
,
"Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"
);
if
(
bias
)
{
PADDLE_ENFORCE
(
bias
->
layout
()
==
DataLayout
::
kMKLDNN
&&
bias
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Bias tensor"
);
PADDLE_ENFORCE
(
bias
->
dims
().
size
()
==
1
,
"Bias must only have 1 dimension, i.e. X"
);
}
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
bool
fuse_relu
=
ctx
.
Attr
<
bool
>
(
"fuse_relu"
);
bool
force_fp32_output
=
ctx
.
Attr
<
bool
>
(
"force_fp32_output"
);
bool
is_conv3d
=
strides
.
size
()
==
3U
;
// TODO(tpatejko): add support for dilation
PADDLE_ENFORCE
(
is_conv3d
?
dilations
.
size
()
==
3
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
&&
dilations
[
2
]
==
1
:
dilations
.
size
()
==
2
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
,
"dilation in convolution is not implemented yet"
);
PADDLE_ENFORCE
(
is_conv3d
!=
true
,
"int8 does not support conv3d currently"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
weights_tz
=
paddle
::
framework
::
vectorize2int
(
filter
->
dims
());
int
g
=
std
::
max
(
groups
,
1
);
GetWeightsTz
(
weights_tz
,
g
,
is_conv3d
);
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
mkldnn
::
memory
::
data_type
src_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
input
->
type
());
auto
dst_dt
=
fuse_relu
?
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
uint8_t
>::
DataType
)
:
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
int8_t
>::
DataType
);
if
(
force_fp32_output
)
{
dst_dt
=
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
float
>::
DataType
);
}
// Get unique name for storing MKLDNN primitives
std
::
string
key
;
key
.
reserve
(
MaxKeyLength
);
platform
::
ConvMKLDNNHandler
::
AppendKey
(
&
key
,
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
groups
,
src_dt
,
input
->
format
(),
dst_dt
,
ctx
.
op
().
Output
(
"Output"
));
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
std
::
shared_ptr
<
mkldnn
::
convolution_forward
>
conv_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
user_src_memory_p
=
nullptr
;
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
=
nullptr
;
std
::
vector
<
primitive
>
pipeline
;
std
::
shared_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
conv_pd
=
nullptr
;
std
::
shared_ptr
<
platform
::
ConvMKLDNNHandler
>
handler
=
nullptr
;
auto
prim_key
=
key
+
"@conv_p"
;
auto
dst_key
=
key
+
"@dst_mem_p"
;
auto
src_key
=
key
+
"@src_mem_p"
;
auto
user_src_key
=
key
+
"@user_src_mem_p"
;
auto
src_reorder_key
=
key
+
"@src_mem_preorder_p"
;
conv_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_forward
>
(
dev_ctx
.
GetBlob
(
prim_key
));
if
(
conv_p
==
nullptr
||
!
is_test
)
{
const
K
*
filter_data
=
filter
->
data
<
K
>
();
auto
scale_in_data
=
ctx
.
Attr
<
float
>
(
"Scale_in"
);
auto
scale_weights_data
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"Scale_weights"
);
auto
scale_out_data
=
force_fp32_output
?
1.0
f
:
ctx
.
Attr
<
float
>
(
"Scale_out"
);
bool
is_multi_channel
=
scale_weights_data
.
size
()
>
1
;
int
count
=
is_multi_channel
?
(
g
>
1
?
(
weights_tz
)[
1
]
*
(
weights_tz
)[
0
]
:
(
weights_tz
)[
0
])
:
1
;
std
::
vector
<
float
>
output_shift_scale
(
count
);
#pragma omp parallel for if (count > 1)
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
if
(
scale_weights_data
[
i
]
==
0.0
)
output_shift_scale
[
i
]
=
scale_out_data
;
// weights data will contain 0
// in some models, then weights
// scale couldn't be calculated
else
output_shift_scale
[
i
]
=
scale_out_data
/
(
scale_in_data
*
scale_weights_data
[
i
]);
}
auto
user_src_md
=
platform
::
MKLDNNMemDesc
({
src_tz
},
src_dt
,
input
->
format
());
auto
user_weights_md
=
platform
::
MKLDNNMemDesc
(
{
weights_tz
},
platform
::
MKLDNNGetDataType
<
K
>
(),
((
g
)
==
1
)
?
mkldnn
::
memory
::
format
::
oihw
:
mkldnn
::
memory
::
format
::
goihw
);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
chosen_memory_format
=
platform
::
data_format_to_memory_format
(
data_format
);
std
::
vector
<
int
>
bias_tz
;
auto
src_md
=
platform
::
MKLDNNMemDesc
(
src_tz
,
src_dt
,
chosen_memory_format
);
auto
weights_md
=
platform
::
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
s8
,
chosen_memory_format
);
auto
dst_md
=
platform
::
MKLDNNMemDesc
(
dst_tz
,
dst_dt
,
chosen_memory_format
);
// create a conv primitive descriptor and save it for usage in backward
if
(
bias
)
{
bias_tz
=
paddle
::
framework
::
vectorize2int
(
bias
->
dims
());
auto
bias_md
=
platform
::
MKLDNNMemDesc
(
bias_tz
,
memory
::
data_type
::
s32
,
memory
::
format
::
x
);
conv_pd
=
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
bias_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
fuse_relu
,
output_shift_scale
,
is_test
);
}
else
{
conv_pd
=
ConvFwdPrimitiveDesc
(
src_md
,
weights_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
fuse_relu
,
output_shift_scale
,
is_test
);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd
);
handler
.
reset
(
new
platform
::
ConvMKLDNNHandler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
));
// create mkldnn memory from input tensors (data/weights)
user_src_memory_p
=
handler
->
AcquireSrcMemory
(
user_src_md
,
to_void_cast
<
T
>
(
input_data
));
auto
user_weights_memory_p
=
handler
->
AcquireWeightsMemory
(
user_weights_md
,
to_void_cast
<
K
>
(
filter_data
));
// create reorder primitive if the input format is not the preferred one
src_memory_p
=
handler
->
AcquireSrcMemoryFromPrimitive
(
user_src_memory_p
,
pipeline
);
std
::
shared_ptr
<
mkldnn
::
memory
>
weights_memory_p
;
int
mask_reorder
=
is_multi_channel
?
((
g
!=
1
)
?
(
1
<<
1
)
+
(
1
<<
0
)
:
1
<<
0
)
:
0
;
weights_memory_p
=
handler
->
AcquireWeightsMemoryFromPrimitive
(
user_weights_memory_p
,
pipeline
,
is_test
,
true
,
scale_weights_data
,
mask_reorder
);
if
(
!
force_fp32_output
)
{
if
(
fuse_relu
)
{
dst_memory_p
=
platform
::
SetDstMemory
<
uint8_t
>
(
ctx
,
output
,
handler
);
}
else
{
dst_memory_p
=
platform
::
SetDstMemory
<
int8_t
>
(
ctx
,
output
,
handler
);
}
}
else
{
dst_memory_p
=
platform
::
SetDstMemory
<
float
>
(
ctx
,
output
,
handler
);
}
// create convolution op primitive
auto
scale_bias_key
=
key
+
"@scale_bias"
;
if
(
bias
)
{
const
float
*
bias_data
=
bias
->
data
<
float
>
();
auto
user_bias_md
=
platform
::
MKLDNNMemDesc
(
{
bias_tz
},
platform
::
MKLDNNGetDataType
<
float
>
(),
memory
::
format
::
x
);
auto
user_bias_memory_p
=
handler
->
AcquireBiasMemory
(
user_bias_md
,
to_void_cast
<
float
>
(
bias_data
));
std
::
shared_ptr
<
mkldnn
::
memory
>
bias_memory_p
;
int
mask_reorder
=
is_multi_channel
?
1
<<
0
:
1
;
int
count
=
is_multi_channel
?
(
g
>
1
?
(
weights_tz
)[
1
]
*
(
weights_tz
)[
0
]
:
(
weights_tz
)[
0
])
:
1
;
std
::
vector
<
float
>
scale_bias_data
(
count
);
#pragma omp parallel for if (count > 1)
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
scale_bias_data
[
i
]
=
scale_in_data
*
scale_weights_data
[
i
];
}
bias_memory_p
=
handler
->
AcquireBiasMemoryFromPrimitive
(
user_bias_memory_p
,
pipeline
,
is_test
,
true
,
scale_bias_data
,
mask_reorder
);
conv_p
=
handler
->
AcquireConvolution
(
src_memory_p
,
weights_memory_p
,
bias_memory_p
,
dst_memory_p
);
}
else
{
conv_p
=
handler
->
AcquireConvolution
(
src_memory_p
,
weights_memory_p
,
dst_memory_p
);
}
// push primitive to stream and wait until it's executed
pipeline
.
push_back
(
*
conv_p
);
}
else
{
auto
src_memory_reorder_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
src_reorder_key
));
src_memory_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
src_key
));
if
(
src_memory_reorder_p
)
{
user_src_memory_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
user_src_key
));
user_src_memory_p
->
set_data_handle
(
to_void_cast
<
T
>
(
input_data
));
}
else
if
(
src_memory_p
)
{
src_memory_p
->
set_data_handle
(
to_void_cast
<
T
>
(
input_data
));
}
dst_memory_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
dst_key
));
conv_pd
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_forward
::
primitive_desc
>
(
dev_ctx
.
GetBlob
(
key_conv_pd
));
if
(
conv_pd
)
{
handler
.
reset
(
new
platform
::
ConvMKLDNNHandler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
));
}
if
(
!
force_fp32_output
)
{
if
(
fuse_relu
)
{
dst_memory_p
=
platform
::
SetDstMemoryHandler
<
uint8_t
>
(
ctx
,
output
,
handler
);
}
else
{
dst_memory_p
=
platform
::
SetDstMemoryHandler
<
int8_t
>
(
ctx
,
output
,
handler
);
}
}
else
{
dst_memory_p
=
platform
::
SetDstMemoryHandler
<
float
>
(
ctx
,
output
,
handler
);
}
if
(
src_memory_reorder_p
)
{
pipeline
.
push_back
(
*
src_memory_reorder_p
);
}
pipeline
.
push_back
(
*
conv_p
);
}
// push primitive to stream and wait until it's executed
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory_p
));
}
private:
mkldnn
::
primitive_attr
CreatePostOps
(
bool
fuse_relu
,
...
...
@@ -301,6 +576,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return
conv_attr
;
}
mkldnn
::
primitive_attr
CreatePostOps
(
bool
fuse_relu
,
const
std
::
vector
<
float
>
output_shift_scale
)
const
{
mkldnn
::
primitive_attr
conv_attr
;
mkldnn
::
post_ops
post_operations
;
int
mask
=
output_shift_scale
.
size
()
>
1
?
1
<<
1
:
0
;
conv_attr
.
set_output_scales
(
mask
,
output_shift_scale
);
if
(
fuse_relu
)
{
constexpr
float
scale
=
1.0
f
;
constexpr
float
negative_slope
=
0.0
f
;
constexpr
float
placeholder
=
1.0
f
;
// beta
post_operations
.
append_eltwise
(
scale
,
mkldnn
::
algorithm
::
eltwise_relu
,
negative_slope
,
placeholder
);
}
conv_attr
.
set_post_ops
(
post_operations
);
return
conv_attr
;
}
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
...
...
@@ -325,6 +617,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
p_conv_pd
);
}
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
const
std
::
vector
<
float
>
output_shift_scale
,
bool
is_test
)
const
{
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
auto
propagation
=
is_test
?
mkldnn
::
prop_kind
::
forward_scoring
:
mkldnn
::
prop_kind
::
forward_training
;
auto
conv_desc
=
mkldnn
::
convolution_forward
::
desc
(
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
fuse_relu
,
output_shift_scale
);
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
conv_desc
,
conv_attr
,
engine
);
return
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
(
p_conv_pd
);
}
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
bias
,
const
memory
::
desc
&
dst
,
...
...
@@ -349,6 +668,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
(
p_conv_pd
);
}
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
bias
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
const
std
::
vector
<
float
>
output_shift_scale
,
bool
is_test
)
const
{
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
auto
propagation
=
is_test
?
mkldnn
::
prop_kind
::
forward_scoring
:
mkldnn
::
prop_kind
::
forward_training
;
auto
conv_desc
=
mkldnn
::
convolution_forward
::
desc
(
propagation
,
mkldnn
::
convolution_direct
,
src
,
weights
,
bias
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
conv_attr
=
CreatePostOps
(
fuse_relu
,
output_shift_scale
);
auto
p_conv_pd
=
new
mkldnn
::
convolution_forward
::
primitive_desc
(
conv_desc
,
conv_attr
,
engine
);
return
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
(
p_conv_pd
);
}
};
template
<
typename
T
>
...
...
@@ -555,7 +902,17 @@ namespace ops = paddle::operators;
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv2d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
FP32
,
ops
::
kConvMKLDNNFP32
,
ops
::
ConvMKLDNNOpKernel
<
float
>
);
ops
::
ConvMKLDNNOpKernel
<
float
,
float
>
);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv2d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
U8
,
ops
::
kConvMKLDNNFP32
,
ops
::
ConvMKLDNNOpKernel
<
uint8_t
,
float
>
);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv2d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
S8
,
ops
::
kConvMKLDNNFP32
,
ops
::
ConvMKLDNNOpKernel
<
int8_t
,
float
>
);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv2d_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
FP32
,
...
...
@@ -565,7 +922,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv3d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
FP32
,
ops
::
kConvMKLDNNFP32
,
ops
::
ConvMKLDNNOpKernel
<
float
>
);
ops
::
ConvMKLDNNOpKernel
<
float
,
float
>
);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE
(
conv3d_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
FP32
,
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
4a443ffc
...
...
@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
#endif
auto
input_data_type
=
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
();
if
(
input_data_type
!=
framework
::
proto
::
VarType
::
INT8
&&
input_data_type
!=
framework
::
proto
::
VarType
::
UINT8
)
{
auto
filter_data_type
=
ctx
.
Input
<
Tensor
>
(
"Filter"
)
->
type
();
PADDLE_ENFORCE_EQ
(
input_data_type
,
filter_data_type
,
"input and filter data type should be consistent"
);
}
if
(
input_data_type
==
framework
::
proto
::
VarType
::
FP16
)
{
PADDLE_ENFORCE_EQ
(
library
,
framework
::
LibraryType
::
kCUDNN
,
"float16 can only be used when CUDNN is used"
);
...
...
@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() {
"whenever convolution output is as an input to residual "
"connection."
)
.
SetDefault
(
false
);
AddAttr
<
float
>
(
"Scale_in"
,
"Scale_in to be used for int8 input data."
"Only used with MKL-DNN INT8."
)
.
SetDefault
(
1.0
f
);
AddAttr
<
float
>
(
"Scale_out"
,
"Scale_out to be used for int8 output data."
"Only used with MKL-DNN INT8."
)
.
SetDefault
(
1.0
f
);
AddAttr
<
float
>
(
"Scale_in_eltwise"
,
"Scale_in_eltwise to be used for int8 eltwise input data."
"Only used with MKL-DNN INT8."
)
.
SetDefault
(
1.0
f
);
AddAttr
<
std
::
vector
<
float
>>
(
"Scale_weights"
,
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8."
)
.
SetDefault
({
1.0
f
});
AddAttr
<
bool
>
(
"force_fp32_output"
,
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"data_format"
,
"(string, default NCHW) Only used in "
...
...
@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() {
"Defaults to
\"
NHWC
\"
. Specify the data format of the output data, "
"the input will be transformed automatically. "
)
.
SetDefault
(
"AnyLayout"
);
AddAttr
<
bool
>
(
"force_fp32_output"
,
"(bool, default false) Only used in mkldnn INT8 kernel"
)
.
SetDefault
(
false
);
// TODO(dzhwinter): need to registered layout transform function
AddAttr
<
int
>
(
"workspace_size_MB"
,
"Only used in cudnn kernel. workspace size for cudnn, in MB, "
...
...
paddle/fluid/operators/conv_op.h
浏览文件 @
4a443ffc
...
...
@@ -18,7 +18,6 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
...
...
@@ -30,6 +29,7 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
constexpr
int
kConvMKLDNNFP32
=
1
;
constexpr
int
kConvMKLDNNINT8
=
2
;
constexpr
int
MaxKeyLength
=
256
;
// Base convolution operator definations for other conv
// like operators to reuse the implementation.
...
...
@@ -158,10 +158,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
// to call the matrix multiplication interface.
Tensor
col_matrix
;
if
(
is_expand
)
{
auto
tmp_allocation_ptr
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
framework
::
product
(
col_shape
)
*
sizeof
(
T
));
col
=
framework
::
GetTensor
<
T
>
(
std
::
move
(
tmp_allocation_ptr
),
col_shape
);
col
=
context
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
col_shape
,
dev_ctx
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
...
...
@@ -293,10 +290,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
// to call the matrix multiplication interface.
Tensor
col_matrix
;
if
(
is_expand
)
{
auto
tmp_allocation_ptr
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
framework
::
product
(
col_shape
)
*
sizeof
(
T
));
col
=
framework
::
GetTensor
<
T
>
(
std
::
move
(
tmp_allocation_ptr
),
col_shape
);
col
=
context
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
col_shape
,
dev_ctx
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
...
...
paddle/fluid/operators/detection/density_prior_box_op.cu
浏览文件 @
4a443ffc
...
...
@@ -148,7 +148,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
// blockx is multiple of 32.
int
blockx
=
std
::
min
(
static_cast
<
int64_t
>
(((
feature_width
*
num_priors
+
31
)
>>
5
)
<<
5
),
512L
);
static_cast
<
int64_t
>
(
512L
)
);
int
gridx
=
(
feature_width
*
num_priors
+
blockx
-
1
)
/
blockx
;
dim3
threads
(
blockx
,
1
);
dim3
grids
(
gridx
,
feature_height
);
...
...
paddle/fluid/operators/distributed/parameter_prefetch.cc
浏览文件 @
4a443ffc
...
...
@@ -32,7 +32,7 @@ namespace paddle {
namespace
operators
{
namespace
distributed
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoD
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
SelectedRows
=
framework
::
SelectedRows
;
using
DDim
=
framework
::
DDim
;
...
...
@@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection(
auto
&
id_tensor
=
scope
->
FindVar
(
id_name
)
->
Get
<
framework
::
LoDTensor
>
();
auto
*
out_tensor
=
scope
->
FindVar
(
out_name
)
->
GetMutable
<
framework
::
LoDTensor
>
();
PADDLE_ENFORCE_GT
(
out_tensor
->
numel
(),
0
,
"When calling this method, the LoDTensor's numel must larger than zero. "
"Please check LoDTensor::Resize has been called first."
);
auto
*
out_tensor_data
=
out_tensor
->
mutable_data
<
float
>
(
id_tensor
.
place
());
bool
is_on_cpu_place
=
true
;
...
...
@@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection(
auto
row_numel
=
dims
[
1
];
for
(
size
_t
i
=
0
;
i
<
dims
[
0
];
++
i
)
{
for
(
int64
_t
i
=
0
;
i
<
dims
[
0
];
++
i
)
{
auto
id
=
ids_in_this_section
[
i
];
auto
origin_id
=
id
+
abs_sections
[
section_idx
];
auto
&
offsets
=
id_to_offset
[
origin_id
];
...
...
@@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name,
const
std
::
vector
<
std
::
string
>&
table_names
,
const
std
::
vector
<
std
::
string
>&
epmap
,
const
std
::
vector
<
int
>&
height_sections
,
const
framework
::
ExecutionContext
&
context
)
{
auto
&
local_scope
=
context
.
scope
().
NewScope
();
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Scope
&
scope
)
{
auto
&
local_scope
=
scope
.
NewScope
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
cpu_ctx
=
*
pool
.
Get
(
platform
::
CPUPlace
());
...
...
@@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name,
out_var_names
.
push_back
(
out_name
+
"@"
+
epmap
[
i
]);
}
auto
&
id_tensor
=
local_
scope
.
FindVar
(
id_name
)
->
Get
<
framework
::
LoDTensor
>
();
auto
&
id_tensor
=
scope
.
FindVar
(
id_name
)
->
Get
<
framework
::
LoDTensor
>
();
std
::
vector
<
int64_t
>
ids_vector
;
if
(
platform
::
is_cpu_place
(
id_tensor
.
place
()))
{
auto
*
id_data
=
id_tensor
.
data
<
int64_t
>
();
for
(
size
_t
i
=
0
;
i
<
id_tensor
.
numel
();
++
i
)
{
for
(
int64
_t
i
=
0
;
i
<
id_tensor
.
numel
();
++
i
)
{
ids_vector
.
push_back
(
id_data
[
i
]);
}
}
else
{
...
...
@@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
PADDLE_THROW
(
"paddle is not compiled with CUDA!"
);
#else
auto
cpu_place
=
platform
::
CPUPlace
();
framework
::
Tensor
cpu_tensor
;
framework
::
LoD
Tensor
cpu_tensor
;
auto
*
cpu_tensor_data
=
cpu_tensor
.
mutable_data
<
int64_t
>
(
id_tensor
.
dims
(),
cpu_place
);
auto
stream
=
...
...
@@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
MergeMultipleVarsIntoOneBySection
(
id_name
,
ids_vector
,
out_name
,
out_var_names
,
height_sections
,
splited_ids
,
context
,
&
local_scope
,
&
actual_ctx
);
context
.
scope
().
DeleteScope
(
&
local_scope
);
scope
.
DeleteScope
(
&
local_scope
);
}
};
// namespace distributed
...
...
paddle/fluid/operators/distributed/parameter_prefetch.h
浏览文件 @
4a443ffc
...
...
@@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name,
const
std
::
vector
<
std
::
string
>&
table_names
,
const
std
::
vector
<
std
::
string
>&
epmap
,
const
std
::
vector
<
int
>&
height_sections
,
const
framework
::
ExecutionContext
&
context
);
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Scope
&
scope
);
template
<
typename
T
>
void
prefetch_with_reconstruct
(
const
std
::
string
&
id_name
,
const
std
::
string
&
out_name
,
const
std
::
vector
<
std
::
string
>&
table_names
,
const
std
::
vector
<
std
::
string
>&
epmap
,
const
std
::
vector
<
int
>&
height_sections
,
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Scope
&
scope
,
framework
::
LoDTensor
*
original
)
{
prefetch
(
id_name
,
out_name
,
table_names
,
epmap
,
height_sections
,
context
,
scope
);
auto
&
out
=
scope
.
FindVar
(
out_name
)
->
Get
<
framework
::
LoDTensor
>
();
auto
&
ids
=
scope
.
FindVar
(
id_name
)
->
Get
<
framework
::
LoDTensor
>
();
auto
*
original_value
=
original
->
data
<
T
>
();
auto
*
out_value
=
out
.
data
<
T
>
();
size_t
original_width
=
original
->
numel
()
/
original
->
dims
()[
0
];
bool
is_on_cpu_place
=
true
;
if
(
!
platform
::
is_cpu_place
(
ids
.
place
()))
{
is_on_cpu_place
=
false
;
}
if
(
is_on_cpu_place
)
{
for
(
int64_t
i
=
0
;
i
<
ids
.
numel
();
i
++
)
{
const
T
*
out_rows
=
out_value
+
original_width
*
i
;
T
*
original_row
=
original_value
+
original_width
*
ids
.
data
<
int64_t
>
()[
i
];
std
::
memcpy
(
original_row
,
out_rows
,
original_width
*
sizeof
(
T
));
}
}
else
{
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW
(
"paddle is not compiled with CUDA!"
);
#else
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
actual_ctx
=
*
pool
.
Get
(
context
.
GetPlace
());
for
(
int64_t
i
=
0
;
i
<
ids
.
numel
();
i
++
)
{
const
T
*
out_rows
=
out_value
+
original_width
*
i
;
T
*
original_row
=
original_value
+
original_width
*
ids
.
data
<
int64_t
>
()[
i
];
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
&
actual_ctx
)
->
stream
();
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
ids
.
place
()),
original_row
,
platform
::
CPUPlace
(),
out_rows
,
original_width
*
sizeof
(
T
),
stream
);
}
#endif
}
}
};
// namespace distributed
};
// namespace operators
...
...
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -2,7 +2,9 @@ include(operators)
register_operators
(
EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op
)
if
(
WITH_GPU
)
op_library
(
fusion_transpose_flatten_concat_op
)
op_library
(
fusion_conv_inception_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);
\n
"
)
if
(
NOT
${
CUDNN_VERSION
}
VERSION_LESS 7100
)
op_library
(
fusion_conv_inception_op
)
file
(
APPEND
${
pybind_file
}
"USE_CUDA_ONLY_OP(conv2d_inception_fusion);
\n
"
)
endif
()
endif
()
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
0 → 100644
浏览文件 @
4a443ffc
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace
paddle
{
namespace
operators
{
class
FusedEmbeddingSeqPoolOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input W of FusedEmbeddingSeqPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Ids"
),
"Input Ids of FusedEmbeddingSeqPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output of FusedEmbeddingSeqPoolOp should not be null."
);
auto
table_dims
=
ctx
->
GetInputDim
(
"W"
);
auto
ids_dims
=
ctx
->
GetInputDim
(
"Ids"
);
const
std
::
string
&
combiner
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"combiner"
);
PADDLE_ENFORCE_EQ
(
table_dims
.
size
(),
2
);
PADDLE_ENFORCE_GE
(
ids_dims
.
size
(),
1
,
"The dim size of the 'Ids' tensor must greater than 1."
);
PADDLE_ENFORCE_EQ
(
ids_dims
[
ids_dims
.
size
()
-
1
],
1
,
"The last dimension of the 'Ids' tensor must be 1."
);
// we only support sum now
PADDLE_ENFORCE_EQ
(
combiner
,
"sum"
);
int64_t
last_dim
=
table_dims
[
1
];
for
(
int
i
=
1
;
i
!=
ids_dims
.
size
();
++
i
)
{
last_dim
*=
ids_dims
[
i
];
}
if
(
ctx
->
IsRuntime
())
{
framework
::
Variable
*
ids_var
=
boost
::
get
<
framework
::
Variable
*>
(
ctx
->
GetInputVarPtrs
(
"Ids"
)[
0
]);
const
auto
&
ids_lod
=
ids_var
->
Get
<
LoDTensor
>
().
lod
();
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE
(
ids_lod
.
size
(),
1u
,
"The LoD level of Input(Ids) must be 1"
);
PADDLE_ENFORCE_GE
(
ids_lod
[
0
].
size
(),
1u
,
"The LoD could NOT be empty"
);
int64_t
batch_size
=
ids_lod
[
0
].
size
()
-
1
;
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size, embedding_size]
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
({
batch_size
,
last_dim
}));
}
else
{
// in compile time, the lod level of ids must be 1
framework
::
VarDesc
*
ids_desc
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"Ids"
)[
0
]);
PADDLE_ENFORCE_EQ
(
ids_desc
->
GetLoDLevel
(),
1
);
// in compile time, the shape from Ids -> output
// should be [-1, 1] -> [-1, embedding_size]
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
({
-
1
,
last_dim
}));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"W"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
class
FusedEmbeddingSeqPoolOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"W"
,
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter."
);
AddInput
(
"Ids"
,
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"The last dimension size must be 1."
);
AddOutput
(
"Out"
,
"The lookup results, which have the same type as W."
);
AddAttr
<
std
::
string
>
(
"combiner"
,
"(string, default sum) "
"A string specifying the reduction op. Currently sum "
"are supported, sum computes the weighted sum of the "
"embedding results for each row."
)
.
SetDefault
(
"sum"
);
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr
<
bool
>
(
"grad_inplace"
,
"(boolean, default false) "
"If the grad op reuse the input's variable."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"is_sparse"
,
"(boolean, default false) "
"Sparse update."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
FusedEmbeddingSeqPool Operator.
Computes embeddings for the given ids and weights.
This operator is used to perform lookups on the parameter W,
then computes the weighted sum of the lookups results for each row
and concatenated into a dense tensor.
The input Ids should carry the LoD (Level of Details) information.
And the output will change the LoD information with input Ids.
)DOC"
);
}
};
class
FusedEmbeddingSeqPoolOpGradDescMaker
:
public
framework
::
DefaultGradOpDescMaker
<
true
>
{
using
::
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>::
DefaultGradOpDescMaker
;
protected:
virtual
std
::
string
GradOpType
()
const
{
return
"fused_embedding_seq_pool_grad"
;
}
};
class
FusedEmbeddingSeqPoolOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
table_dims
=
ctx
->
GetInputDim
(
"W"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"W"
),
table_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"W"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
class
FusedEmbeddingSeqPoolOpGradVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
out_var_name
=
op_desc
.
Output
(
framework
::
GradVarName
(
"W"
)).
front
();
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
3
)
<<
"fused_embedding_seq_pool_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
VLOG
(
3
)
<<
"fused_embedding_seq_pool_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
block
->
Var
(
out_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fused_embedding_seq_pool
,
ops
::
FusedEmbeddingSeqPoolOp
,
ops
::
FusedEmbeddingSeqPoolOpGradDescMaker
,
ops
::
FusedEmbeddingSeqPoolOpMaker
);
REGISTER_OPERATOR
(
fused_embedding_seq_pool_grad
,
ops
::
FusedEmbeddingSeqPoolOpGrad
,
ops
::
FusedEmbeddingSeqPoolOpGradVarTypeInference
);
REGISTER_OP_CPU_KERNEL
(
fused_embedding_seq_pool
,
ops
::
FusedEmbeddingSeqPoolKernel
<
float
>
,
ops
::
FusedEmbeddingSeqPoolKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
fused_embedding_seq_pool_grad
,
ops
::
FusedEmbeddingSeqPoolGradKernel
<
float
>
,
ops
::
FusedEmbeddingSeqPoolGradKernel
<
double
>
);
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
0 → 100644
浏览文件 @
4a443ffc
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
SelectedRows
=
framework
::
SelectedRows
;
using
DDim
=
framework
::
DDim
;
template
<
typename
T
>
struct
EmbeddingVSumFunctor
{
void
operator
()(
const
framework
::
ExecutionContext
&
context
,
const
LoDTensor
*
table_t
,
const
LoDTensor
*
ids_t
,
LoDTensor
*
output_t
)
{
auto
*
table
=
table_t
->
data
<
T
>
();
int64_t
row_number
=
table_t
->
dims
()[
0
];
int64_t
row_width
=
table_t
->
dims
()[
1
];
int64_t
last_dim
=
output_t
->
dims
()[
1
];
const
int64_t
*
ids
=
ids_t
->
data
<
int64_t
>
();
auto
ids_lod
=
ids_t
->
lod
()[
0
];
int64_t
ids_count
=
ids_t
->
numel
()
/
ids_lod
.
back
();
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int64_t
i
=
0
;
i
!=
ids_lod
.
size
()
-
1
;
++
i
)
{
size_t
begin
=
ids_lod
[
i
]
*
ids_count
;
for
(
int64_t
j
=
0
;
j
!=
ids_count
;
++
j
)
{
PADDLE_ENFORCE_LT
(
ids
[
begin
],
row_number
);
PADDLE_ENFORCE_GE
(
ids
[
begin
],
0
,
"ids %d"
,
i
);
blas
.
VCOPY
(
row_width
,
table
+
ids
[
begin
+
j
]
*
row_width
,
output
+
i
*
last_dim
+
j
*
row_width
);
}
for
(
int64_t
r
=
(
ids_lod
[
i
]
+
1
)
*
ids_count
;
r
<
ids_lod
[
i
+
1
]
*
ids_count
;
++
r
)
{
PADDLE_ENFORCE_LT
(
ids
[
r
],
row_number
);
PADDLE_ENFORCE_GE
(
ids
[
r
],
0
,
"ids %d"
,
i
);
blas
.
AXPY
(
row_width
,
1.
,
table
+
ids
[
r
]
*
row_width
,
output
+
i
*
last_dim
+
(
r
%
ids_count
)
*
row_width
);
}
}
}
};
template
<
typename
T
>
class
FusedEmbeddingSeqPoolKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
LoDTensor
*
ids_t
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
// int tensor
LoDTensor
*
output_t
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
// float tensor
const
LoDTensor
*
table_var
=
context
.
Input
<
LoDTensor
>
(
"W"
);
const
std
::
string
&
combiner_type
=
context
.
Attr
<
std
::
string
>
(
"combiner"
);
if
(
combiner_type
==
"sum"
)
{
EmbeddingVSumFunctor
<
T
>
functor
;
functor
(
context
,
table_var
,
ids_t
,
output_t
);
}
}
};
template
<
typename
T
>
class
FusedEmbeddingSeqPoolGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
table_var
=
context
.
InputVar
(
"W"
);
DDim
table_dim
;
if
(
table_var
->
IsType
<
LoDTensor
>
())
{
table_dim
=
context
.
Input
<
LoDTensor
>
(
"W"
)
->
dims
();
}
else
if
(
table_var
->
IsType
<
SelectedRows
>
())
{
auto
*
table_t
=
context
.
Input
<
SelectedRows
>
(
"W"
);
table_dim
=
table_t
->
value
().
dims
();
}
else
{
PADDLE_THROW
(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows"
);
}
bool
is_sparse
=
context
.
Attr
<
bool
>
(
"is_sparse"
);
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if
(
is_sparse
)
{
auto
*
ids
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
auto
*
d_output
=
context
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_table
=
context
.
Output
<
SelectedRows
>
(
framework
::
GradVarName
(
"W"
));
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
int64_t
ids_num
=
ids
->
numel
();
auto
lod
=
ids
->
lod
()[
0
];
int64_t
row_width
=
d_output
->
dims
()[
1
];
framework
::
Vector
<
int64_t
>
*
new_rows
=
d_table
->
mutable_rows
();
new_rows
->
resize
(
ids_num
);
std
::
memcpy
(
&
(
*
new_rows
)[
0
],
ids_data
,
ids_num
*
sizeof
(
int64_t
));
auto
*
d_table_value
=
d_table
->
mutable_value
();
d_table_value
->
Resize
({
ids_num
,
table_dim
[
1
]});
T
*
d_table_data
=
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
int64_t
h
=
static_cast
<
int64_t
>
(
lod
[
i
+
1
]
-
lod
[
i
]);
int64_t
in_offset
=
lod
[
i
]
*
row_width
;
const
T
*
out_pos
=
d_output_data
+
i
*
row_width
;
T
*
in_pos
=
d_table_data
+
in_offset
;
for
(
int
r
=
0
;
r
!=
h
;
++
r
)
{
blas
.
VCOPY
(
row_width
,
out_pos
,
in_pos
+
r
*
row_width
);
}
}
}
else
{
LOG
(
ERROR
)
<<
"Dense is not supported in fused_embedding_seq_pool_op now"
;
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
4a443ffc
...
...
@@ -21,7 +21,7 @@ DECLARE_uint64(conv_workspace_size_limit);
namespace
paddle
{
namespace
operators
{
#if CUDNN_VERSION >= 7
001
#if CUDNN_VERSION >= 7
100
using
Tensor
=
framework
::
Tensor
;
using
ScopedTensorDescriptor
=
platform
::
ScopedTensorDescriptor
;
using
ScopedFilterDescriptor
=
platform
::
ScopedFilterDescriptor
;
...
...
@@ -264,7 +264,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
#if CUDNN_VERSION >= 7
001
#if CUDNN_VERSION >= 7
100
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
conv2d_inception_fusion
,
ops
::
CUDNNConvInceptionFusionOpKernel
<
float
>
,
...
...
paddle/fluid/operators/hierarchical_sigmoid_op.cc
浏览文件 @
4a443ffc
...
...
@@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"PreOut"
),
"Output(PreOut) should not be null."
);
auto
with_prefetch
=
ctx
->
Attrs
().
Get
<
bool
>
(
"remote_prefetch"
);
if
(
with_prefetch
)
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"W_Out"
),
"Output(W_Out) should not be null."
);
}
const
int64_t
batch_size
=
ctx
->
GetInputDim
(
"X"
)[
0
];
std
::
vector
<
int64_t
>
output_shape
({
batch_size
,
1
});
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
output_shape
));
...
...
@@ -95,7 +100,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Label"
,
"(LoDTensor, required), The labels of training data. It's a"
"tensor with shape [N, 1]."
);
AddInput
(
"PTable"
,
AddInput
(
"P
ath
Table"
,
"(LoDTensor, optional), The Path Table from root to current word"
"it should have shape like [N, L], L is the length of the Path"
)
.
AsDispensable
();
...
...
@@ -119,8 +124,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
"[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes."
)
.
AsIntermediate
();
AddOutput
(
"W_Out"
,
"(LoDTensor, optinal) using input 'W' as Output to make it mutable"
"When we are using prefetch"
)
.
AsIntermediate
();
AddAttr
<
AttrType
>
(
"num_classes"
,
"(int, optional), The number of classes"
)
.
SetDefault
(
2
);
// for parameter prefetch
AddAttr
<
bool
>
(
"remote_prefetch"
,
""
).
SetDefault
(
false
);
AddAttr
<
int
>
(
"trainer_id"
,
"trainer id from 0 ~ worker_num."
).
SetDefault
(
0
);
AddAttr
<
std
::
vector
<
int
>>
(
"height_sections"
,
"Height for each output SelectedRows."
)
.
SetDefault
(
std
::
vector
<
int
>
({}));
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
,
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input variables for mapping"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"table_names"
,
"(string vector, the splited table names that will be fetched from "
"parameter server)"
"in the order of input variables for mapping"
)
.
SetDefault
({});
AddComment
(
R"DOC(
The hierarchical sigmoid operator organize the classes into a binary tree.
At each node, a sigmoid function is used to calculate the probability of
...
...
@@ -189,24 +216,18 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
<<
" is set to SelectedRows"
;
block
->
Var
(
w_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
if
(
hasBias
)
{
VLOG
(
30
)
<<
"hierarchical_sigmoid_grad op "
<<
framework
::
GradVarName
(
"Bias"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
bias_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
}
else
{
VLOG
(
30
)
<<
"hierarchical_sigmoid_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
w_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
if
(
hasBias
)
{
VLOG
(
30
)
<<
"hierarchical_sigmoid_grad op "
<<
framework
::
GradVarName
(
"Bias"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
bias_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
}
block
->
Var
(
w_grad_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
}
};
...
...
paddle/fluid/operators/hierarchical_sigmoid_op.h
浏览文件 @
4a443ffc
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <iostream>
#include <iterator>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -24,6 +26,10 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -34,8 +40,9 @@ using platform::Transform;
static
std
::
vector
<
int64_t
>
PathToRows
(
const
framework
::
LoDTensor
&
path
)
{
std
::
set
<
int64_t
>
rows
;
const
int64_t
*
paths
=
path
.
data
<
int64_t
>
();
for
(
int64_t
i
=
0
;
i
<
path
.
numel
();
++
i
)
{
int64_t
row
=
path
.
data
<
int64_t
>
()
[
i
];
int64_t
row
=
path
s
[
i
];
if
(
row
<
0
)
{
continue
;
}
...
...
@@ -49,13 +56,54 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
in
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
));
auto
&
w
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"W"
));
auto
*
path
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PTable"
);
auto
*
path
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"P
ath
Table"
);
auto
*
code
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PathCode"
);
auto
&
label
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Label"
));
auto
*
bias
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Bias"
);
auto
*
out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
*
pre_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"PreOut"
);
size_t
num_classes
=
static_cast
<
size_t
>
(
ctx
.
Attr
<
int
>
(
"num_classes"
));
// for remote prefetch
auto
epmap
=
ctx
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
if
(
!
epmap
.
empty
())
{
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
// server
auto
height_sections
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"height_sections"
);
auto
table_names
=
ctx
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"table_names"
);
std
::
vector
<
int64_t
>
real_rows
=
PathToRows
(
*
path
);
framework
::
Scope
&
local_scope
=
ctx
.
scope
().
NewScope
();
auto
*
ids
=
local_scope
.
Var
(
"Ids@Prefetch"
);
auto
*
x_tensor
=
ids
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
mutable_data
<
int64_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
real_rows
.
size
()),
1
}),
ctx
.
GetPlace
());
// copy.
std
::
memcpy
(
x_tensor
->
data
<
int64_t
>
(),
real_rows
.
data
(),
real_rows
.
size
()
*
sizeof
(
int64_t
));
framework
::
DDim
w_dims
=
ctx
.
Input
<
Tensor
>
(
"W"
)
->
dims
();
w_dims
[
0
]
=
x_tensor
->
dims
()[
0
];
auto
*
w_tensor
=
local_scope
.
Var
(
"W@Prefetch"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
w_tensor
->
Resize
(
w_dims
);
#ifdef PADDLE_WITH_DISTRIBUTE
// w_Out is set to used by prefetch, never change it in other cases
auto
*
w_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"W_Out"
);
operators
::
distributed
::
prefetch_with_reconstruct
<
T
>
(
"Ids@Prefetch"
,
"W@Prefetch"
,
table_names
,
epmap
,
height_sections
,
ctx
,
local_scope
,
w_out
);
#else
PADDLE_THROW
(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!"
);
#endif
}
bool
is_custom
=
false
;
if
(
path
)
{
is_custom
=
true
;
...
...
@@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
in
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
));
auto
&
w
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"W"
));
auto
*
path
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PTable"
);
auto
*
path
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"P
ath
Table"
);
auto
*
code
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PathCode"
);
auto
*
bias
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Bias"
);
auto
*
in_grad
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
bool
is_sparse
=
ctx
.
Attr
<
bool
>
(
"is_sparse"
);
...
...
@@ -173,8 +220,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
}
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if
(
!
is_sparse
)
{
auto
*
bias_grad
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"Bias"
));
if
(
bias_grad
)
{
...
...
@@ -182,6 +227,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
zero
(
dev_ctx
,
bias_grad
,
static_cast
<
T
>
(
0.0
));
bit_code
->
AddGrad
(
pre_out_grad
,
bias_grad
);
}
if
(
!
is_sparse
)
{
auto
*
w_grad
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"W"
));
w_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -200,21 +246,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
w_grad_value
->
mutable_data
<
T
>
(
temp_dim
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
w_grad_value
,
static_cast
<
T
>
(
0.0
));
auto
*
bias_grad
=
ctx
.
Output
<
framework
::
SelectedRows
>
(
framework
::
GradVarName
(
"Bias"
));
if
(
bias_grad
)
{
bias_grad
->
set_rows
(
real_rows
);
// build ids -> rows index map
bias_grad
->
SyncIndex
();
bias_grad
->
set_height
(
bias
->
dims
()[
0
]);
auto
*
bias_grad_value
=
bias_grad
->
mutable_value
();
std
::
vector
<
int64_t
>
dims
=
{
static_cast
<
int64_t
>
(
real_rows
.
size
()),
bias
->
dims
()[
1
]};
bias_grad_value
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
dims
),
ctx
.
GetPlace
());
zero
(
dev_ctx
,
bias_grad_value
,
static_cast
<
T
>
(
0.0
));
bit_code
->
AddGrad
(
pre_out_grad
,
bias_grad
);
}
bit_code
->
MulGradWeight
(
pre_out_grad
,
w_grad
,
in
);
}
bit_code
->
MulGradError
(
pre_out_grad
,
w
,
in_grad
);
...
...
paddle/fluid/operators/huber_loss_op.h
浏览文件 @
4a443ffc
...
...
@@ -105,14 +105,16 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
out0
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x_grad
=
EigenVector
<
T
>::
Flatten
(
*
out0
);
x_grad
.
device
(
place
)
=
out_grad
*
residual
.
unaryExpr
(
HuberLossBackward
<
T
>
(
delta
,
-
1.0
));
residual
.
unaryExpr
(
HuberLossBackward
<
T
>
(
delta
,
-
1.0
));
x_grad
.
device
(
place
)
=
out_grad
*
x_grad
;
}
if
(
out1
)
{
out1
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
y_grad
=
EigenVector
<
T
>::
Flatten
(
*
out1
);
y_grad
.
device
(
place
)
=
out_grad
*
residual
.
unaryExpr
(
HuberLossBackward
<
T
>
(
delta
,
1.0
));
residual
.
unaryExpr
(
HuberLossBackward
<
T
>
(
delta
,
1.0
));
y_grad
.
device
(
place
)
=
out_grad
*
y_grad
;
}
}
};
...
...
paddle/fluid/operators/linear_chain_crf_op.cc
浏览文件 @
4a443ffc
...
...
@@ -230,10 +230,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Emission"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Emission"
),
emission_exps_dims
);
ctx
->
ShareLoD
(
"Emission"
,
framework
::
GradVarName
(
"Emission"
));
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Transition"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Transition"
),
transition_exps_dims
);
ctx
->
ShareLoD
(
"Transition"
,
framework
::
GradVarName
(
"Transition"
));
}
}
...
...
paddle/fluid/operators/lookup_table_op.cu
浏览文件 @
4a443ffc
...
...
@@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
// server
#ifdef PADDLE_WITH_DISTRIBUTE
operators
::
distributed
::
prefetch
(
id_name
,
out_name
,
table_names
,
epmap
,
height_sections
,
context
);
height_sections
,
context
,
context
.
scope
());
#else
PADDLE_THROW
(
"paddle is not compiled with distribute support, can not do "
...
...
paddle/fluid/operators/lookup_table_op.h
浏览文件 @
4a443ffc
...
...
@@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
// server
#ifdef PADDLE_WITH_DISTRIBUTE
operators
::
distributed
::
prefetch
(
id_name
,
out_name
,
table_names
,
epmap
,
height_sections
,
context
);
height_sections
,
context
,
context
.
scope
());
#else
PADDLE_THROW
(
"paddle is not compiled with distribute support, can not do "
...
...
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
4a443ffc
...
...
@@ -62,19 +62,27 @@ struct CUBlas<float> {
cudaDataType_t
Atype
,
int
lda
,
const
void
*
B
,
cudaDataType_t
Btype
,
int
ldb
,
const
float
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
)
{
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
// Because the gcc 4.8 doesn't expand template parameter pack that
// appears in a lambda-expression, I can not use template parameter pack
// here.
auto
cublas_call
=
[
&
]()
{
#if CUDA_VERSION >= 8000
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
dev_ctx
->
tensor_core_available
()
?
"True"
:
"False"
);
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
<<
(
platform
::
TensorCoreAvailable
()
?
"True"
:
"False"
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSgemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
));
});
dev_ctx
->
cublas_handle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
));
#else
PADDLE_THROW
(
"cublasSgemmEx is supported on cuda >= 8.0"
);
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx
->
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
#else
cublas_call
();
#endif
}
};
...
...
@@ -162,10 +170,11 @@ struct CUBlas<platform::float16> {
cudaDataType_t
Btype
,
int
ldb
,
const
void
*
beta
,
void
*
C
,
cudaDataType_t
Ctype
,
int
ldc
,
cudaDataType_t
computeType
)
{
auto
cublas_call
=
[
&
]()
{
#if CUDA_VERSION >= 8000
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
#if CUDA_VERSION >= 9000
bool
use_tensor_op_math
=
dev_ctx
->
tensor_core_a
vailable
();
bool
use_tensor_op_math
=
platform
::
TensorCoreA
vailable
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
...
...
@@ -173,13 +182,20 @@ struct CUBlas<platform::float16> {
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
#endif // CUDA_VERSION >= 9000
dev_ctx
->
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmEx
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
});
dev_ctx
->
cublas_handle
(),
transa
,
transb
,
m
,
n
,
k
,
alpha
,
A
,
Atype
,
lda
,
B
,
Btype
,
ldb
,
beta
,
C
,
Ctype
,
ldc
,
computeType
,
algo
));
#else
PADDLE_THROW
(
"cublasGemmEx is supported on cuda >= 8.0"
);
#endif
};
#if CUDA_VERSION >= 9000
// NOTES: To use Tensor Core, we should change the cublas config,
// but the cublas may be hold by multi-thread.
dev_ctx
->
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
#else
cublas_call
();
#endif
}
};
...
...
@@ -207,10 +223,9 @@ void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CUDA_R_32F
,
N
);
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
});
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
N
);
#if CUDA_VERSION >= 8000
}
...
...
@@ -251,12 +266,9 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
CUDA_R_16F
,
lda
,
&
h_beta
,
C
,
CUDA_R_16F
,
N
,
CUDA_R_32F
);
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
platform
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
h_beta
,
h_C
,
N
);
});
CUBlas
<
platform
::
float16
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
h_alpha
,
h_B
,
ldb
,
h_A
,
lda
,
&
h_beta
,
h_C
,
N
);
#endif // CUDA_VERSION >= 8000
}
...
...
@@ -280,10 +292,8 @@ void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
}
else
{
#endif // CUDA_VERSION >= 8000
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
CUBlas
<
T
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
#if CUDA_VERSION >= 8000
}
...
...
@@ -301,19 +311,16 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
cublasOperation_t
cuTransA
=
transA
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
cublasOperation_t
cuTransB
=
transB
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
platform
::
float16
>::
GEMM
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
});
CUBlas
<
platform
::
float16
>::
GEMM
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
A
,
lda
,
&
beta
,
C
,
ldc
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
{
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
AXPY
(
handle
,
n
,
&
alpha
,
x
,
1
,
y
,
1
);
});
CUBlas
<
T
>::
AXPY
(
context_
.
cublas_handle
(),
n
,
&
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
...
...
@@ -323,9 +330,8 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
T
beta
,
T
*
C
)
const
{
cublasOperation_t
cuTransA
=
!
trans_a
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMV
(
handle
,
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
});
CUBlas
<
T
>::
GEMV
(
context_
.
cublas_handle
(),
cuTransA
,
N
,
M
,
&
alpha
,
A
,
N
,
B
,
1
,
&
beta
,
C
,
1
);
}
template
<
>
...
...
@@ -347,28 +353,28 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
#if CUDA_VERSION >= 9010
if
(
FLAGS_enable_cublas_tensor_op_math
&&
std
::
is_same
<
T
,
float
>::
value
)
{
auto
cublas_call
=
[
&
]()
{
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DFALT
;
bool
use_tensor_op_math
=
context_
.
tensor_core_a
vailable
();
bool
use_tensor_op_math
=
platform
::
TensorCoreA
vailable
();
if
(
use_tensor_op_math
)
{
algo
=
CUBLAS_GEMM_DFALT_TENSOR_OP
;
}
VLOG
(
5
)
<<
"use_tensor_op_math: "
<<
(
use_tensor_op_math
?
"True"
:
"False"
);
context_
.
TensorCoreCublasCallIfAvailable
([
&
](
cublasHandle_t
handle
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGemmStridedBatchedEx
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
strideB
,
A
,
CUDA_R_32F
,
lda
,
strideA
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
,
strideC
,
batchCount
,
CUDA_R_32F
,
algo
));
});
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
CUDA_R_32F
,
ldb
,
strideB
,
A
,
CUDA_R_32F
,
lda
,
strideA
,
&
beta
,
C
,
CUDA_R_32F
,
ldc
,
strideC
,
batchCount
,
CUDA_R_32F
,
algo
));
};
auto
&
dev_ctx
=
const_cast
<
platform
::
CUDADeviceContext
&>
(
context_
);
dev_ctx
.
CublasCall
(
cublas_call
,
CUBLAS_TENSOR_OP_MATH
);
}
else
{
#endif // CUDA_VERSION >= 9010
context_
.
CublasCall
([
&
](
cublasHandle_t
handle
)
{
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
handle
,
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
});
CUBlas
<
T
>::
GEMM_STRIDED_BATCH
(
context_
.
cublas_handle
(),
cuTransB
,
cuTransA
,
N
,
M
,
K
,
&
alpha
,
B
,
ldb
,
strideB
,
A
,
lda
,
strideA
,
&
beta
,
C
,
ldc
,
strideC
,
batchCount
);
#if CUDA_VERSION >= 9010
}
...
...
paddle/fluid/operators/math/matrix_bit_code.cc
浏览文件 @
4a443ffc
...
...
@@ -84,41 +84,6 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
struct
MatrixBitCodeFunctorSelectedRowsAddGrad
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
SelectedRows
*
vec_
;
MatrixBitCodeFunctorSelectedRowsAddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
:
tmat_
(
tmat
),
vec_
(
vec
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
batch_size
=
tmat_
.
dims
()[
0
];
size_t
width
=
tmat_
.
dims
()[
1
];
auto
*
vec_data
=
vec_
->
mutable_value
()
->
template
data
<
T
>();
auto
*
tmat_data
=
tmat_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
.
calc_index
(
j
);
int64_t
row_index
=
vec_
->
GetIndexFromId
(
static_cast
<
int64_t
>
(
index
));
vec_data
[
row_index
]
+=
tmat_data
[
i
*
width
+
j
];
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
{
MatrixBitCodeFunctorSelectedRowsAddGrad
<
T
>
func
(
tmat
,
vec
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
struct
MatrixBitCodeFunctorSum
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
4a443ffc
...
...
@@ -124,11 +124,12 @@ class SimpleCode {
template
<
typename
T
>
class
CustomCode
{
public:
CustomCode
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
,
int
index
)
{
seq_len_
=
ptable
.
dims
()[
1
];
ptable_data_
=
ptable
.
data
<
T
>
()
+
seq_len_
*
index
;
pcode_data_
=
pcode
.
data
<
T
>
()
+
seq_len_
*
index
;
CustomCode
(
const
framework
::
Tensor
&
path_table
,
const
framework
::
Tensor
&
path_code
,
const
int64_t
*
ids
,
int
index
)
{
seq_len_
=
path_table
.
dims
()[
1
];
path_table_data_
=
path_table
.
data
<
T
>
()
+
seq_len_
*
index
;
path_code_data_
=
path_code
.
data
<
T
>
()
+
seq_len_
*
index
;
}
/**
* Here the id of root should be 1 rather than 0, thus the encoding of class c
...
...
@@ -139,25 +140,25 @@ class CustomCode {
* Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit.
*/
size_t
calc_index
(
int
bit
)
const
{
return
ptable_data_
[
bit
];
}
bool
calc_bit
(
int
bit
)
const
{
return
pcode_data_
[
bit
];
}
size_t
calc_index
(
int
bit
)
const
{
return
p
ath_
table_data_
[
bit
];
}
bool
calc_bit
(
int
bit
)
const
{
return
p
ath_
code_data_
[
bit
];
}
// NOTE: this function is not thread-safe.
int
get_length
()
const
{
if
(
length_
<
0
)
{
auto
len
=
seq_len_
;
length_
=
st
atic_cast
<
int
>
(
std
::
find_if
(
ptable_data_
,
p
table_data_
+
len
,
length_
=
static_cast
<
int
>
(
st
d
::
find_if
(
path_table_data_
,
path_
table_data_
+
len
,
[](
const
T
&
val
)
{
return
val
<
0
;
})
-
p
table_data_
);
path_
table_data_
);
}
return
length_
;
}
private:
int64_t
seq_len_
;
const
T
*
ptable_data_
;
const
T
*
pcode_data_
;
const
T
*
p
ath_
table_data_
;
const
T
*
p
ath_
code_data_
;
mutable
int
length_
{
-
1
};
};
...
...
@@ -181,9 +182,9 @@ class SimpleCodeTable {
template
<
typename
T
>
class
CustomCodeTable
{
public:
CustomCodeTable
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
ptable_
(
p
table
),
pcode_
(
p
code
),
ids_
(
ids
)
{}
CustomCodeTable
(
const
framework
::
Tensor
&
p
ath_
table
,
const
framework
::
Tensor
&
p
ath_
code
,
const
int64_t
*
ids
)
:
ptable_
(
p
ath_table
),
pcode_
(
path_
code
),
ids_
(
ids
)
{}
CustomCode
<
T
>
get_code
(
int64_t
code
)
const
{
return
CustomCode
<
T
>
(
ptable_
,
pcode_
,
ids_
,
code
);
...
...
@@ -210,11 +211,11 @@ class MatrixBitCodeFunctor {
ids_
(
ids
),
code_table_
(
SimpleCodeTable
(
num_classes
,
ids
))
{}
MatrixBitCodeFunctor
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
num_classes_
(
static_cast
<
size_t
>
(
ptable
.
dims
()[
1
])),
MatrixBitCodeFunctor
(
const
framework
::
Tensor
&
p
ath_
table
,
const
framework
::
Tensor
&
p
ath_
code
,
const
int64_t
*
ids
)
:
num_classes_
(
static_cast
<
size_t
>
(
p
ath_
table
.
dims
()[
1
])),
ids_
(
ids
),
code_table_
(
CustomCodeTable
<
int64_t
>
(
p
table
,
p
code
,
ids
))
{}
code_table_
(
CustomCodeTable
<
int64_t
>
(
p
ath_table
,
path_
code
,
ids
))
{}
/* For j < code_length
tmat(i, j) += vec(0, index(i, j))
*/
...
...
@@ -225,11 +226,6 @@ class MatrixBitCodeFunctor {
*/
void
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
);
/* For selected rows For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
);
/* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/
...
...
paddle/fluid/operators/nce_op.cc
浏览文件 @
4a443ffc
...
...
@@ -153,6 +153,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
bool
>
(
"is_sparse"
,
"(boolean, default false) Sparse update."
)
.
SetDefault
(
false
);
// for parameter prefetch
AddAttr
<
bool
>
(
"remote_prefetch"
,
""
).
SetDefault
(
false
);
AddAttr
<
int
>
(
"trainer_id"
,
"trainer id from 0 ~ worker_num."
).
SetDefault
(
0
);
AddAttr
<
std
::
vector
<
int
>>
(
"height_sections"
,
"Height for each output SelectedRows."
)
.
SetDefault
(
std
::
vector
<
int
>
({}));
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
,
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input variables for mapping"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"table_names"
,
"(string vector, the splited table names that will be fetched from "
"parameter server)"
"in the order of input variables for mapping"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
int
>>
(
"custom_neg_classes"
,
"This attribute only be used in unitest. Classes "
"in this list wiil be used as negative classes "
...
...
@@ -222,24 +240,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference {
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
weight_grad
=
op_desc
.
Output
(
framework
::
GradVarName
(
"Weight"
)).
front
();
auto
bias_grad
=
op_desc
.
Output
(
framework
::
GradVarName
(
"Bias"
)).
front
();
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
3
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
bias_grad
VLOG
(
3
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
" is set to SelectedRows"
;
block
->
Var
(
weight_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
block
->
Var
(
bias_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
VLOG
(
3
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
bias_grad
VLOG
(
3
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
" is set to LoDTensor"
;
block
->
Var
(
weight_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
block
->
Var
(
bias_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
block
->
Var
(
weight_grad
)
->
SetDataType
(
block
->
Var
(
"Input"
)
->
GetDataType
());
block
->
Var
(
bias_grad
)
->
SetDataType
(
block
->
Var
(
"Input"
)
->
GetDataType
());
}
};
...
...
paddle/fluid/operators/nce_op.h
浏览文件 @
4a443ffc
...
...
@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once
#include <math.h>
#include <iterator>
#include <random>
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -24,6 +26,10 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "unsupported/Eigen/CXX11/Tensor"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -43,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context,
auto
label
=
context
.
Input
<
Tensor
>
(
"Label"
);
const
int64_t
*
label_data
=
label
->
data
<
int64_t
>
();
auto
label_dims
=
label
->
dims
();
// int num_total_classes = context.Attr<int>("num_total_classes");
// for unitest
std
::
vector
<
int
>
custom_neg_classes
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"custom_neg_classes"
);
...
...
@@ -144,7 +149,72 @@ class NCEKernel : public framework::OpKernel<T> {
}
// forward mul
auto
input_mat
=
EigenMatrix
<
T
>::
From
(
*
(
context
.
Input
<
Tensor
>
(
"Input"
)));
auto
weight_mat
=
EigenMatrix
<
T
>::
From
(
*
(
context
.
Input
<
Tensor
>
(
"Weight"
)));
// for remote prefetch
auto
epmap
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
if
(
!
epmap
.
empty
())
{
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
// server
std
::
vector
<
int64_t
>
labels
;
for
(
int64_t
i
=
0
;
i
<
sample_labels
->
numel
();
++
i
)
{
labels
.
push_back
(
sample_labels_data
[
i
]);
}
std
::
set
<
T
>
st
(
labels
.
begin
(),
labels
.
end
());
labels
.
assign
(
st
.
begin
(),
st
.
end
());
framework
::
Scope
&
local_scope
=
context
.
scope
().
NewScope
();
auto
height_sections
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"height_sections"
);
auto
table_names
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"table_names"
);
auto
*
ids
=
local_scope
.
Var
(
"Ids@Prefetch"
);
auto
*
x_tensor
=
ids
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
mutable_data
<
int64_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
labels
.
size
()),
1
}),
context
.
GetPlace
());
// copy.
std
::
memcpy
(
x_tensor
->
data
<
int64_t
>
(),
labels
.
data
(),
labels
.
size
()
*
sizeof
(
int64_t
));
std
::
vector
<
int
>
w_dims
=
paddle
::
framework
::
vectorize2int
(
context
.
Input
<
Tensor
>
(
"Weight"
)
->
dims
());
w_dims
[
0
]
=
static_cast
<
int
>
(
labels
.
size
());
auto
*
w_tensor
=
local_scope
.
Var
(
"Weight@Prefetch"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
w_tensor
->
Resize
(
framework
::
make_ddim
(
w_dims
));
#ifdef PADDLE_WITH_DISTRIBUTE
operators
::
distributed
::
prefetch
(
"Ids@Prefetch"
,
"Weight@Prefetch"
,
table_names
,
epmap
,
height_sections
,
context
,
local_scope
);
#else
PADDLE_THROW
(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!"
);
#endif
auto
weight_mat
=
EigenMatrix
<
T
>::
From
(
(
local_scope
.
Var
(
"Weight@Prefetch"
)
->
Get
<
framework
::
LoDTensor
>
()));
for
(
int64_t
i
=
0
;
i
<
sample_labels
->
numel
();
++
i
)
{
std
::
vector
<
int64_t
>::
iterator
it
=
std
::
find
(
labels
.
begin
(),
labels
.
end
(),
sample_labels_data
[
i
]);
int
idx
=
std
::
distance
(
labels
.
begin
(),
it
);
Eigen
::
Tensor
<
T
,
0
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>
result
=
(
input_mat
.
chip
(
static_cast
<
int
>
(
i
/
sample_labels
->
dims
()[
1
]),
0
)
*
weight_mat
.
chip
(
idx
,
0
))
.
sum
();
sample_out_data
[
i
]
+=
result
(
0
);
sample_out_data
[
i
]
=
(
1.
/
(
1.
+
exp
(
-
sample_out_data
[
i
])));
}
context
.
scope
().
DeleteScope
(
&
local_scope
);
}
else
{
auto
weight_mat
=
EigenMatrix
<
T
>::
From
(
*
(
context
.
Input
<
Tensor
>
(
"Weight"
)));
for
(
int64_t
i
=
0
;
i
<
sample_labels
->
numel
();
++
i
)
{
Eigen
::
Tensor
<
T
,
0
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>
result
=
(
input_mat
.
chip
(
static_cast
<
int
>
(
i
/
sample_labels
->
dims
()[
1
]),
0
)
*
...
...
@@ -153,6 +223,8 @@ class NCEKernel : public framework::OpKernel<T> {
sample_out_data
[
i
]
+=
result
(
0
);
sample_out_data
[
i
]
=
(
1.
/
(
1.
+
exp
(
-
sample_out_data
[
i
])));
}
}
// forward cost
for
(
int64_t
i
=
0
;
i
<
sample_labels
->
dims
()[
0
];
++
i
)
{
out_data
[
i
]
=
0
;
...
...
@@ -240,9 +312,6 @@ class NCEGradKernel : public framework::OpKernel<T> {
sample_grad_data
[
i
]
*=
d_out_data
[
sample_idx
];
}
bool
is_sparse
=
context
.
Attr
<
bool
>
(
"is_sparse"
);
if
(
!
is_sparse
)
{
// get d_bias
auto
d_bias
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
if
(
d_bias
!=
nullptr
)
{
...
...
@@ -252,6 +321,10 @@ class NCEGradKernel : public framework::OpKernel<T> {
d_bias_data
[
sample_labels_data
[
i
]]
+=
sample_grad_data
[
i
];
}
}
bool
is_sparse
=
context
.
Attr
<
bool
>
(
"is_sparse"
);
if
(
!
is_sparse
)
{
// get d_w
auto
d_w
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Weight"
));
if
(
d_w
!=
nullptr
)
{
...
...
@@ -273,34 +346,6 @@ class NCEGradKernel : public framework::OpKernel<T> {
std
::
set
<
T
>
st
(
labels
.
begin
(),
labels
.
end
());
labels
.
assign
(
st
.
begin
(),
st
.
end
());
auto
*
bias_var
=
context
.
InputVar
(
"Bias"
);
DDim
bias_dim
;
if
(
bias_var
->
IsType
<
LoDTensor
>
())
{
bias_dim
=
context
.
Input
<
LoDTensor
>
(
"Bias"
)
->
dims
();
}
else
if
(
bias_var
->
IsType
<
SelectedRows
>
())
{
auto
*
table_t
=
context
.
Input
<
SelectedRows
>
(
"Bias"
);
bias_dim
=
table_t
->
value
().
dims
();
}
else
{
PADDLE_THROW
(
"The parameter Bias of a NCE_OP "
"must be either LoDTensor or SelectedRows"
);
}
auto
d_bias
=
context
.
Output
<
SelectedRows
>
(
framework
::
GradVarName
(
"Bias"
));
d_bias
->
set_rows
(
labels
);
d_bias
->
set_height
(
bias_dim
[
0
]);
d_bias
->
mutable_value
()
->
Resize
(
{
static_cast
<
int64_t
>
(
labels
.
size
()),
bias_dim
[
1
]});
T
*
d_bias_data
=
d_bias
->
mutable_value
()
->
mutable_data
<
T
>
(
context
.
GetPlace
());
std
::
fill
(
d_bias_data
,
d_bias_data
+
labels
.
size
(),
0.0
);
for
(
int64_t
i
=
0
;
i
<
sample_labels
->
numel
();
++
i
)
{
d_bias_data
[
d_bias
->
Index
(
sample_labels_data
[
i
])]
+=
sample_grad_data
[
i
];
}
auto
*
table_var
=
context
.
InputVar
(
"Weight"
);
DDim
table_dim
;
if
(
table_var
->
IsType
<
LoDTensor
>
())
{
...
...
paddle/fluid/operators/ngraph/ngraph_ops.h
浏览文件 @
4a443ffc
...
...
@@ -23,5 +23,7 @@ limitations under the License. */
#include "ops/binary_unnary_op.h"
#include "ops/fill_constant_op.h"
#include "ops/mean_op.h"
#include "ops/mul_op.h"
#include "ops/scale_op.h"
#include "ops/top_k_op.h"
paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
0 → 100644
浏览文件 @
4a443ffc
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
template
<
typename
T
>
std
::
shared_ptr
<
ngraph
::
Node
>
ElementwiseScalar
(
float
scale
,
std
::
shared_ptr
<
ngraph
::
Node
>
node
)
{
auto
node_shape
=
node
->
get_shape
();
auto
scale_const
=
ngraph
::
op
::
Constant
::
create
(
node
->
get_element_type
(),
node_shape
,
{
scale
});
return
std
::
make_shared
<
T
>
(
scale_const
,
node
);
}
template
<
typename
T
>
std
::
shared_ptr
<
ngraph
::
Node
>
ElementwiseScalar
(
std
::
shared_ptr
<
ngraph
::
Node
>
scale_1d
,
std
::
shared_ptr
<
ngraph
::
Node
>
node
)
{
auto
scale_shape
=
scale_1d
->
get_shape
();
PADDLE_ENFORCE_EQ
(
scale_shape
.
size
(),
1
,
"Supporting 1d scale node"
);
PADDLE_ENFORCE_EQ
(
scale_shape
.
at
(
0
),
1
,
"scale 1d in in shape {1}"
);
auto
node_shape
=
node
->
get_shape
();
ngraph
::
AxisSet
axis_set
;
for
(
size_t
i
=
0
;
i
<
node_shape
.
size
();
++
i
)
{
axis_set
.
insert
(
i
);
}
node_shape
.
push_back
(
1
);
auto
scale_bcast
=
std
::
make_shared
<
ngraph
::
op
::
Broadcast
>
(
scale_1d
,
node_shape
,
axis_set
);
auto
scale_reshape
=
paddle
::
platform
::
NgReshaper
(
scale_bcast
,
node
->
get_shape
());
return
std
::
make_shared
<
T
>
(
scale_reshape
,
node
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/mean_op.h
0 → 100644
浏览文件 @
4a443ffc
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <functional>
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
void
BuildMeanNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
input
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
ngraph
::
AxisSet
axes
;
for
(
size_t
i
=
0
;
i
<
input
->
get_shape
().
size
();
++
i
)
{
axes
.
insert
(
i
);
}
auto
mean
=
ngraph
::
builder
::
mean
(
input
,
axes
);
auto
mean_1d
=
std
::
make_shared
<
ngraph
::
op
::
Reshape
>
(
mean
,
ngraph
::
AxisVector
{},
ngraph
::
Shape
{
1
});
paddle
::
platform
::
SetOutputNode
(
op
,
"Out"
,
mean_1d
,
ngb_node_map
);
}
void
BuildMeanGradNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
x
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
og
=
paddle
::
platform
::
GetInputNode
(
op
,
"Out@GRAD"
,
ngb_node_map
);
auto
x_shape
=
x
->
get_shape
();
float
x_size
=
std
::
accumulate
(
std
::
begin
(
x_shape
),
std
::
end
(
x_shape
),
1
,
std
::
multiplies
<
float
>
());
auto
node_const
=
ngraph
::
op
::
Constant
::
create
(
og
->
get_element_type
(),
ngraph
::
Shape
{
1
},
{
x_size
});
auto
node_div
=
std
::
make_shared
<
ngraph
::
op
::
Divide
>
(
og
,
node_const
);
auto
result
=
ElementwiseScalar
<
ngraph
::
op
::
Add
>
(
og
/
node_const
,
ngraph
::
op
::
Constant
::
create
(
og
->
get_element_type
(),
x_shape
,
{
0
}));
paddle
::
platform
::
SetOutputNode
(
op
,
"X@GRAD"
,
result
,
ngb_node_map
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/ngraph/ops/scale_op.h
0 → 100644
浏览文件 @
4a443ffc
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
ngraphs
{
void
BuildScaleNode
(
const
std
::
shared_ptr
<
paddle
::
framework
::
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
op_attrs
=
paddle
::
framework
::
AttrReader
(
op
->
Attrs
());
float
scale
=
op_attrs
.
Get
<
float
>
(
"scale"
);
auto
x
=
paddle
::
platform
::
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
out
=
ElementwiseScalar
<
ngraph
::
op
::
Multiply
>
(
scale
,
x
);
paddle
::
platform
::
SetOutputNode
(
op
,
"Out"
,
out
,
ngb_node_map
);
}
}
// namespace ngraphs
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/optimizers/adam_op.h
浏览文件 @
4a443ffc
...
...
@@ -424,16 +424,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
}
}
framework
::
SelectedRows
cpu_grad_merge
;
const
framework
::
SelectedRows
*
grad_merge_ptr
;
if
(
is_strict_sorted
)
{
grad_merge_ptr
=
&
grad
;
}
else
{
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
framework
::
SelectedRows
*
grad_merge_var
;
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_func
;
auto
*
grad_merge_var
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
grad_merge_var
=
&
cpu_grad_merge
;
}
else
{
// FIXME(qiao): GPU also need to fix this
grad_merge_var
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
()
->
GetMutable
<
framework
::
SelectedRows
>
();
}
merge_func
(
ctx
.
template
device_context
<
DeviceContext
>(),
grad
,
grad_merge_var
,
true
);
grad_merge_ptr
=
grad_merge_var
;
...
...
paddle/fluid/operators/reader/ctr_reader.h
浏览文件 @
4a443ffc
...
...
@@ -49,7 +49,7 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
class
CTRReader
:
public
framework
::
FileReader
{
public:
explicit
CTRReader
(
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
queue
,
int
batch_size
,
in
t
thread_num
,
int
batch_size
,
size_
t
thread_num
,
const
std
::
vector
<
std
::
string
>&
slots
,
const
std
::
vector
<
std
::
string
>&
file_list
)
:
batch_size_
(
batch_size
),
slots_
(
slots
),
file_list_
(
file_list
)
{
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -100,7 +100,7 @@ ENDIF()
nv_library
(
cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info
)
if
(
WITH_GPU
)
nv_test
(
temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor
)
nv_test
(
temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor
operator
)
else
()
cc_test
(
temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor
)
cc_test
(
temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor
operator
)
endif
()
paddle/fluid/platform/device_context.cc
浏览文件 @
4a443ffc
...
...
@@ -245,15 +245,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
eigen_device_
.
reset
(
new
Eigen
::
GpuDevice
(
eigen_stream_
.
get
()));
cublas_handle_
.
reset
(
new
CublasHandleHolder
(
stream_
,
CUBLAS_DEFAULT_MATH
));
if
(
TensorCoreAvailable
())
{
#if CUDA_VERSION >= 9000
cublas_tensor_core_handle_
.
reset
(
new
CublasHandleHolder
(
stream_
,
CUBLAS_TENSOR_OP_MATH
));
#endif
}
PADDLE_ENFORCE
(
dynload
::
cublasCreate
(
&
cublas_handle_
));
PADDLE_ENFORCE
(
dynload
::
cublasSetStream
(
cublas_handle_
,
stream_
));
if
(
dynload
::
HasCUDNN
())
{
cudnn_holder_
.
reset
(
new
CudnnHolder
(
&
stream_
,
place
));
}
...
...
@@ -313,8 +306,7 @@ CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId
(
place_
.
device
);
Wait
();
WaitStreamCallback
();
cublas_handle_
.
reset
();
cublas_tensor_core_handle_
.
reset
();
PADDLE_ENFORCE
(
dynload
::
cublasDestroy
(
cublas_handle_
));
eigen_stream_
.
reset
();
eigen_device_
.
reset
();
PADDLE_ENFORCE
(
cudaStreamDestroy
(
stream_
));
...
...
@@ -343,8 +335,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return
eigen_device_
.
get
();
}
bool
CUDADeviceContext
::
tensor_core_availab
le
()
const
{
return
cublas_
tensor_core_handle_
!=
nullptr
;
cublasHandle_t
CUDADeviceContext
::
cublas_hand
le
()
const
{
return
cublas_
handle_
;
}
cudnnHandle_t
CUDADeviceContext
::
cudnn_handle
()
const
{
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
4a443ffc
...
...
@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/temporary_allocator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_helper.h"
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/gpu_info.h"
...
...
@@ -210,6 +209,39 @@ class CudnnWorkspaceHandle {
std
::
unique_ptr
<
std
::
lock_guard
<
std
::
mutex
>>
guard_
;
};
#if CUDA_VERSION >= 9000
class
ScopedCublasMathMode
{
public:
ScopedCublasMathMode
(
cublasHandle_t
handle
,
cublasMath_t
new_math_mode
)
:
handle_
(
handle
)
{
need_reset
=
false
;
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasGetMathMode
(
handle_
,
&
old_math_mode_
),
"Failed to get old cublas math mode"
);
if
(
old_math_mode_
!=
new_math_mode
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
handle_
,
new_math_mode
),
"Failed to set old cublas math mode"
);
need_reset
=
true
;
}
}
~
ScopedCublasMathMode
()
{
if
(
need_reset
)
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cublasSetMathMode
(
handle_
,
old_math_mode_
),
"Failed to set old cublas math mode"
);
}
}
private:
cublasHandle_t
handle_
;
cublasMath_t
old_math_mode_
;
bool
need_reset
;
};
#endif
class
CUDADeviceContext
:
public
DeviceContext
{
public:
explicit
CUDADeviceContext
(
CUDAPlace
place
);
...
...
@@ -230,25 +262,8 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return eigen device in the device context. */
Eigen
::
GpuDevice
*
eigen_device
()
const
;
/*! \brief Call cublas function safely. */
template
<
typename
Callback
>
inline
void
CublasCall
(
Callback
&&
callback
)
const
{
cublas_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
));
}
/*! \brief Check whether tensor core is supported */
bool
tensor_core_available
()
const
;
/*! \brief Call cublas function with Tensor Core safely. If
Tensor Core is not available, use DEFAULT_MATH instead. */
template
<
typename
Callback
>
inline
void
TensorCoreCublasCallIfAvailable
(
Callback
&&
callback
)
const
{
if
(
cublas_tensor_core_handle_
)
{
cublas_tensor_core_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
));
}
else
{
cublas_handle_
->
Call
(
std
::
forward
<
Callback
>
(
callback
));
}
}
/*! \brief Return cublas handle in the device context. */
cublasHandle_t
cublas_handle
()
const
;
/*! \brief Return cudnn handle in the device context. */
cudnnHandle_t
cudnn_handle
()
const
;
...
...
@@ -267,6 +282,7 @@ class CUDADeviceContext : public DeviceContext {
template
<
typename
Callback
>
void
RecordEvent
(
cudaEvent_t
ev
,
Callback
callback
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
callback
();
PADDLE_ENFORCE
(
cudaEventRecord
(
ev
,
stream_
));
}
...
...
@@ -278,6 +294,18 @@ class CUDADeviceContext : public DeviceContext {
void
WaitStreamCallback
()
const
{
callback_manager_
->
Wait
();
}
#if CUDA_VERSION >= 9000
/*! \brief CublasCall may need to change cublas's config,
* but the cublas may be hold by multi-thread, so we should
* add lock here. */
template
<
typename
Callback
>
void
CublasCall
(
Callback
callback
,
cublasMath_t
new_math
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
cublas_mtx_
);
ScopedCublasMathMode
scoped_cublas_math
(
cublas_handle_
,
new_math
);
callback
();
}
#endif
private:
CUDAPlace
place_
;
...
...
@@ -285,9 +313,7 @@ class CUDADeviceContext : public DeviceContext {
std
::
unique_ptr
<
EigenCudaStreamDevice
>
eigen_stream_
;
std
::
unique_ptr
<
CudnnHolder
>
cudnn_holder_
;
cudaStream_t
stream_
;
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_handle_
;
std
::
unique_ptr
<
CublasHandleHolder
>
cublas_tensor_core_handle_
;
cublasHandle_t
cublas_handle_
;
int
compute_capability_
;
int
runtime_version_
;
...
...
@@ -295,10 +321,12 @@ class CUDADeviceContext : public DeviceContext {
int
multi_process_
;
int
max_threads_per_mp_
;
mutable
std
::
mutex
mtx_
;
// StreamCallbackManager is thread-safe
std
::
unique_ptr
<
StreamCallbackManager
>
callback_manager_
;
DISABLE_COPY_AND_ASSIGN
(
CUDADeviceContext
)
;
mutable
std
::
mutex
cublas_mtx_
;
};
template
<
>
...
...
paddle/fluid/platform/device_context_test.cu
浏览文件 @
4a443ffc
...
...
@@ -43,6 +43,9 @@ TEST(Device, CUDADeviceContext) {
ASSERT_NE
(
nullptr
,
gpu_device
);
cudnnHandle_t
cudnn_handle
=
device_context
->
cudnn_handle
();
ASSERT_NE
(
nullptr
,
cudnn_handle
);
cublasHandle_t
cublas_handle
=
device_context
->
cublas_handle
();
ASSERT_NE
(
nullptr
,
cublas_handle
);
ASSERT_NE
(
nullptr
,
device_context
->
stream
());
delete
device_context
;
}
}
...
...
paddle/fluid/platform/mkldnn_reuse.h
浏览文件 @
4a443ffc
...
...
@@ -145,7 +145,8 @@ class MKLDNNHandler {
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
const
std
::
string
&
suffix
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
,
// NOLINT
bool
is_persistent
=
false
)
{
bool
is_persistent
=
false
,
bool
is_INT8
=
false
,
std
::
vector
<
float
>
scale_data
=
{
1.0
f
},
int
mask
=
0
)
{
// create reorder primitive if the input format is not the preferred one
auto
local_key
=
key_
+
suffix
;
auto
key_reorder_p
=
key_
+
suffix
+
"reorder_p"
;
...
...
@@ -159,8 +160,20 @@ class MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
primitive
>
reorder_p
;
if
(
mpd
!=
user_mpd
)
{
target_memory_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
);
auto
reorder_p
=
std
::
make_shared
<
mkldnn
::
reorder
>
(
*
user_memory_p
,
*
target_memory_p
);
std
::
shared_ptr
<
mkldnn
::
reorder
>
reorder_p
;
if
(
is_INT8
)
{
mkldnn
::
primitive_attr
attri
;
// attribute for int8 weights and bias data reorder.
attri
.
set_output_scales
(
mask
,
scale_data
);
auto
reorder_pd
=
std
::
shared_ptr
<
mkldnn
::
reorder
::
primitive_desc
>
(
new
mkldnn
::
reorder
::
primitive_desc
(
user_mpd
,
mpd
,
attri
));
reorder_p
=
std
::
shared_ptr
<
mkldnn
::
reorder
>
(
new
mkldnn
::
reorder
(
*
reorder_pd
,
*
user_memory_p
,
*
target_memory_p
));
}
else
{
reorder_p
=
std
::
make_shared
<
mkldnn
::
reorder
>
(
*
user_memory_p
,
*
target_memory_p
);
}
dev_ctx_
.
SetBlob
(
key_reorder_p
,
reorder_p
);
pipeline
.
push_back
(
*
reorder_p
);
}
...
...
@@ -182,22 +195,58 @@ class MKLDNNHandler {
return
dims2str
(
operand_dims
)
+
suffix
;
}
template
<
typename
M
>
template
<
typename
T
>
static
void
SetDstMemory
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
std
::
vector
<
int
>
dst_tz
,
const
mkldnn
::
engine
&
engine
,
std
::
shared_ptr
<
mkldnn
::
memory
::
primitive_desc
>&
dst_pd
,
// NOLINT
std
::
shared_ptr
<
mkldnn
::
memory
>&
dst_memory
)
{
// NOLINT
M
*
output_data
=
output
->
mutable_data
<
M
>
(
ctx
.
GetPlace
());
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
dst_md
=
platform
::
MKLDNNMemDesc
(
{
dst_tz
},
paddle
::
framework
::
ToMKLDNNDataType
(
framework
::
DataTypeTrait
<
M
>::
DataType
),
framework
::
DataTypeTrait
<
T
>::
DataType
),
mkldnn
::
memory
::
format
::
nhwc
);
dst_pd
.
reset
(
new
mkldnn
::
memory
::
primitive_desc
(
dst_md
,
engine
));
dst_memory
.
reset
(
new
mkldnn
::
memory
(
*
dst_pd
,
to_void_cast
<
M
>
(
output_data
)));
dst_memory
.
reset
(
new
mkldnn
::
memory
(
*
dst_pd
,
to_void_cast
<
T
>
(
output_data
)));
}
static
void
AppendKey
(
std
::
string
*
key
,
const
mkldnn
::
memory
::
dims
&
input_dims
,
const
mkldnn
::
memory
::
dims
&
weights_dims
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
int
&
groups
,
const
mkldnn
::
memory
::
data_type
&
srcdt
,
const
mkldnn
::
memory
::
format
&
format
,
const
mkldnn
::
memory
::
data_type
&
dstdt
,
const
std
::
string
&
suffix
)
{
AppendKeyDims
(
key
,
input_dims
);
AppendKeyDims
(
key
,
weights_dims
);
AppendKeyVec
(
key
,
strides
);
AppendKeyVec
(
key
,
paddings
);
AppendKeyVec
(
key
,
dilations
);
AppendKey
(
key
,
std
::
to_string
(
groups
));
AppendKey
(
key
,
std
::
to_string
(
srcdt
));
AppendKey
(
key
,
std
::
to_string
(
format
));
AppendKey
(
key
,
std
::
to_string
(
dstdt
));
AppendKey
(
key
,
suffix
);
}
protected:
static
void
AppendKeyDims
(
std
::
string
*
key
,
const
mkldnn
::
memory
::
dims
&
dims
)
{
for
(
unsigned
int
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
AppendKey
(
key
,
std
::
to_string
(
dims
[
i
]));
}
}
static
void
AppendKeyVec
(
std
::
string
*
key
,
const
std
::
vector
<
int
>&
dims
)
{
for
(
unsigned
int
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
AppendKey
(
key
,
std
::
to_string
(
dims
[
i
]));
}
}
static
void
AppendKey
(
std
::
string
*
key
,
const
std
::
string
&
s
)
{
key
->
append
(
s
);
}
static
std
::
string
dims2str
(
const
mkldnn
::
memory
::
dims
&
operand_dims
)
{
std
::
string
dstr
=
""
;
for
(
size_t
i
=
0
;
i
<
operand_dims
.
size
();
++
i
)
{
...
...
@@ -215,7 +264,8 @@ class MKLDNNHandler {
class
TransposeMKLDNNHandler
:
public
MKLDNNHandler
{
public:
TransposeMKLDNNHandler
(
std
::
vector
<
int
>&
dims
,
std
::
vector
<
int
>&
axis
,
TransposeMKLDNNHandler
(
std
::
vector
<
int
>&
dims
,
// NOLINT
std
::
vector
<
int
>&
axis
,
// NOLINT
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
),
...
...
@@ -303,8 +353,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
}
protected:
mkldnn_memory_desc_t
Axis2MemoryDesc
(
std
::
vector
<
int
>&
nchw_tz
,
std
::
vector
<
int
>&
axis
)
{
mkldnn_memory_desc_t
Axis2MemoryDesc
(
std
::
vector
<
int
>&
nchw_tz
,
// NOLINT
std
::
vector
<
int
>&
axis
// NOLINT
)
{
mkldnn_memory_desc_t
mem_fmt
;
mem_fmt
.
primitive_kind
=
mkldnn_memory
;
...
...
@@ -462,21 +513,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
,
// NOLINT
bool
is_persistent
=
false
)
{
bool
is_persistent
=
false
,
bool
is_INT8
=
false
,
std
::
vector
<
float
>
scale_data
=
{
1.0
f
},
int
mask
=
0
)
{
auto
user_weights_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
weights_pd
=
conv_pd_
->
weights_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
user_weights_memory_p
,
"@weights_mem_p"
,
pipeline
,
is_persistent
);
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
user_weights_memory_p
,
"@weights_mem_p"
,
pipeline
,
is_persistent
,
is_INT8
,
scale_data
,
mask
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireBiasMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_bias_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
,
// NOLINT
bool
is_persistent
=
false
,
bool
is_INT8
=
false
,
std
::
vector
<
float
>
scale_data
=
{
1.0
f
},
int
mask
=
0
)
{
// NOLINT
auto
user_bias_pd
=
user_bias_memory_p
->
get_primitive_desc
();
auto
bias_pd
=
conv_pd_
->
bias_primitive_desc
();
return
this
->
AcquireMemory
(
bias_pd
,
user_bias_pd
,
user_bias_memory_p
,
"@bias_mem_p"
,
pipeline
);
"@bias_mem_p"
,
pipeline
,
is_persistent
,
is_INT8
,
scale_data
,
mask
);
}
std
::
shared_ptr
<
forward_t
>
AcquireConvolution
(
...
...
@@ -594,5 +650,29 @@ using ConvTransposeMKLDNNHandler =
ConvMKLDNNTemplateHandler
<
mkldnn
::
deconvolution_forward
,
mkldnn
::
deconvolution_backward_data
,
mkldnn
::
deconvolution_backward_weights
>
;
template
<
typename
T
>
static
std
::
shared_ptr
<
mkldnn
::
memory
>
SetDstMemory
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
)
{
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(),
::
paddle
::
memory
::
Allocator
::
kDefault
,
handler
->
GetDstMemorySize
());
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
=
handler
->
AcquireDstMemoryFromPrimitive
(
to_void_cast
<
T
>
(
output_data
));
return
dst_memory_p
;
}
template
<
typename
T
>
static
std
::
shared_ptr
<
mkldnn
::
memory
>
SetDstMemoryHandler
(
const
framework
::
ExecutionContext
&
ctx
,
framework
::
Tensor
*
output
,
const
std
::
shared_ptr
<
ConvMKLDNNHandler
>&
handler
)
{
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(),
::
paddle
::
memory
::
Allocator
::
kDefault
,
handler
->
GetDstMemorySize
());
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
;
dst_memory_p
->
set_data_handle
(
to_void_cast
<
T
>
(
output_data
));
return
dst_memory_p
;
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/nccl_helper.h
浏览文件 @
4a443ffc
...
...
@@ -106,7 +106,7 @@ struct NCCLContextMap {
}
std
::
unique_ptr
<
ncclComm_t
[]
>
comms
(
new
ncclComm_t
[
order_
.
size
()]);
// if num_trainers == 1, should create a new nccl id for local comms.
if
(
num_trainers
==
1
)
{
if
(
num_trainers
==
1
&&
nccl_id
==
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
NCCLGroupGuard
::
NCCLMutex
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitAll
(
comms
.
get
(),
static_cast
<
int
>
(
order_
.
size
()),
order_
.
data
()));
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
4a443ffc
...
...
@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/port.h"
#include <algorithm>
#include <iomanip>
#include <limits>
...
...
@@ -25,9 +22,12 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool
(
enable_rpc_profiler
,
false
,
"Enable rpc profiler or not."
);
...
...
@@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
:
is_enabled_
(
false
),
start_ns_
(
PosixInNsec
())
{
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
is_enabled_
=
true
;
dev_ctx_
=
dev_ctx
;
name_
=
name
;
...
...
@@ -184,8 +185,8 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
}
RecordEvent
::~
RecordEvent
()
{
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
g_state
==
ProfilerState
::
kDisabled
||
!
is_enabled_
)
return
;
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
tracer
->
AddCPURecords
(
CurAnnotation
(),
start_ns_
,
PosixInNsec
(),
...
...
paddle/fluid/platform/temporary_allocator_test.cc
浏览文件 @
4a443ffc
...
...
@@ -14,12 +14,27 @@
#include "paddle/fluid/platform/temporary_allocator.h"
#include <gtest/gtest.h>
#include <string>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
DECLARE_double
(
limit_of_temporary_allocation
);
namespace
paddle
{
namespace
platform
{
class
DummyOp
:
public
framework
::
OperatorBase
{
public:
DummyOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
protected:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{}
};
TEST
(
temporary_allocator
,
temporary_allocator
)
{
platform
::
CPUPlace
cpu_place
;
TemporaryAllocator
alloc
(
cpu_place
);
...
...
@@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) {
}
TEST
(
temporary_allocator
,
create_tensor_with_allocationptr
)
{
platform
::
CPUPlace
cpu_place
;
TemporaryAllocator
cpu_alloc
(
cpu_place
);
framework
::
VariableNameMap
dummy_vars
;
framework
::
AttributeMap
dummy_attrs
;
DummyOp
op
(
"dummy"
,
dummy_vars
,
dummy_vars
,
dummy_attrs
);
framework
::
Scope
scope
;
framework
::
VariableValueMap
vars
;
framework
::
RuntimeContext
run_ctx
(
vars
,
vars
);
size_t
memory_size
=
300
;
{
size_t
memory_size
=
200
;
auto
allocation
=
cpu_alloc
.
Allocate
(
memory_size
);
void
*
address
=
allocation
->
ptr
();
platform
::
CPUPlace
cpu_place
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
framework
::
GetTensor
<
float
>
(
std
::
move
(
allocation
),
framework
::
make_ddim
({
numel
}));
PADDLE_ENFORCE_EQ
(
address
,
tensor
.
data
<
float
>
()
);
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CPUDeviceContext
>
(
framework
::
make_ddim
({
numel
}),
*
dev_ctx
);
PADDLE_ENFORCE_EQ
(
tensor
.
numel
(),
numel
);
}
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
{
size_t
memory_size
=
300
;
auto
allocation
=
gpu_alloc
.
Allocate
(
memory_size
);
void
*
address
=
allocation
->
ptr
();
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
framework
::
GetTensor
<
float
>
(
std
::
move
(
allocation
),
framework
::
make_ddim
({
numel
}));
PADDLE_ENFORCE_EQ
(
address
,
tensor
.
data
<
float
>
()
);
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
({
numel
}),
*
dev_ctx
);
PADDLE_ENFORCE_EQ
(
tensor
.
numel
(),
numel
);
}
// The allocation is not holded now, it should be placed to
// TemporaryAllocationQueue.
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
1
);
gpu_alloc
.
Release
([]()
{});
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
#endif
}
TEST
(
temporary_allocator
,
create_tensor_with_allocationptr2
)
{
platform
::
CPUPlace
cpu_place
;
TemporaryAllocator
cpu_alloc
(
cpu_place
);
{
framework
::
VariableNameMap
dummy_vars
;
framework
::
AttributeMap
dummy_attrs
;
DummyOp
op
(
"dummy"
,
dummy_vars
,
dummy_vars
,
dummy_attrs
);
framework
::
Scope
scope
;
framework
::
VariableValueMap
vars
;
framework
::
RuntimeContext
run_ctx
(
vars
,
vars
);
size_t
memory_size
=
400
;
{
platform
::
CPUPlace
cpu_place
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
out_side_tensor
;
void
*
address
;
{
auto
allocation
=
cpu_alloc
.
Allocate
(
memory_size
);
address
=
allocation
->
ptr
();
framework
::
Tensor
tensor
=
framework
::
GetTensor
<
float
>
(
std
::
move
(
allocation
),
framework
::
make_ddim
({
numel
}));
PADDLE_ENFORCE_EQ
(
address
,
tensor
.
data
<
float
>
());
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CPUDeviceContext
>
(
framework
::
make_ddim
({
numel
}),
*
dev_ctx
);
PADDLE_ENFORCE_EQ
(
tensor
.
numel
(),
numel
);
out_side_tensor
.
ShareDataWith
(
tensor
);
}
PADDLE_ENFORCE_EQ
(
address
,
out_side_tensor
.
data
<
float
>
());
PADDLE_ENFORCE_EQ
(
out_side_tensor
.
numel
(),
numel
);
}
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu_place
(
0
);
TemporaryAllocator
gpu_alloc
(
gpu_place
);
{
void
*
address
;
platform
::
CUDAPlace
gpu_place
(
0
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
size_t
memory_size
=
500
;
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
out_side_tensor
;
{
auto
allocation
=
gpu_alloc
.
Allocate
(
memory_size
);
address
=
allocation
->
ptr
();
framework
::
Tensor
tensor
=
framework
::
GetTensor
<
float
>
(
std
::
move
(
allocation
),
framework
::
make_ddim
({
numel
}));
PADDLE_ENFORCE_EQ
(
address
,
tensor
.
data
<
float
>
());
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
({
numel
}),
*
dev_ctx
);
PADDLE_ENFORCE_EQ
(
tensor
.
numel
(),
numel
);
out_side_tensor
.
ShareDataWith
(
tensor
);
}
PADDLE_ENFORCE_EQ
(
address
,
out_side_tensor
.
data
<
float
>
());
PADDLE_ENFORCE_EQ
(
out_side_tensor
.
numel
(),
numel
);
// The allocation is holded by out_side_tensor.
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
gpu_alloc
.
Release
([]()
{});
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
}
// The allocation is not holded now, it should be placed to
// TemporaryAllocationQueue.
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
1
);
gpu_alloc
.
Release
([]()
{});
PADDLE_ENFORCE_EQ
(
gpu_alloc
.
TemporaryAllocationQueueSize
(),
0
);
#endif
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
4a443ffc
...
...
@@ -946,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(The type is STR, debug_graphviz_path indicate the path that
writing the SSA Graph to file in the form of graphviz, you.
It is useful for debugging. Default "")DOC"
)
.
def_property
(
"enable_data_balance"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
enable_data_balance_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
PADDLE_ENFORCE
(
!
self
.
IsFinalized
(),
"BuildStrategy is finlaized."
);
self
.
enable_data_balance_
=
b
;
})
// FIXME(chengudo): enable_data_balance seems not important
.
def_property
(
"enable_sequential_execution"
,
[](
const
BuildStrategy
&
self
)
{
...
...
@@ -1007,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle.
"memory_optimize"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
memory_optimize_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
memory_optimize_
=
b
;
})
.
def_property
(
"is_distribution"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
is_distribution_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
is_distribution_
=
b
;
})
.
def_property
(
"memory_early_delete"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
memory_early_delete_
;
},
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
4a443ffc
...
...
@@ -199,6 +199,7 @@ function cmake_gen() {
-DANAKIN_BUILD_CROSS_PLANTFORM=
${
ANAKIN_BUILD_CROSS_PLANTFORM
:ON
}
-DPY_VERSION=
${
PY_VERSION
:-
2
.7
}
-DCMAKE_INSTALL_PREFIX=
${
INSTALL_PREFIX
:-
/paddle/build
}
-DWITH_JEMALLOC=
${
WITH_JEMALLOC
:-
OFF
}
========================================
EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because
...
...
@@ -232,7 +233,8 @@ EOF
-DANAKIN_BUILD_FAT_BIN
=
${
ANAKIN_BUILD_FAT_BIN
:OFF
}
\
-DANAKIN_BUILD_CROSS_PLANTFORM
=
${
ANAKIN_BUILD_CROSS_PLANTFORM
:ON
}
\
-DPY_VERSION
=
${
PY_VERSION
:-
2
.7
}
\
-DCMAKE_INSTALL_PREFIX
=
${
INSTALL_PREFIX
:-
/paddle/build
}
-DCMAKE_INSTALL_PREFIX
=
${
INSTALL_PREFIX
:-
/paddle/build
}
\
-DWITH_JEMALLOC
=
${
WITH_JEMALLOC
:-
OFF
}
}
...
...
@@ -918,11 +920,11 @@ function main() {
cmake_gen
${
PYTHON_ABI
:-
""
}
build
assert_api_not_changed
${
PYTHON_ABI
:-
""
}
assert_api_spec_approvals
run_test
gen_capi_package
gen_fluid_lib
test_fluid_lib
assert_api_spec_approvals
;;
assert_api
)
assert_api_not_changed
${
PYTHON_ABI
:-
""
}
...
...
python/paddle/fluid/__init__.py
浏览文件 @
4a443ffc
...
...
@@ -102,13 +102,6 @@ def __bootstrap__():
import
sys
import
os
import
platform
if
os
.
name
==
'nt'
:
third_lib_path
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))
+
os
.
sep
+
'..'
+
os
.
sep
+
'libs'
os
.
environ
[
'path'
]
+=
';'
+
third_lib_path
sys
.
path
.
append
(
third_lib_path
)
from
.
import
core
in_test
=
'unittest'
in
sys
.
modules
...
...
@@ -135,7 +128,8 @@ def __bootstrap__():
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'fast_eager_deletion_mode'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
,
'pe_profile_fname'
,
'warpctc_dir'
'print_sub_graph_dir'
,
'pe_profile_fname'
,
'warpctc_dir'
,
'enable_parallel_graph'
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
...
...
@@ -158,14 +152,10 @@ def __bootstrap__():
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'cudnn_exhaustive_search_times'
,
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'memory_optimize_debug'
,
'selected_gpus'
,
'cudnn_exhaustive_search_times'
,
'sync_nccl_allreduce'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
python/paddle/fluid/framework.py
浏览文件 @
4a443ffc
...
...
@@ -26,6 +26,13 @@ import numpy as np
from
..
import
compat
as
cpt
from
.proto
import
framework_pb2
try
:
if
os
.
name
==
'nt'
:
import
sys
third_lib_path
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))
+
os
.
sep
+
'..'
+
os
.
sep
+
'libs'
os
.
environ
[
'path'
]
+=
';'
+
third_lib_path
sys
.
path
.
append
(
third_lib_path
)
from
.
import
core
except
ImportError
as
e
:
if
os
.
name
==
'nt'
:
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
4a443ffc
...
...
@@ -26,7 +26,7 @@ from ..initializer import Normal, Constant
from
..framework
import
Variable
,
OpProtoHolder
from
..param_attr
import
ParamAttr
from
.layer_function_generator
import
autodoc
,
templatedoc
,
_generate_doc_string_
from
.tensor
import
concat
from
.tensor
import
concat
,
assign
from
.
import
utils
from
..
import
unique_name
from
functools
import
reduce
...
...
@@ -340,9 +340,7 @@ def embedding(input,
"""
helper
=
LayerHelper
(
'embedding'
,
**
locals
())
remote_prefetch
=
False
if
os
.
environ
.
get
(
'PADDLE_ENABLE_REMOTE_PREFETCH'
):
remote_prefetch
=
True
remote_prefetch
=
is_sparse
and
(
not
is_distributed
)
if
remote_prefetch
:
assert
is_sparse
is
True
and
is_distributed
is
False
w
=
helper
.
create_parameter
(
...
...
@@ -5032,12 +5030,18 @@ def nce(input,
else
:
num_neg_samples
=
int
(
num_neg_samples
)
remote_prefetch
=
is_sparse
print
(
"With sparse mode, if your models has only small parameter prefetch may cause speed down"
)
attrs
=
{
'num_total_classes'
:
int
(
num_total_classes
),
'num_neg_samples'
:
num_neg_samples
,
'seed'
:
seed
,
'sampler'
:
sampler
,
'is_sparse'
:
is_sparse
'is_sparse'
:
is_sparse
,
'remote_prefetch'
:
remote_prefetch
}
helper
.
append_op
(
...
...
@@ -5147,7 +5151,10 @@ def hsigmoid(input,
pass
weights
=
None
remote_prefetch
=
is_sparse
print
(
"With sparse mode, if your models has only small parameter prefetch may cause speed down"
)
if
not
is_custom
:
weights
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
...
...
@@ -5163,7 +5170,7 @@ def hsigmoid(input,
inputs
=
{
"X"
:
input
,
"W"
:
weights
,
"PTable"
:
path_table
,
"P
ath
Table"
:
path_table
,
"PathCode"
:
path_code
,
"Label"
:
label
}
...
...
@@ -5186,9 +5193,13 @@ def hsigmoid(input,
type
=
"hierarchical_sigmoid"
,
inputs
=
inputs
,
outputs
=
{
"Out"
:
out
,
"PreOut"
:
pre_out
},
attrs
=
{
"num_classes"
:
num_classes
,
"is_sparse"
:
is_sparse
})
"PreOut"
:
pre_out
,
"W_Out"
:
weights
},
attrs
=
{
"num_classes"
:
num_classes
,
"is_sparse"
:
is_sparse
,
"remote_prefetch"
:
remote_prefetch
})
return
out
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
4a443ffc
...
...
@@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
def
_is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
framework
.
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
ParallelExecutor
(
object
):
"""
ParallelExecutor is designed for data parallelism, which focuses on distributing
...
...
@@ -128,6 +137,11 @@ class ParallelExecutor(object):
build_strategy
=
BuildStrategy
()
build_strategy
.
num_trainers
=
num_trainers
build_strategy
.
trainer_id
=
trainer_id
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy
.
is_distribution
=
_is_pserver_mode
(
main_program
)
or
num_trainers
>
1
# step4: get main_program, scope, local_scopes
main
=
main_program
if
main_program
\
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
4a443ffc
...
...
@@ -21,18 +21,19 @@ if(NOT WITH_DISTRIBUTE)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_simnet_bow
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_text_classification
)
LIST
(
REMOVE_ITEM TEST_OPS test_nce_remote_table_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op
)
endif
(
NOT WITH_DISTRIBUTE
)
if
(
NOT
${
WITH_GPU
}
)
LIST
(
REMOVE_ITEM TEST_OPS test_conv2d_fusion_op
)
elseif
(
${
CUDNN_
MAJOR_VERSION
}
VERSION_LESS 7
)
elseif
(
${
CUDNN_
VERSION
}
VERSION_LESS 7100
)
LIST
(
REMOVE_ITEM TEST_OPS test_conv2d_fusion_op
)
endif
()
list
(
REMOVE_ITEM TEST_OPS test_seq_concat_op
)
# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
list
(
REMOVE_ITEM TEST_OPS test_modified_huber_loss_op
)
# FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
list
(
REMOVE_ITEM TEST_OPS test_lstm_unit_op
)
# # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
list
(
REMOVE_ITEM TEST_OPS test_nce
)
# FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
list
(
REMOVE_ITEM TEST_OPS test_recurrent_op
)
# FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
list
(
REMOVE_ITEM TEST_OPS test_cond_op
)
# FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
...
...
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
0 → 100644
浏览文件 @
4a443ffc
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
paddle.fluid.tests.unittests.test_mean_op
import
TestMeanOp
,
TestFP16MeanOp
class
TestNGRAPHMeanOp
(
TestMeanOp
):
def
setUp
(
self
):
super
(
TestNGRAPHMeanOp
,
self
).
setUp
()
class
TestNGRAPHFP16MeanOp
(
TestFP16MeanOp
):
def
setUp
(
self
):
super
(
TestNGRAPHFP16MeanOp
,
self
).
setUp
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
0 → 100644
浏览文件 @
4a443ffc
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
paddle.fluid.tests.unittests.test_scale_op
import
TestScaleOp
,
TestScaleOpSelectedRows
,
TestScaleFp16Op
,
TestScaleFp16OpSelectedRows
class
TestNGRAPHScaleOp
(
TestScaleOp
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleOpSelectedRows
(
TestScaleOpSelectedRows
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleFp16Op
(
TestScaleFp16Op
):
def
init_dtype_type
(
self
):
pass
class
TestNGRAPHScaleFp16OpSelectedRows
(
TestScaleFp16OpSelectedRows
):
def
init_dtype_type
(
self
):
pass
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
4a443ffc
...
...
@@ -78,7 +78,6 @@ class TestParallelExecutorBase(unittest.TestCase):
exec_strategy
.
allow_op_delay
=
allow_op_delay
if
use_fast_executor
:
exec_strategy
.
use_experimental_executor
=
True
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
\
if
use_reduce
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
...
...
python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
浏览文件 @
4a443ffc
...
...
@@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest):
input
=
np
.
random
.
random
(
self
.
input_size
).
astype
(
self
.
dtype
)
filter
=
np
.
random
.
random
(
self
.
filter_size
).
astype
(
self
.
dtype
)
self
.
output
=
conv2d_forward_naive
(
input
,
filter
,
self
.
groups
,
conv2d_param
).
astype
(
self
.
dtype
)
self
.
output
,
_
,
_
,
_
,
_
=
conv2d_forward_naive
(
input
,
filter
,
self
.
groups
,
conv2d_param
)
self
.
output
=
self
.
output
.
astype
(
self
.
dtype
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
),
...
...
python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
0 → 100644
浏览文件 @
4a443ffc
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
from
test_conv2d_op
import
conv2d_forward_naive
,
TestConv2dOp
def
conv2d_forward_refer
(
input
,
filter
,
group
,
conv_param
):
out
,
in_n
,
out_h
,
out_w
,
out_c
=
conv2d_forward_naive
(
input
,
filter
,
group
,
conv_param
)
out_tmp
=
np
.
zeros
((
in_n
,
out_h
,
out_w
,
out_c
))
for
n
in
range
(
in_n
):
for
i
in
range
(
out_h
):
for
j
in
range
(
out_w
):
for
m
in
range
(
out_c
):
out_tmp
[
n
,
i
,
j
,
m
]
=
out
[
n
,
m
,
i
,
j
]
return
out_tmp
.
reshape
(
in_n
,
out_c
,
out_h
,
out_w
)
class
TestConv2dInt8Op
(
TestConv2dOp
):
def
setUp
(
self
):
self
.
op_type
=
"conv2d"
self
.
use_cudnn
=
False
self
.
exhaustive_search
=
False
self
.
use_cuda
=
False
self
.
use_mkldnn
=
False
self
.
data_format
=
"AnyLayout"
self
.
weighttype
=
np
.
float32
self
.
use_mkldnn
=
True
self
.
init_group
()
self
.
init_dilation
()
self
.
init_test_case
()
self
.
init_fuse_relu
()
self
.
init_data_type
()
conv2d_param
=
{
'stride'
:
self
.
stride
,
'pad'
:
self
.
pad
,
'dilation'
:
self
.
dilations
}
filter
=
np
.
random
.
random
(
self
.
filter_size
).
astype
(
self
.
weighttype
)
if
self
.
srctype
==
np
.
uint8
:
input
=
np
.
random
.
randint
(
0
,
10
,
self
.
input_size
).
astype
(
self
.
srctype
)
else
:
input
=
np
.
random
.
randint
(
-
5
,
5
,
self
.
input_size
).
astype
(
self
.
srctype
)
input_shift
=
(
np
.
ones
(
self
.
input_size
)
*
128
).
astype
(
np
.
uint8
)
if
self
.
srctype
==
np
.
int8
:
filter_int
=
np
.
round
(
filter
*
self
.
scale_weights
[
0
]
*
0.5
).
astype
(
np
.
int32
)
scale_output_shift
=
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
]
*
0.5
)
output1
=
conv2d_forward_refer
(
np
.
round
((
input
.
astype
(
np
.
int32
)
+
input_shift
)
*
self
.
scale_in
).
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
conv2d_param
).
astype
(
np
.
float32
)
*
scale_output_shift
output2
=
conv2d_forward_refer
(
np
.
round
((
input_shift
)
*
self
.
scale_in
).
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
conv2d_param
).
astype
(
np
.
float32
)
*
scale_output_shift
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
np
.
round
(
output1
-
output2
),
0
).
astype
(
self
.
dsttype
)
else
:
output
=
np
.
round
(
output1
-
output2
).
astype
(
self
.
dsttype
)
else
:
filter_int
=
np
.
round
(
filter
*
self
.
scale_weights
[
0
]).
astype
(
np
.
int32
)
scale_output_shift
=
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
])
output1
=
conv2d_forward_refer
(
input
.
astype
(
np
.
int32
),
filter_int
,
self
.
groups
,
conv2d_param
).
astype
(
np
.
float32
)
if
self
.
fuse_relu
:
output
=
np
.
maximum
(
np
.
round
(
output1
*
(
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
]))),
0
).
astype
(
self
.
dsttype
)
else
:
output
=
np
.
round
(
output1
*
(
self
.
scale_out
/
(
self
.
scale_in
*
self
.
scale_weights
[
0
]))).
astype
(
self
.
dsttype
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
.
astype
(
self
.
srctype
)),
'Filter'
:
OpTest
.
np_dtype_to_fluid_dtype
(
filter
)
}
self
.
attrs
=
{
'strides'
:
self
.
stride
,
'paddings'
:
self
.
pad
,
'groups'
:
self
.
groups
,
'dilations'
:
self
.
dilations
,
'use_cudnn'
:
self
.
use_cudnn
,
'use_mkldnn'
:
self
.
use_mkldnn
,
'data_format'
:
self
.
data_format
,
'exhaustive_search'
:
self
.
exhaustive_search
,
'Scale_in'
:
self
.
scale_in
,
'Scale_out'
:
self
.
scale_out
,
'Scale_weights'
:
self
.
scale_weights
,
'fuse_relu'
:
self
.
fuse_relu
}
self
.
outputs
=
{
'Output'
:
output
}
def
test_check_output
(
self
):
self
.
check_output_with_place
(
core
.
CPUPlace
(),
atol
=
0
)
def
test_check_grad
(
self
):
pass
def
test_check_grad_no_filter
(
self
):
pass
def
test_check_grad_no_input
(
self
):
pass
def
init_test_case
(
self
):
TestConv2dOp
.
init_test_case
(
self
)
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
1
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
def
init_data_type
(
self
):
self
.
srctype
=
np
.
uint8
self
.
dsttype
=
np
.
int8
def
init_fuse_relu
(
self
):
self
.
fuse_relu
=
True
#--------------------test conv2d u8 in and u8 out--------------------
class
TestConv2d
(
TestConv2dInt8Op
):
def
init_test_case
(
self
):
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
class
TestWithPad
(
TestConv2d
):
def
init_test_case
(
self
):
TestConv2d
.
init_test_case
(
self
)
self
.
pad
=
[
1
,
1
]
class
TestWithGroup
(
TestConv2d
):
def
init_group
(
self
):
self
.
groups
=
3
class
TestWithStride
(
TestConv2dInt8Op
):
def
init_test_case
(
self
):
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
6
,
6
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.8
self
.
scale_weights
=
[
10.0
]
class
TestWith1x1
(
TestConv2dInt8Op
):
def
init_test_case
(
self
):
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
1
,
3
,
5
,
5
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
12.0
]
class
TestWithInput1x1Filter1x1
(
TestConv2dInt8Op
):
def
init_test_case
(
self
):
self
.
pad
=
[
0
,
0
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
1
,
1
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
1
,
1
]
self
.
scale_in
=
1.0
self
.
scale_out
=
0.5
self
.
scale_weights
=
[
10.0
]
def
init_group
(
self
):
self
.
groups
=
3
def
init_data_type_with_fusion
(
self
,
input_dt
,
fuse_relu
):
self
.
srctype
=
input_dt
self
.
dsttype
=
np
.
uint8
if
fuse_relu
else
np
.
int8
def
init_fuse_relu
(
self
):
self
.
fuse_relu
=
fuse_relu
def
create_test_int8_class
(
parent
):
#--------------------test conv2d s8 in and u8 out--------------------
class
TestS8U8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
True
)
#--------------------test conv2d s8 in and s8 out--------------------
class
TestS8S8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
int8
,
False
)
#--------------------test conv2d u8 in and s8 out--------------------
class
TestU8S8Case
(
parent
):
def
init_data_type
(
self
):
init_data_type_with_fusion
(
self
,
np
.
uint8
,
False
)
cls_name_s8u8
=
"{0}_relu_{1}"
.
format
(
parent
.
__name__
,
"1"
)
cls_name_s8s8
=
"{0}_relu_{1}"
.
format
(
parent
.
__name__
,
"0"
)
cls_name_u8s8
=
"{0}_relu_{1}"
.
format
(
parent
.
__name__
,
"0"
)
TestS8U8Case
.
__name__
=
cls_name_s8u8
TestS8S8Case
.
__name__
=
cls_name_s8s8
TestU8S8Case
.
__name__
=
cls_name_u8s8
globals
()[
cls_name_s8u8
]
=
TestS8U8Case
globals
()[
cls_name_s8s8
]
=
TestS8S8Case
globals
()[
cls_name_u8s8
]
=
TestU8S8Case
create_test_int8_class
(
TestConv2dInt8Op
)
create_test_int8_class
(
TestWithPad
)
create_test_int8_class
(
TestWithStride
)
create_test_int8_class
(
TestWithGroup
)
create_test_int8_class
(
TestWith1x1
)
create_test_int8_class
(
TestWithInput1x1Filter1x1
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
4a443ffc
...
...
@@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
np
.
sum
(
input_pad_masked
*
f_sub
[
k
,
:,
:,
:],
axis
=
(
1
,
2
,
3
))
return
out
return
out
,
in_n
,
out_h
,
out_w
,
out_c
class
TestConv2dOp
(
OpTest
):
...
...
@@ -85,8 +85,9 @@ class TestConv2dOp(OpTest):
input
=
np
.
random
.
random
(
self
.
input_size
).
astype
(
self
.
dtype
)
filter
=
np
.
random
.
random
(
self
.
filter_size
).
astype
(
self
.
dtype
)
output
=
conv2d_forward_naive
(
input
,
filter
,
self
.
groups
,
conv2d_param
).
astype
(
self
.
dtype
)
output
,
_
,
_
,
_
,
_
=
conv2d_forward_naive
(
input
,
filter
,
self
.
groups
,
conv2d_param
)
output
=
output
.
astype
(
self
.
dtype
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
),
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
4a443ffc
...
...
@@ -442,10 +442,10 @@ class TestDistBase(unittest.TestCase):
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f"
tr0_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
0
,
w0_ep
,
self
.
_lr
/
2
)
0
,
w0_ep
,
self
.
_lr
)
tr1_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
1
,
w1_ep
,
self
.
_lr
/
2
)
1
,
w1_ep
,
self
.
_lr
)
if
self
.
_mem_opt
:
tr0_cmd
+=
" --mem_opt"
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
4a443ffc
...
...
@@ -14,14 +14,15 @@
from
__future__
import
print_function
import
traceback
import
math
import
collections
import
six
import
unittest
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.transpiler.distribute_transpiler
import
delete_ops
import
traceback
import
collections
import
six
class
TranspilerTest
(
unittest
.
TestCase
):
...
...
@@ -520,7 +521,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'
recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
,
'concat
'
'recv'
,
'
fetch_barrier
'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
@@ -560,7 +561,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'
recv'
,
'fetch_barrier'
,
'concat
'
'recv'
,
'recv'
,
'
fetch_barrier
'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
startup_ops
=
[
...
...
@@ -607,8 +608,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
,
'concat'
'sum'
,
'split_selected_rows'
,
'send'
,
'recv'
,
'recv'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
@@ -648,8 +648,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'recv'
,
'recv'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
startup_ops
=
[
...
...
@@ -824,5 +823,142 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
# test for remote prefetch
class
TestRemoteNce
(
TestDistLookupTableBase
):
def
network_with_table
(
self
,
is_sparse
,
is_distributed
):
num_total_classes
=
20
sampler
=
"uniform"
nid_freq_arr
=
np
.
random
.
dirichlet
(
np
.
ones
(
20
)
*
1000
).
astype
(
'float32'
)
input
=
fluid
.
layers
.
data
(
name
=
"input"
,
shape
=
[
10
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
w_param
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
shape
=
[
num_total_classes
,
10
],
dtype
=
'float32'
,
name
=
'nce_w'
,
initializer
=
fluid
.
initializer
.
ConstantInitializer
())
b_param
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
shape
=
[
num_total_classes
,
1
],
dtype
=
'float32'
,
name
=
'nce_b'
,
initializer
=
fluid
.
initializer
.
ConstantInitializer
())
cost
=
fluid
.
layers
.
nce
(
input
=
input
,
label
=
label
,
num_total_classes
=
num_total_classes
,
sampler
=
sampler
,
custom_dist
=
nid_freq_arr
.
tolist
(),
sample_weight
=
None
,
param_attr
=
'nce_w'
,
bias_attr
=
'nce_b'
,
seed
=
1
,
num_neg_samples
=
5
,
is_sparse
=
is_sparse
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
# optimizer
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.003
)
optimizer
.
minimize
(
avg_cost
)
def
net_conf
(
self
):
import
os
os
.
environ
[
'PADDLE_ENABLE_REMOTE_PREFETCH'
]
=
"1"
self
.
network_with_table
(
is_sparse
=
True
,
is_distributed
=
False
)
def
transpiler_test_impl
(
self
):
trainer
,
_
=
self
.
get_trainer
()
out_vars
=
[
"nce_w"
]
in_vars
=
[
"nce_b"
]
recv_var_names
=
[]
for
op
in
trainer
.
blocks
[
0
].
ops
:
if
op
.
type
==
"recv"
:
for
var
in
op
.
output
(
"Out"
):
recv_var_names
.
append
(
var
)
for
out_var
in
out_vars
:
self
.
assertFalse
(
out_var
in
recv_var_names
)
for
in_var
in
in_vars
:
self
.
assertTrue
(
in_var
in
recv_var_names
)
# test for remote prefetch
class
TestRemoteHsigmoid
(
TestDistLookupTableBase
):
def
network_with_table
(
self
,
is_sparse
,
is_distributed
):
num_total_classes
=
3
input
=
fluid
.
layers
.
data
(
name
=
"input"
,
shape
=
[
1
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
path_table
=
fluid
.
layers
.
data
(
name
=
'path_table'
,
shape
=
[
3
],
dtype
=
'int64'
)
path_code
=
fluid
.
layers
.
data
(
name
=
'path_code'
,
shape
=
[
3
],
dtype
=
'int64'
)
w_param
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
shape
=
[
num_total_classes
,
10
],
dtype
=
'float32'
,
name
=
'hs_w'
,
initializer
=
fluid
.
initializer
.
ConstantInitializer
())
b_param
=
fluid
.
default_main_program
().
global_block
().
create_parameter
(
shape
=
[
3
,
1
],
dtype
=
'float32'
,
name
=
'hs_b'
,
initializer
=
fluid
.
initializer
.
ConstantInitializer
())
emb
=
fluid
.
layers
.
embedding
(
input
=
input
,
is_sparse
=
is_sparse
,
size
=
[
3
,
3
],
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Normal
(
scale
=
1
/
math
.
sqrt
(
num_total_classes
))))
cost
=
fluid
.
layers
.
hsigmoid
(
input
=
emb
,
label
=
label
,
num_classes
=
num_total_classes
,
path_table
=
path_table
,
path_code
=
path_code
,
is_custom
=
True
,
is_sparse
=
is_sparse
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
# optimizer
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.003
)
optimizer
.
minimize
(
avg_cost
)
def
net_conf
(
self
):
import
os
os
.
environ
[
'PADDLE_ENABLE_REMOTE_PREFETCH'
]
=
"1"
self
.
network_with_table
(
is_sparse
=
True
,
is_distributed
=
False
)
def
transpiler_test_impl
(
self
):
trainer
,
_
=
self
.
get_trainer
()
params_to_check
=
list
()
for
op
in
trainer
.
blocks
[
0
].
ops
:
if
op
.
type
==
"hierarchical_sigmoid"
:
params_to_check
=
[
op
.
input
(
"W"
)[
0
],
op
.
input
(
"Bias"
)[
0
]]
for
name
in
[
"epmap"
,
"table_names"
,
"epmap"
]:
assert
op
.
has_attr
(
name
)
if
name
==
"epmap"
:
assert
op
.
attr
(
name
)[
0
]
==
u
'127.0.0.1:6174'
elif
name
==
"table_names"
:
assert
op
.
attr
(
name
)[
0
]
==
u
'hierarchical_sigmoid_0.w_0'
else
:
assert
op
.
attr
(
name
)
==
3
elif
op
.
type
==
"lookup_table"
:
params_to_check
.
append
(
op
.
input
(
"W"
)[
0
])
else
:
pass
op_count
=
0
for
op
in
trainer
.
blocks
[
0
].
ops
:
if
op
.
type
==
"recv"
:
assert
len
(
op
.
output
(
"Out"
))
==
1
assert
op
.
output
(
"Out"
)[
0
]
==
u
'hierarchical_sigmoid_0.b_0'
op_count
+=
1
assert
op_count
==
1
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
浏览文件 @
4a443ffc
...
...
@@ -29,6 +29,12 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
print
(
'Skip use_cuda=True because Paddle is not compiled with cuda'
)
return
if
use_parallel_executor
and
os
.
name
==
'nt'
:
print
(
'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
)
return
word_dict
=
paddle
.
dataset
.
imdb
.
word_dict
()
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
imdb
.
train
(
word_dict
),
batch_size
=
batch_size
)
...
...
python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
0 → 100644
浏览文件 @
4a443ffc
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
from
paddle.fluid.op
import
Operator
import
paddle.compat
as
cpt
class
TestFusedEmbeddingSeqPoolOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"fused_embedding_seq_pool"
self
.
emb_size
=
2
table
=
np
.
random
.
random
((
17
,
self
.
emb_size
)).
astype
(
"float32"
)
ids
=
np
.
array
([[[
4
],
[
3
]],
[[
4
],
[
3
]],
[[
2
],
[
1
]],
[[
16
],
[
1
]]]).
astype
(
"int64"
)
merged_ids
=
np
.
array
([
4
,
2
,
16
]).
astype
(
"int64"
)
ids_expand
=
np
.
expand_dims
(
ids
,
axis
=
1
)
self
.
lod
=
[[
3
,
1
]]
self
.
attrs
=
{
'is_sparse'
:
True
}
self
.
inputs
=
{
'W'
:
table
,
'Ids'
:
(
ids_expand
,
self
.
lod
)}
self
.
outputs
=
{
'Out'
:
np
.
reshape
(
np
.
array
([
table
[[
4
,
3
]]
+
table
[[
4
,
3
]]
+
table
[[
2
,
1
]],
table
[[
16
,
1
]]
]),
[
len
(
self
.
lod
[
0
]),
2
*
self
.
emb_size
])
}
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
浏览文件 @
4a443ffc
...
...
@@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest):
self
.
inputs
=
{
'X'
:
x
,
'W'
:
w
,
'PTable'
:
path_table
,
'P
ath
Table'
:
path_table
,
'PathCode'
:
path_code
,
'Label'
:
label
,
'Bias'
:
bias
...
...
@@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest):
self
.
inputs
=
{
'X'
:
x
,
'W'
:
w
,
'PTable'
:
path_table
,
'P
ath
Table'
:
path_table
,
'PathCode'
:
path_code
,
'Label'
:
label
,
'Bias'
:
bias
...
...
@@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
self
.
inputs
=
{
'X'
:
x
,
'W'
:
w
,
'PTable'
:
path_table
,
'P
ath
Table'
:
path_table
,
'PathCode'
:
path_code
,
'Label'
:
label
,
}
...
...
python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
0 → 100644
浏览文件 @
4a443ffc
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
signal
import
time
import
unittest
from
multiprocessing
import
Process
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.framework
import
Program
,
program_guard
def
run_pserver
(
pserver_id
,
use_cuda
,
sync_mode
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
# create table parameter in scope
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
# create and initialize Param Variable
param
=
scope
.
var
(
'table'
).
get_tensor
()
param_array
=
np
.
ones
((
5
,
8
)).
astype
(
"float32"
)
for
i
in
range
(
len
(
param_array
)):
param_array
[
i
]
*=
param_array
[
i
]
*
i
+
pserver_id
*
10
+
1
param
.
set
(
param_array
,
place
)
optimize_block
=
program
.
_create_block
(
program
.
global_block
().
idx
)
program
.
global_block
().
append_op
(
type
=
"listen_and_serv"
,
inputs
=
{
'X'
:
[]},
outputs
=
{},
attrs
=
{
"optimize_blocks"
:
[
optimize_block
],
"endpoint"
:
'127.0.0.1:0'
,
"Fanin"
:
1
,
"sync_mode"
:
True
,
"grad_to_block_id"
:
[]
})
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
program
)
class
TestListenAndServOp
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
ps_timeout
=
5
def
_start_pserver
(
self
,
pserver_id
,
use_cuda
,
sync_mode
,
pserver_func
):
p
=
Process
(
target
=
pserver_func
,
args
=
(
pserver_id
,
use_cuda
,
sync_mode
))
p
.
daemon
=
True
p
.
start
()
return
p
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
sleep_time
=
0.5
while
True
:
assert
start_left_time
>=
0
,
"wait ps ready failed"
time
.
sleep
(
sleep_time
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
start_left_time
-=
sleep_time
def
_get_pserver_port
(
self
,
pid
):
with
open
(
"/tmp/paddle.%d.port"
%
pid
,
'r'
)
as
f
:
port
=
int
(
f
.
read
().
strip
())
return
port
def
_run_hsigmoid_op_one_pserver
(
self
,
place
,
port
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
x
=
scope
.
var
(
'X'
).
get_tensor
()
x_array
=
np
.
random
.
random
((
4
,
8
)).
astype
(
"float32"
)
*
2
x
.
set
(
x_array
,
place
)
# create and initialize Param Variable
param
=
scope
.
var
(
'W'
).
get_tensor
()
param_array
=
np
.
zeros
((
5
,
8
)).
astype
(
"float32"
)
*
2
param
.
set
(
param_array
,
place
)
path_table
=
scope
.
var
(
'PathTable'
).
get_tensor
()
path_table_array
=
np
.
array
(
[(
0
,
2
,
-
1
,
-
1
,
-
1
),
(
0
,
1
,
2
,
-
1
,
-
1
),
(
0
,
1
,
4
,
-
1
,
-
1
),
(
0
,
2
,
-
1
,
-
1
,
-
1
)]).
astype
(
"int64"
)
#np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_table
.
set
(
path_table_array
,
place
)
path_code
=
scope
.
var
(
'PathCode'
).
get_tensor
()
path_code_array
=
np
.
array
(
[(
0
,
0
,
-
1
,
-
1
,
-
1
),
(
1
,
1
,
1
,
-
1
,
-
1
),
(
1
,
0
,
0
,
-
1
,
-
1
),
(
0
,
1
,
-
1
,
-
1
,
-
1
)]).
astype
(
"int64"
)
#np.array to store
path_code
.
set
(
path_code_array
,
place
)
label
=
scope
.
var
(
'Label'
).
get_tensor
()
label_array
=
np
.
array
([
0
,
1
,
4
,
5
])
label
.
set
(
label_array
,
place
)
bias
=
scope
.
var
(
'Bias'
).
get_tensor
()
bias_array
=
np
.
random
.
random
((
5
,
1
)).
astype
(
"float32"
)
bias
.
set
(
bias_array
,
place
)
out
=
scope
.
var
(
'Out'
).
get_tensor
()
pre_out
=
scope
.
var
(
'PreOut'
).
get_tensor
w_out
=
scope
.
var
(
'W_Out'
).
get_tensor
()
w_out
.
set
(
param_array
,
place
)
emaps
=
[
'127.0.0.1:'
+
str
(
port
)]
table_names
=
[
'table'
]
height_sections
=
[
2
]
# create and run sgd operator
hsigmoid_op
=
Operator
(
"hierarchical_sigmoid"
,
X
=
'X'
,
W
=
'W'
,
PathTable
=
'PathTable'
,
PathCode
=
'PathCode'
,
Label
=
'Label'
,
Bias
=
'Bias'
,
Out
=
'Out'
,
PreOut
=
'PreOut'
,
W_Out
=
'W_Out'
,
remote_prefetch
=
True
,
epmap
=
emaps
,
table_names
=
table_names
,
height_sections
=
height_sections
)
hsigmoid_op
.
run
(
scope
,
place
)
# get and compare result
result_array
=
np
.
array
(
w_out
)
self
.
assertEqual
(
list
(
result_array
.
shape
),
[
5
,
8
])
correct
=
None
for
i
in
range
(
5
):
if
i
!=
3
:
correct
=
np
.
full
((
1
,
8
),
i
+
1
).
astype
(
"float32"
)
self
.
assertTrue
((
result_array
[
i
]
==
correct
).
all
())
else
:
correct
=
np
.
full
((
1
,
8
),
0
).
astype
(
"float32"
)
self
.
assertTrue
((
result_array
[
i
]
==
correct
).
all
())
def
_run_hsigmoid_op_two_pserver
(
self
,
place
,
port0
,
port1
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
x
=
scope
.
var
(
'X'
).
get_tensor
()
x_array
=
np
.
random
.
random
((
4
,
8
)).
astype
(
"float32"
)
*
2
x
.
set
(
x_array
,
place
)
# create and initialize Param Variable
param
=
scope
.
var
(
'W'
).
get_tensor
()
param_array
=
np
.
zeros
((
5
,
8
)).
astype
(
"float32"
)
*
2
param
.
set
(
param_array
,
place
)
path_table
=
scope
.
var
(
'PathTable'
).
get_tensor
()
path_table_array
=
np
.
array
(
[(
0
,
2
,
-
1
,
-
1
,
-
1
),
(
0
,
1
,
3
,
-
1
,
-
1
),
(
0
,
1
,
4
,
-
1
,
-
1
),
(
0
,
2
,
-
1
,
-
1
,
-
1
)]).
astype
(
"int64"
)
#np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_table
.
set
(
path_table_array
,
place
)
path_code
=
scope
.
var
(
'PathCode'
).
get_tensor
()
path_code_array
=
np
.
array
(
[(
0
,
0
,
-
1
,
-
1
,
-
1
),
(
1
,
1
,
1
,
-
1
,
-
1
),
(
1
,
0
,
0
,
-
1
,
-
1
),
(
0
,
1
,
-
1
,
-
1
,
-
1
)]).
astype
(
"int64"
)
#np.array to store
path_code
.
set
(
path_code_array
,
place
)
label
=
scope
.
var
(
'Label'
).
get_tensor
()
label_array
=
np
.
array
([
0
,
1
,
4
,
5
])
label
.
set
(
label_array
,
place
)
bias
=
scope
.
var
(
'Bias'
).
get_tensor
()
bias_array
=
np
.
random
.
random
((
5
,
1
)).
astype
(
"float32"
)
bias
.
set
(
bias_array
,
place
)
out
=
scope
.
var
(
'Out'
).
get_tensor
()
pre_out
=
scope
.
var
(
'PreOut'
).
get_tensor
w_out
=
scope
.
var
(
'W_Out'
).
get_tensor
()
w_out
.
set
(
param_array
,
place
)
emaps
=
[
'127.0.0.1:'
+
str
(
port0
),
'127.0.0.1:'
+
str
(
port1
)]
table_names
=
[
'table'
,
'table'
]
height_sections
=
[
2
,
3
]
# create and run sgd operator
hsigmoid_op
=
Operator
(
"hierarchical_sigmoid"
,
X
=
'X'
,
W
=
'W'
,
PathTable
=
'PathTable'
,
PathCode
=
'PathCode'
,
Label
=
'Label'
,
Bias
=
'Bias'
,
Out
=
'Out'
,
PreOut
=
'PreOut'
,
W_Out
=
'W_Out'
,
remote_prefetch
=
True
,
epmap
=
emaps
,
table_names
=
table_names
,
height_sections
=
height_sections
)
hsigmoid_op
.
run
(
scope
,
place
)
# get and compare result
result_array
=
np
.
array
(
w_out
)
self
.
assertEqual
(
list
(
result_array
.
shape
),
[
5
,
8
])
correct
=
None
for
i
in
range
(
5
):
if
i
<
2
:
correct
=
np
.
full
((
1
,
8
),
i
+
1
).
astype
(
"float32"
)
self
.
assertTrue
((
result_array
[
i
]
==
correct
).
all
())
else
:
correct
=
np
.
full
((
1
,
8
),
i
+
9
).
astype
(
"float32"
)
self
.
assertTrue
((
result_array
[
i
]
==
correct
).
all
())
def
test_hsigmoid_op_remote
(
self
):
os
.
environ
[
'PADDLE_ENABLE_REMOTE_PREFETCH'
]
=
"1"
# run pserver on CPU in sync mode
p0
=
self
.
_start_pserver
(
0
,
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p0
.
pid
)
port0
=
self
.
_get_pserver_port
(
p0
.
pid
)
p1
=
self
.
_start_pserver
(
1
,
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p1
.
pid
)
port1
=
self
.
_get_pserver_port
(
p1
.
pid
)
places
=
[
core
.
CPUPlace
()]
for
place
in
places
:
self
.
_run_hsigmoid_op_one_pserver
(
place
,
port0
)
self
.
_run_hsigmoid_op_two_pserver
(
place
,
port0
,
port1
)
# raise SIGTERM to pserver
os
.
kill
(
p0
.
pid
,
signal
.
SIGINT
)
p0
.
join
()
os
.
kill
(
p1
.
pid
,
signal
.
SIGINT
)
p1
.
join
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
0 → 100644
浏览文件 @
4a443ffc
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
signal
import
time
import
unittest
from
multiprocessing
import
Process
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.framework
import
Program
,
program_guard
def
nce
(
input
,
weight
,
bias
,
sample_weight
,
labels
,
num_classes
,
num_sample_class
):
samples
=
[]
sample_labels
=
[]
batch_size
=
input
.
shape
[
0
]
num_true_class
=
labels
.
shape
[
1
]
for
i
in
range
(
batch_size
):
w
=
1
if
sample_weight
is
None
else
sample_weight
[
i
]
for
label
in
labels
[
i
]:
samples
.
append
((
i
,
label
,
True
,
w
))
sample_labels
.
append
(
label
)
for
num
in
range
(
num_sample_class
):
samples
.
append
((
i
,
num
,
False
,
w
))
sample_labels
.
append
(
num
)
# forward bias
sample_out
=
np
.
zeros
(
len
(
samples
)).
astype
(
np
.
float32
)
if
bias
is
not
None
:
for
i
in
range
(
len
(
samples
)):
sample_out
[
i
]
=
bias
[
samples
[
i
][
1
]]
# forward weight
for
i
in
range
(
len
(
samples
)):
sample_out
[
i
]
+=
np
.
dot
(
input
[
samples
[
i
][
0
]],
weight
[
samples
[
i
][
1
]])
# forward activation
sample_out
=
1.0
/
(
1.0
+
np
.
exp
(
-
sample_out
))
# forward cost
out
=
np
.
zeros
(
batch_size
).
astype
(
np
.
float32
)
b
=
1.0
/
num_classes
*
num_sample_class
for
i
in
range
(
len
(
samples
)):
o
=
sample_out
[
i
]
cost
=
-
np
.
log
(
o
/
(
o
+
b
))
if
samples
[
i
][
2
]
else
-
np
.
log
(
b
/
(
o
+
b
))
out
[
samples
[
i
][
0
]]
+=
cost
*
samples
[
i
][
3
]
return
(
out
[:,
np
.
newaxis
],
np
.
array
(
sample_out
).
reshape
(
batch_size
,
num_sample_class
+
num_true_class
),
np
.
array
(
sample_labels
).
reshape
(
batch_size
,
num_sample_class
+
num_true_class
))
def
run_pserver
(
pserver_id
,
use_cuda
,
sync_mode
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
# create table parameter in scope
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
# create and initialize Param Variable
param
=
scope
.
var
(
'table'
).
get_tensor
()
param_array
=
np
.
ones
((
5
,
8
)).
astype
(
"float32"
)
for
i
in
range
(
len
(
param_array
)):
param_array
[
i
]
*=
param_array
[
i
]
*
i
+
pserver_id
*
10
+
1
param
.
set
(
param_array
,
place
)
optimize_block
=
program
.
_create_block
(
program
.
global_block
().
idx
)
program
.
global_block
().
append_op
(
type
=
"listen_and_serv"
,
inputs
=
{
'X'
:
[]},
outputs
=
{},
attrs
=
{
"optimize_blocks"
:
[
optimize_block
],
"endpoint"
:
'127.0.0.1:0'
,
"Fanin"
:
1
,
"sync_mode"
:
True
,
"grad_to_block_id"
:
[]
})
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
program
)
class
TestListenAndServOp
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
ps_timeout
=
5
def
_start_pserver
(
self
,
pserver_id
,
use_cuda
,
sync_mode
,
pserver_func
):
p
=
Process
(
target
=
pserver_func
,
args
=
(
pserver_id
,
use_cuda
,
sync_mode
))
p
.
daemon
=
True
p
.
start
()
return
p
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
sleep_time
=
0.5
while
True
:
assert
start_left_time
>=
0
,
"wait ps ready failed"
time
.
sleep
(
sleep_time
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
start_left_time
-=
sleep_time
def
_get_pserver_port
(
self
,
pid
):
with
open
(
"/tmp/paddle.%d.port"
%
pid
,
'r'
)
as
f
:
port
=
int
(
f
.
read
().
strip
())
return
port
def
_run_nce_op_two_pserver
(
self
,
place
,
port0
,
port1
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
x
=
scope
.
var
(
'Input'
).
get_tensor
()
x_array
=
np
.
random
.
random
((
4
,
8
)).
astype
(
"float32"
)
x
.
set
(
x_array
,
place
)
# create and initialize Param Variable
param
=
scope
.
var
(
'Weight'
).
get_tensor
()
param_array
=
np
.
zeros
((
5
,
8
)).
astype
(
"float32"
)
param
.
set
(
param_array
,
place
)
bias
=
scope
.
var
(
'Bias'
).
get_tensor
()
bias_array
=
np
.
random
.
random
((
5
,
1
)).
astype
(
"float32"
)
bias
.
set
(
bias_array
,
place
)
sample_w
=
scope
.
var
(
'SampleWeight'
).
get_tensor
()
sample_weight
=
np
.
random
.
random
((
4
,
1
)).
astype
(
"float32"
)
sample_w
.
set
(
sample_weight
,
place
)
label
=
scope
.
var
(
'Label'
).
get_tensor
()
label_array
=
np
.
array
([[
0
],
[
1
],
[
4
],
[
3
]])
label
.
set
(
label_array
,
place
)
cost
=
scope
.
var
(
'Cost'
).
get_tensor
()
cost_w
=
np
.
zeros
((
4
,
1
)).
astype
(
"float32"
)
cost
.
set
(
cost_w
,
place
)
sample_l
=
scope
.
var
(
'SampleLogits'
).
get_tensor
()
sample_l_w
=
np
.
zeros
((
4
,
3
)).
astype
(
"float32"
)
sample_l
.
set
(
sample_l_w
,
place
)
sample_la
=
scope
.
var
(
'SampleLabels'
).
get_tensor
()
sample_la_w
=
np
.
zeros
((
4
,
3
)).
astype
(
"int"
)
sample_la
.
set
(
sample_la_w
,
place
)
emaps
=
[
'127.0.0.1:'
+
str
(
port0
),
'127.0.0.1:'
+
str
(
port1
)]
table_names
=
[
'table'
,
'table'
]
height_sections
=
[
2
,
3
]
# create and run nce operator
nce_op
=
Operator
(
"nce"
,
Input
=
'Input'
,
Weight
=
'Weight'
,
Label
=
'Label'
,
Bias
=
'Bias'
,
Cost
=
'Cost'
,
SampleLogits
=
'SampleLogits'
,
SampleLabels
=
'SampleLabels'
,
SampleWeight
=
'SampleWeight'
,
num_total_classes
=
5
,
num_neg_samples
=
2
,
custom_neg_classes
=
list
(
range
(
2
)),
sampler
=
0
,
seed
=
0
,
is_sparse
=
True
,
remote_prefetch
=
True
,
epmap
=
emaps
,
table_names
=
table_names
,
height_sections
=
height_sections
)
nce_op
.
run
(
scope
,
place
)
# get and compare result
o_cost
=
np
.
array
(
scope
.
var
(
'Cost'
).
get_tensor
())
o_logits
=
np
.
array
(
scope
.
var
(
'SampleLogits'
).
get_tensor
())
o_labels
=
np
.
array
(
scope
.
var
(
'SampleLabels'
).
get_tensor
())
param_array
=
np
.
ones
((
5
,
8
)).
astype
(
"float32"
)
for
i
in
range
(
2
):
param_array
[
i
]
*=
param_array
[
i
]
*
i
+
0
*
10
+
1
for
i
in
range
(
2
,
5
):
param_array
[
i
]
*=
param_array
[
i
]
*
i
+
1
*
10
+
1
out
=
nce
(
x_array
,
param_array
,
bias_array
,
sample_weight
,
label_array
,
5
,
2
)
self
.
assertAlmostEqual
(
o_cost
.
all
(),
out
[
0
].
all
(),
delta
=
1e-6
)
self
.
assertAlmostEqual
(
o_logits
.
all
(),
out
[
1
].
all
(),
delta
=
1e-6
)
self
.
assertAlmostEqual
(
o_labels
.
all
(),
out
[
2
].
all
(),
delta
=
1e-6
)
def
test_nce_op_remote
(
self
):
os
.
environ
[
'PADDLE_ENABLE_REMOTE_PREFETCH'
]
=
"1"
# run pserver on CPU in sync mode
p0
=
self
.
_start_pserver
(
0
,
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p0
.
pid
)
port0
=
self
.
_get_pserver_port
(
p0
.
pid
)
p1
=
self
.
_start_pserver
(
1
,
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p1
.
pid
)
port1
=
self
.
_get_pserver_port
(
p1
.
pid
)
places
=
[
core
.
CPUPlace
()]
for
place
in
places
:
self
.
_run_nce_op_two_pserver
(
place
,
port0
,
port1
)
# raise SIGTERM to pserver
os
.
kill
(
p0
.
pid
,
signal
.
SIGINT
)
p0
.
join
()
os
.
kill
(
p1
.
pid
,
signal
.
SIGINT
)
p1
.
join
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
浏览文件 @
4a443ffc
...
...
@@ -175,41 +175,61 @@ class TestCRFModel(unittest.TestCase):
print
(
pe
.
run
(
feed
=
feeder
.
feed
(
cur_batch
),
fetch_list
=
[
avg_cost
.
name
])[
0
])
def
test_update_sparse_parameter_all_reduce
(
self
):
def
_new_build_strategy
(
self
,
use_reduce
=
False
):
build_strategy
=
fluid
.
BuildStrategy
()
if
use_reduce
:
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
else
:
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
return
build_strategy
def
test_update_sparse_parameter_all_reduce
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
is_sparse
=
True
,
build_strategy
=
self
.
_new_build_strategy
(),
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
True
,
build_strategy
=
self
.
_new_build_strategy
(),
use_cuda
=
False
)
def
test_update_dense_parameter_all_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
is_sparse
=
False
,
build_strategy
=
self
.
_new_build_strategy
(),
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
False
,
build_strategy
=
self
.
_new_build_strategy
(),
use_cuda
=
False
)
def
test_update_sparse_parameter_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
is_sparse
=
True
,
build_strategy
=
self
.
_new_build_strategy
(
use_reduce
=
True
),
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
True
,
build_strategy
=
self
.
_new_build_strategy
(
use_reduce
=
True
),
use_cuda
=
False
)
def
test_update_dense_parameter_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
is_sparse
=
False
,
build_strategy
=
self
.
_new_build_strategy
(
use_reduce
=
True
),
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
is_sparse
=
False
,
build_strategy
=
self
.
_new_build_strategy
(
use_reduce
=
True
),
use_cuda
=
False
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
4a443ffc
...
...
@@ -86,6 +86,7 @@ class TestMNIST(TestParallelExecutorBase):
"label"
:
label
},
use_cuda
=
use_cuda
,
use_reduce
=
False
)
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
...
...
python/paddle/fluid/tests/unittests/test_reader_reset.py
浏览文件 @
4a443ffc
...
...
@@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase):
exe
.
run
(
startup_prog
)
build_strategy
=
fluid
.
BuildStrategy
()
if
with_double_buffer
:
build_strategy
.
enable_data_balance
=
True
exec_strategy
=
fluid
.
ExecutionStrategy
()
parallel_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
self
.
use_cuda
,
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
4a443ffc
...
...
@@ -251,11 +251,10 @@ class DistributeTranspiler(object):
def
_get_all_remote_sparse_update_op
(
self
,
main_program
):
sparse_update_ops
=
[]
sparse_update_op_types
=
[
"lookup_table"
]
sparse_update_op_types
=
[
"lookup_table"
,
"nce"
,
"hierarchical_sigmoid"
]
for
op
in
main_program
.
global_block
().
ops
:
if
op
.
type
in
sparse_update_op_types
and
op
.
attr
(
'remote_prefetch'
)
is
True
and
not
op
.
attr
(
'is_distributed'
):
'remote_prefetch'
)
is
True
:
sparse_update_ops
.
append
(
op
)
return
sparse_update_ops
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录