Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
c7e38680
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c7e38680
编写于
1月 28, 2019
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add-communicator
上级
1edc0423
a6910f90
变更
138
隐藏空白更改
内联
并排
Showing
138 changed file
with
5255 addition
and
1652 deletion
+5255
-1652
CMakeLists.txt
CMakeLists.txt
+0
-6
Dockerfile
Dockerfile
+0
-2
cmake/FindSphinx.cmake
cmake/FindSphinx.cmake
+0
-147
cmake/generic.cmake
cmake/generic.cmake
+4
-1
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+25
-12
paddle/fluid/framework/commit.h.in
paddle/fluid/framework/commit.h.in
+21
-0
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+1
-1
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+3
-0
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+4
-20
paddle/fluid/framework/ir/graph_traits.cc
paddle/fluid/framework/ir/graph_traits.cc
+2
-1
paddle/fluid/framework/lod_tensor.cc
paddle/fluid/framework/lod_tensor.cc
+4
-3
paddle/fluid/framework/mixed_vector.h
paddle/fluid/framework/mixed_vector.h
+9
-9
paddle/fluid/framework/ngraph_operator.h
paddle/fluid/framework/ngraph_operator.h
+0
-64
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+9
-8
paddle/fluid/framework/tensor_impl.h
paddle/fluid/framework/tensor_impl.h
+2
-1
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+2
-2
paddle/fluid/imperative/layer.cc
paddle/fluid/imperative/layer.cc
+136
-48
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+25
-7
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+94
-50
paddle/fluid/imperative/tracer.h
paddle/fluid/imperative/tracer.h
+9
-4
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+6
-1
paddle/fluid/inference/analysis/helper.cc
paddle/fluid/inference/analysis/helper.cc
+8
-0
paddle/fluid/inference/analysis/helper.h
paddle/fluid/inference/analysis/helper.h
+54
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+19
-2
paddle/fluid/inference/analysis/ir_pass_manager.h
paddle/fluid/inference/analysis/ir_pass_manager.h
+3
-2
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+46
-7
paddle/fluid/inference/analysis/passes/CMakeLists.txt
paddle/fluid/inference/analysis/passes/CMakeLists.txt
+1
-1
paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
...uid/inference/analysis/passes/ir_graph_to_program_pass.cc
+5
-1
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
...e/fluid/inference/analysis/passes/memory_optimize_pass.cc
+117
-71
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
...le/fluid/inference/analysis/passes/memory_optimize_pass.h
+1
-1
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+25
-7
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+100
-21
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+18
-1
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+2
-0
paddle/fluid/inference/api/api.cc
paddle/fluid/inference/api/api.cc
+10
-0
paddle/fluid/inference/api/api_tester.cc
paddle/fluid/inference/api/api_tester.cc
+6
-0
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+12
-14
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+10
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+10
-7
paddle/fluid/inference/tensorrt/CMakeLists.txt
paddle/fluid/inference/tensorrt/CMakeLists.txt
+1
-1
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+7
-0
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+16
-8
paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+147
-0
paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+128
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+5
-0
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+21
-3
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+30
-11
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+2
-2
paddle/fluid/operators/beam_search_op.cc
paddle/fluid/operators/beam_search_op.cc
+31
-204
paddle/fluid/operators/beam_search_op.cu.cc
paddle/fluid/operators/beam_search_op.cu.cc
+24
-0
paddle/fluid/operators/beam_search_op.h
paddle/fluid/operators/beam_search_op.h
+11
-180
paddle/fluid/operators/beam_search_op_test.cc
paddle/fluid/operators/beam_search_op_test.cc
+0
-92
paddle/fluid/operators/bpr_loss_op.h
paddle/fluid/operators/bpr_loss_op.h
+2
-2
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+26
-39
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+28
-29
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+5
-3
paddle/fluid/operators/distributed/brpc/brpc_client.cc
paddle/fluid/operators/distributed/brpc/brpc_client.cc
+33
-13
paddle/fluid/operators/distributed/brpc/brpc_client.h
paddle/fluid/operators/distributed/brpc/brpc_client.h
+9
-0
paddle/fluid/operators/distributed/brpc/brpc_server.cc
paddle/fluid/operators/distributed/brpc/brpc_server.cc
+59
-6
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+14
-3
paddle/fluid/operators/distributed/rpc_server.cc
paddle/fluid/operators/distributed/rpc_server.cc
+22
-15
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+3
-2
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+7
-1
paddle/fluid/operators/distributed_ops/merge_ids_op.h
paddle/fluid/operators/distributed_ops/merge_ids_op.h
+9
-9
paddle/fluid/operators/elementwise/elementwise_op_function.h
paddle/fluid/operators/elementwise/elementwise_op_function.h
+0
-62
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+11
-12
paddle/fluid/operators/grid_sampler_op.cc
paddle/fluid/operators/grid_sampler_op.cc
+8
-6
paddle/fluid/operators/jit/CMakeLists.txt
paddle/fluid/operators/jit/CMakeLists.txt
+1
-1
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+70
-37
paddle/fluid/operators/lrn_mkldnn_op.cc
paddle/fluid/operators/lrn_mkldnn_op.cc
+19
-8
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+2
-0
paddle/fluid/operators/math/beam_search.cc
paddle/fluid/operators/math/beam_search.cc
+283
-0
paddle/fluid/operators/math/beam_search.cu
paddle/fluid/operators/math/beam_search.cu
+393
-0
paddle/fluid/operators/math/beam_search.h
paddle/fluid/operators/math/beam_search.h
+119
-0
paddle/fluid/operators/math/beam_search_test.cc
paddle/fluid/operators/math/beam_search_test.cc
+141
-0
paddle/fluid/operators/math/sampler.cc
paddle/fluid/operators/math/sampler.cc
+9
-1
paddle/fluid/operators/math/sampler.h
paddle/fluid/operators/math/sampler.h
+1
-0
paddle/fluid/operators/math/selected_rows_functor_test.cc
paddle/fluid/operators/math/selected_rows_functor_test.cc
+1
-1
paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+1
-1
paddle/fluid/operators/math/sequence_pooling_test.cc
paddle/fluid/operators/math/sequence_pooling_test.cc
+1
-1
paddle/fluid/operators/nce_op.h
paddle/fluid/operators/nce_op.h
+5
-0
paddle/fluid/operators/ngraph/CMakeLists.txt
paddle/fluid/operators/ngraph/CMakeLists.txt
+5
-0
paddle/fluid/operators/ngraph/ngraph_bridge.cc
paddle/fluid/operators/ngraph/ngraph_bridge.cc
+18
-18
paddle/fluid/operators/ngraph/ngraph_bridge.h
paddle/fluid/operators/ngraph/ngraph_bridge.h
+6
-6
paddle/fluid/operators/ngraph/ngraph_engine.cc
paddle/fluid/operators/ngraph/ngraph_engine.cc
+491
-0
paddle/fluid/operators/ngraph/ngraph_engine.h
paddle/fluid/operators/ngraph/ngraph_engine.h
+93
-0
paddle/fluid/operators/ngraph/ngraph_engine_op.cc
paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+52
-0
paddle/fluid/operators/ngraph/ngraph_engine_op.h
paddle/fluid/operators/ngraph/ngraph_engine_op.h
+58
-0
paddle/fluid/operators/reader/create_ctr_reader_op.cc
paddle/fluid/operators/reader/create_ctr_reader_op.cc
+27
-9
paddle/fluid/operators/reader/ctr_reader.cc
paddle/fluid/operators/reader/ctr_reader.cc
+199
-39
paddle/fluid/operators/reader/ctr_reader.h
paddle/fluid/operators/reader/ctr_reader.h
+73
-18
paddle/fluid/operators/reader/ctr_reader_test.cc
paddle/fluid/operators/reader/ctr_reader_test.cc
+81
-7
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+4
-8
paddle/fluid/operators/reader/read_op.cc
paddle/fluid/operators/reader/read_op.cc
+24
-16
paddle/fluid/operators/reader/reader_op_registry.cc
paddle/fluid/operators/reader/reader_op_registry.cc
+21
-13
paddle/fluid/operators/shuffle_channel_op.cc
paddle/fluid/operators/shuffle_channel_op.cc
+113
-0
paddle/fluid/operators/shuffle_channel_op.cu
paddle/fluid/operators/shuffle_channel_op.cu
+125
-0
paddle/fluid/operators/shuffle_channel_op.h
paddle/fluid/operators/shuffle_channel_op.h
+95
-0
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+7
-1
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+93
-4
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+25
-25
paddle/fluid/operators/warpctc_cudnn_op.cu.cc
paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+11
-13
paddle/fluid/platform/cuda_device_function.h
paddle/fluid/platform/cuda_device_function.h
+29
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+3
-2
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+25
-3
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+23
-3
paddle/fluid/pybind/inference_api.cc
paddle/fluid/pybind/inference_api.cc
+10
-3
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+42
-16
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+2
-32
python/paddle/fluid/contrib/__init__.py
python/paddle/fluid/contrib/__init__.py
+3
-0
python/paddle/fluid/contrib/int8_inference/utility.py
python/paddle/fluid/contrib/int8_inference/utility.py
+29
-5
python/paddle/fluid/contrib/reader/README.md
python/paddle/fluid/contrib/reader/README.md
+15
-0
python/paddle/fluid/contrib/reader/__init__.py
python/paddle/fluid/contrib/reader/__init__.py
+19
-0
python/paddle/fluid/contrib/reader/ctr_reader.py
python/paddle/fluid/contrib/reader/ctr_reader.py
+57
-16
python/paddle/fluid/contrib/tests/test_calibration.py
python/paddle/fluid/contrib/tests/test_calibration.py
+67
-40
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+21
-2
python/paddle/fluid/imperative/base.py
python/paddle/fluid/imperative/base.py
+13
-5
python/paddle/fluid/imperative/nn.py
python/paddle/fluid/imperative/nn.py
+159
-18
python/paddle/fluid/layer_helper.py
python/paddle/fluid/layer_helper.py
+4
-1
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+1
-1
python/paddle/fluid/layers/learning_rate_scheduler.py
python/paddle/fluid/layers/learning_rate_scheduler.py
+1
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+104
-23
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+2
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+18
-15
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+1
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+3
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+1
-1
python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
...paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+6
-1
python/paddle/fluid/tests/unittests/test_imperative.py
python/paddle/fluid/tests/unittests/test_imperative.py
+18
-3
python/paddle/fluid/tests/unittests/test_imperative_gan.py
python/paddle/fluid/tests/unittests/test_imperative_gan.py
+4
-2
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
...paddle/fluid/tests/unittests/test_imperative_optimizer.py
+2
-1
python/paddle/fluid/tests/unittests/test_imperative_resnet.py
...on/paddle/fluid/tests/unittests/test_imperative_resnet.py
+370
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+8
-0
python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
...n/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
+52
-0
python/paddle/fluid/transpiler/details/checkport.py
python/paddle/fluid/transpiler/details/checkport.py
+3
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+7
-2
python/setup.py.in
python/setup.py.in
+1
-0
未找到文件。
CMakeLists.txt
浏览文件 @
c7e38680
...
@@ -276,9 +276,3 @@ add_subdirectory(paddle)
...
@@ -276,9 +276,3 @@ add_subdirectory(paddle)
if
(
WITH_PYTHON
)
if
(
WITH_PYTHON
)
add_subdirectory
(
python
)
add_subdirectory
(
python
)
endif
()
endif
()
if
(
WITH_DOC
)
find_package
(
Sphinx REQUIRED
)
find_python_module
(
recommonmark REQUIRED
)
add_subdirectory
(
doc
)
endif
()
Dockerfile
浏览文件 @
c7e38680
...
@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
...
@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
# ENV variables
# ENV variables
ARG
WITH_GPU
ARG
WITH_GPU
ARG
WITH_AVX
ARG
WITH_AVX
ARG
WITH_DOC
ENV
WOBOQ OFF
ENV
WOBOQ OFF
ENV
WITH_GPU=${WITH_GPU:-ON}
ENV
WITH_GPU=${WITH_GPU:-ON}
ENV
WITH_AVX=${WITH_AVX:-ON}
ENV
WITH_AVX=${WITH_AVX:-ON}
ENV
WITH_DOC=${WITH_DOC:-OFF}
ENV
HOME /root
ENV
HOME /root
# Add bash enhancements
# Add bash enhancements
...
...
cmake/FindSphinx.cmake
已删除
100644 → 0
浏览文件 @
1edc0423
# - This module looks for Sphinx
# Find the Sphinx documentation generator
#
# This modules defines
# SPHINX_EXECUTABLE
# SPHINX_FOUND
find_program
(
SPHINX_EXECUTABLE
NAMES sphinx-build
PATHS
/usr/bin
/usr/local/bin
/opt/local/bin
DOC
"Sphinx documentation generator"
)
if
(
NOT SPHINX_EXECUTABLE
)
set
(
_Python_VERSIONS
2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
)
foreach
(
_version
${
_Python_VERSIONS
}
)
set
(
_sphinx_NAMES sphinx-build-
${
_version
}
)
find_program
(
SPHINX_EXECUTABLE
NAMES
${
_sphinx_NAMES
}
PATHS
/usr/bin
/usr/local/bin
/opt/loca/bin
DOC
"Sphinx documentation generator"
)
endforeach
()
endif
()
include
(
FindPackageHandleStandardArgs
)
find_package_handle_standard_args
(
Sphinx DEFAULT_MSG
SPHINX_EXECUTABLE
)
option
(
SPHINX_HTML_OUTPUT
"Build a single HTML with the whole content."
ON
)
option
(
SPHINX_DIRHTML_OUTPUT
"Build HTML pages, but with a single directory per document."
OFF
)
option
(
SPHINX_HTMLHELP_OUTPUT
"Build HTML pages with additional information for building a documentation collection in htmlhelp."
OFF
)
option
(
SPHINX_QTHELP_OUTPUT
"Build HTML pages with additional information for building a documentation collection in qthelp."
OFF
)
option
(
SPHINX_DEVHELP_OUTPUT
"Build HTML pages with additional information for building a documentation collection in devhelp."
OFF
)
option
(
SPHINX_EPUB_OUTPUT
"Build HTML pages with additional information for building a documentation collection in epub."
OFF
)
option
(
SPHINX_LATEX_OUTPUT
"Build LaTeX sources that can be compiled to a PDF document using pdflatex."
OFF
)
option
(
SPHINX_MAN_OUTPUT
"Build manual pages in groff format for UNIX systems."
OFF
)
option
(
SPHINX_TEXT_OUTPUT
"Build plain text files."
OFF
)
mark_as_advanced
(
SPHINX_EXECUTABLE
SPHINX_HTML_OUTPUT
SPHINX_DIRHTML_OUTPUT
SPHINX_HTMLHELP_OUTPUT
SPHINX_QTHELP_OUTPUT
SPHINX_DEVHELP_OUTPUT
SPHINX_EPUB_OUTPUT
SPHINX_LATEX_OUTPUT
SPHINX_MAN_OUTPUT
SPHINX_TEXT_OUTPUT
)
function
(
Sphinx_add_target target_name builder conf cache source destination
)
add_custom_target
(
${
target_name
}
ALL
COMMAND
${
SPHINX_EXECUTABLE
}
-b
${
builder
}
-d
${
cache
}
-c
${
conf
}
${
source
}
${
destination
}
COMMENT
"Generating sphinx documentation:
${
builder
}
"
COMMAND cd
${
destination
}
&& ln -sf ./index_*.html index.html
)
set_property
(
DIRECTORY APPEND PROPERTY
ADDITIONAL_MAKE_CLEAN_FILES
${
destination
}
)
endfunction
()
# Target dependencies can be optionally listed at the end.
function
(
Sphinx_add_targets target_base_name conf source base_destination
)
set
(
_dependencies
)
foreach
(
arg IN LISTS ARGN
)
set
(
_dependencies
${
_dependencies
}
${
arg
}
)
endforeach
()
if
(
${
SPHINX_HTML_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_html html
${
conf
}
${
source
}
${
base_destination
}
/html
)
add_dependencies
(
${
target_base_name
}
_html
${
_dependencies
}
)
endif
()
if
(
${
SPHINX_DIRHTML_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_dirhtml dirhtml
${
conf
}
${
source
}
${
base_destination
}
/dirhtml
)
add_dependencies
(
${
target_base_name
}
_dirhtml
${
_dependencies
}
)
endif
()
if
(
${
SPHINX_QTHELP_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_qthelp qthelp
${
conf
}
${
source
}
${
base_destination
}
/qthelp
)
add_dependencies
(
${
target_base_name
}
_qthelp
${
_dependencies
}
)
endif
()
if
(
${
SPHINX_DEVHELP_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_devhelp devhelp
${
conf
}
${
source
}
${
base_destination
}
/devhelp
)
add_dependencies
(
${
target_base_name
}
_devhelp
${
_dependencies
}
)
endif
()
if
(
${
SPHINX_EPUB_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_epub epub
${
conf
}
${
source
}
${
base_destination
}
/epub
)
add_dependencies
(
${
target_base_name
}
_epub
${
_dependencies
}
)
endif
()
if
(
${
SPHINX_LATEX_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_latex latex
${
conf
}
${
source
}
${
base_destination
}
/latex
)
add_dependencies
(
${
target_base_name
}
_latex
${
_dependencies
}
)
endif
()
if
(
${
SPHINX_MAN_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_man man
${
conf
}
${
source
}
${
base_destination
}
/man
)
add_dependencies
(
${
target_base_name
}
_man
${
_dependencies
}
)
endif
()
if
(
${
SPHINX_TEXT_OUTPUT
}
)
Sphinx_add_target
(
${
target_base_name
}
_text text
${
conf
}
${
source
}
${
base_destination
}
/text
)
add_dependencies
(
${
target_base_name
}
_text
${
_dependencies
}
)
endif
()
if
(
${
BUILD_TESTING
}
)
sphinx_add_target
(
${
target_base_name
}
_linkcheck linkcheck
${
conf
}
${
source
}
${
base_destination
}
/linkcheck
)
add_dependencies
(
${
target_base_name
}
_linkcheck
${
_dependencies
}
)
endif
()
endfunction
()
cmake/generic.cmake
浏览文件 @
c7e38680
...
@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
...
@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
endif
()
endif
()
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296
)
# 4G
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true
)
# No unit test should exceed 10 minutes.
# No unit test should exceed 10 minutes.
set_tests_properties
(
${
TARGET_NAME
}
PROPERTIES TIMEOUT 600
)
set_tests_properties
(
${
TARGET_NAME
}
PROPERTIES TIMEOUT 600
)
...
@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
...
@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
endif
()
endif
()
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296
)
# 4G
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true
)
endif
()
endif
()
endfunction
(
nv_test
)
endfunction
(
nv_test
)
...
@@ -708,9 +710,10 @@ function(py_test TARGET_NAME)
...
@@ -708,9 +710,10 @@ function(py_test TARGET_NAME)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS ARGS ENVS
)
set
(
multiValueArgs SRCS DEPS ARGS ENVS
)
cmake_parse_arguments
(
py_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
py_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
add_test
(
NAME
${
TARGET_NAME
}
add_test
(
NAME
${
TARGET_NAME
}
COMMAND
${
CMAKE_COMMAND
}
-E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
COMMAND
${
CMAKE_COMMAND
}
-E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true
FLAGS_cpu_deterministic=true
FLAGS_limit_of_tmp_allocation=4294967296
# 4G
PYTHONPATH=
${
PADDLE_BINARY_DIR
}
/python
${
py_test_ENVS
}
PYTHONPATH=
${
PADDLE_BINARY_DIR
}
/python
${
py_test_ENVS
}
${
PYTHON_EXECUTABLE
}
-u
${
py_test_SRCS
}
${
py_test_ARGS
}
${
PYTHON_EXECUTABLE
}
-u
${
py_test_SRCS
}
${
py_test_ARGS
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
...
...
paddle/fluid/API.spec
浏览文件 @
c7e38680
...
@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
...
@@ -122,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', '
name'], varargs=None, keywords=None, defaults=(0
, None))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', '
is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True
, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
...
@@ -213,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
...
@@ -213,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
...
@@ -359,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
...
@@ -359,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
c7e38680
#windows treat symbolic file as a real file, which is different with unix
#windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file.
#We create a hidden file and compile it instead of origin source file.
function
(
windows_symbolic TARGET
)
function
(
windows_symbolic TARGET
)
...
@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
...
@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library
(
proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version
)
cc_library
(
proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version
)
if
(
WITH_NGRAPH
)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph
)
cc_library
(
ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler
)
endif
(
WITH_NGRAPH
)
cc_library
(
op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc
)
cc_library
(
op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
...
@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
...
@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
else
()
if
(
WITH_NGRAPH
)
if
(
WITH_NGRAPH
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
ngraph_operator variable_helper
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
variable_helper ngraph_engine
)
else
(
WITH_NGRAPH
)
else
(
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
)
endif
(
WITH_NGRAPH
)
endif
()
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
endif
()
...
@@ -214,3 +206,24 @@ endif (NOT WIN32)
...
@@ -214,3 +206,24 @@ endif (NOT WIN32)
cc_library
(
dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack
)
cc_library
(
dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack
)
cc_test
(
dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog
)
cc_test
(
dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog
)
# Get the current working branch
execute_process
(
COMMAND git rev-parse --abbrev-ref HEAD
WORKING_DIRECTORY
${
CMAKE_SOURCE_DIR
}
OUTPUT_VARIABLE PADDLE_BRANCH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Get the latest abbreviated commit hash of the working branch
execute_process
(
COMMAND git log -1 --format=%h
WORKING_DIRECTORY
${
CMAKE_SOURCE_DIR
}
OUTPUT_VARIABLE PADDLE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message
(
STATUS
"commit:
${
PADDLE_COMMIT
}
"
)
message
(
STATUS
"branch:
${
PADDLE_BRANCH
}
"
)
configure_file
(
commit.h.in commit.h
)
paddle/fluid/framework/commit.h.in
0 → 100644
浏览文件 @
c7e38680
#pragma once
#include <string>
namespace paddle {
namespace framework {
static std::string paddle_commit() {
return "@PADDLE_COMMIT@";
}
static std::string paddle_compile_branch() {
return "@PADDLE_BRANCH@";
}
static std::string paddle_version() {
return "@PADDLE_VERSION@";
}
} // namespace framework
} // namespace paddle
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
c7e38680
...
@@ -91,7 +91,7 @@ struct BuildStrategy {
...
@@ -91,7 +91,7 @@ struct BuildStrategy {
int
num_trainers_
{
1
};
int
num_trainers_
{
1
};
int
trainer_id_
{
0
};
int
trainer_id_
{
0
};
std
::
vector
<
std
::
string
>
trainers_endpoints_
;
std
::
vector
<
std
::
string
>
trainers_endpoints_
;
bool
remove_unnecessary_lock_
{
fals
e
};
bool
remove_unnecessary_lock_
{
tru
e
};
// NOTE:
// NOTE:
// Before you add new options, think if it's a general strategy that works
// Before you add new options, think if it's a general strategy that works
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
c7e38680
...
@@ -25,6 +25,9 @@ struct ExecutionStrategy {
...
@@ -25,6 +25,9 @@ struct ExecutionStrategy {
size_t
num_threads_
{
0
};
size_t
num_threads_
{
0
};
bool
use_cuda_
{
true
};
bool
use_cuda_
{
true
};
bool
allow_op_delay_
{
false
};
bool
allow_op_delay_
{
false
};
// If we set this to 1, we will delete all variables when finish a batch. and
// this will loss 15%+ performance.
// Please be aware about this parameters.
size_t
num_iteration_per_drop_scope_
{
1
};
size_t
num_iteration_per_drop_scope_
{
1
};
ExecutorType
type_
{
kDefault
};
ExecutorType
type_
{
kDefault
};
bool
dry_run_
{
false
};
bool
dry_run_
{
false
};
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
c7e38680
...
@@ -27,7 +27,7 @@ limitations under the License. */
...
@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_NGRAPH
#ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/
framework/ngraph_operator
.h"
#include "paddle/fluid/
operators/ngraph/ngraph_engine
.h"
#endif
#endif
DECLARE_bool
(
benchmark
);
DECLARE_bool
(
benchmark
);
...
@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
...
@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
}
}
}
}
static
void
EnableFusedOp
(
ExecutorPrepareContext
*
ctx
)
{
#ifdef PADDLE_WITH_NGRAPH
VLOG
(
3
)
<<
"use_ngraph=True"
;
auto
intervals
=
NgraphOperator
::
NgraphOpIntervals
(
&
ctx
->
ops_
);
for
(
auto
&
interval
:
intervals
)
{
auto
*
ng_op
=
new
NgraphOperator
(
ctx
->
prog_
,
ctx
->
block_id_
,
interval
.
at
(
0
),
interval
.
at
(
1
));
*
interval
[
0
]
=
std
::
unique_ptr
<
OperatorBase
>
(
ng_op
);
}
for
(
auto
it
=
intervals
.
rbegin
();
it
!=
intervals
.
rend
();
++
it
)
{
ctx
->
ops_
.
erase
(
it
->
at
(
0
)
+
1
,
it
->
at
(
1
));
}
#else
LOG
(
WARNING
)
<<
"'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"
;
#endif
}
Executor
::
Executor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
Executor
::
Executor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
void
Executor
::
Close
()
{
void
Executor
::
Close
()
{
...
@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
...
@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool
create_local_scope
,
bool
create_vars
)
{
bool
create_local_scope
,
bool
create_vars
)
{
platform
::
RecordBlock
b
(
block_id
);
platform
::
RecordBlock
b
(
block_id
);
if
(
FLAGS_use_mkldnn
)
EnableMKLDNN
(
pdesc
);
if
(
FLAGS_use_mkldnn
)
EnableMKLDNN
(
pdesc
);
#ifdef PADDLE_WITH_NGRAPH
if
(
FLAGS_use_ngraph
)
operators
::
NgraphEngine
::
EnableNgraph
(
pdesc
);
#endif
auto
ctx
=
Prepare
(
pdesc
,
block_id
);
auto
ctx
=
Prepare
(
pdesc
,
block_id
);
RunPreparedContext
(
ctx
.
get
(),
scope
,
create_local_scope
,
create_vars
);
RunPreparedContext
(
ctx
.
get
(),
scope
,
create_local_scope
,
create_vars
);
}
}
...
@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
...
@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
ctx
->
ops_
.
push_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
ctx
->
ops_
.
push_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
}
if
(
FLAGS_use_ngraph
)
EnableFusedOp
(
ctx
.
get
());
return
ctx
;
return
ctx
;
}
}
...
...
paddle/fluid/framework/ir/graph_traits.cc
浏览文件 @
c7e38680
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include <set>
#include <vector>
#include <vector>
namespace
paddle
{
namespace
paddle
{
...
@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
...
@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
}
}
std
::
unordered_set
<
Node
*>
visited
;
std
::
unordered_set
<
Node
*>
visited
;
std
::
unordered_
set
<
Node
*>
to_visit
{
source
.
begin
(),
source
.
end
()};
std
::
set
<
Node
*>
to_visit
{
source
.
begin
(),
source
.
end
()};
std
::
vector
<
Node
*>
inlink_visited
;
std
::
vector
<
Node
*>
inlink_visited
;
while
(
!
to_visit
.
empty
())
{
while
(
!
to_visit
.
empty
())
{
...
...
paddle/fluid/framework/lod_tensor.cc
浏览文件 @
c7e38680
...
@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
...
@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
LoDTensor
&
t
)
{
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
LoDTensor
&
t
)
{
if
(
!
platform
::
is_cpu_place
(
t
.
place
()))
{
if
(
!
platform
::
is_cpu_place
(
t
.
place
()))
{
LoDTensor
tt
;
LoDTensor
cpu_tensor
;
framework
::
TensorCopy
(
t
,
platform
::
CPUPlace
(),
&
tt
);
cpu_tensor
.
set_lod
(
t
.
lod
());
framework
::
TensorCopy
(
t
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
t
.
place
());
auto
&
dev_ctx
=
*
pool
.
Get
(
t
.
place
());
dev_ctx
.
Wait
();
dev_ctx
.
Wait
();
os
<<
tt
;
os
<<
cpu_tensor
;
return
os
;
return
os
;
}
}
...
...
paddle/fluid/framework/mixed_vector.h
浏览文件 @
c7e38680
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
...
...
paddle/fluid/framework/ngraph_operator.h
已删除
100644 → 0
浏览文件 @
1edc0423
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/type/element_type.hpp"
namespace
paddle
{
namespace
framework
{
class
NgraphOperator
:
public
OperatorBase
{
public:
static
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
>>
NgraphOpIntervals
(
std
::
vector
<
std
::
unique_ptr
<
paddle
::
framework
::
OperatorBase
>>*
ops
);
explicit
NgraphOperator
(
const
ProgramDesc
&
prog
,
size_t
block_id
,
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
start
,
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
end
,
const
std
::
string
&
type
=
"fused_op"
,
const
VariableNameMap
&
inputs
=
{},
const
VariableNameMap
&
outputs
=
{},
const
AttributeMap
&
attrs
=
{});
void
RunImpl
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
final
;
private:
const
ProgramDesc
pdesc_
;
size_t
block_
;
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
>>
fused_ops_
;
std
::
unordered_map
<
std
::
string
,
ngraph
::
element
::
Type
>
var_type_map_
;
std
::
unordered_set
<
std
::
string
>
persistables_
;
std
::
unordered_set
<
std
::
string
>
fetches_
;
std
::
unordered_set
<
std
::
string
>
post_op_inputs_
;
bool
is_full_
=
false
;
void
Process
();
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/operator.cc
浏览文件 @
c7e38680
...
@@ -19,8 +19,6 @@ limitations under the License. */
...
@@ -19,8 +19,6 @@ limitations under the License. */
#include <sstream>
#include <sstream>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
...
@@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData(
...
@@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData(
proto
::
VarType
::
Type
OperatorWithKernel
::
IndicateDataType
(
proto
::
VarType
::
Type
OperatorWithKernel
::
IndicateDataType
(
const
ExecutionContext
&
ctx
)
const
{
const
ExecutionContext
&
ctx
)
const
{
int
data_type
=
-
1
;
proto
::
VarType
::
Type
dafault_data_type
=
static_cast
<
proto
::
VarType
::
Type
>
(
-
1
);
proto
::
VarType
::
Type
data_type
=
dafault_data_type
;
for
(
auto
&
input
:
this
->
inputs_
)
{
for
(
auto
&
input
:
this
->
inputs_
)
{
const
std
::
vector
<
const
Variable
*>
vars
=
ctx
.
MultiInputVar
(
input
.
first
);
const
std
::
vector
<
const
Variable
*>
vars
=
ctx
.
MultiInputVar
(
input
.
first
);
for
(
size_t
i
=
0
;
i
<
vars
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
vars
.
size
();
++
i
)
{
...
@@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
...
@@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
if
(
t
!=
nullptr
)
{
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE
(
t
->
IsInitialized
(),
"Input %s(%lu)is not initialized"
,
PADDLE_ENFORCE
(
t
->
IsInitialized
(),
"Input %s(%lu)is not initialized"
,
input
.
first
,
i
);
input
.
first
,
i
);
int
tmp
=
static_cast
<
int
>
(
t
->
type
()
);
proto
::
VarType
::
Type
tmp
=
t
->
type
(
);
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
tmp
==
data_type
||
data_type
==
-
1
,
tmp
==
data_type
||
data_type
==
dafault_data_type
,
"DataType of Paddle Op %s must be the same. Get (%d) != (%d)"
,
"DataType of Paddle Op %s must be the same. Get (%d) != (%d)"
,
Type
(),
data_type
,
tmp
);
Type
(),
DataTypeToString
(
data_type
),
DataTypeToString
(
tmp
)
);
data_type
=
tmp
;
data_type
=
tmp
;
}
}
}
}
}
}
}
}
PADDLE_ENFORCE
(
data_type
!=
-
1
,
"DataType should be indicated by input"
);
PADDLE_ENFORCE
(
data_type
!=
dafault_data_type
,
return
static_cast
<
proto
::
VarType
::
Type
>
(
data_type
);
"DataType should be indicated by input"
);
return
data_type
;
}
}
OpKernelType
OperatorWithKernel
::
GetExpectedKernelType
(
OpKernelType
OperatorWithKernel
::
GetExpectedKernelType
(
...
...
paddle/fluid/framework/tensor_impl.h
浏览文件 @
c7e38680
...
@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
...
@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
check_memory_size
();
check_memory_size
();
bool
valid
=
bool
valid
=
std
::
is_same
<
T
,
void
>::
value
||
type_
==
DataTypeTrait
<
T
>::
DataType
;
std
::
is_same
<
T
,
void
>::
value
||
type_
==
DataTypeTrait
<
T
>::
DataType
;
PADDLE_ENFORCE
(
valid
,
"Tensor holds the wrong type, it holds %d"
,
type_
);
PADDLE_ENFORCE
(
valid
,
"Tensor holds the wrong type, it holds %d"
,
DataTypeToString
(
type_
));
return
reinterpret_cast
<
const
T
*>
(
return
reinterpret_cast
<
const
T
*>
(
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())
+
offset_
);
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())
+
offset_
);
...
...
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
c7e38680
if
(
WITH_PYTHON
)
if
(
WITH_PYTHON
)
cc_library
(
layer SRCS layer.cc DEPS proto_desc operator
)
cc_library
(
layer SRCS layer.cc DEPS proto_desc operator
device_context blas
)
cc_library
(
tracer SRCS tracer.cc DEPS proto_desc
)
cc_library
(
tracer SRCS tracer.cc DEPS proto_desc
device_context
)
cc_library
(
engine SRCS engine.cc
)
cc_library
(
engine SRCS engine.cc
)
endif
()
endif
()
paddle/fluid/imperative/layer.cc
浏览文件 @
c7e38680
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/layer.h"
#include <deque>
#include <deque>
#include <limits>
#include <limits>
#include <map>
#include <map>
...
@@ -22,6 +23,9 @@
...
@@ -22,6 +23,9 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
...
@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
using
framework
::
Variable
;
using
framework
::
Variable
;
void
AddTo
(
Variable
*
src
,
Variable
*
dst
)
{
namespace
detail
{
framework
::
LoDTensor
*
dst_tensor
=
dst
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
LoDTensor
*
src_tensor
=
src
->
GetMutable
<
framework
::
LoDTensor
>
();
template
<
typename
T
>
class
TensorAddToFunctor
:
public
boost
::
static_visitor
<>
{
public:
TensorAddToFunctor
(
int64_t
numel
,
const
T
*
x
,
T
*
y
)
:
numel_
(
numel
),
x_
(
x
),
y_
(
y
)
{}
void
operator
()(
const
platform
::
CPUPlace
&
place
)
{
platform
::
CPUDeviceContext
*
ctx
=
dynamic_cast
<
platform
::
CPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
blas
=
operators
::
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
*
ctx
);
blas
.
AXPY
(
numel_
,
1.
,
x_
,
y_
);
}
#ifdef PADDLE_WITH_CUDA
void
operator
()(
const
platform
::
CUDAPlace
&
place
)
{
platform
::
CUDADeviceContext
*
ctx
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
blas
=
operators
::
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
*
ctx
);
blas
.
AXPY
(
numel_
,
1.
,
x_
,
y_
);
}
#else
void
operator
()(
const
platform
::
CUDAPlace
&
place
)
{
PADDLE_THROW
(
"Do NOT support gradient merge in place %s"
,
place
);
}
#endif
// there is NO blas in CUDAPinnedPlace
void
operator
()(
const
platform
::
CUDAPinnedPlace
&
place
)
{
PADDLE_THROW
(
"Do NOT support gradient merge in place %s"
,
place
);
}
private:
int64_t
numel_
;
const
T
*
x_
;
T
*
y_
;
};
}
// namespace detail
void
AddTo
(
Variable
*
src
,
Variable
*
dst
,
platform
::
Place
place
)
{
framework
::
Tensor
*
dst_tensor
=
dst
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
Tensor
*
src_tensor
=
src
->
GetMutable
<
framework
::
LoDTensor
>
();
// FIXME(minqiyang): loss_grad op will pass a zero grad of label
// FIXME(minqiyang): loss_grad op will pass a zero grad of label
// ugly fix for it
// ugly fix for it
if
(
src_tensor
->
numel
()
==
0
)
{
if
(
src_tensor
->
numel
()
==
0
)
{
return
;
return
;
}
}
PADDLE_ENFORCE
(
dst_tensor
->
numel
()
==
src_tensor
->
numel
(),
PADDLE_ENFORCE
(
dst_tensor
->
numel
()
==
src_tensor
->
numel
(),
"dst_numel %lld vs. src_numel %lld"
,
dst_tensor
->
numel
(),
"dst_numel %lld vs. src_numel %lld"
,
dst_tensor
->
numel
(),
src_tensor
->
numel
());
src_tensor
->
numel
());
float
*
dst_data
=
dst_tensor
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
const
float
*
src_data
=
src_tensor
->
data
<
float
>
();
detail
::
TensorAddToFunctor
<
float
>
func
(
for
(
int64_t
i
=
0
;
i
<
src_tensor
->
numel
();
++
i
)
{
src_tensor
->
numel
(),
src_tensor
->
data
<
float
>
(),
dst_data
[
i
]
+=
src_data
[
i
]
;
dst_tensor
->
mutable_data
<
float
>
(
place
))
;
}
boost
::
apply_visitor
(
func
,
place
);
}
}
class
Autograd
{
class
Autograd
{
...
@@ -120,66 +168,104 @@ class Autograd {
...
@@ -120,66 +168,104 @@ class Autograd {
}
}
};
};
std
::
unique_ptr
<
VarBase
>
VarBase
::
NewVarBase
(
const
platform
::
Place
&
dst_place
,
const
bool
blocking
)
const
{
PADDLE_ENFORCE
(
var_
->
IsInitialized
(),
"Variable must be initialized when getting numpy tensor"
);
std
::
unique_ptr
<
VarBase
>
new_var
(
new
VarBase
());
framework
::
LoDTensor
*
tensor
=
new_var
->
var_
->
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
var_
->
Get
<
framework
::
LoDTensor
>
().
dims
());
tensor
->
set_lod
(
var_
->
Get
<
framework
::
LoDTensor
>
().
lod
());
if
(
blocking
)
{
platform
::
DeviceContext
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
dst_place
);
framework
::
TensorCopySync
(
var_
->
Get
<
framework
::
LoDTensor
>
(),
dst_place
,
tensor
);
dev_ctx
->
Wait
();
}
else
{
framework
::
TensorCopy
(
var_
->
Get
<
framework
::
LoDTensor
>
(),
dst_place
,
tensor
);
}
if
(
platform
::
is_gpu_place
(
dst_place
))
{
VLOG
(
3
)
<<
"copy tensor "
<<
var_desc_
->
Name
()
<<
" from gpu"
;
}
return
new_var
;
}
framework
::
LoDTensor
&
VarBase
::
GradValue
()
{
framework
::
LoDTensor
&
VarBase
::
GradValue
()
{
VLOG
(
3
)
<<
"get var grad "
<<
var_desc_
->
Name
();
VLOG
(
3
)
<<
"get var grad "
<<
var_desc_
->
Name
();
return
*
(
grads_
->
var_
->
GetMutable
<
framework
::
LoDTensor
>
());
return
*
(
grads_
->
var_
->
GetMutable
<
framework
::
LoDTensor
>
());
}
}
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>
OpBase
::
ApplyGrad
()
{
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>
OpBase
::
ApplyGrad
()
{
if
(
!
grad_op_desc_
&&
backward_id_
<=
0
)
{
if
(
grad_op_descs_
.
empty
()
&&
backward_id_
<=
0
)
{
LOG
(
WARNING
)
<<
"op with no grad: "
<<
op_desc_
->
Type
();
LOG
(
WARNING
)
<<
"op with no grad: "
<<
op_desc_
->
Type
();
return
{};
return
{};
}
}
std
::
map
<
std
::
string
,
std
::
vector
<
framework
::
Variable
*>
>
grad_outputs
;
std
::
vector
<
framework
::
VariableValueMap
>
grad_outputs
;
if
(
backward_id_
>
0
)
{
if
(
backward_id_
>
0
)
{
VLOG
(
3
)
<<
"py_layer_grad"
;
VLOG
(
3
)
<<
"py_layer_grad"
;
grad_outputs
[
framework
::
GradVarName
(
PyLayer
::
kFwdOut
)]
=
PyLayer
::
ApplyGrad
(
grad_outputs
.
resize
(
1
);
backward_id_
,
grad_outputs
[
0
][
framework
::
GradVarName
(
PyLayer
::
kFwdOut
)]
=
grad_input_vars_
[
framework
::
GradVarName
(
PyLayer
::
kFwdInp
)]);
PyLayer
::
ApplyGrad
(
backward_id_
,
grad_input_vars_
[
0
][
framework
::
GradVarName
(
PyLayer
::
kFwdInp
)]);
}
else
{
}
else
{
VLOG
(
3
)
<<
"op grad "
<<
grad_op_desc_
->
Type
();
grad_outputs
.
resize
(
grad_op_descs_
.
size
());
for
(
auto
it
:
grad_output_vars_
)
{
for
(
size_t
k
=
0
;
k
<
grad_op_descs_
.
size
();
++
k
)
{
auto
&
outputs
=
grad_outputs
[
it
.
first
];
framework
::
OpDesc
*
grad_op_desc
=
grad_op_descs_
[
k
];
for
(
size_t
i
=
0
;
i
<
it
.
second
.
size
();
++
i
)
{
VLOG
(
3
)
<<
"op grad "
<<
grad_op_desc
->
Type
();
// Allocate a new variable
for
(
auto
it
:
grad_output_vars_
[
k
])
{
Variable
*
tmp_var
=
new
framework
::
Variable
();
auto
&
outputs
=
grad_outputs
[
k
][
it
.
first
];
tmp_var
->
GetMutable
<
framework
::
LoDTensor
>
();
for
(
size_t
i
=
0
;
i
<
it
.
second
.
size
();
++
i
)
{
outputs
.
push_back
(
tmp_var
);
// Allocate a new variable
Variable
*
tmp_var
=
new
framework
::
Variable
();
tmp_var
->
GetMutable
<
framework
::
LoDTensor
>
();
outputs
.
push_back
(
tmp_var
);
}
}
}
}
framework
::
RuntimeContext
ctx
(
grad_input_vars_
,
grad_outputs
);
framework
::
RuntimeContext
ctx
(
grad_input_vars_
[
k
],
grad_outputs
[
k
]
);
// No need to do compile time infer shape here.
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
// grad_op_desc_->InferShape(*block_);
grad_op_desc_
->
InferVarType
(
block_
);
grad_op_desc
->
InferVarType
(
block_
);
std
::
unique_ptr
<
framework
::
OperatorBase
>
opbase
=
std
::
unique_ptr
<
framework
::
OperatorBase
>
opbase
=
framework
::
OpRegistry
::
CreateOp
(
*
grad_op_desc_
);
framework
::
OpRegistry
::
CreateOp
(
*
grad_op_desc
);
framework
::
OperatorWithKernel
*
op_kernel
=
framework
::
OperatorWithKernel
*
op_kernel
=
dynamic_cast
<
framework
::
OperatorWithKernel
*>
(
opbase
.
get
());
dynamic_cast
<
framework
::
OperatorWithKernel
*>
(
opbase
.
get
());
PADDLE_ENFORCE_NOT_NULL
(
op_kernel
,
"only support op with kernel"
);
PADDLE_ENFORCE_NOT_NULL
(
op_kernel
,
"only support op with kernel"
);
framework
::
Scope
scope
;
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place_
)
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place
);
p
.
op
.
RuntimeInferShape
(
scope
,
place_
,
ctx
);
p
.
op
.
RuntimeInferShape
(
scope
,
place
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
)
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
}
}
}
for
(
auto
it
:
grad_output_vars_
)
{
for
(
size_t
k
=
0
;
k
<
grad_output_vars_
.
size
();
++
k
)
{
auto
&
outputs
=
grad_outputs
[
it
.
first
];
for
(
auto
it
:
grad_output_vars_
[
k
])
{
auto
&
origin_outputs
=
it
.
second
;
auto
&
outputs
=
grad_outputs
[
k
][
it
.
first
];
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
origin_outputs
.
size
());
auto
&
origin_outputs
=
it
.
second
;
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
origin_outputs
.
size
());
for
(
size_t
i
=
0
;
i
<
outputs
.
size
();
++
i
)
{
framework
::
Variable
*
grad
=
outputs
[
i
];
for
(
size_t
i
=
0
;
i
<
outputs
.
size
();
++
i
)
{
framework
::
Variable
*
orig_grad
=
origin_outputs
[
i
];
framework
::
Variable
*
grad
=
outputs
[
i
];
AddTo
(
grad
,
orig_grad
);
framework
::
Variable
*
orig_grad
=
origin_outputs
[
i
];
delete
grad
;
AddTo
(
grad
,
orig_grad
,
place_
);
delete
grad
;
}
}
}
}
}
return
input_vars_
;
return
input_vars_
;
}
}
...
@@ -188,8 +274,10 @@ void VarBase::RunBackward() {
...
@@ -188,8 +274,10 @@ void VarBase::RunBackward() {
VLOG
(
3
)
<<
"start backward"
;
VLOG
(
3
)
<<
"start backward"
;
auto
grads_t
=
grads_
->
var_
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
grads_t
=
grads_
->
var_
->
GetMutable
<
framework
::
LoDTensor
>
();
float
*
data
=
grads_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
operators
::
math
::
set_constant
(
std
::
fill
(
data
,
data
+
grads_t
->
numel
(),
1.0
);
*
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
var_
->
GetMutable
<
framework
::
LoDTensor
>
()
->
place
())),
grads_t
,
1.0
);
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
grads_
==
grads_
==
...
...
paddle/fluid/imperative/layer.h
浏览文件 @
c7e38680
...
@@ -21,17 +21,21 @@
...
@@ -21,17 +21,21 @@
#include <map> // NOLINT
#include <map> // NOLINT
#include <string> // NOLINT
#include <string> // NOLINT
#include <vector> // NOLINT
#include <vector> // NOLINT
#include <memory> // NOLINT
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/imperative/type_defs.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace
paddle
{
namespace
paddle
{
namespace
imperative
{
namespace
imperative
{
class
VarBase
;
namespace
py
=
::
pybind11
;
namespace
py
=
::
pybind11
;
class
PreparedOp
{
class
PreparedOp
{
...
@@ -81,6 +85,8 @@ class PreparedOp {
...
@@ -81,6 +85,8 @@ class PreparedOp {
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
);
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
);
}
}
inline
platform
::
DeviceContext
*
GetDeviceContext
()
const
{
return
dev_ctx
;
}
const
framework
::
OperatorBase
&
op
;
const
framework
::
OperatorBase
&
op
;
const
framework
::
RuntimeContext
&
ctx
;
const
framework
::
RuntimeContext
&
ctx
;
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
...
@@ -148,6 +154,9 @@ class VarBase {
...
@@ -148,6 +154,9 @@ class VarBase {
framework
::
LoDTensor
&
GradValue
();
framework
::
LoDTensor
&
GradValue
();
std
::
unique_ptr
<
VarBase
>
NewVarBase
(
const
platform
::
Place
&
dst_place
,
const
bool
blocking
)
const
;
inline
std
::
string
GradName
()
const
{
inline
std
::
string
GradName
()
const
{
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
var_desc_
,
var_desc_
,
...
@@ -175,11 +184,13 @@ class OpBase {
...
@@ -175,11 +184,13 @@ class OpBase {
OpBase
()
OpBase
()
:
op_desc_
(
nullptr
),
:
op_desc_
(
nullptr
),
forward_id_
(
-
1
),
forward_id_
(
-
1
),
grad_op_desc_
(
nullptr
),
backward_id_
(
-
1
),
backward_id_
(
-
1
)
{}
place_
(
platform
::
CPUPlace
()
)
{}
virtual
~
OpBase
()
{
virtual
~
OpBase
()
{
if
(
grad_op_desc_
)
delete
grad_op_desc_
;
for
(
framework
::
OpDesc
*
desc
:
grad_op_descs_
)
{
delete
desc
;
}
}
}
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>
ApplyGrad
();
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>
ApplyGrad
();
...
@@ -188,18 +199,25 @@ class OpBase {
...
@@ -188,18 +199,25 @@ class OpBase {
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework
::
OpDesc
*
op_desc_
;
framework
::
OpDesc
*
op_desc_
;
int
forward_id_
;
int
forward_id_
;
// When has backward, one of `grad_op_desc_` or `backward_id_` is set,
// When has backward, one of `grad_op_descs_` or `backward_id_` is set,
// not both.
// not both.
framework
::
OpDesc
*
grad_op_desc_
;
// Note: each fwd op corresponds to a vector of bwd ops.
std
::
vector
<
framework
::
OpDesc
*>
grad_op_descs_
;
int
backward_id_
;
int
backward_id_
;
platform
::
Place
place_
;
VarBasePtrMap
input_vars_
;
VarBasePtrMap
input_vars_
;
VarBasePtrMap
output_vars_
;
VarBasePtrMap
output_vars_
;
OpBasePtrMap
pre_ops_
;
OpBasePtrMap
pre_ops_
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
pre_ops_out_idx_
;
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
pre_ops_out_idx_
;
framework
::
VariableValueMap
grad_input_vars_
;
// Inputs to a vector of bwd ops.
framework
::
VariableValueMap
grad_output_vars_
;
std
::
vector
<
framework
::
VariableValueMap
>
grad_input_vars_
;
// Outputs to a vector of bwd ops.
std
::
vector
<
framework
::
VariableValueMap
>
grad_output_vars_
;
framework
::
BlockDesc
*
block_
;
framework
::
BlockDesc
*
block_
;
};
};
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
c7e38680
...
@@ -14,33 +14,60 @@
...
@@ -14,33 +14,60 @@
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
namespace
imperative
{
namespace
imperative
{
void
CreateGradOp
(
const
framework
::
OpDesc
&
op_desc
,
void
CreateGradOp
(
const
framework
::
OpDesc
&
op_desc
,
const
std
::
unordered_set
<
std
::
string
>&
no_grad_set
,
const
std
::
unordered_set
<
std
::
string
>&
no_grad_set
,
const
std
::
vector
<
framework
::
BlockDesc
*>&
grad_sub_block
,
const
std
::
vector
<
framework
::
BlockDesc
*>&
grad_sub_block
,
framework
::
OpDesc
**
grad_op_desc
,
std
::
vector
<
framework
::
OpDesc
*>*
grad_op_descs
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
grad_to_var
)
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>*
grad_to_var
)
{
std
::
vector
<
std
::
unique_ptr
<
framework
::
OpDesc
>>
grad_op_descs
=
PADDLE_ENFORCE
(
grad_op_descs
->
empty
());
std
::
vector
<
std
::
unique_ptr
<
framework
::
OpDesc
>>
descs
=
framework
::
OpInfoMap
::
Instance
()
framework
::
OpInfoMap
::
Instance
()
.
Get
(
op_desc
.
Type
())
.
Get
(
op_desc
.
Type
())
.
GradOpMaker
()(
op_desc
,
no_grad_set
,
grad_to_var
,
grad_sub_block
);
.
GradOpMaker
()(
op_desc
,
no_grad_set
,
grad_to_var
,
grad_sub_block
);
PADDLE_ENFORCE
(
grad_op_descs
.
size
()
==
1
,
"Only support 1 grad op now."
);
for
(
auto
&
desc
:
descs
)
{
// TODO(panyx0718): Leak?
grad_op_descs
->
emplace_back
(
desc
.
release
());
*
grad_op_desc
=
grad_op_descs
[
0
].
release
();
}
}
}
void
InitVar
(
framework
::
Variable
*
var
,
framework
::
Variable
*
grad_var
)
{
void
InitVar
(
framework
::
Variable
*
var
,
framework
::
Variable
*
grad_var
,
platform
::
DeviceContext
*
dev_ctx
)
{
PADDLE_ENFORCE_NOT_NULL
(
dev_ctx
,
"Could not get valid device from forward op"
);
auto
&
var_t
=
var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
var_t
=
var
->
Get
<
framework
::
LoDTensor
>
();
float
*
data
=
grad_var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
mutable_data
<
float
>
(
grad_var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
mutable_data
<
float
>
(
var_t
.
dims
(),
dev_ctx
->
GetPlace
());
var_t
.
dims
(),
platform
::
CPUPlace
());
operators
::
math
::
set_constant
(
std
::
fill
(
data
,
data
+
var_t
.
numel
(),
0.0
);
*
dev_ctx
,
grad_var
->
GetMutable
<
framework
::
LoDTensor
>
(),
0.0
);
}
platform
::
Place
GetExpectedPlace
(
platform
::
Place
place
,
VarBasePtrMap
inputs
)
{
platform
::
Place
result
=
place
;
for
(
auto
it
:
inputs
)
{
for
(
VarBase
*
var
:
it
.
second
)
{
platform
::
Place
tmp_place
=
var
->
var_
->
Get
<
framework
::
LoDTensor
>
().
place
();
if
(
!
platform
::
is_same_place
(
tmp_place
,
result
))
{
PADDLE_THROW
(
"Input variable should keep in the same place: %s, but get place: "
"%s of input %s instead"
,
result
,
tmp_place
,
it
.
first
);
}
}
}
return
result
;
}
}
void
Tracer
::
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
void
Tracer
::
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
platform
::
Place
expected_place
,
const
bool
stop_gradient
)
{
const
bool
stop_gradient
)
{
std
::
map
<
std
::
string
,
VarBase
*>
vars
;
std
::
map
<
std
::
string
,
VarBase
*>
vars
;
...
@@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
PADDLE_ENFORCE_NOT_NULL
(
op_kernel
,
"only support op with kernel"
);
PADDLE_ENFORCE_NOT_NULL
(
op_kernel
,
"only support op with kernel"
);
framework
::
Scope
scope
;
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
op
->
place_
=
GetExpectedPlace
(
expected_place
,
inputs
);
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place
);
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
p
.
op
.
RuntimeInferShape
(
scope
,
place
,
ctx
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
prepared_op
.
func
(
framework
::
ExecutionContext
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
));
if
(
!
stop_gradient
)
{
if
(
!
stop_gradient
)
{
framework
::
OpDesc
*
grad_op_desc
;
// TODO(panyx): Is this leaked?
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
new
std
::
unordered_map
<
std
::
string
,
std
::
string
>
());
new
std
::
unordered_map
<
std
::
string
,
std
::
string
>
());
CreateGradOp
(
*
op_desc
,
{},
{
block
},
&
grad_op_desc
,
grad_to_var
.
get
());
CreateGradOp
(
*
op_desc
,
{},
{
block
},
&
op
->
grad_op_descs_
,
grad_to_var
.
get
());
op
->
grad_op_desc_
=
grad_op_desc
;
op
->
grad_input_vars_
.
resize
(
op
->
grad_op_descs_
.
size
());
for
(
auto
it
:
grad_op_desc
->
Inputs
())
{
op
->
grad_output_vars_
.
resize
(
op
->
grad_op_descs_
.
size
());
auto
&
grad_in_vars
=
op
->
grad_input_vars_
[
it
.
first
];
for
(
size_t
i
=
0
;
i
<
op
->
grad_op_descs_
.
size
();
++
i
)
{
for
(
const
std
::
string
&
grad_invar
:
it
.
second
)
{
framework
::
OpDesc
*
grad_op_desc
=
op
->
grad_op_descs_
[
i
];
block
->
FindRecursiveOrCreateVar
(
grad_invar
);
for
(
auto
it
:
grad_op_desc
->
Inputs
())
{
auto
var_it
=
grad_to_var
->
find
(
grad_invar
);
auto
&
grad_in_vars
=
op
->
grad_input_vars_
[
i
][
it
.
first
];
if
(
var_it
==
grad_to_var
->
end
())
{
for
(
const
std
::
string
&
grad_invar
:
it
.
second
)
{
auto
fwd_var_it
=
vars
.
find
(
grad_invar
);
block
->
FindRecursiveOrCreateVar
(
grad_invar
);
PADDLE_ENFORCE
(
fwd_var_it
!=
vars
.
end
());
auto
var_it
=
grad_to_var
->
find
(
grad_invar
);
// Forward inputs or outputs.
if
(
var_it
==
grad_to_var
->
end
())
{
grad_in_vars
.
push_back
(
fwd_var_it
->
second
->
var_
);
auto
fwd_var_it
=
vars
.
find
(
grad_invar
);
}
else
{
PADDLE_ENFORCE
(
fwd_var_it
!=
vars
.
end
());
VarBase
*
var
=
vars
[
var_it
->
second
];
// Forward inputs or outputs.
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
grad_in_vars
.
push_back
(
fwd_var_it
->
second
->
var_
);
InitVar
(
var
->
var_
,
var
->
grads_
->
var_
);
}
else
{
VarBase
*
var
=
vars
[
var_it
->
second
];
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
InitVar
(
var
->
var_
,
var
->
grads_
->
var_
,
prepared_op
.
GetDeviceContext
());
}
// Douts.
grad_in_vars
.
push_back
(
var
->
grads_
->
var_
);
}
}
// Douts.
grad_in_vars
.
push_back
(
var
->
grads_
->
var_
);
}
}
}
}
}
for
(
auto
it
:
grad_op_desc
->
Outputs
())
{
for
(
auto
it
:
grad_op_desc
->
Outputs
())
{
auto
&
grad_out_vars
=
op
->
grad_output_vars_
[
it
.
first
];
auto
&
grad_out_vars
=
op
->
grad_output_vars_
[
i
][
it
.
first
];
for
(
const
std
::
string
&
grad_outvar
:
it
.
second
)
{
for
(
const
std
::
string
&
grad_outvar
:
it
.
second
)
{
block
->
FindRecursiveOrCreateVar
(
grad_outvar
);
block
->
FindRecursiveOrCreateVar
(
grad_outvar
);
auto
var_it
=
grad_to_var
->
find
(
grad_outvar
);
auto
var_it
=
grad_to_var
->
find
(
grad_outvar
);
PADDLE_ENFORCE
(
var_it
!=
grad_to_var
->
end
());
PADDLE_ENFORCE
(
var_it
!=
grad_to_var
->
end
(),
VarBase
*
var
=
vars
[
var_it
->
second
];
"Could not found the grad op output var, should this "
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
"operator %s's stop gradient be True"
,
InitVar
(
var
->
var_
,
var
->
grads_
->
var_
);
op_desc
->
Type
());
VarBase
*
var
=
vars
[
var_it
->
second
];
if
(
!
var
->
grads_
->
var_
->
IsInitialized
())
{
InitVar
(
var
->
var_
,
var
->
grads_
->
var_
,
prepared_op
.
GetDeviceContext
());
}
grad_out_vars
.
push_back
(
var
->
grads_
->
var_
);
}
}
grad_out_vars
.
push_back
(
var
->
grads_
->
var_
);
}
}
}
}
}
}
...
@@ -178,10 +213,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
...
@@ -178,10 +213,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
out
->
TrackPreOp
(
op
,
PyLayer
::
kFwdOut
,
i
,
stop_gradient
);
out
->
TrackPreOp
(
op
,
PyLayer
::
kFwdOut
,
i
,
stop_gradient
);
}
}
if
(
!
stop_gradient
)
{
if
(
!
stop_gradient
)
{
op
->
grad_input_vars_
.
resize
(
1
);
op
->
grad_output_vars_
.
resize
(
1
);
auto
&
grad_input_vars
=
auto
&
grad_input_vars
=
op
->
grad_input_vars_
[
framework
::
GradVarName
(
PyLayer
::
kFwdInp
)];
op
->
grad_input_vars_
[
0
][
framework
::
GradVarName
(
PyLayer
::
kFwdInp
)];
auto
&
grad_output_vars
=
auto
&
grad_output_vars
=
op
->
grad_output_vars_
[
framework
::
GradVarName
(
PyLayer
::
kFwdOut
)];
op
->
grad_output_vars_
[
0
][
framework
::
GradVarName
(
PyLayer
::
kFwdOut
)];
for
(
const
VarBase
*
inp
:
inputs
)
{
for
(
const
VarBase
*
inp
:
inputs
)
{
grad_input_vars
.
push_back
(
inp
->
var_
);
grad_input_vars
.
push_back
(
inp
->
var_
);
...
@@ -189,16 +226,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
...
@@ -189,16 +226,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
for
(
VarBase
*
out
:
outputs
)
{
for
(
VarBase
*
out
:
outputs
)
{
grad_input_vars
.
push_back
(
out
->
var_
);
grad_input_vars
.
push_back
(
out
->
var_
);
}
}
platform
::
CPUPlace
place
;
for
(
VarBase
*
out
:
outputs
)
{
for
(
VarBase
*
out
:
outputs
)
{
grad_input_vars
.
push_back
(
out
->
grads_
->
var_
);
grad_input_vars
.
push_back
(
out
->
grads_
->
var_
);
if
(
!
grad_input_vars
.
back
()
->
IsInitialized
())
{
if
(
!
grad_input_vars
.
back
()
->
IsInitialized
())
{
InitVar
(
out
->
var_
,
grad_input_vars
.
back
());
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar
(
out
->
var_
,
grad_input_vars
.
back
(),
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
}
}
}
}
for
(
const
VarBase
*
inp
:
inputs
)
{
for
(
const
VarBase
*
inp
:
inputs
)
{
grad_output_vars
.
push_back
(
inp
->
grads_
->
var_
);
grad_output_vars
.
push_back
(
inp
->
grads_
->
var_
);
if
(
!
grad_output_vars
.
back
()
->
IsInitialized
())
{
if
(
!
grad_output_vars
.
back
()
->
IsInitialized
())
{
InitVar
(
inp
->
var_
,
grad_output_vars
.
back
());
// TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
InitVar
(
inp
->
var_
,
grad_output_vars
.
back
(),
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
}
}
}
}
}
}
...
...
paddle/fluid/imperative/tracer.h
浏览文件 @
c7e38680
...
@@ -22,6 +22,7 @@
...
@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/engine.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
paddle
{
namespace
imperative
{
namespace
imperative
{
...
@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
...
@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
void
InitVar
(
framework
::
Variable
*
var
,
framework
::
Variable
*
grad_var
);
void
InitVar
(
framework
::
Variable
*
var
,
framework
::
Variable
*
grad_var
);
platform
::
Place
GetExpectedPlace
(
platform
::
Place
place
,
VarBasePtrMap
inputs
);
class
Tracer
{
class
Tracer
{
public:
public:
explicit
Tracer
(
framework
::
BlockDesc
*
root_block
)
:
root_block_
(
root_block
)
{}
explicit
Tracer
(
framework
::
BlockDesc
*
root_block
)
:
root_block_
(
root_block
)
{}
virtual
~
Tracer
()
{}
virtual
~
Tracer
()
{}
void
Trace
(
OpBase
*
op
,
void
Trace
(
OpBase
*
op
,
const
VarBasePtrMap
&
inputs
,
const
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>&
inputs
,
const
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
std
::
map
<
std
::
string
,
std
::
vector
<
VarBase
*>>&
outputs
,
const
platform
::
Place
expected_place
,
framework
::
BlockDesc
*
block
,
const
bool
stop_gradient
=
false
);
const
bool
stop_gradient
=
false
);
std
::
vector
<
VarBase
*>
PyTrace
(
OpBase
*
op
,
const
std
::
vector
<
VarBase
*>&
inputs
,
std
::
vector
<
VarBase
*>
PyTrace
(
OpBase
*
op
,
const
std
::
vector
<
VarBase
*>&
inputs
,
bool
stop_gradient
=
false
);
bool
stop_gradient
=
false
);
private:
private:
platform
::
Place
GetPlace
(
const
VarBasePtrMap
&
inputs
);
framework
::
BlockDesc
*
root_block_
;
framework
::
BlockDesc
*
root_block_
;
};
};
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
c7e38680
...
@@ -28,6 +28,7 @@
...
@@ -28,6 +28,7 @@
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/platform/variant.h"
#include "paddle/fluid/platform/variant.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -130,10 +131,14 @@ struct Argument {
...
@@ -130,10 +131,14 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
tensorrt_max_batch_size
,
TensorRtMaxBatchSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_max_batch_size
,
TensorRtMaxBatchSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_workspace_size
,
TensorRtWorkspaceSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_min_subgraph_size
,
TensorRtMinSubgraphSize
,
int
);
DECL_ARGUMENT_FIELD
(
tensorrt_precision_mode
,
TensorRtPrecisionMode
,
contrib
::
AnalysisConfig
::
Precision
);
// Memory optimized related.
// Memory optimized related.
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
enable_memory_optim
,
EnableMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
memory_optim_force_update
,
MemoryOptimForceUpdate
,
bool
);
DECL_ARGUMENT_FIELD
(
static_memory_optim
,
StaticMemoryOptim
,
bool
);
DECL_ARGUMENT_FIELD
(
static_memory_optim_force_update
,
StaticMemoryOptimForceUpdate
,
bool
);
// Indicate which kind of sort algorithm is used for operators, the memory
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
// optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD
(
memory_optim_sort_kind
,
MemoryOptimSortKind
,
int
);
DECL_ARGUMENT_FIELD
(
memory_optim_sort_kind
,
MemoryOptimSortKind
,
int
);
...
...
paddle/fluid/inference/analysis/helper.cc
浏览文件 @
c7e38680
...
@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
...
@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
attr
->
set_i
(
data
);
attr
->
set_i
(
data
);
}
}
template
<
>
template
<
>
void
SetAttr
<
bool
>
(
framework
::
proto
::
OpDesc
*
op
,
const
std
::
string
&
name
,
const
bool
&
data
)
{
auto
*
attr
=
op
->
add_attrs
();
attr
->
set_name
(
name
);
attr
->
set_type
(
paddle
::
framework
::
proto
::
AttrType
::
BOOLEAN
);
attr
->
set_b
(
data
);
}
template
<
>
void
SetAttr
<
int64_t
>
(
framework
::
proto
::
OpDesc
*
op
,
const
std
::
string
&
name
,
void
SetAttr
<
int64_t
>
(
framework
::
proto
::
OpDesc
*
op
,
const
std
::
string
&
name
,
const
int64_t
&
data
)
{
const
int64_t
&
data
)
{
auto
*
attr
=
op
->
add_attrs
();
auto
*
attr
=
op
->
add_attrs
();
...
...
paddle/fluid/inference/analysis/helper.h
浏览文件 @
c7e38680
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <sys/stat.h>
#include <sys/stat.h>
#include <cstdio>
#include <cstdio>
#include <fstream>
#include <fstream>
#include <set>
#include <string>
#include <string>
#include <typeindex>
#include <typeindex>
#include <unordered_map>
#include <unordered_map>
...
@@ -29,9 +30,14 @@ limitations under the License. */
...
@@ -29,9 +30,14 @@ limitations under the License. */
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/port.h"
#ifdef _WIN32
#ifdef _WIN32
#include <direct.h>
#include <io.h>
#define GCC_ATTRIBUTE(attr__) ;
#define GCC_ATTRIBUTE(attr__) ;
#define MKDIR(path) _mkdir(path)
#else
#else
#include <unistd.h>
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
#endif
#endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
...
@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) {
...
@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) {
return
false
;
return
false
;
}
}
static
std
::
string
GetDirRoot
(
const
std
::
string
&
path
)
{
char
sep
=
'/'
;
#ifdef _WIN32
sep
=
'\\'
;
#endif
size_t
i
=
path
.
rfind
(
sep
,
path
.
length
());
if
(
i
!=
std
::
string
::
npos
)
{
return
(
path
.
substr
(
0
,
i
));
}
return
path
;
}
static
std
::
string
GetOrCreateModelOptCacheDir
(
const
std
::
string
&
model_root
)
{
std
::
string
opt_cache_dir
=
model_root
+
"/_opt_cache/"
;
if
(
!
PathExists
(
opt_cache_dir
))
{
PADDLE_ENFORCE
(
MKDIR
(
opt_cache_dir
.
c_str
())
!=
-
1
,
"Can not create optimize cache directory: %s, Make sure you "
"have permission to write"
,
opt_cache_dir
);
}
return
opt_cache_dir
;
}
static
std
::
string
GetTrtCalibPath
(
const
std
::
string
&
model_root
,
const
std
::
string
&
engine_key
)
{
return
model_root
+
"/trt_calib_"
+
engine_key
;
}
// If there is no calib table data file in model_opt_cache_dir, return "".
static
std
::
string
GetTrtCalibTableData
(
const
std
::
string
&
model_opt_cache_dir
,
const
std
::
string
&
engine_key
,
bool
enable_int8
)
{
std
::
string
trt_calib_table_path
=
GetTrtCalibPath
(
model_opt_cache_dir
,
engine_key
);
if
(
enable_int8
&&
FileExists
(
trt_calib_table_path
))
{
VLOG
(
3
)
<<
"Calibration table file: "
<<
trt_calib_table_path
<<
"is found here"
;
std
::
ifstream
infile
(
trt_calib_table_path
,
std
::
ios
::
in
);
std
::
stringstream
buffer
;
buffer
<<
infile
.
rdbuf
();
std
::
string
calibration_data
(
buffer
.
str
());
return
calibration_data
;
}
return
""
;
}
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
c7e38680
...
@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
...
@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
pass
->
Set
(
"max_batch_size"
,
new
int
(
argument
->
tensorrt_max_batch_size
()));
pass
->
Set
(
"max_batch_size"
,
new
int
(
argument
->
tensorrt_max_batch_size
()));
pass
->
Set
(
"min_subgraph_size"
,
pass
->
Set
(
"min_subgraph_size"
,
new
int
(
argument
->
tensorrt_min_subgraph_size
()));
new
int
(
argument
->
tensorrt_min_subgraph_size
()));
pass
->
Set
(
"program"
,
new
framework
::
ProgramDesc
*
(
&
argument
->
main_program
()));
bool
enable_int8
=
argument
->
tensorrt_precision_mode
()
==
contrib
::
AnalysisConfig
::
Precision
::
kInt8
;
pass
->
Set
(
"enable_int8"
,
new
bool
(
enable_int8
));
std
::
string
model_opt_cache_dir
=
argument
->
Has
(
"model_dir"
)
?
argument
->
model_dir
()
:
GetDirRoot
(
argument
->
model_program_path
());
pass
->
Set
(
"model_opt_cache_dir"
,
new
std
::
string
(
GetOrCreateModelOptCacheDir
(
model_opt_cache_dir
)));
}
}
// graph_ = pass->Apply(std::move(graph_));
// graph_ = pass->Apply(std::move(graph_));
...
@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
...
@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
}
}
framework
::
proto
::
ProgramDesc
IRPassManager
::
AcquireProgram
(
framework
::
proto
::
ProgramDesc
IRPassManager
::
AcquireProgram
(
std
::
unique_ptr
<
Graph
>
*
graph
,
const
ProgramDesc
&
program
)
const
{
std
::
unique_ptr
<
Graph
>
*
graph
,
ProgramDesc
*
program
)
const
{
auto
pass
=
auto
pass
=
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
"graph_to_program_pass"
);
framework
::
ir
::
PassRegistry
::
Instance
().
Get
(
"graph_to_program_pass"
);
ProgramDesc
desc
(
program
);
// Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
ProgramDesc
desc
;
desc
.
CopyFrom
(
*
program
->
Proto
());
pass
->
SetNotOwned
(
"program"
,
&
desc
);
pass
->
SetNotOwned
(
"program"
,
&
desc
);
auto
*
the_graph
=
graph
->
release
();
auto
*
the_graph
=
graph
->
release
();
*
graph
=
pass
->
Apply
(
std
::
unique_ptr
<
Graph
>
(
the_graph
));
*
graph
=
pass
->
Apply
(
std
::
unique_ptr
<
Graph
>
(
the_graph
));
...
...
paddle/fluid/inference/analysis/ir_pass_manager.h
浏览文件 @
c7e38680
...
@@ -29,6 +29,7 @@
...
@@ -29,6 +29,7 @@
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -42,8 +43,8 @@ class IRPassManager final {
...
@@ -42,8 +43,8 @@ class IRPassManager final {
std
::
unique_ptr
<
Graph
>
Apply
(
std
::
unique_ptr
<
Graph
>
graph
);
std
::
unique_ptr
<
Graph
>
Apply
(
std
::
unique_ptr
<
Graph
>
graph
);
framework
::
proto
::
ProgramDesc
AcquireProgram
(
framework
::
proto
::
ProgramDesc
AcquireProgram
(
std
::
unique_ptr
<
Graph
>
*
graph
,
std
::
unique_ptr
<
Graph
>
*
graph
,
const
ProgramDesc
&
program
)
const
;
ProgramDesc
*
program
)
const
;
framework
::
ir
::
Graph
&
graph
()
const
{
return
*
graph_
;
}
framework
::
ir
::
Graph
&
graph
()
const
{
return
*
graph_
;
}
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
c7e38680
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include <algorithm>
#include <algorithm>
#include <set>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
...
@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
return
graph
;
return
graph
;
}
}
std
::
string
GenerateEngineKey
(
const
std
::
set
<
std
::
string
>
&
engine_inputs
,
const
std
::
set
<
std
::
string
>
&
engine_outputs
)
{
std
::
string
engine_hash_key
=
""
;
for
(
auto
name
:
engine_inputs
)
{
engine_hash_key
+=
name
;
}
for
(
auto
name
:
engine_outputs
)
{
engine_hash_key
+=
name
;
}
auto
engine_key
=
std
::
to_string
(
std
::
hash
<
std
::
string
>
()(
engine_hash_key
));
return
engine_key
;
}
void
TensorRtSubgraphPass
::
CreateTensorRTOp
(
framework
::
ir
::
Node
*
node
,
void
TensorRtSubgraphPass
::
CreateTensorRTOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
)
const
{
Graph
*
graph
)
const
{
auto
*
op_desc
=
node
->
Op
();
auto
*
op_desc
=
node
->
Op
();
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
framework
::
ProgramDesc
*
program_desc
=
Get
<
framework
::
ProgramDesc
*>
(
"program"
);
// Add new block for TensorRTEngineOP
const
framework
::
BlockDesc
&
main_block
=
program_desc
->
Block
(
framework
::
kRootBlockIndex
);
// const framework::BlockDesc& main_block = program_desc->Block(0);
framework
::
BlockDesc
*
new_block
=
program_desc
->
AppendBlock
(
main_block
);
// An fake block desc.
// An fake block desc.
framework
::
proto
::
BlockDesc
block_proto
;
framework
::
proto
::
BlockDesc
block_proto
;
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
framework
::
BlockDesc
block_desc
(
nullptr
,
&
block_proto
);
...
@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
subgraph
.
size
());
subgraph
.
size
());
for
(
auto
*
node
:
subgraph
)
{
for
(
auto
*
node
:
subgraph
)
{
auto
*
new_block_op
=
new_block
->
AppendOp
();
auto
*
op
=
block_desc
.
AppendOp
();
auto
*
op
=
block_desc
.
AppendOp
();
*
new_block_op
->
Proto
()
=
*
node
->
Op
()
->
Proto
();
*
op
->
Proto
()
=
*
node
->
Op
()
->
Proto
();
*
op
->
Proto
()
=
*
node
->
Op
()
->
Proto
();
}
}
// collect inputs
// Then, we will use the input_names_with_id and output_names_with_id to
std
::
unordered_set
<
std
::
string
>
input_names
;
// generate the eigine key.
std
::
unordered_set
<
std
::
string
>
input_names_with_id
;
// So, We use set instead of unordered_set here to ensure that the engine key
// is unique.
std
::
set
<
std
::
string
>
input_names
;
std
::
set
<
std
::
string
>
input_names_with_id
;
for
(
auto
*
x
:
node
->
inputs
)
{
for
(
auto
*
x
:
node
->
inputs
)
{
input_names
.
insert
(
x
->
Name
());
input_names
.
insert
(
x
->
Name
());
input_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
input_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
...
@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
op_desc
->
SetInput
(
op_desc
->
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
"Xs"
,
std
::
vector
<
std
::
string
>
(
input_names
.
begin
(),
input_names
.
end
()));
std
::
unordered_
set
<
std
::
string
>
output_names
;
std
::
set
<
std
::
string
>
output_names
;
std
::
unordered_
set
<
std
::
string
>
output_names_with_id
;
std
::
set
<
std
::
string
>
output_names_with_id
;
for
(
auto
*
x
:
node
->
outputs
)
{
for
(
auto
*
x
:
node
->
outputs
)
{
output_names
.
insert
(
x
->
Name
());
output_names
.
insert
(
x
->
Name
());
output_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
output_names_with_id
.
insert
(
x
->
Name
()
+
std
::
to_string
(
x
->
id
()));
...
@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// to Tensor.
// to Tensor.
std
::
vector
<
std
::
string
>
output_mapping
;
std
::
vector
<
std
::
string
>
output_mapping
;
for
(
auto
name
:
output_names
)
{
for
(
auto
name
:
output_names
)
{
// LOG(INFO) << name << " " << output_name_map.size();
PADDLE_ENFORCE
(
output_name_map
.
count
(
name
)
!=
0
);
PADDLE_ENFORCE
(
output_name_map
.
count
(
name
)
!=
0
);
output_mapping
.
push_back
(
output_name_map
[
name
]);
output_mapping
.
push_back
(
output_name_map
[
name
]);
}
}
...
@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
...
@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
*
vars
->
Add
()
=
*
node
->
Var
()
->
Proto
();
*
vars
->
Add
()
=
*
node
->
Var
()
->
Proto
();
}
}
}
}
PADDLE_ENFORCE
(
!
block_desc
.
Proto
()
->
vars
().
empty
(),
PADDLE_ENFORCE
(
!
block_desc
.
Proto
()
->
vars
().
empty
(),
"the block has no var-desc"
);
"the block has no var-desc"
);
PADDLE_ENFORCE
(
!
output_mapping
.
empty
());
PADDLE_ENFORCE
(
!
output_mapping
.
empty
());
// Set attrs
op_desc
->
SetBlockAttr
(
"sub_block"
,
new_block
);
SetAttr
(
op_desc
->
Proto
(),
"subgraph"
,
SetAttr
(
op_desc
->
Proto
(),
"subgraph"
,
block_desc
.
Proto
()
->
SerializeAsString
());
block_desc
.
Proto
()
->
SerializeAsString
());
// Set attrs
SetAttr
(
op_desc
->
Proto
(),
"max_batch_size"
,
Get
<
int
>
(
"max_batch_size"
));
SetAttr
(
op_desc
->
Proto
(),
"max_batch_size"
,
Get
<
int
>
(
"max_batch_size"
));
SetAttr
(
op_desc
->
Proto
(),
"workspace_size"
,
Get
<
int
>
(
"workspace_size"
));
SetAttr
(
op_desc
->
Proto
(),
"workspace_size"
,
Get
<
int
>
(
"workspace_size"
));
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
ExtractParameters
(
graph
->
Nodes
()));
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
ExtractParameters
(
graph
->
Nodes
()));
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
auto
enable_int8
=
Get
<
bool
>
(
"enable_int8"
);
auto
engine_key
=
GenerateEngineKey
(
input_names_with_id
,
output_names_with_id
);
std
::
string
calibration_data
=
GetTrtCalibTableData
(
Get
<
std
::
string
>
(
"model_opt_cache_dir"
),
engine_key
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"calibration_data"
,
calibration_data
);
SetAttr
(
op_desc
->
Proto
(),
"enable_int8"
,
enable_int8
);
SetAttr
(
op_desc
->
Proto
(),
"engine_key"
,
engine_key
);
}
}
std
::
vector
<
std
::
string
>
ExtractParameters
(
std
::
vector
<
std
::
string
>
ExtractParameters
(
...
...
paddle/fluid/inference/analysis/passes/CMakeLists.txt
浏览文件 @
c7e38680
cc_library
(
ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass
)
cc_library
(
memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass
zero_copy_tensor
)
cc_library
(
ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass
)
cc_library
(
ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass
)
...
...
paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
浏览文件 @
c7e38680
...
@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
...
@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
}
}
std
::
unique_ptr
<
Graph
>
graph
(
argument
->
main_graph_ptr
());
std
::
unique_ptr
<
Graph
>
graph
(
argument
->
main_graph_ptr
());
framework
::
ProgramDesc
desc
(
argument
->
main_program
());
// Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
framework
::
ProgramDesc
desc
;
desc
.
CopyFrom
(
*
argument
->
main_program
().
Proto
());
pass
->
SetNotOwned
(
"program"
,
&
desc
);
pass
->
SetNotOwned
(
"program"
,
&
desc
);
auto
thegraph
=
pass
->
Apply
(
std
::
move
(
graph
));
auto
thegraph
=
pass
->
Apply
(
std
::
move
(
graph
));
thegraph
.
release
();
// the argument still own the graph.
thegraph
.
release
();
// the argument still own the graph.
...
...
paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
浏览文件 @
c7e38680
...
@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
...
@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
return
batch_shapes
;
return
batch_shapes
;
}
}
// Replace the -1 in shape to a real number to fake the shape.
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
FakeBatchVarShapes
(
const
framework
::
ProgramDesc
&
program
)
{
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
res
;
res
.
emplace_back
();
auto
&
record
=
res
.
front
();
const
int
fake_batch_size
=
3
;
for
(
auto
*
var
:
program
.
Block
(
0
).
AllVars
())
{
if
(
var
->
GetType
()
==
framework
::
proto
::
VarType
::
Type
::
VarType_Type_LOD_TENSOR
)
{
auto
shape
=
var
->
GetShape
();
for
(
auto
&
v
:
shape
)
{
if
(
v
<
0
)
v
=
fake_batch_size
;
}
record
[
var
->
Name
()].
assign
(
shape
.
begin
(),
shape
.
end
());
}
}
return
res
;
}
// Calculate the average dim of each tensor from the batch shape cache.
// Calculate the average dim of each tensor from the batch shape cache.
std
::
unordered_map
<
std
::
string
,
size_t
>
GetBatchAverageSize
(
std
::
unordered_map
<
std
::
string
,
size_t
>
GetBatchAverageSize
(
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
)
{
const
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>&
batches
)
{
...
@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
...
@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
std
::
unordered_map
<
std
::
string
,
std
::
stringstream
>
var_batchsize_hashes
;
std
::
unordered_map
<
std
::
string
,
std
::
stringstream
>
var_batchsize_hashes
;
for
(
auto
&
batch
:
batches
)
{
for
(
auto
&
batch
:
batches
)
{
for
(
auto
&
ele
:
batch
)
{
for
(
auto
&
ele
:
batch
)
{
PADDLE_ENFORCE
(
!
ele
.
second
.
empty
());
int
batch_size
=
ele
.
second
.
front
();
int
batch_size
=
ele
.
second
.
front
();
// TODO(Superjomn) might consume large memory here, use combine hash.
// TODO(Superjomn) might consume large memory here, use combine hash.
var_batchsize_hashes
[
ele
.
first
]
<<
batch_size
;
var_batchsize_hashes
[
ele
.
first
]
<<
batch_size
;
...
@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
...
@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
std
::
string
MemoryOptimizePass
::
repr
()
const
{
return
"memory optimize pass"
;
}
std
::
string
MemoryOptimizePass
::
repr
()
const
{
return
"memory optimize pass"
;
}
std
::
pair
<
size_t
,
size_t
>
GetRange
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
ave_size
)
{
auto
res
=
std
::
make_pair
(
std
::
numeric_limits
<
size_t
>::
max
(),
std
::
numeric_limits
<
size_t
>::
min
());
for
(
auto
&
item
:
ave_size
)
{
res
.
first
=
std
::
min
(
item
.
second
,
res
.
first
);
res
.
second
=
std
::
max
(
item
.
second
,
res
.
second
);
}
return
res
;
}
void
MemoryOptimizePass
::
RunImpl
(
Argument
*
argument
)
{
void
MemoryOptimizePass
::
RunImpl
(
Argument
*
argument
)
{
// When force update, should not optimize memory.
// When force update, should not optimize memory.
if
(
!
argument
->
enable_memory_optim
()
||
argument
->
memory_optim_force_update
())
if
(
!
argument
->
enable_memory_optim
()
||
argument
->
static_memory_optim_force_update
())
return
;
return
;
graph_
=
argument
->
main_graph_ptr
();
graph_
=
argument
->
main_graph_ptr
();
...
@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
...
@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
argument
->
model_program_path_valid
()
?
argument
->
model_program_path
()
argument
->
model_program_path_valid
()
?
argument
->
model_program_path
()
:
""
);
:
""
);
VLOG
(
3
)
<<
"Load memory cache from "
<<
path
;
VLOG
(
3
)
<<
"Load memory cache from "
<<
path
;
if
(
inference
::
IsFileExists
(
path
))
{
std
::
vector
<
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>>
batches
;
VLOG
(
4
)
<<
"Performing memory optimize"
;
auto
batches
=
DeseralizeBatchVarShapes
(
path
);
if
(
argument
->
static_memory_optim
()
&&
inference
::
IsFileExists
(
path
))
{
auto
var_batch_ave_size
=
GetBatchAverageSize
(
batches
);
string
::
PrettyLogInfo
(
"--- Performing static memory optimize"
);
batches
=
DeseralizeBatchVarShapes
(
path
);
}
else
{
string
::
PrettyLogInfo
(
"--- Performing dynamic memory optimize"
);
batches
=
FakeBatchVarShapes
(
argument
->
main_program
());
}
auto
var_batch_ave_size
=
GetBatchAverageSize
(
batches
);
// Get min and max memory size.
const
auto
range
=
GetRange
(
var_batch_ave_size
);
const
int
cluster_size
=
std
::
max
(
static_cast
<
int
>
((
range
.
second
-
range
.
first
)
/
100
/*cluster num*/
),
1024
);
const
int
cluster_size1
=
std
::
max
(
static_cast
<
int
>
((
range
.
second
-
range
.
first
)
/
1000
/*cluster num*/
),
1024
);
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
std
::
unordered_map
<
std
::
string
,
Node
*>
tensor_nodes
;
space_table_t
space_table
;
space_table_t
space_table
;
CollectVarMemorySize
(
var_batch_ave_size
,
&
tensor_nodes
,
&
space_table
);
CollectVarMemorySize
(
var_batch_ave_size
,
&
tensor_nodes
,
&
space_table
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
reuse_table
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
reuse_table
;
double
max_saving_ratio
=
0.
;
double
max_saving_ratio
=
0.
;
std
::
vector
<
std
::
function
<
MemoryAllocation
()
>>
strategies
;
std
::
vector
<
std
::
function
<
MemoryAllocation
()
>>
strategies
;
for
(
int
sort_kind
=
0
;
sort_kind
<
2
;
sort_kind
++
)
{
for
(
int
sort_kind
=
0
;
sort_kind
<
2
;
sort_kind
++
)
{
if
(
argument
->
static_memory_optim
())
{
// This strategy only make scene in static memory optimize.
strategies
.
emplace_back
([
&
,
sort_kind
]
{
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_batch_size
=
auto
clustered_vars_by_batch_size
=
AnalysisBatchShapesByBatchSize
(
batches
);
AnalysisBatchShapesByBatchSize
(
batches
);
...
@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
...
@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
return
allocation
;
});
});
}
strategies
.
emplace_back
([
&
,
sort_kind
]
{
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
auto
clustered_vars_by_ave_size
=
space_table
,
batches
,
1024
);
// interval 1kb
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
cluster_size
);
MemoryAllocation
allocation
;
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
return
allocation
;
});
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
cluster_size1
);
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
std
::
numeric_limits
<
int
>::
max
());
// no intervals
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
}
strategies
.
emplace_back
([
&
,
sort_kind
]
{
std
::
function
<
MemoryAllocation
()
>*
best_strategy
{
nullptr
};
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
space_table
,
batches
,
1024
*
1024
);
// interval 1MB
MemoryAllocation
allocation
;
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
return
allocation
;
});
strategies
.
emplace_back
([
&
,
sort_kind
]
{
// Try all strategies to get the best result.
auto
clustered_vars_by_ave_size
=
AnalysisBatchShapesBySimilarSize
(
for
(
auto
&
strategy
:
strategies
)
{
space_table
,
batches
,
auto
allocation
=
strategy
();
std
::
numeric_limits
<
int
>::
max
());
// no intervals
string
::
PrettyLogDetail
(
"--- get strategy saving %f memory for workspace"
,
MemoryAllocation
allocation
;
allocation
.
GetSavingRatio
());
MakeReusePlan
(
clustered_vars_by_ave_size
,
var_batch_ave_size
,
if
(
allocation
.
GetSavingRatio
()
>
max_saving_ratio
)
{
space_table
,
&
reuse_table
,
sort_kind
,
&
allocation
);
max_saving_ratio
=
allocation
.
GetSavingRatio
();
return
allocation
;
best_strategy
=
&
strategy
;
});
}
}
}
if
(
!
best_strategy
)
{
LOG
(
ERROR
)
<<
"This model makes poor memory optimize, skip memory optimize"
;
return
;
}
auto
memory_allocation
=
(
*
best_strategy
)();
std
::
function
<
MemoryAllocation
()
>*
best_strategy
{
nullptr
};
string
::
PrettyLogInfo
(
"--- Saved %.2f%s memory for workspace(temporary variables)"
,
memory_allocation
.
GetSavingRatio
()
*
100
,
"%"
);
// Try all strategies to get the best result.
argument
->
main_graph
().
Set
(
framework
::
ir
::
kGraphToProgramVarsToRemove
,
for
(
auto
&
strategy
:
strategies
)
{
new
std
::
unordered_set
<
std
::
string
>
);
auto
allocation
=
strategy
();
auto
&
vars2remove
=
string
::
PrettyLogDetail
(
"--- get strategy saving %f memory for workspace"
,
argument
->
main_graph
().
Get
<
std
::
unordered_set
<
std
::
string
>>
(
allocation
.
GetSavingRatio
());
framework
::
ir
::
kGraphToProgramVarsToRemove
);
if
(
allocation
.
GetSavingRatio
()
>
max_saving_ratio
)
{
max_saving_ratio
=
allocation
.
GetSavingRatio
();
PerformReusePlan
(
reuse_table
,
memory_allocation
.
sort_kind
,
&
vars2remove
);
best_strategy
=
&
strategy
;
argument
->
SetMemoryOptimSortKind
(
memory_allocation
.
sort_kind
);
}
}
if
(
!
best_strategy
)
{
LOG
(
ERROR
)
<<
"This model makes poor memory optimize, skip memory optimize"
;
return
;
}
auto
memory_allocation
=
(
*
best_strategy
)();
string
::
PrettyLogH2
(
"--- Saved %.2f%s memory for workspace(temporary variables)"
,
memory_allocation
.
GetSavingRatio
()
*
100
,
"%"
);
string
::
PrettyLogDetail
(
"--- Allocated %d MB"
,
memory_allocation
.
allocated
/
1024.
/
1024.
);
string
::
PrettyLogDetail
(
"--- Saved %d MB"
,
memory_allocation
.
saved
/
1024.
/
1024.
);
argument
->
main_graph
().
Set
(
framework
::
ir
::
kGraphToProgramVarsToRemove
,
new
std
::
unordered_set
<
std
::
string
>
);
auto
&
vars2remove
=
argument
->
main_graph
().
Get
<
std
::
unordered_set
<
std
::
string
>>
(
framework
::
ir
::
kGraphToProgramVarsToRemove
);
PerformReusePlan
(
reuse_table
,
memory_allocation
.
sort_kind
,
&
vars2remove
);
argument
->
SetMemoryOptimSortKind
(
memory_allocation
.
sort_kind
);
}
}
}
float
MemoryOptimizePass
::
MemoryAllocation
::
GetSavingRatio
()
const
{
float
MemoryOptimizePass
::
MemoryAllocation
::
GetSavingRatio
()
const
{
...
...
paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
浏览文件 @
c7e38680
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
#pragma once
#pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/
inference/analysis/passes/memory_optimize_pass
.h"
#include "paddle/fluid/
platform/port
.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
c7e38680
...
@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
...
@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER
(
memory_pool_init_size_mb_
);
CP_MEMBER
(
memory_pool_init_size_mb_
);
CP_MEMBER
(
enable_memory_optim_
);
CP_MEMBER
(
enable_memory_optim_
);
CP_MEMBER
(
memory_optim_force_update_
);
CP_MEMBER
(
static_memory_optim_
);
CP_MEMBER
(
static_memory_optim_force_update_
);
// TensorRT releated.
// TensorRT releated.
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
use_tensorrt_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
CP_MEMBER
(
tensorrt_workspace_size_
);
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_max_batchsize_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
CP_MEMBER
(
tensorrt_min_subgraph_size_
);
CP_MEMBER
(
tensorrt_precision_mode_
);
// MKLDNN releated.
// MKLDNN releated.
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
use_mkldnn_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
CP_MEMBER
(
mkldnn_enabled_op_types_
);
...
@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
...
@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
Update
();
Update
();
}
}
void
contrib
::
AnalysisConfig
::
EnableTensorRtEngine
(
int
workspace_size
,
void
contrib
::
AnalysisConfig
::
EnableTensorRtEngine
(
int
max_batc
h_size
,
int
workspace_size
,
int
max_batch_size
,
int
min_subgrap
h_size
,
int
min_subgraph_siz
e
)
{
contrib
::
AnalysisConfig
::
Precision
precision_mod
e
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
!
use_gpu
())
{
if
(
!
use_gpu
())
{
LOG
(
ERROR
)
<<
"To use TensorRT engine, please call EnableGpu() first"
;
LOG
(
ERROR
)
<<
"To use TensorRT engine, please call EnableGpu() first"
;
...
@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
...
@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
tensorrt_precision_mode_
=
precision_mode
;
Update
();
Update
();
#else
#else
...
@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
...
@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
ss
<<
tensorrt_min_subgraph_size_
;
ss
<<
tensorrt_min_subgraph_size_
;
ss
<<
enable_memory_optim_
;
ss
<<
enable_memory_optim_
;
ss
<<
memory_optim_force_update_
;
ss
<<
static_memory_optim_
;
ss
<<
static_memory_optim_force_update_
;
ss
<<
use_mkldnn_
;
ss
<<
use_mkldnn_
;
for
(
auto
&
item
:
mkldnn_enabled_op_types_
)
ss
<<
item
;
for
(
auto
&
item
:
mkldnn_enabled_op_types_
)
ss
<<
item
;
...
@@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
...
@@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
#endif
}
}
void
contrib
::
AnalysisConfig
::
EnableMemoryOptim
(
bool
force_update_cache
)
{
void
contrib
::
AnalysisConfig
::
EnableMemoryOptim
(
bool
static_optim
,
bool
force_update_static_cache
)
{
enable_memory_optim_
=
true
;
enable_memory_optim_
=
true
;
memory_optim_force_update_
=
force_update_cache
;
static_memory_optim_
=
static_optim
;
static_memory_optim_force_update_
=
force_update_static_cache
;
Update
();
Update
();
}
}
...
@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
...
@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
Update
();
Update
();
}
}
NativeConfig
contrib
::
AnalysisConfig
::
ToNativeConfig
()
const
{
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
config
.
prog_file
=
prog_file_
;
config
.
param_file
=
params_file_
;
config
.
use_gpu
=
use_gpu_
;
config
.
device
=
device_id_
;
config
.
fraction_of_gpu_memory
=
fraction_of_gpu_memory_for_pool
();
config
.
specify_input_name
=
specify_input_name_
;
return
config
;
}
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
c7e38680
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h>
#include <glog/logging.h>
#include <algorithm>
#include <algorithm>
#include <fstream>
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -25,6 +26,7 @@
...
@@ -25,6 +26,7 @@
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
...
@@ -37,6 +39,8 @@
...
@@ -37,6 +39,8 @@
#if PADDLE_WITH_TENSORRT
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#endif
#endif
DECLARE_bool
(
profile
);
DECLARE_bool
(
profile
);
...
@@ -44,6 +48,12 @@ DECLARE_bool(profile);
...
@@ -44,6 +48,12 @@ DECLARE_bool(profile);
namespace
paddle
{
namespace
paddle
{
using
contrib
::
AnalysisConfig
;
using
contrib
::
AnalysisConfig
;
using
inference
::
Singleton
;
#if PADDLE_WITH_TENSORRT
using
inference
::
tensorrt
::
TRTInt8Calibrator
;
using
inference
::
tensorrt
::
TRTCalibratorEngine
;
using
inference
::
tensorrt
::
TRTCalibratorEngineManager
;
#endif
namespace
{
namespace
{
bool
IsPersistable
(
const
framework
::
VarDesc
*
var
)
{
bool
IsPersistable
(
const
framework
::
VarDesc
*
var
)
{
...
@@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram(
...
@@ -113,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram(
if
(
!
program
)
{
if
(
!
program
)
{
if
(
!
LoadProgramDesc
())
return
false
;
if
(
!
LoadProgramDesc
())
return
false
;
// If not cloned, the parameters should be loaded.
// If config_.ir_optim() is True, parameters is loaded in
// OptimizeInferenceProgram(), but other persistable variables
// (like RAW type var) are not created in scope.
// If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
// still need to create other persistable variables.
// So in both case, create persistable variables at first.
executor_
->
CreateVariables
(
*
inference_program_
,
0
,
true
,
sub_scope_
);
// Optimize the program, and load parameters and modify them in the
// Optimize the program, and load parameters and modify them in the
// scope_.
// scope_.
// This will change the scope_ address.
// This will change the scope_ address.
...
@@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram(
...
@@ -120,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram(
status_ir_optim_enabled_
=
true
;
status_ir_optim_enabled_
=
true
;
OptimizeInferenceProgram
();
OptimizeInferenceProgram
();
}
else
{
}
else
{
// If the parent_scope is passed, we assert that the persistable variables
// are already created, so just create the no persistable variables.
// If not cloned, the parameters should be loaded
// OptimizeInferenceProgram.
// So in both cases, just the local variables are needed to load, not the
// parematers.
executor_
->
CreateVariables
(
*
inference_program_
,
0
,
true
,
sub_scope_
);
// Load parameters
// Load parameters
LOG
(
INFO
)
<<
"load parameters "
;
LOG
(
INFO
)
<<
"load parameters "
;
LoadParameters
();
LoadParameters
();
...
@@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
...
@@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool
AnalysisPredictor
::
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
outputs
,
bool
AnalysisPredictor
::
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
outputs
,
framework
::
Scope
*
scope
)
{
framework
::
Scope
*
scope
)
{
VLOG
(
3
)
<<
"Predictor::get_fetch"
;
VLOG
(
3
)
<<
"Predictor::get_fetch"
;
outputs
->
resize
(
fetchs_
.
size
());
outputs
->
resize
(
fetch
e
s_
.
size
());
for
(
size_t
i
=
0
;
i
<
fetchs_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
fetch
e
s_
.
size
();
++
i
)
{
int
idx
=
boost
::
get
<
int
>
(
fetchs_
[
i
]
->
GetAttr
(
"col"
));
int
idx
=
boost
::
get
<
int
>
(
fetch
e
s_
[
i
]
->
GetAttr
(
"col"
));
PADDLE_ENFORCE
((
size_t
)
idx
==
i
);
PADDLE_ENFORCE
((
size_t
)
idx
==
i
);
framework
::
LoDTensor
&
fetch
=
framework
::
LoDTensor
&
fetch
=
framework
::
GetFetchVariable
(
*
scope
,
"fetch"
,
idx
);
framework
::
GetFetchVariable
(
*
scope
,
"fetch"
,
idx
);
auto
type
=
fetch
.
type
();
auto
type
=
fetch
.
type
();
auto
output
=
&
(
outputs
->
at
(
i
));
auto
output
=
&
(
outputs
->
at
(
i
));
output
->
name
=
fetchs_
[
idx
]
->
Input
(
"X"
)[
0
];
output
->
name
=
fetch
e
s_
[
idx
]
->
Input
(
"X"
)[
0
];
if
(
type
==
framework
::
proto
::
VarType
::
FP32
)
{
if
(
type
==
framework
::
proto
::
VarType
::
FP32
)
{
GetFetchOne
<
float
>
(
fetch
,
output
);
GetFetchOne
<
float
>
(
fetch
,
output
);
output
->
dtype
=
PaddleDType
::
FLOAT32
;
output
->
dtype
=
PaddleDType
::
FLOAT32
;
...
@@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
...
@@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetUseGPU
(
config_
.
use_gpu
());
argument_
.
SetUseGPU
(
config_
.
use_gpu
());
argument_
.
SetGPUDeviceId
(
config_
.
gpu_device_id
());
argument_
.
SetGPUDeviceId
(
config_
.
gpu_device_id
());
argument_
.
SetEnableMemoryOptim
(
config_
.
enable_memory_optim
());
argument_
.
SetEnableMemoryOptim
(
config_
.
enable_memory_optim
());
argument_
.
SetMemoryOptimForceUpdate
(
config_
.
memory_optim_force_update_
);
argument_
.
SetStaticMemoryOptim
(
config_
.
static_memory_optim_
);
argument_
.
SetStaticMemoryOptimForceUpdate
(
config_
.
static_memory_optim_force_update_
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
argument_
.
SetModelFromMemory
(
config_
.
model_from_memory_
);
// Analyze inference_program
// Analyze inference_program
if
(
!
config_
.
model_dir
().
empty
())
{
if
(
!
config_
.
model_dir
().
empty
())
{
...
@@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
...
@@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
!
config_
.
params_file
().
empty
(),
!
config_
.
params_file
().
empty
(),
"Either model_dir or (param_file, prog_file) should be set."
);
"Either model_dir or (param_file, prog_file) should be set."
);
PADDLE_ENFORCE
(
!
config_
.
prog_file
().
empty
());
PADDLE_ENFORCE
(
!
config_
.
prog_file
().
empty
());
std
::
string
dir
=
inference
::
analysis
::
GetDirRoot
(
config_
.
prog_file
());
argument_
.
SetModelProgramPath
(
config_
.
prog_file
());
argument_
.
SetModelProgramPath
(
config_
.
prog_file
());
argument_
.
SetModelParamsPath
(
config_
.
params_file
());
argument_
.
SetModelParamsPath
(
config_
.
params_file
());
}
}
...
@@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
...
@@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_
.
SetTensorRtWorkspaceSize
(
config_
.
tensorrt_workspace_size_
);
argument_
.
SetTensorRtWorkspaceSize
(
config_
.
tensorrt_workspace_size_
);
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
argument_
.
SetTensorRtMaxBatchSize
(
config_
.
tensorrt_max_batchsize_
);
argument_
.
SetTensorRtMinSubgraphSize
(
config_
.
tensorrt_min_subgraph_size_
);
argument_
.
SetTensorRtMinSubgraphSize
(
config_
.
tensorrt_min_subgraph_size_
);
argument_
.
SetTensorRtPrecisionMode
(
config_
.
tensorrt_precision_mode_
);
}
}
if
(
config_
.
use_mkldnn_
)
{
if
(
config_
.
use_mkldnn_
)
{
...
@@ -361,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
...
@@ -361,7 +376,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
}
argument_
.
SetIrAnalysisPasses
(
passes
);
argument_
.
SetIrAnalysisPasses
(
passes
);
argument_
.
SetAnalysisPasses
(
config_
.
pass_builder
()
->
AnalysisPasses
());
argument_
.
SetAnalysisPasses
(
config_
.
pass_builder
()
->
AnalysisPasses
());
argument_
.
SetScopeNotOwned
(
const_cast
<
framework
::
Scope
*>
(
scope_
.
get
()
));
argument_
.
SetScopeNotOwned
(
scope_
.
get
(
));
Analyzer
().
Run
(
&
argument_
);
Analyzer
().
Run
(
&
argument_
);
PADDLE_ENFORCE
(
argument_
.
scope_valid
());
PADDLE_ENFORCE
(
argument_
.
scope_valid
());
...
@@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
...
@@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
feed_names_
[
op
->
Output
(
"Out"
)[
0
]]
=
idx
;
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
}
else
if
(
op
->
Type
()
==
"fetch"
)
{
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
int
idx
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
"col"
));
if
(
fetchs_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
if
(
fetch
e
s_
.
size
()
<=
static_cast
<
size_t
>
(
idx
))
{
fetchs_
.
resize
(
idx
+
1
);
fetch
e
s_
.
resize
(
idx
+
1
);
}
}
fetchs_
[
idx
]
=
op
;
fetch
e
s_
[
idx
]
=
op
;
}
}
}
}
}
}
...
@@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() {
...
@@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() {
return
true
;
return
true
;
}
}
#if PADDLE_WITH_TENSORRT
bool
AnalysisPredictor
::
SaveTrtCalibToDisk
()
{
PADDLE_ENFORCE
(
config_
.
tensorrt_engine_enabled
(),
"This func can be invoked only in trt mode"
);
auto
&
block
=
inference_program_
->
Block
(
0
);
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
if
(
op_desc
->
Type
()
==
"tensorrt_engine"
)
{
std
::
string
engine_name
=
boost
::
get
<
std
::
string
>
(
op_desc
->
GetAttr
(
"engine_key"
));
if
(
!
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Has
(
engine_name
))
{
LOG
(
ERROR
)
<<
"You should run the predictor(with trt) on the real data "
"to generate calibration info"
;
return
false
;
}
TRTCalibratorEngine
*
calib_engine
=
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Get
(
engine_name
);
LOG
(
INFO
)
<<
"Wait for calib threads done."
;
calib_engine
->
calib_
->
waitAndSetDone
();
LOG
(
INFO
)
<<
"Generating TRT Calibration table data, this may cost a lot "
"of time..."
;
calib_engine
->
thr_
->
join
();
std
::
string
calibration_table_data
=
calib_engine
->
calib_
->
getCalibrationTableAsString
();
if
(
calibration_table_data
.
empty
())
{
LOG
(
ERROR
)
<<
"the calibration table is empty."
;
return
false
;
}
std
::
string
model_opt_cache_dir
=
argument_
.
Has
(
"model_dir"
)
?
argument_
.
model_dir
()
:
inference
::
analysis
::
GetDirRoot
(
argument_
.
model_program_path
());
std
::
string
calibration_table_data_path
=
inference
::
analysis
::
GetTrtCalibPath
(
inference
::
analysis
::
GetOrCreateModelOptCacheDir
(
model_opt_cache_dir
),
engine_name
);
std
::
ofstream
ofile
(
calibration_table_data_path
,
std
::
ios
::
out
);
LOG
(
INFO
)
<<
"Write Paddle-TRT INT8 calibration table data to file "
<<
calibration_table_data_path
;
ofile
<<
calibration_table_data
;
ofile
.
close
();
}
}
// Free all calibrator resources.
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
DeleteALL
();
return
true
;
}
#endif
AnalysisPredictor
::~
AnalysisPredictor
()
{
AnalysisPredictor
::~
AnalysisPredictor
()
{
#if PADDLE_WITH_TENSORRT
if
(
config_
.
tensorrt_engine_enabled
()
&&
config_
.
tensorrt_precision_mode_
==
AnalysisConfig
::
Precision
::
kInt8
&&
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Has
())
{
SaveTrtCalibToDisk
();
}
#endif
if
(
FLAGS_profile
)
{
if
(
FLAGS_profile
)
{
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kTotal
,
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kTotal
,
"./profile.log"
);
"./profile.log"
);
...
@@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
...
@@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
// check if the cache exists
// check if the cache exists
if
(
!
config_
.
enable_memory_optim
())
{
if
(
!
config_
.
enable_memory_optim
())
{
need
=
false
;
need
=
false
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
}
else
if
(
config_
.
static_memory_optim_
&&
!
inference
::
IsFileExists
(
inference
::
analysis
::
GetMemoryCachePath
(
!
inference
::
IsFileExists
(
inference
::
analysis
::
GetMemoryCachePath
(
config_
.
model_dir
(),
config_
.
prog_file
())))
{
config_
.
model_dir
(),
config_
.
prog_file
())))
{
need
=
true
;
need
=
true
;
}
else
if
(
config_
.
enable_memory_optim
()
&&
}
else
if
(
config_
.
static_memory_optim_
&&
config_
.
memory_optim_force_update_
)
{
config_
.
static_
memory_optim_force_update_
)
{
need
=
true
;
need
=
true
;
}
}
...
@@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
...
@@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
return
need
;
return
need
;
}
}
std
::
string
AnalysisPredictor
::
GetSeriazlizedProgram
()
const
{
return
inference_program_
->
Proto
()
->
SerializeAsString
();
}
template
<
>
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
contrib
::
AnalysisConfig
>
(
const
contrib
::
AnalysisConfig
&
config
)
{
const
contrib
::
AnalysisConfig
&
config
)
{
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
c7e38680
...
@@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -75,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor {
void
SetMkldnnThreadID
(
int
tid
);
void
SetMkldnnThreadID
(
int
tid
);
std
::
string
GetSeriazlizedProgram
()
const
override
;
protected:
protected:
// For memory optimization.
// For memory optimization.
bool
need_collect_var_shapes_for_memory_optim
();
bool
need_collect_var_shapes_for_memory_optim
();
...
@@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -97,6 +99,21 @@ class AnalysisPredictor : public PaddlePredictor {
void
GetFetchOne
(
const
framework
::
LoDTensor
&
fetchs
,
void
GetFetchOne
(
const
framework
::
LoDTensor
&
fetchs
,
PaddleTensor
*
output_data
);
PaddleTensor
*
output_data
);
#if PADDLE_WITH_TENSORRT
// When we use Paddle-TRT INT8 engine, we need to generate calibration table
// data first,
// the calibration table contains the range for each op's input and output,
// this whole process can be divided into several steps:
//
// 1. Builds a 32-bit engine, runs it on the calibration set, and records a
// histogram for each
// tensor of the distribution of activation values.
// 2. Builds a calibration table from the histograms.
//
// After step 2, we need to store the calibration table on disk
bool
SaveTrtCalibToDisk
();
#endif
// Some more detailed tests, they are made the friends of the predictor, so that
// Some more detailed tests, they are made the friends of the predictor, so that
// the all the details can be tested.
// the all the details can be tested.
#if PADDLE_WITH_TESTING
#if PADDLE_WITH_TESTING
...
@@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor {
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
inference_program_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
vector
<
framework
::
OpDesc
*>
feeds_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
std
::
map
<
std
::
string
,
size_t
>
feed_names_
;
std
::
vector
<
framework
::
OpDesc
*>
fetchs_
;
std
::
vector
<
framework
::
OpDesc
*>
fetch
e
s_
;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
// concurrency problems, wrong results and memory leak, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
c7e38680
...
@@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) {
...
@@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) {
{
{
// The first predictor help to cache the memory optimize strategy.
// The first predictor help to cache the memory optimize strategy.
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
LOG
(
INFO
)
<<
"serialized program: "
<<
predictor
->
GetSeriazlizedProgram
();
ASSERT_FALSE
(
predictor
->
GetSeriazlizedProgram
().
empty
());
// Run several times to check the parameters are not reused by mistake.
// Run several times to check the parameters are not reused by mistake.
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
...
...
paddle/fluid/inference/api/api.cc
浏览文件 @
c7e38680
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <sstream>
#include "paddle/fluid/framework/commit.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
...
@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
...
@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
}
}
}
}
std
::
string
get_version
()
{
std
::
stringstream
ss
;
ss
<<
"version: "
<<
framework
::
paddle_version
()
<<
"
\n
"
;
ss
<<
"commit: "
<<
framework
::
paddle_commit
()
<<
"
\n
"
;
ss
<<
"branch: "
<<
framework
::
paddle_compile_branch
()
<<
"
\n
"
;
return
ss
.
str
();
}
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/api_tester.cc
浏览文件 @
c7e38680
...
@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
...
@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
predictor
->
Run
({},
&
outputs
);
predictor
->
Run
({},
&
outputs
);
}
}
TEST
(
paddle_inference_api
,
get_version
)
{
LOG
(
INFO
)
<<
"paddle version:
\n
"
<<
get_version
();
auto
version
=
get_version
();
ASSERT_FALSE
(
version
.
empty
());
}
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
c7e38680
...
@@ -42,6 +42,10 @@ struct AnalysisConfig {
...
@@ -42,6 +42,10 @@ struct AnalysisConfig {
explicit
AnalysisConfig
(
const
std
::
string
&
model_dir
);
explicit
AnalysisConfig
(
const
std
::
string
&
model_dir
);
explicit
AnalysisConfig
(
const
std
::
string
&
prog_file
,
explicit
AnalysisConfig
(
const
std
::
string
&
prog_file
,
const
std
::
string
&
params_file
);
const
std
::
string
&
params_file
);
enum
class
Precision
{
kFloat32
=
0
,
kInt8
,
};
/** Set model with a directory.
/** Set model with a directory.
*/
*/
...
@@ -135,7 +139,8 @@ struct AnalysisConfig {
...
@@ -135,7 +139,8 @@ struct AnalysisConfig {
* subgraph is less than this, it will not transfer to TensorRT engine.
* subgraph is less than this, it will not transfer to TensorRT engine.
*/
*/
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
void
EnableTensorRtEngine
(
int
workspace_size
=
1
<<
20
,
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
);
int
max_batch_size
=
1
,
int
min_subgraph_size
=
3
,
Precision
precision
=
Precision
::
kFloat32
);
/** A boolean state telling whether the TensorRT engine is used.
/** A boolean state telling whether the TensorRT engine is used.
*/
*/
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
bool
tensorrt_engine_enabled
()
const
{
return
use_tensorrt_
;
}
...
@@ -162,17 +167,7 @@ struct AnalysisConfig {
...
@@ -162,17 +167,7 @@ struct AnalysisConfig {
/** Transform the AnalysisConfig to NativeConfig.
/** Transform the AnalysisConfig to NativeConfig.
*/
*/
NativeConfig
ToNativeConfig
()
const
{
NativeConfig
ToNativeConfig
()
const
;
NativeConfig
config
;
config
.
model_dir
=
model_dir_
;
config
.
prog_file
=
prog_file_
;
config
.
param_file
=
params_file_
;
config
.
use_gpu
=
use_gpu_
;
config
.
device
=
device_id_
;
config
.
fraction_of_gpu_memory
=
fraction_of_gpu_memory_for_pool
();
config
.
specify_input_name
=
specify_input_name_
;
return
config
;
}
/** Specify the operator type list to use MKLDNN acceleration.
/** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list.
* @param op_list the operator type list.
*/
*/
...
@@ -195,7 +190,8 @@ struct AnalysisConfig {
...
@@ -195,7 +190,8 @@ struct AnalysisConfig {
/** Turn on memory optimize
/** Turn on memory optimize
* NOTE still in development, will release latter.
* NOTE still in development, will release latter.
*/
*/
void
EnableMemoryOptim
(
bool
force_update_cache
=
false
);
void
EnableMemoryOptim
(
bool
static_optim
=
false
,
bool
force_update_static_cache
=
false
);
/** Tell whether the memory optimization is activated. */
/** Tell whether the memory optimization is activated. */
bool
enable_memory_optim
()
const
;
bool
enable_memory_optim
()
const
;
...
@@ -238,10 +234,12 @@ struct AnalysisConfig {
...
@@ -238,10 +234,12 @@ struct AnalysisConfig {
// We set this variable to control the minimum number of nodes in the
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
// subgraph, 3 as default value.
int
tensorrt_min_subgraph_size_
{
3
};
int
tensorrt_min_subgraph_size_
{
3
};
Precision
tensorrt_precision_mode_
;
// memory reuse related.
// memory reuse related.
bool
enable_memory_optim_
{
false
};
bool
enable_memory_optim_
{
false
};
bool
memory_optim_force_update_
{
false
};
bool
static_memory_optim_
{
false
};
bool
static_memory_optim_force_update_
{
false
};
bool
use_mkldnn_
{
false
};
bool
use_mkldnn_
{
false
};
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
std
::
unordered_set
<
std
::
string
>
mkldnn_enabled_op_types_
;
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
c7e38680
...
@@ -215,6 +215,14 @@ class PaddlePredictor {
...
@@ -215,6 +215,14 @@ class PaddlePredictor {
*/
*/
virtual
~
PaddlePredictor
()
=
default
;
virtual
~
PaddlePredictor
()
=
default
;
/** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message.
*/
virtual
std
::
string
GetSeriazlizedProgram
()
const
{
assert
(
false
);
// Force raise error.
return
"NotImplemented"
;
};
/** The common configs for all the predictors.
/** The common configs for all the predictors.
*/
*/
struct
Config
{
struct
Config
{
...
@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
...
@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
int
PaddleDtypeSize
(
PaddleDType
dtype
);
int
PaddleDtypeSize
(
PaddleDType
dtype
);
std
::
string
get_version
();
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
c7e38680
...
@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
...
@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
public:
public:
GpuPassStrategy
()
:
PassStrategy
({})
{
GpuPassStrategy
()
:
PassStrategy
({})
{
passes_
.
assign
({
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
"infer_clean_graph_pass"
,
//
"conv_affine_channel_fuse_pass"
,
//
"conv_affine_channel_fuse_pass"
,
//
"conv_eltwiseadd_affine_channel_fuse_pass"
,
//
"conv_eltwiseadd_affine_channel_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_elementwise_add_act_fuse_pass"
,
//
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
"conv_elementwise_add2_act_fuse_pass"
,
//
// guaranteed at least v7
"conv_elementwise_add_fuse_pass"
,
//
"conv_elementwise_add_act_fuse_pass"
,
//
"conv_elementwise_add2_act_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
#endif
});
});
for
(
int
i
=
6
;
i
>=
3
;
i
--
)
{
for
(
int
i
=
6
;
i
>=
3
;
i
--
)
{
...
...
paddle/fluid/inference/tensorrt/CMakeLists.txt
浏览文件 @
c7e38680
nv_library
(
tensorrt_engine SRCS engine.cc DEPS
${
GLOB_OPERATOR_DEPS
}
framework_proto device_context
)
nv_library
(
tensorrt_engine SRCS engine.cc
trt_int8_calibrator.cc
DEPS
${
GLOB_OPERATOR_DEPS
}
framework_proto device_context
)
nv_library
(
tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto
)
nv_library
(
tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
nv_test
(
test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine
)
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
c7e38680
...
@@ -69,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() {
...
@@ -69,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() {
// build engine.
// build engine.
infer_builder_
->
setMaxBatchSize
(
max_batch_
);
infer_builder_
->
setMaxBatchSize
(
max_batch_
);
infer_builder_
->
setMaxWorkspaceSize
(
max_workspace_
);
infer_builder_
->
setMaxWorkspaceSize
(
max_workspace_
);
if
(
enable_int8_
)
{
infer_builder_
->
setInt8Mode
(
true
);
PADDLE_ENFORCE
(
calibrator_
!=
nullptr
,
"The precision mode is 'INT8', the calibrator should not be nullptr"
);
infer_builder_
->
setInt8Calibrator
(
calibrator_
);
}
infer_engine_
.
reset
(
infer_builder_
->
buildCudaEngine
(
*
infer_network_
));
infer_engine_
.
reset
(
infer_builder_
->
buildCudaEngine
(
*
infer_network_
));
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed!"
);
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed!"
);
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
c7e38680
...
@@ -23,12 +23,14 @@ limitations under the License. */
...
@@ -23,12 +23,14 @@ limitations under the License. */
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
namespace
tensorrt
{
namespace
tensorrt
{
class
TRTInt8Calibrator
;
/*
/*
* TensorRT Engine.
* TensorRT Engine.
*
*
...
@@ -55,13 +57,16 @@ class TensorRTEngine : public EngineBase {
...
@@ -55,13 +57,16 @@ class TensorRTEngine : public EngineBase {
};
};
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
cudaStream_t
stream
,
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
cudaStream_t
stream
,
int
device
=
0
,
int
device
=
0
,
bool
enable_int8
=
false
,
TRTInt8Calibrator
*
calibrator
=
nullptr
,
nvinfer1
::
ILogger
&
logger
=
NaiveLogger
::
Global
())
nvinfer1
::
ILogger
&
logger
=
NaiveLogger
::
Global
())
:
max_batch_
(
max_batch
),
:
max_batch_
(
max_batch
),
max_workspace_
(
max_workspace
),
max_workspace_
(
max_workspace
),
stream_
(
stream
),
stream_
(
stream
),
logger_
(
logger
),
device_
(
device
),
device_
(
device
)
{}
enable_int8_
(
enable_int8
),
calibrator_
(
calibrator
),
logger_
(
logger
)
{}
virtual
~
TensorRTEngine
();
virtual
~
TensorRTEngine
();
...
@@ -139,8 +144,8 @@ class TensorRTEngine : public EngineBase {
...
@@ -139,8 +144,8 @@ class TensorRTEngine : public EngineBase {
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into
// into
one conv, and then trigger bug. So, We should use strategy to avoid
//
one conv, and then trigger bug. So, We should use strategy to avoid
this
// this
// optimization for the time being. This bug will be fixed in the future.
// optimization for the time being. This bug will be fixed in the future.
std
::
unordered_map
<
std
::
string
/*name*/
,
int
/*ITensor_quote_num*/
>
std
::
unordered_map
<
std
::
string
/*name*/
,
int
/*ITensor_quote_num*/
>
itensor_quote_num
;
itensor_quote_num
;
...
@@ -153,9 +158,14 @@ class TensorRTEngine : public EngineBase {
...
@@ -153,9 +158,14 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses
// the max memory size the engine uses
int
max_workspace_
;
int
max_workspace_
;
cudaStream_t
stream_
;
// The specific GPU id that the TensorRTEngine bounded to.
int
device_
;
bool
enable_int8_
;
TRTInt8Calibrator
*
calibrator_
;
// batch size of the current data, will be updated each Executation.
// batch size of the current data, will be updated each Executation.
int
batch_size_
{
-
1
};
int
batch_size_
{
-
1
};
cudaStream_t
stream_
;
nvinfer1
::
ILogger
&
logger_
;
nvinfer1
::
ILogger
&
logger_
;
...
@@ -165,8 +175,6 @@ class TensorRTEngine : public EngineBase {
...
@@ -165,8 +175,6 @@ class TensorRTEngine : public EngineBase {
std
::
unordered_map
<
std
::
string
/*name*/
,
nvinfer1
::
ITensor
*
/*ITensor*/
>
std
::
unordered_map
<
std
::
string
/*name*/
,
nvinfer1
::
ITensor
*
/*ITensor*/
>
itensor_map_
;
itensor_map_
;
// The specific GPU id that the TensorRTEngine bounded to.
int
device_
;
std
::
vector
<
std
::
unique_ptr
<
plugin
::
PluginTensorRT
>>
owned_plugin_
;
std
::
vector
<
std
::
unique_ptr
<
plugin
::
PluginTensorRT
>>
owned_plugin_
;
// TensorRT related internal members
// TensorRT related internal members
...
...
paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
0 → 100644
浏览文件 @
c7e38680
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "glog/logging.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
// set the batch size before constructing the thread to execute engine
int
TRTInt8Calibrator
::
getBatchSize
()
const
{
return
batch_size_
;
}
TRTInt8Calibrator
::
TRTInt8Calibrator
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
buffers
,
int
batch_size
,
std
::
string
engine_name
,
const
platform
::
Place
place
)
:
batch_size_
(
batch_size
),
engine_name_
(
engine_name
)
{
int
i
=
0
;
VLOG
(
4
)
<<
"Init a new calibrator: "
<<
engine_name_
;
for
(
const
auto
it
:
buffers
)
{
framework
::
Tensor
temp_tensor
;
std
::
string
input_name
=
it
.
first
;
int
data_size
=
it
.
second
;
int
num_ele
=
data_size
/
sizeof
(
int16_t
);
framework
::
DDim
data_shape
=
framework
::
make_ddim
({
num_ele
});
temp_tensor
.
Resize
(
data_shape
);
data_tensors_
.
push_back
(
temp_tensor
);
data_buffers_
[
input_name
]
=
std
::
pair
<
void
*
,
size_t
>
(
static_cast
<
void
*>
(
temp_tensor
.
mutable_data
<
int16_t
>
(
place
)),
num_ele
);
i
+=
1
;
}
}
TRTInt8Calibrator
::
TRTInt8Calibrator
(
const
std
::
string
&
calib_data
)
:
batch_size_
(
0
),
calib_running_
(
false
),
data_is_set_
(
false
),
done_
(
true
),
calibration_table_
(
calib_data
)
{}
void
TRTInt8Calibrator
::
waitAndSetDone
()
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
while
((
calib_running_
||
data_is_set_
)
&&
!
done_
)
cond_
.
wait
(
lk
);
if
(
!
done_
)
{
done_
=
true
;
cond_
.
notify_all
();
}
}
// There might be more than one input for trt subgraph,
// So, we use a map to store input information.
bool
TRTInt8Calibrator
::
setBatch
(
const
std
::
unordered_map
<
std
::
string
,
void
*>&
data
)
{
VLOG
(
3
)
<<
"set batch: "
<<
engine_name_
;
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
// There is a producer and a consumer. The producer set the batch data and
// the consumer get the batch data. The size of the data pool is one.
// So, the producer has to wait for the consumer to finish processing before
// they can set the data.
while
((
calib_running_
||
data_is_set_
)
&&
(
!
done_
))
cond_
.
wait
(
lk
);
// The done_ is set to true using waitAndSetDone, When all calibration data
// are processed.
if
(
done_
)
return
false
;
// Sets the batch.
for
(
const
auto
&
it
:
data
)
{
auto
dataptr
=
data_buffers_
.
find
(
it
.
first
);
if
(
dataptr
==
data_buffers_
.
end
())
{
LOG
(
FATAL
)
<<
"FATAL "
<<
engine_name_
<<
" input name '"
<<
it
.
first
<<
"' does not match with the buffer names"
;
}
const
auto
&
d
=
dataptr
->
second
;
PADDLE_ENFORCE
(
cudaMemcpy
(
d
.
first
,
it
.
second
,
d
.
second
,
cudaMemcpyDeviceToDevice
),
"Fail to cudaMemcpy %s for %s"
,
engine_name_
,
it
.
first
);
}
data_is_set_
=
true
;
cond_
.
notify_all
();
return
true
;
}
bool
TRTInt8Calibrator
::
getBatch
(
void
**
bindings
,
const
char
**
names
,
int
num_bindings
)
{
VLOG
(
4
)
<<
"get batch: "
<<
engine_name_
;
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
// The consumer has just finished processing a data.
// The producer can set the data again.
calib_running_
=
false
;
cond_
.
notify_all
();
// As long as there is data in the pool, the consumer can get it.
while
(
!
data_is_set_
&&
!
done_
)
cond_
.
wait
(
lk
);
if
(
done_
)
return
false
;
// Gets the batch
for
(
int
i
=
0
;
i
<
num_bindings
;
i
++
)
{
auto
it
=
data_buffers_
.
find
(
names
[
i
]);
if
(
it
==
data_buffers_
.
end
())
{
LOG
(
FATAL
)
<<
"Calibration engine asked for unknown tensor name '"
<<
names
[
i
]
<<
"' at position "
<<
i
;
}
bindings
[
i
]
=
it
->
second
.
first
;
}
data_is_set_
=
false
;
calib_running_
=
true
;
VLOG
(
4
)
<<
"get batch done: "
<<
engine_name_
;
return
true
;
}
void
TRTInt8Calibrator
::
setDone
()
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
done_
=
true
;
cond_
.
notify_all
();
}
const
void
*
TRTInt8Calibrator
::
readCalibrationCache
(
size_t
&
length
)
{
if
(
calibration_table_
.
empty
())
return
nullptr
;
length
=
calibration_table_
.
size
();
return
calibration_table_
.
data
();
}
void
TRTInt8Calibrator
::
writeCalibrationCache
(
const
void
*
ptr
,
std
::
size_t
length
)
{
calibration_table_
=
std
::
string
((
const
char
*
)
ptr
,
length
);
VLOG
(
4
)
<<
"Got calibration data for "
<<
engine_name_
<<
" "
<<
ptr
<<
" length="
<<
length
;
}
TRTInt8Calibrator
::~
TRTInt8Calibrator
()
{
VLOG
(
4
)
<<
"Destroying calibrator for "
<<
engine_name_
;
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
0 → 100644
浏览文件 @
c7e38680
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <atomic>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
class
TensorRTEngine
;
struct
TRTInt8Calibrator
:
public
nvinfer1
::
IInt8EntropyCalibrator
{
public:
TRTInt8Calibrator
(
const
std
::
unordered_map
<
std
::
string
,
size_t
>&
buffers
,
int
batch_size
,
std
::
string
engine_name
,
const
platform
::
Place
place
);
explicit
TRTInt8Calibrator
(
const
std
::
string
&
calibration_data
);
~
TRTInt8Calibrator
();
int
getBatchSize
()
const
override
;
bool
getBatch
(
void
*
bindings
[],
const
char
*
names
[],
int
num_bindings
)
override
;
bool
setBatch
(
const
std
::
unordered_map
<
std
::
string
,
void
*>&
data
);
void
setDone
();
void
waitAndSetDone
();
const
void
*
readCalibrationCache
(
std
::
size_t
&
length
)
override
;
void
writeCalibrationCache
(
const
void
*
ptr
,
std
::
size_t
length
)
override
;
const
std
::
string
&
getCalibrationTableAsString
()
{
return
calibration_table_
;
}
private:
const
int
batch_size_
;
bool
calib_running_
{
true
};
bool
data_is_set_
{
false
};
bool
done_
{
false
};
std
::
mutex
mut_
;
std
::
condition_variable
cond_
;
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
void
*
,
size_t
>>
data_buffers_
;
std
::
vector
<
framework
::
Tensor
>
data_tensors_
;
std
::
string
engine_name_
;
std
::
string
calibration_table_
;
};
class
TRTCalibratorEngine
{
public:
TRTCalibratorEngine
()
{}
std
::
unique_ptr
<
TRTInt8Calibrator
>
calib_
;
std
::
unique_ptr
<
std
::
thread
>
thr_
;
std
::
unique_ptr
<
TensorRTEngine
>
engine_
;
};
/*
* Manager to control the TensorRT Int8 calibration creation and deltetion.
*/
class
TRTCalibratorEngineManager
{
public:
bool
Has
()
const
{
return
res_
.
size
()
>
0
;
}
bool
Has
(
const
std
::
string
&
name
)
const
{
if
(
res_
.
count
(
name
)
==
0
)
return
false
;
return
res_
.
at
(
name
).
get
()
!=
nullptr
;
}
// Get Int8Calibrator via name
TRTCalibratorEngine
*
Get
(
const
std
::
string
&
name
)
const
{
return
res_
.
at
(
name
).
get
();
}
// Look up or create a calibrator.
TRTCalibratorEngine
*
LookupOrCreate
(
const
std
::
string
&
engine_name
)
{
if
(
res_
.
count
(
engine_name
)
==
0
)
{
auto
*
p
=
new
TRTCalibratorEngine
;
res_
[
engine_name
].
reset
(
p
);
}
return
res_
.
at
(
engine_name
).
get
();
}
// Create an Int8Calibrator
TRTCalibratorEngine
*
Create
(
const
std
::
string
&
engine_name
)
{
auto
*
p
=
new
TRTCalibratorEngine
;
res_
[
engine_name
].
reset
(
p
);
return
p
;
}
void
DeleteALL
()
{
for
(
auto
&
item
:
res_
)
{
item
.
second
.
reset
(
nullptr
);
}
}
private:
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
TRTCalibratorEngine
>>
res_
;
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
c7e38680
...
@@ -54,6 +54,7 @@ else()
...
@@ -54,6 +54,7 @@ else()
message
(
WARNING
"These tests has been disabled in OSX or WITH_MKL=OFF before being fixed:
\n
test_analyzer_seq_pool1"
)
message
(
WARNING
"These tests has been disabled in OSX or WITH_MKL=OFF before being fixed:
\n
test_analyzer_seq_pool1"
)
endif
()
endif
()
# RNN2
# RNN2
set
(
RNN2_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn2"
)
set
(
RNN2_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn2"
)
download_model_and_data
(
${
RNN2_INSTALL_DIR
}
"rnn2_model.tar.gz"
"rnn2_data.txt.tar.gz"
)
download_model_and_data
(
${
RNN2_INSTALL_DIR
}
"rnn2_model.tar.gz"
"rnn2_data.txt.tar.gz"
)
...
@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
...
@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
endif
()
endif
()
inference_analysis_api_test_with_refer_result
(
test_analyzer_mobilenet_transpose
${
MOBILENET_INSTALL_DIR
}
analyzer_vis_tester.cc SERIAL
)
inference_analysis_api_test_with_refer_result
(
test_analyzer_mobilenet_transpose
${
MOBILENET_INSTALL_DIR
}
analyzer_vis_tester.cc SERIAL
)
# googlenet
inference_analysis_api_test_with_fake_data
(
test_analyzer_googlenet
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/googlenet"
analyzer_resnet50_tester.cc
"googlenet.tar.gz"
SERIAL
)
# resnet50
# resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
SERIAL
)
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
SERIAL
)
...
...
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
浏览文件 @
c7e38680
...
@@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) {
...
@@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) {
}
}
// Compare result of NativeConfig and AnalysisConfig with memory optimization.
// Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST
(
Analyzer_dam
,
compare_with_memory_optim
)
{
TEST
(
Analyzer_dam
,
compare_with_
static_
memory_optim
)
{
// The small dam will core in CI, but works in local.
// The small dam will core in CI, but works in local.
if
(
FLAGS_max_turn_num
==
9
)
{
if
(
FLAGS_max_turn_num
==
9
)
{
contrib
::
AnalysisConfig
cfg
,
cfg1
;
contrib
::
AnalysisConfig
cfg
,
cfg1
;
...
@@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
...
@@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
SetInput
(
&
input_slots_all
);
SetInput
(
&
input_slots_all
);
// Run the first time to force to update memory cache
// Run the first time to force to update memory cache
SetConfig
(
&
cfg
);
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
(
true
);
cfg
.
EnableMemoryOptim
(
true
,
true
/*force update*/
);
CompareNativeAndAnalysis
(
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
...
@@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
...
@@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
// Run second time to use the memory cache and perform memory optimization.
// Run second time to use the memory cache and perform memory optimization.
SetConfig
(
&
cfg1
);
SetConfig
(
&
cfg1
);
cfg1
.
EnableMemoryOptim
();
cfg1
.
EnableMemoryOptim
(
true
,
false
/*do not force update*/
);
CompareNativeAndAnalysis
(
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg1
),
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg1
),
...
@@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
...
@@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
}
}
}
}
TEST
(
Analyzer_dam
,
compare_with_dynamic_memory_optim
)
{
// The small dam will core in CI, but works in local.
if
(
FLAGS_max_turn_num
==
9
)
{
contrib
::
AnalysisConfig
cfg
,
cfg1
;
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
// Run the first time to force to update memory cache
SetConfig
(
&
cfg
);
cfg
.
EnableMemoryOptim
();
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
TEST
(
Analyzer_dam
,
compare
)
{
compare
();
}
TEST
(
Analyzer_dam
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
...
...
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
c7e38680
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <string>
#include <string>
#include <utility>
#include <vector>
#include <vector>
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
...
@@ -37,7 +38,7 @@ template <typename Place>
...
@@ -37,7 +38,7 @@ template <typename Place>
void
*
Alloc
(
const
Place
&
place
,
size_t
size
);
void
*
Alloc
(
const
Place
&
place
,
size_t
size
);
template
<
typename
Place
>
template
<
typename
Place
>
void
Free
(
const
Place
&
place
,
void
*
p
);
void
Free
(
const
Place
&
place
,
void
*
p
,
size_t
size
);
template
<
typename
Place
>
template
<
typename
Place
>
size_t
Used
(
const
Place
&
place
);
size_t
Used
(
const
Place
&
place
);
...
@@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p);
...
@@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p);
using
BuddyAllocator
=
detail
::
BuddyAllocator
;
using
BuddyAllocator
=
detail
::
BuddyAllocator
;
std
::
unordered_map
<
/*device id*/
int
,
std
::
pair
<
/*current memory usage*/
uint64_t
,
/*peak memory usage*/
uint64_t
>>
gpu_mem_info
;
BuddyAllocator
*
GetCPUBuddyAllocator
()
{
BuddyAllocator
*
GetCPUBuddyAllocator
()
{
// We tried thread_local for inference::RNN1 model, but that not works much
// We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test.
// for multi-thread test.
...
@@ -98,7 +104,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
...
@@ -98,7 +104,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
}
}
template
<
>
template
<
>
void
Free
<
platform
::
CPUPlace
>
(
const
platform
::
CPUPlace
&
place
,
void
*
p
)
{
void
Free
<
platform
::
CPUPlace
>
(
const
platform
::
CPUPlace
&
place
,
void
*
p
,
size_t
size
)
{
VLOG
(
10
)
<<
"Free pointer="
<<
p
<<
" on "
<<
platform
::
Place
(
place
);
VLOG
(
10
)
<<
"Free pointer="
<<
p
<<
" on "
<<
platform
::
Place
(
place
);
GetCPUBuddyAllocator
()
->
Free
(
p
);
GetCPUBuddyAllocator
()
->
Free
(
p
);
}
}
...
@@ -177,9 +184,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
...
@@ -177,9 +184,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
LOG
(
WARNING
)
<<
"GPU memory used: "
LOG
(
WARNING
)
<<
"GPU memory used: "
<<
string
::
HumanReadableSize
(
Used
<
platform
::
CUDAPlace
>
(
place
));
<<
string
::
HumanReadableSize
(
Used
<
platform
::
CUDAPlace
>
(
place
));
platform
::
SetDeviceId
(
cur_dev
);
platform
::
SetDeviceId
(
cur_dev
);
}
}
else
{
if
(
FLAGS_init_allocated_mem
)
{
gpu_mem_info
[
place
.
device
].
first
+=
size
;
cudaMemset
(
ptr
,
0xEF
,
size
);
if
(
gpu_mem_info
[
place
.
device
].
first
>
gpu_mem_info
[
place
.
device
].
second
)
{
gpu_mem_info
[
place
.
device
].
second
=
gpu_mem_info
[
place
.
device
].
first
;
VLOG
(
3
)
<<
"device: "
<<
place
.
device
<<
" peak memory usage : "
<<
(
gpu_mem_info
[
place
.
device
].
second
>>
20
)
<<
" MiB"
;
}
if
(
FLAGS_init_allocated_mem
)
{
cudaMemset
(
ptr
,
0xEF
,
size
);
}
}
}
return
ptr
;
return
ptr
;
#else
#else
...
@@ -188,9 +202,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
...
@@ -188,9 +202,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
}
}
template
<
>
template
<
>
void
Free
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
void
*
p
)
{
void
Free
<
platform
::
CUDAPlace
>
(
const
platform
::
CUDAPlace
&
place
,
void
*
p
,
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
gpu_mem_info
[
place
.
device
].
first
-=
size
;
#else
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
#endif
...
@@ -243,7 +259,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
...
@@ -243,7 +259,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template
<
>
template
<
>
void
Free
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
void
Free
<
platform
::
CUDAPinnedPlace
>
(
const
platform
::
CUDAPinnedPlace
&
place
,
void
*
p
)
{
void
*
p
,
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
GetCUDAPinnedBuddyAllocator
()
->
Free
(
p
);
GetCUDAPinnedBuddyAllocator
()
->
Free
(
p
);
#else
#else
...
@@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
...
@@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
};
};
struct
FreeVisitor
:
public
boost
::
static_visitor
<
void
>
{
struct
FreeVisitor
:
public
boost
::
static_visitor
<
void
>
{
inline
explicit
FreeVisitor
(
void
*
ptr
)
:
ptr_
(
ptr
)
{}
inline
explicit
FreeVisitor
(
void
*
ptr
,
size_t
size
)
:
ptr_
(
ptr
),
size_
(
size
)
{}
template
<
typename
Place
>
template
<
typename
Place
>
inline
void
operator
()(
const
Place
&
place
)
const
{
inline
void
operator
()(
const
Place
&
place
)
const
{
Free
<
Place
>
(
place
,
ptr_
);
Free
<
Place
>
(
place
,
ptr_
,
size_
);
}
}
private:
private:
void
*
ptr_
;
void
*
ptr_
;
size_t
size_
;
};
};
size_t
Usage
::
operator
()(
const
platform
::
CPUPlace
&
cpu
)
const
{
size_t
Usage
::
operator
()(
const
platform
::
CPUPlace
&
cpu
)
const
{
...
@@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
...
@@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
}
}
void
LegacyAllocator
::
Free
(
Allocation
*
allocation
)
{
void
LegacyAllocator
::
Free
(
Allocation
*
allocation
)
{
boost
::
apply_visitor
(
legacy
::
FreeVisitor
(
allocation
->
ptr
()),
boost
::
apply_visitor
(
allocation
->
place
());
legacy
::
FreeVisitor
(
allocation
->
ptr
(),
allocation
->
size
()),
allocation
->
place
());
delete
allocation
;
delete
allocation
;
}
}
}
// namespace allocation
}
// namespace allocation
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
c7e38680
...
@@ -13,6 +13,7 @@ add_subdirectory(detection)
...
@@ -13,6 +13,7 @@ add_subdirectory(detection)
add_subdirectory
(
elementwise
)
add_subdirectory
(
elementwise
)
add_subdirectory
(
fused
)
add_subdirectory
(
fused
)
add_subdirectory
(
metrics
)
add_subdirectory
(
metrics
)
add_subdirectory
(
ngraph
)
add_subdirectory
(
optimizers
)
add_subdirectory
(
optimizers
)
add_subdirectory
(
reduce_ops
)
add_subdirectory
(
reduce_ops
)
add_subdirectory
(
sequence_ops
)
add_subdirectory
(
sequence_ops
)
...
@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
...
@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
beam_search
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
endif
()
endif
()
...
@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
...
@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
cc_test
(
gather_test SRCS gather_test.cc DEPS tensor
)
cc_test
(
gather_test SRCS gather_test.cc DEPS tensor
)
cc_test
(
scatter_test SRCS scatter_test.cc DEPS tensor math_function
)
cc_test
(
scatter_test SRCS scatter_test.cc DEPS tensor math_function
)
cc_test
(
beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor
)
cc_test
(
beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor
)
cc_test
(
beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op
)
cc_test
(
strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory
)
cc_test
(
strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory
)
cc_test
(
save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op
)
cc_test
(
save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op
)
cc_test
(
save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op
)
cc_test
(
save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op
)
...
...
paddle/fluid/operators/beam_search_op.cc
浏览文件 @
c7e38680
...
@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include
<algorithm>
#include
"paddle/fluid/operators/beam_search_op.h"
#include <map>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/beam_search_op.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
void
BeamSearch
::
operator
()(
const
framework
::
LoDTensor
&
pre_ids
,
const
framework
::
LoDTensor
&
pre_scores
,
framework
::
LoDTensor
*
selected_ids
,
framework
::
LoDTensor
*
selected_scores
)
{
auto
abs_lod
=
framework
::
ToAbsOffset
(
ids_
->
lod
());
auto
&
high_level
=
abs_lod
[
lod_level_
];
auto
items
=
SelectTopBeamSizeItems
(
pre_ids
,
pre_scores
);
auto
selected_items
=
ToMap
(
items
,
high_level
.
back
());
VLOG
(
3
)
<<
"selected_items:"
;
for
(
size_t
i
=
0
;
i
<
selected_items
.
size
();
++
i
)
{
VLOG
(
3
)
<<
"offset:"
<<
i
;
for
(
auto
&
item
:
selected_items
[
i
])
{
VLOG
(
3
)
<<
ItemToString
(
item
);
}
}
PruneEndBeams
(
pre_ids
,
&
selected_items
);
// calculate the output tensor's height
size_t
num_instances
=
std
::
accumulate
(
std
::
begin
(
selected_items
),
std
::
end
(
selected_items
),
0
,
[](
size_t
a
,
std
::
vector
<
Item
>
&
b
)
{
return
a
+
b
.
size
();
});
// the output tensor shape should be [num_instances, 1]
auto
dims
=
framework
::
make_ddim
(
std
::
vector
<
int64_t
>
({
static_cast
<
int
>
(
num_instances
),
1
}));
selected_ids
->
Resize
(
dims
);
selected_scores
->
Resize
(
dims
);
std
::
map
<
size_t
/*offset*/
,
std
::
vector
<
Item
>>
hash
;
framework
::
LoD
new_lod
;
auto
*
ids_data
=
selected_ids
->
mutable_data
<
int64_t
>
(
platform
::
CPUPlace
());
auto
*
scores_data
=
selected_scores
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
// fill in data
std
::
vector
<
size_t
>
low_level
;
size_t
low_offset
=
0
;
for
(
auto
&
items
:
selected_items
)
{
low_level
.
push_back
(
low_offset
);
for
(
auto
&
item
:
items
)
{
ids_data
[
low_offset
]
=
item
.
id
;
scores_data
[
low_offset
]
=
item
.
score
;
low_offset
++
;
}
}
low_level
.
push_back
(
low_offset
);
// fill lod
framework
::
LoD
lod
(
2
);
lod
[
0
].
assign
(
high_level
.
begin
(),
high_level
.
end
());
lod
[
1
].
assign
(
low_level
.
begin
(),
low_level
.
end
());
if
(
!
framework
::
CheckLoD
(
lod
))
{
PADDLE_THROW
(
"lod %s is not right"
,
framework
::
LoDToString
(
lod
));
}
selected_ids
->
set_lod
(
lod
);
selected_scores
->
set_lod
(
lod
);
}
void
BeamSearch
::
PruneEndBeams
(
const
framework
::
LoDTensor
&
pre_ids
,
std
::
vector
<
std
::
vector
<
Item
>>
*
items
)
{
auto
*
pre_ids_data
=
pre_ids
.
data
<
int64_t
>
();
auto
abs_lod
=
framework
::
ToAbsOffset
(
ids_
->
lod
());
auto
&
high_level
=
abs_lod
[
lod_level_
];
for
(
size_t
src_idx
=
0
;
src_idx
<
high_level
.
size
()
-
1
;
++
src_idx
)
{
size_t
src_prefix_start
=
high_level
[
src_idx
];
size_t
src_prefix_end
=
high_level
[
src_idx
+
1
];
bool
finish_flag
=
true
;
for
(
size_t
offset
=
src_prefix_start
;
offset
<
src_prefix_end
;
offset
++
)
{
for
(
auto
&
item
:
items
->
at
(
offset
))
{
if
(
item
.
id
!=
static_cast
<
size_t
>
(
end_id_
)
||
pre_ids_data
[
offset
]
!=
end_id_
)
{
finish_flag
=
false
;
break
;
}
}
if
(
!
finish_flag
)
break
;
}
if
(
finish_flag
)
{
// all branchs of the beam (source sentence) end and
// prune this beam
for
(
size_t
offset
=
src_prefix_start
;
offset
<
src_prefix_end
;
offset
++
)
items
->
at
(
offset
).
clear
();
}
}
}
std
::
vector
<
std
::
vector
<
BeamSearch
::
Item
>>
BeamSearch
::
ToMap
(
const
std
::
vector
<
std
::
vector
<
Item
>>
&
items
,
size_t
element_num
)
{
std
::
vector
<
std
::
vector
<
Item
>>
result
;
result
.
resize
(
element_num
);
for
(
auto
&
entries
:
items
)
{
for
(
const
auto
&
item
:
entries
)
{
result
[
item
.
offset
].
push_back
(
item
);
}
}
return
result
;
}
std
::
vector
<
std
::
vector
<
BeamSearch
::
Item
>>
BeamSearch
::
SelectTopBeamSizeItems
(
const
framework
::
LoDTensor
&
pre_ids
,
const
framework
::
LoDTensor
&
pre_scores
)
{
std
::
vector
<
std
::
vector
<
Item
>>
result
;
std
::
vector
<
Item
>
items
;
// for each source sentence, select the top beam_size items across all
// candidate sets.
while
(
NextItemSet
(
pre_ids
,
pre_scores
,
&
items
))
{
std
::
nth_element
(
std
::
begin
(
items
),
std
::
begin
(
items
)
+
beam_size_
,
std
::
end
(
items
),
[](
const
Item
&
a
,
const
Item
&
b
)
{
return
a
.
score
>
b
.
score
;
});
// prune the top beam_size items.
if
(
items
.
size
()
>
beam_size_
)
{
items
.
resize
(
beam_size_
);
}
result
.
emplace_back
(
items
);
}
VLOG
(
3
)
<<
"SelectTopBeamSizeItems result size "
<<
result
.
size
();
for
(
auto
&
items
:
result
)
{
VLOG
(
3
)
<<
"item set:"
;
for
(
auto
&
item
:
items
)
{
VLOG
(
3
)
<<
ItemToString
(
item
);
}
}
return
result
;
}
// the candidates of a source
bool
BeamSearch
::
NextItemSet
(
const
framework
::
LoDTensor
&
pre_ids
,
const
framework
::
LoDTensor
&
pre_scores
,
std
::
vector
<
BeamSearch
::
Item
>
*
items
)
{
if
(
sent_offset_
>=
ids_
->
NumElements
(
lod_level_
))
{
return
false
;
}
// find the current candidates
auto
ids
=
*
ids_
;
auto
scores
=
*
scores_
;
auto
abs_lod
=
framework
::
ToAbsOffset
(
ids
.
lod
());
auto
*
ids_data
=
ids
.
data
<
int64_t
>
();
auto
*
scores_data
=
scores
.
data
<
float
>
();
size_t
instance_dim
=
1
;
for
(
int
i
=
1
;
i
<
ids
.
dims
().
size
();
i
++
)
{
instance_dim
*=
ids
.
dims
()[
i
];
}
auto
*
pre_ids_data
=
pre_ids
.
data
<
int64_t
>
();
auto
*
pre_scores_data
=
pre_scores
.
data
<
float
>
();
items
->
clear
();
items
->
reserve
(
framework
::
product
(
ids
.
dims
()));
for
(
size_t
offset
=
abs_lod
[
lod_level_
][
sent_offset_
];
offset
<
abs_lod
[
lod_level_
][
sent_offset_
+
1
];
offset
++
)
{
auto
pre_id
=
pre_ids_data
[
offset
];
auto
pre_score
=
pre_scores_data
[
offset
];
if
(
pre_id
==
end_id_
)
{
// Allocate all probability mass to eos_id for finished branchs and the
// other candidate ids can be ignored.
items
->
emplace_back
(
offset
,
end_id_
,
pre_score
);
}
else
{
for
(
size_t
d
=
0
;
d
<
instance_dim
;
d
++
)
{
const
size_t
dim_offset
=
offset
*
instance_dim
+
d
;
items
->
emplace_back
(
offset
,
ids_data
[
dim_offset
],
scores_data
[
dim_offset
]);
}
}
}
sent_offset_
++
;
return
true
;
}
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
BeamSearch
::
Item
&
item
)
{
os
<<
"{"
;
os
<<
"offset: "
<<
item
.
offset
<<
", "
;
os
<<
"id: "
<<
item
.
id
<<
", "
;
os
<<
"score: "
<<
item
.
score
<<
""
;
os
<<
"}"
;
return
os
;
}
std
::
string
ItemToString
(
const
BeamSearch
::
Item
&
item
)
{
std
::
ostringstream
stream
;
stream
<<
item
;
return
stream
.
str
();
}
class
BeamSearchOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
BeamSearchOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
override
{
void
Make
()
override
{
...
@@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
"(LoDTensor) The LoDTensor containing the selected ids at the "
"(LoDTensor) The LoDTensor containing the selected ids at the "
"previous step. It should be a tensor with shape (batch_size, 1) "
"previous step. It should be a tensor with shape (batch_size, 1) "
"and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
"and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
"thefirst step."
);
"the
first step."
);
AddInput
(
"pre_scores"
,
AddInput
(
"pre_scores"
,
"(LoDTensor) The LoDTensor containing the accumulated "
"(LoDTensor) The LoDTensor containing the accumulated "
"scores corresponding to the selected ids at the previous step."
);
"scores corresponding to the selected ids at the previous step."
);
AddInput
(
"ids"
,
AddInput
(
"ids"
,
"(LoDTensor) The LoDTensor containing the candidates ids. Its "
"(LoDTensor) The LoDTensor containing the candidates ids. Its "
"shape should be (batch_size * beam_size, K), where K supposed to "
"shape should be (batch_size * beam_size, W). If not set, it will "
"be beam_size."
);
"be calculated out according to Input(scores) in this operator."
)
.
AsDispensable
();
AddInput
(
"scores"
,
AddInput
(
"scores"
,
"(LoDTensor) The LodTensor containing the accumulated scores "
"(LoDTensor) The LoDTensor containing the current scores "
"corresponding to Input(ids) and its shape is the same as the "
"corresponding to Input(ids). If Input(ids) is not nullptr, its "
"shape of Input(ids)."
);
"shape is the same as that of Input(ids)."
"If is_accumulated is true, Input(scores) is accumulated scores "
"and will be used derectedly. Else, each score will be "
"transformed to the log field and accumulate Input(pre_sores) "
"first."
);
AddOutput
(
"selected_ids"
,
AddOutput
(
"selected_ids"
,
"A LodTensor that stores the IDs selected by beam search."
);
"A LodTensor that stores the IDs selected by beam search."
);
AddOutput
(
"selected_scores"
,
AddOutput
(
"selected_scores"
,
...
@@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"beam_size"
,
"beam size for beam search"
);
AddAttr
<
int
>
(
"beam_size"
,
"beam size for beam search"
);
AddAttr
<
int
>
(
"end_id"
,
AddAttr
<
int
>
(
"end_id"
,
"the token id which indicates the end of a sequence"
);
"the token id which indicates the end of a sequence"
);
AddAttr
<
bool
>
(
"is_accumulated"
,
"Whether the Input(scores) is accumulated scores."
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
This operator does the search in beams for one time step.
This operator does the search in beams for one time step.
...
@@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
...
@@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
for
(
const
std
::
string
&
arg
:
for
(
const
std
::
string
&
arg
:
std
::
vector
<
std
::
string
>
({
"pre_ids"
,
"
ids"
,
"
scores"
}))
{
std
::
vector
<
std
::
string
>
({
"pre_ids"
,
"scores"
}))
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
arg
),
"BeamSearch need input argument '%s'"
,
PADDLE_ENFORCE
(
ctx
->
HasInput
(
arg
),
"BeamSearch need input argument '%s'"
,
arg
);
arg
);
}
}
...
@@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
...
@@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
}
}
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
auto
*
scores
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"scores"
);
ctx
.
Input
<
framework
::
LoDTensor
>
(
"pre_ids"
)
->
type
(),
size_t
level
=
ctx
.
Attr
<
int
>
(
"level"
);
platform
::
CPUPlace
());
size_t
batch_size
=
scores
->
lod
()[
level
].
size
()
-
1
;
return
kt
;
// The current CUDA kernel only support cases with batch_size < 4.
// Compute on CPU for cases with batch_size > 4.
if
(
batch_size
<=
4
)
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"pre_ids"
)
->
type
(),
ctx
.
GetPlace
());
}
else
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"pre_ids"
)
->
type
(),
platform
::
CPUPlace
());
}
}
}
};
};
...
...
paddle/fluid/operators/beam_search_op.cu.cc
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
beam_search
,
ops
::
BeamSearchOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
BeamSearchOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
BeamSearchOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
BeamSearchOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/beam_search_op.h
浏览文件 @
c7e38680
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
...
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -14,187 +14,12 @@ limitations under the License. */
...
@@ -14,187 +14,12 @@ limitations under the License. */
#pragma once
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/beam_search.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
/*
* This is an implementation of beam search.
*
* To explain the details, lets take machine translation task for example, in
* this task, one source sentence is translated to multiple target sentences,
* during this period, one sentence will be translated to multiple translation
* prefixes(target sentence that have not ended), in each time step a prefix
* will have some candidates, input the candidate ids and their corresponding
* scores (probabilities), it will sort and select the top beam_size candidates
* for each source sentence, and store the selected candidates's score and their
* corresponding ids to LoDTensors.
*
* A detailed example:
*
* Input
*
* ids:
* LoD (should have 2 levels)
* first level: [0, 1, 4]
* second level: [0, 1, 2, 3, 4]
*
* tensor's data
* [
* [4, 2, 5]
* [2, 1, 3]
* [3, 5, 2]
* [8, 2, 1]
* ]
*
* scores:
* LoD same as `ids`
* tensor's data
* [
* [0.5, 0.3, 0.2]
* [0.6, 0.3, 0.1]
* [0.9, 0.5, 0.1]
* [0.7, 0.5, 0.1]
* ]
*
* the inputs means that there are 2 source sentences to translate, and the
* first source has 1 prefix, the second source has 2 prefix.
*
* lets assume beam size is 2, and the beam search's output should be
* LoD
* first level:
* [0, 1, 2]
* second level:
* [0, 2, 4]
*
* id tensor's data
* [[
* 4,
* 1,
* 3,
* 8,
* ]]
*
* score tensor's data
* [[
* 0.5,
* 0.3,
* 0.9,
* 0.7
* ]]
*
* TODO all the prune operations should be in the beam search, so it is better
* to split the beam search algorithm into a sequence of smaller operators, and
* the prune operators can be inserted in this sequence.
*/
class
BeamSearch
{
public:
// TODO(superjom) make type customizable
using
id_t
=
size_t
;
using
score_t
=
float
;
/*
* Input the arguments that needed by this class.
*/
BeamSearch
(
const
framework
::
LoDTensor
&
ids
,
const
framework
::
LoDTensor
&
scores
,
size_t
level
,
size_t
beam_size
,
int
end_id
)
:
beam_size_
(
beam_size
),
ids_
(
&
ids
),
scores_
(
&
scores
),
lod_level_
(
level
),
end_id_
(
end_id
)
{}
/*
* The main function of beam search.
*
* @selected_ids: a [None, 1]-shaped tensor with LoD.
* In a machine translation model, it might be the candidate term id sets,
* each set stored as a varience-length sequence.
* The format might be described with a two-level LoD
* - [[0 1]
* - [0 1 2]]
* - [[]
* - [0 1]]
* the first level of LoD tells that there are two source sentences. The
* second level describes the details of the candidate id set's offsets in
* the
* source sentences.
*
* @selected_scores: a LoD tensor with the same shape and LoD with
* selected_ids.
* It stores the corresponding scores of candidate ids in selected_ids.
*
* Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running.
*/
void
operator
()(
const
framework
::
LoDTensor
&
pre_ids
,
const
framework
::
LoDTensor
&
pre_scores
,
framework
::
LoDTensor
*
selected_ids
,
framework
::
LoDTensor
*
selected_scores
);
/*
* The basic items help to sort.
*/
struct
Item
{
Item
()
{}
Item
(
size_t
offset
,
size_t
id
,
float
score
)
:
offset
(
offset
),
id
(
id
),
score
(
score
)
{}
// offset in the higher lod level.
size_t
offset
;
// // prefix id in the lower lod level.
// size_t prefix;
// the candidate id
id_t
id
;
// the corresponding score
score_t
score
;
};
protected:
/*
* Prune the source sentences all branchs finished, and it is optional.
* Pruning must one step later than finishing (thus pre_ids is needed here),
* since the end tokens must be writed out.
*/
void
PruneEndBeams
(
const
framework
::
LoDTensor
&
pre_ids
,
std
::
vector
<
std
::
vector
<
Item
>>*
items
);
/*
* Transform the items into a map whose key is offset, value is the items.
* NOTE low performance.
*/
std
::
vector
<
std
::
vector
<
Item
>>
ToMap
(
const
std
::
vector
<
std
::
vector
<
Item
>>&
inputs
,
size_t
element_num
);
/*
* For each source, select top beam_size records.
*/
std
::
vector
<
std
::
vector
<
Item
>>
SelectTopBeamSizeItems
(
const
framework
::
LoDTensor
&
pre_ids
,
const
framework
::
LoDTensor
&
pre_scores
);
/*
* Get the items of next source sequence, return false if no remaining items.
*/
bool
NextItemSet
(
const
framework
::
LoDTensor
&
pre_ids
,
const
framework
::
LoDTensor
&
pre_scores
,
std
::
vector
<
Item
>*
items
);
private:
size_t
beam_size_
;
const
framework
::
LoDTensor
*
ids_
;
const
framework
::
LoDTensor
*
scores_
;
size_t
lod_level_
{
0
};
size_t
sent_offset_
{
0
};
int
end_id_
{
0
};
};
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
BeamSearch
::
Item
&
item
);
std
::
string
ItemToString
(
const
BeamSearch
::
Item
&
item
);
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
BeamSearchOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
BeamSearchOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
...
@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
auto
*
scores
=
context
.
Input
<
framework
::
LoDTensor
>
(
"scores"
);
auto
*
scores
=
context
.
Input
<
framework
::
LoDTensor
>
(
"scores"
);
auto
*
pre_ids
=
context
.
Input
<
framework
::
LoDTensor
>
(
"pre_ids"
);
auto
*
pre_ids
=
context
.
Input
<
framework
::
LoDTensor
>
(
"pre_ids"
);
auto
*
pre_scores
=
context
.
Input
<
framework
::
LoDTensor
>
(
"pre_scores"
);
auto
*
pre_scores
=
context
.
Input
<
framework
::
LoDTensor
>
(
"pre_scores"
);
PADDLE_ENFORCE_NOT_NULL
(
ids
);
PADDLE_ENFORCE_NOT_NULL
(
scores
);
PADDLE_ENFORCE_NOT_NULL
(
scores
);
PADDLE_ENFORCE_NOT_NULL
(
pre_ids
);
PADDLE_ENFORCE_NOT_NULL
(
pre_ids
);
PADDLE_ENFORCE_NOT_NULL
(
pre_scores
);
PADDLE_ENFORCE_NOT_NULL
(
pre_scores
);
...
@@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
...
@@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
size_t
level
=
context
.
Attr
<
int
>
(
"level"
);
size_t
level
=
context
.
Attr
<
int
>
(
"level"
);
size_t
beam_size
=
context
.
Attr
<
int
>
(
"beam_size"
);
size_t
beam_size
=
context
.
Attr
<
int
>
(
"beam_size"
);
int
end_id
=
context
.
Attr
<
int
>
(
"end_id"
);
int
end_id
=
context
.
Attr
<
int
>
(
"end_id"
);
BeamSearch
alg
(
*
ids
,
*
scores
,
level
,
beam_size
,
end_id
);
bool
is_accumulated
=
context
.
Attr
<
bool
>
(
"is_accumulated"
);
auto
selected_ids
=
context
.
Output
<
framework
::
LoDTensor
>
(
"selected_ids"
);
auto
selected_ids
=
context
.
Output
<
framework
::
LoDTensor
>
(
"selected_ids"
);
auto
selected_scores
=
auto
selected_scores
=
context
.
Output
<
framework
::
LoDTensor
>
(
"selected_scores"
);
context
.
Output
<
framework
::
LoDTensor
>
(
"selected_scores"
);
PADDLE_ENFORCE_NOT_NULL
(
selected_ids
);
PADDLE_ENFORCE_NOT_NULL
(
selected_ids
);
PADDLE_ENFORCE_NOT_NULL
(
selected_scores
);
PADDLE_ENFORCE_NOT_NULL
(
selected_scores
);
alg
(
*
pre_ids
,
*
pre_scores
,
selected_ids
,
selected_scores
);
math
::
BeamSearchFunctor
<
DeviceContext
,
T
>
alg
;
alg
(
context
.
template
device_context
<
DeviceContext
>(),
pre_ids
,
pre_scores
,
ids
,
scores
,
selected_ids
,
selected_scores
,
level
,
beam_size
,
end_id
,
is_accumulated
);
}
}
};
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/beam_search_op_test.cc
已删除
100644 → 0
浏览文件 @
1edc0423
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/beam_search_op.h"
#include <gtest/gtest.h>
#include <vector>
namespace
paddle
{
namespace
test
{
using
std
::
vector
;
using
framework
::
LoDTensor
;
using
framework
::
LoD
;
using
operators
::
BeamSearch
;
using
paddle
::
platform
::
CPUPlace
;
using
std
::
cout
;
using
std
::
endl
;
void
CreateInput
(
LoDTensor
*
ids
,
LoDTensor
*
scores
)
{
LoD
lod
;
vector
<
size_t
>
level0
({
0
,
2
,
4
});
vector
<
size_t
>
level1
({
0
,
1
,
2
,
3
,
4
});
lod
.
push_back
(
level0
);
lod
.
push_back
(
level1
);
ids
->
set_lod
(
lod
);
scores
->
set_lod
(
lod
);
auto
dims
=
framework
::
make_ddim
(
vector
<
int64_t
>
({
4
,
3
}));
ids
->
Resize
(
dims
);
scores
->
Resize
(
dims
);
CPUPlace
place
;
auto
*
ids_data
=
ids
->
mutable_data
<
int64_t
>
(
place
);
auto
*
scores_data
=
scores
->
mutable_data
<
float
>
(
place
);
vector
<
int64_t
>
_ids
({
4
,
2
,
5
,
2
,
1
,
3
,
3
,
5
,
2
,
8
,
2
,
1
});
vector
<
float
>
_scores
(
{
0.5
f
,
0.3
f
,
0.2
f
,
0.6
f
,
0.3
f
,
0.1
f
,
0.9
f
,
0.5
f
,
0.1
f
,
0.7
f
,
0.5
f
,
0.1
f
});
for
(
int
i
=
0
;
i
<
12
;
i
++
)
{
ids_data
[
i
]
=
_ids
[
i
];
scores_data
[
i
]
=
_scores
[
i
];
}
}
// It seems that beam_search_op has bugs.
TEST
(
DISABLED_beam_search_op
,
run
)
{
CPUPlace
place
;
LoDTensor
ids
,
scores
;
CreateInput
(
&
ids
,
&
scores
);
LoDTensor
pre_ids
;
pre_ids
.
Resize
(
framework
::
make_ddim
(
vector
<
int64_t
>
(
4
,
1
)));
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
pre_ids
.
mutable_data
<
int64_t
>
(
place
)[
i
]
=
i
+
1
;
}
LoDTensor
pre_scores
;
pre_scores
.
Resize
(
framework
::
make_ddim
(
vector
<
int64_t
>
(
4
,
1
)));
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
pre_scores
.
mutable_data
<
float
>
(
place
)[
i
]
=
0.1
*
(
i
+
1
);
}
BeamSearch
beamsearch
(
ids
,
scores
,
(
size_t
)
0
,
(
size_t
)
2
,
0
);
LoDTensor
sids
,
sscores
;
beamsearch
(
pre_ids
,
pre_scores
,
&
sids
,
&
sscores
);
LOG
(
INFO
)
<<
"score: "
<<
sscores
<<
endl
;
ASSERT_EQ
(
sids
.
lod
(),
sscores
.
lod
());
vector
<
int
>
tids
({
4
,
2
,
3
,
8
});
vector
<
float
>
tscores
({
0.5
f
,
0.6
f
,
0.9
f
,
0.7
f
});
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
ASSERT_EQ
(
tids
[
i
],
sids
.
data
<
int64_t
>
()[
i
]);
ASSERT_EQ
(
tscores
[
i
],
sscores
.
data
<
float
>
()[
i
]);
}
}
}
// namespace test
}
// namespace paddle
paddle/fluid/operators/bpr_loss_op.h
浏览文件 @
c7e38680
...
@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
...
@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
auto
*
label
=
ctx
.
Input
<
Tensor
>
(
"Label"
);
auto
*
label
=
ctx
.
Input
<
Tensor
>
(
"Label"
);
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
const
int
step_size
=
x
->
dims
()[
0
]
;
const
size_t
step_size
=
static_cast
<
size_t
>
(
x
->
dims
()[
0
])
;
const
int
num_classes
=
x
->
dims
()[
1
]
;
const
size_t
num_classes
=
static_cast
<
size_t
>
(
x
->
dims
()[
1
])
;
T
*
dx_data
=
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
dx_data
=
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
dy_data
=
dy
->
data
<
T
>
();
const
T
*
dy_data
=
dy
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
x_data
=
x
->
data
<
T
>
();
...
...
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
c7e38680
...
@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -104,9 +104,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv algorithm ---------------------
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t
algo
;
cudnnConvolutionFwdAlgo_t
algo
;
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
Tensor
cudnn_workspace
;
void
*
cudnn_workspace_ptr
=
nullptr
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
...
@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -120,24 +118,19 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
{
}
else
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_limit
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
auto
search_func
=
[
&
]()
{
auto
search_func
=
[
&
]()
{
int
returned_algo_count
;
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
CUDNN_ENFORCE
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
cudnn_workspace_ptr
,
workspace_size_limit
));
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
fwd_perf_stat
[
i
];
const
auto
&
stat
=
fwd_perf_stat
[
i
];
...
@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -188,15 +181,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
PADDLE_ENFORCE_LE
(
workspace_size_in_bytes
,
workspace_size_limit
,
"workspace_size to be allocated exceeds the limit"
);
"workspace_size to be allocated exceeds the limit"
);
if
(
!
cudnn_workspace_ptr
)
{
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
workspace_size_in_bytes
)}),
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
}
if
((
activation
==
"identity"
)
&&
(
!
residual
))
{
if
((
activation
==
"identity"
)
&&
(
!
residual
))
{
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
...
@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -204,12 +188,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// cudnnConvolutionForward and cudnnAddTensor
// cudnnConvolutionForward and cudnnAddTensor
// ------------- cudnn conv forward and bias add ---------------------
// ------------- cudnn conv forward and bias add ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace_ptr
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
));
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnAddTensor
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnAddTensor
(
handle
,
&
alpha
,
cudnn_bias_desc
,
bias_data
,
&
alpha
,
cudnn_output_desc
,
handle
,
&
alpha
,
cudnn_bias_desc
,
bias_data
,
&
alpha
,
cudnn_output_desc
,
output_data
));
output_data
));
...
@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
...
@@ -220,13 +205,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv+bias+act forward --------------------
// ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType
<
T
>
alpha1
=
1.0
f
;
ScalingParamType
<
T
>
alpha1
=
1.0
f
;
ScalingParamType
<
T
>
alpha2
=
residual
?
1.0
f
:
0.0
f
;
ScalingParamType
<
T
>
alpha2
=
residual
?
1.0
f
:
0.0
f
;
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
handle
,
&
alpha1
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
handle
,
&
alpha1
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace_ptr
,
filter_data
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
alpha2
,
cudnn_output_desc
,
residual_data
,
workspace_size_in_bytes
,
&
alpha2
,
cudnn_output_desc
,
residual_data
,
cudnn_bias_desc
,
bias_data
,
cudnn_act_desc
,
cudnn_output_desc
,
cudnn_bias_desc
,
bias_data
,
cudnn_act_desc
,
cudnn_output_desc
,
output_data
));
output_data
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
std
::
vector
<
int
>
channels
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"split_channels"
);
std
::
vector
<
int
>
channels
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"split_channels"
);
if
(
channels
.
size
())
{
if
(
channels
.
size
())
{
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
浏览文件 @
c7e38680
...
@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
...
@@ -104,18 +104,16 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int
output_offset
=
output
->
numel
()
/
output
->
dims
()[
0
]
/
groups
;
int
output_offset
=
output
->
numel
()
/
output
->
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
temp_allocation
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
workspace_size_in_bytes
);
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
+
output_offset
*
g
));
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_output_desc
,
output_data
+
output_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
}
};
};
...
@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
...
@@ -211,22 +209,20 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_grad
->
numel
()
/
output_grad
->
dims
()[
0
]
/
groups
;
output_grad
->
numel
()
/
output_grad
->
dims
()[
0
]
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
int
filter_offset
=
filter
->
numel
()
/
groups
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
temp_allocation
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
workspace_size_in_bytes
);
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
if
(
input_grad
)
{
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
handle
,
&
alpha
,
cudnn_output_desc
,
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
handle
,
&
alpha
,
cudnn_output_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
input_grad_data
+
input_offset
*
g
));
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_input_desc
,
input_grad_data
+
input_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
}
...
@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
...
@@ -236,12 +232,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
// Gradient with respect to the filter
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
handle
,
&
alpha
,
cudnn_output_desc
,
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
handle
,
&
alpha
,
cudnn_output_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
filter_grad_data
+
filter_offset
*
g
));
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
cudnn_filter_desc
,
filter_grad_data
+
filter_offset
*
g
));
};
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size_in_bytes
);
}
}
}
}
}
}
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
c7e38680
...
@@ -20,7 +20,7 @@ if(WITH_GRPC)
...
@@ -20,7 +20,7 @@ if(WITH_GRPC)
collective_client.cc collective_server.cc
collective_client.cc collective_server.cc
${
GRPC_SRCS
}
${
GRPC_SRCS
}
PROTO send_recv.proto
PROTO send_recv.proto
DEPS lod_tensor selected_rows_functor memory
)
DEPS lod_tensor selected_rows_functor memory
scope
${
GRPC_DEPS
}
)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set
(
RPC_DEPS sendrecvop_rpc
${
GRPC_DEPS
}
)
set
(
RPC_DEPS sendrecvop_rpc
${
GRPC_DEPS
}
)
...
@@ -32,15 +32,17 @@ else()
...
@@ -32,15 +32,17 @@ else()
set
(
BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc
)
set
(
BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc
)
set_source_files_properties
(
${
BRPC_SRCS
}
parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
${
BRPC_SRCS
}
parameter_prefetch.cc parameter_send.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set
(
BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib
)
brpc_library
(
sendrecvop_rpc SRCS sendrecvop_utils.cc
brpc_library
(
sendrecvop_rpc SRCS sendrecvop_utils.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc
variable_response.cc
variable_response.cc
collective_client.cc collective_server.cc
collective_client.cc collective_server.cc
${
BRPC_SRCS
}
${
BRPC_SRCS
}
PROTO send_recv.proto
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory
)
DEPS lod_tensor selected_rows memory
scope
${
BRPC_DEPS
}
)
set
(
RPC_DEPS sendrecvop_rpc
brpc ssl crypto protobuf leveldb snappystream snappy zlib
)
set
(
RPC_DEPS sendrecvop_rpc
${
BRPC_DEPS
}
)
cc_test
(
brpc_serde_test SRCS brpc/brpc_serde_test.cc
cc_test
(
brpc_serde_test SRCS brpc/brpc_serde_test.cc
DEPS
${
RPC_DEPS
}
gflags glog executor proto_desc lookup_sparse_table_op SERIAL
)
DEPS
${
RPC_DEPS
}
gflags glog executor proto_desc lookup_sparse_table_op SERIAL
)
endif
()
endif
()
...
...
paddle/fluid/operators/distributed/brpc/brpc_client.cc
浏览文件 @
c7e38680
...
@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
...
@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
const
std
::
string
var_name_val
=
var_name
;
const
std
::
string
var_name_val
=
var_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
std
::
string
method
=
"SendRPC"
;
const
std
::
string
method
=
kSendRPC
;
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
var_name_val
,
p_ctx
,
p_scope
));
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
var_name_val
,
p_ctx
,
p_scope
));
framework
::
AsyncIO
([
=
]
{
framework
::
AsyncIO
([
=
]
{
...
@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
...
@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
method_name
,
const
std
::
string
&
method_name
,
int64_t
time_out
)
{
int64_t
time_out
)
{
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
var_name_val
=
var_name
;
const
std
::
string
var_name_val
=
var_name
;
const
std
::
string
out_varname_val
=
out_var_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
std
::
string
method
=
"GetRPC"
;
const
std
::
string
method
=
kGetRPC
;
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
var_name_val
,
p_ctx
,
p_scope
));
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
out_varname_val
,
p_ctx
,
p_scope
));
framework
::
AsyncIO
([
=
]
{
framework
::
AsyncIO
([
=
]
{
auto
ch_ctx
=
ch_ptr
->
Pop
();
auto
ch_ctx
=
ch_ptr
->
Pop
();
...
@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
...
@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
sendrecv
::
VariableMessage
req
;
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
var_name_val
);
req
.
set_varname
(
var_name_val
);
req
.
set_out_varname
(
out_varname_val
);
req
.
set_trainer_id
(
trainer_id_
);
req
.
set_trainer_id
(
trainer_id_
);
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
...
@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
...
@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
platform
::
RecordRPCEvent
record_event
(
method
,
p_ctx
);
platform
::
RecordRPCEvent
record_event
(
method
,
p_ctx
);
if
(
method_name
==
"GetMonomerVariable"
)
{
if
(
method_name
==
kGetMonomerRPC
)
{
ch_ctx
->
stub
->
GetMonomerVariable
(
cntl
,
&
req
,
response
,
done
);
ch_ctx
->
stub
->
GetMonomerVariable
(
cntl
,
&
req
,
response
,
done
);
}
else
if
(
method_name
==
kGetNoBarrierRPC
)
{
ch_ctx
->
stub
->
GetVariableNoBarrier
(
cntl
,
&
req
,
response
,
done
);
}
else
{
}
else
{
ch_ctx
->
stub
->
GetVariable
(
cntl
,
&
req
,
response
,
done
);
ch_ctx
->
stub
->
GetVariable
(
cntl
,
&
req
,
response
,
done
);
}
}
...
@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
...
@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
return
var_h
;
return
var_h
;
}
}
VarHandlePtr
BRPCClient
::
AsyncGetVarNoBarrier
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
out_var_name
,
int64_t
time_out
)
{
std
::
string
var_name_no_barrier
=
string
::
Sprintf
(
"%s%s"
,
var_name
,
WITHOUT_BARRIER_MESSAGE
);
return
_AsyncGetVar
(
ep
,
ctx
,
scope
,
var_name_no_barrier
,
out_var_name
,
kGetNoBarrierRPC
,
time_out
);
}
VarHandlePtr
BRPCClient
::
AsyncGetMonomerVariable
(
VarHandlePtr
BRPCClient
::
AsyncGetMonomerVariable
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
int64_t
time_out
)
{
return
_AsyncGetVar
(
ep
,
ctx
,
scope
,
var_name
,
"GetMonomerVariable"
,
time_out
);
return
_AsyncGetVar
(
ep
,
ctx
,
scope
,
var_name
,
var_name
,
kGetMonomerRPC
,
time_out
);
}
}
VarHandlePtr
BRPCClient
::
AsyncGetMonomerBarrier
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncGetMonomerBarrier
(
const
std
::
string
&
ep
,
const
std
::
string
&
var_name
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
int64_t
time_out
)
{
return
AsyncSendMessage
(
ep
,
"GetMonomerBarrier"
,
var_name
,
time_out
);
return
AsyncSendMessage
(
ep
,
kSendMonomerFetchBarrierRPC
,
var_name
,
time_out
);
}
}
VarHandlePtr
BRPCClient
::
AsyncGetVar
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncGetVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
var_name
,
const
std
::
string
&
out_var_name
,
int64_t
time_out
)
{
int64_t
time_out
)
{
return
_AsyncGetVar
(
ep
,
ctx
,
scope
,
var_name
,
"GetVariable"
,
time_out
);
return
_AsyncGetVar
(
ep
,
ctx
,
scope
,
var_name
,
out_var_name
,
kGetRPC
,
time_out
);
}
}
VarHandlePtr
BRPCClient
::
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
...
@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
...
@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
std
::
string
method
=
"PrefetchRPC"
;
const
std
::
string
method
=
kPrefetchRPC
;
VarHandlePtr
var_h
(
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
out_var_name_val
,
p_ctx
,
p_scope
));
new
VarHandle
(
ep
,
method
,
out_var_name_val
,
p_ctx
,
p_scope
));
...
@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
...
@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
VarHandlePtr
BRPCClient
::
AsyncSendBatchBarrier
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncSendBatchBarrier
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
int64_t
time_out
)
{
return
AsyncSendMessage
(
ep
,
"BatchBarrierRPC"
,
BATCH_BARRIER_MESSAGE
,
return
AsyncSendMessage
(
ep
,
kBatchBarrierRPC
,
BATCH_BARRIER_MESSAGE
,
time_out
);
time_out
);
}
}
...
@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
...
@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
sendrecv
::
VariableMessage
req
;
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
FETCH_BARRIER_MESSAGE
);
req
.
set_varname
(
FETCH_BARRIER_MESSAGE
);
const
std
::
string
method
=
"FetchBarrierRPC"
;
const
std
::
string
method
=
kFetchBarrierRPC
;
// var handle
// var handle
VarHandlePtr
var_h
(
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
FETCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
new
VarHandle
(
ep
,
method
,
FETCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
...
@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
...
@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
VarHandlePtr
BRPCClient
::
AsyncSendComplete
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncSendComplete
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
int64_t
time_out
)
{
return
AsyncSendMessage
(
ep
,
"SendCompleteRPC"
,
COMPLETE_MESSAGE
,
time_out
);
return
AsyncSendMessage
(
ep
,
kSendCompleteRPC
,
COMPLETE_MESSAGE
,
time_out
);
}
}
void
BRPCClient
::
SendComplete
()
{
void
BRPCClient
::
SendComplete
()
{
...
@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
...
@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
&
HandleSendResponse
,
cntl
,
response
,
var_h
,
ch_ptr
,
ch_ctx
,
this
);
&
HandleSendResponse
,
cntl
,
response
,
var_h
,
ch_ptr
,
ch_ctx
,
this
);
if
(
method_name
==
"CheckPointNotifyRPC"
)
{
if
(
method_name
==
kCheckPointNotifyRPC
)
{
ch_ctx
->
stub
->
CheckpointNotify
(
cntl
,
&
req
,
response
,
done
);
ch_ctx
->
stub
->
CheckpointNotify
(
cntl
,
&
req
,
response
,
done
);
}
else
if
(
method_name
==
"GetMonomerBarrier"
)
{
}
else
if
(
method_name
==
kSendMonomerFetchBarrierRPC
)
{
ch_ctx
->
stub
->
GetMonomerBarrier
(
cntl
,
&
req
,
response
,
done
);
ch_ctx
->
stub
->
GetMonomerBarrier
(
cntl
,
&
req
,
response
,
done
);
}
else
{
}
else
{
ch_ctx
->
stub
->
SendVariable
(
cntl
,
&
req
,
response
,
done
);
ch_ctx
->
stub
->
SendVariable
(
cntl
,
&
req
,
response
,
done
);
...
...
paddle/fluid/operators/distributed/brpc/brpc_client.h
浏览文件 @
c7e38680
...
@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient {
...
@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient {
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
var_name
,
const
std
::
string
&
out_var_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
VarHandlePtr
AsyncGetMonomerBarrier
(
VarHandlePtr
AsyncGetMonomerBarrier
(
...
@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient {
...
@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient {
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
VarHandlePtr
AsyncGetVarNoBarrier
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
out_varname
,
int64_t
time_out
=
FLAGS_rpc_deadline
);
VarHandlePtr
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
VarHandlePtr
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
framework
::
Scope
&
scope
,
...
@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient {
...
@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient {
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
method_name
,
const
std
::
string
&
method_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
);
int64_t
time_out
=
FLAGS_rpc_deadline
);
...
...
paddle/fluid/operators/distributed/brpc/brpc_server.cc
浏览文件 @
c7e38680
...
@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService {
...
@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService {
rpc_server_
->
GetThreadNum
(
distributed
::
kRequestGet
)));
rpc_server_
->
GetThreadNum
(
distributed
::
kRequestGet
)));
}
}
it
=
rpc_call_map
.
find
(
distributed
::
kRequestGetNoBarrier
);
if
(
it
!=
rpc_call_map
.
end
())
{
request_getnobarrier_h_
=
it
->
second
;
getnobarrier_threads_
.
reset
(
new
paddle
::
framework
::
ThreadPool
(
rpc_server_
->
GetThreadNum
(
distributed
::
kRequestGetNoBarrier
)));
}
it
=
rpc_call_map
.
find
(
distributed
::
kRequestPrefetch
);
it
=
rpc_call_map
.
find
(
distributed
::
kRequestPrefetch
);
if
(
it
!=
rpc_call_map
.
end
())
{
if
(
it
!=
rpc_call_map
.
end
())
{
request_prefetch_h_
=
it
->
second
;
request_prefetch_h_
=
it
->
second
;
...
@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService {
...
@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService {
[
=
]
{
_GetVariable
(
cntl_butil
,
request
,
response
,
done
);
});
[
=
]
{
_GetVariable
(
cntl_butil
,
request
,
response
,
done
);
});
}
}
void
GetVariableNoBarrier
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
override
{
getnobarrier_threads_
->
Run
(
[
=
]
{
_GetVariableNoBarrier
(
cntl_butil
,
request
,
response
,
done
);
});
}
void
_GetVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
void
_GetVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
{
google
::
protobuf
::
Closure
*
done
)
{
...
@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService {
...
@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService {
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
std
::
string
varname
=
request
->
varname
();
std
::
string
varname
=
request
->
varname
();
std
::
string
out_varname
=
request
->
out_varname
();
VLOG
(
3
)
<<
"RequestGet varname:"
<<
varname
VLOG
(
3
)
<<
"RequestGet varname:"
<<
varname
<<
", out_varname:"
<<
out_varname
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", from:"
<<
cntl
->
remote_side
();
<<
", from:"
<<
cntl
->
remote_side
();
auto
scope
=
request_get_h_
->
scope
();
auto
scope
=
request_get_h_
->
scope
();
auto
invar
=
scope
->
FindVar
(
varname
);
paddle
::
framework
::
Variable
*
invar
=
nullptr
;
int
trainer_id
=
request
->
trainer_id
();
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
request_get_h_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
,
trainer_id
,
out_varname
);
if
(
outvar
)
{
distributed
::
SerializeToIOBuf
(
out_varname
,
outvar
,
*
request_get_h_
->
dev_ctx
(),
response
,
&
cntl
->
response_attachment
(),
""
,
false
);
}
}
void
_GetVariableNoBarrier
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
{
PADDLE_ENFORCE
(
request_getnobarrier_h_
!=
nullptr
,
"RequestGetNoBarrier handler should be registed first!"
);
brpc
::
ClosureGuard
done_guard
(
done
);
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
std
::
string
varname
=
request
->
varname
();
std
::
string
out_varname
=
request
->
out_varname
();
int
trainer_id
=
request
->
trainer_id
();
int
trainer_id
=
request
->
trainer_id
();
VLOG
(
3
)
<<
"RequestGetNoBarrier varname:"
<<
varname
<<
", out_varname:"
<<
out_varname
<<
", trainer_id:"
<<
trainer_id
<<
", from:"
<<
cntl
->
remote_side
();
auto
scope
=
request_getnobarrier_h_
->
scope
();
paddle
::
framework
::
Variable
*
invar
=
nullptr
;
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
request_get_h_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
,
trainer_id
);
request_getnobarrier_h_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
,
trainer_id
,
out_varname
);
if
(
outvar
)
{
if
(
outvar
)
{
distributed
::
SerializeToIOBuf
(
varname
,
outvar
,
*
request_get_h_
->
dev_ctx
(),
distributed
::
SerializeToIOBuf
(
response
,
&
cntl
->
response_attachment
(),
""
,
out_varname
,
outvar
,
*
request_getnobarrier_h_
->
dev_ctx
(),
response
,
false
);
&
cntl
->
response_attachment
(),
""
,
false
);
}
}
}
}
void
PrefetchVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
void
PrefetchVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
VariableMessage
*
response
,
...
@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService {
...
@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService {
private:
private:
distributed
::
RequestHandler
*
request_send_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_send_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_get_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_get_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_getnobarrier_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_prefetch_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_prefetch_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_checkpoint_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_checkpoint_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_get_monomer_handler_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_get_monomer_handler_h_
{
nullptr
};
...
@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService {
...
@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService {
distributed
::
RPCServer
*
rpc_server_
{
nullptr
};
distributed
::
RPCServer
*
rpc_server_
{
nullptr
};
// FIXME(gongwb): brpc should support process one rpc
e
use one threadpool.
// FIXME(gongwb): brpc should support process one rpc use one threadpool.
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
send_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
send_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
get_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
get_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
getnobarrier_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
prefetch_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
prefetch_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
checkpoint_notify_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
checkpoint_notify_threads_
;
};
};
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
c7e38680
...
@@ -54,9 +54,20 @@ bool RequestSendHandler::Handle(const std::string& varname,
...
@@ -54,9 +54,20 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Async
// Async
if
(
!
sync_mode_
)
{
if
(
!
sync_mode_
)
{
VLOG
(
3
)
<<
"async process var: "
<<
varname
;
VLOG
(
3
)
<<
"async process var: "
<<
varname
;
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
varname
].
get
(),
if
(
varname
==
BATCH_BARRIER_MESSAGE
)
{
scope
);
PADDLE_THROW
(
delete
scope
;
"async mode should not recv BATCH_BARRIER_MESSAGE or "
"COMPLETE_MESSAGE"
);
}
try
{
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
varname
].
get
(),
scope
);
delete
scope
;
}
catch
(
std
::
exception
&
e
)
{
LOG
(
ERROR
)
<<
"async: run sub program error "
<<
e
.
what
();
return
false
;
}
return
true
;
return
true
;
}
else
{
// sync
}
else
{
// sync
rpc_server_
->
WaitCond
(
kRequestSend
);
rpc_server_
->
WaitCond
(
kRequestSend
);
...
...
paddle/fluid/operators/distributed/rpc_server.cc
浏览文件 @
c7e38680
...
@@ -39,27 +39,33 @@ void RPCServer::SavePort() const {
...
@@ -39,27 +39,33 @@ void RPCServer::SavePort() const {
port_file
.
open
(
file_path
);
port_file
.
open
(
file_path
);
port_file
<<
selected_port_
;
port_file
<<
selected_port_
;
port_file
.
close
();
port_file
.
close
();
VLOG
(
4
)
<<
"selected port written to "
<<
file_path
;
VLOG
(
3
)
<<
"selected port written to "
<<
file_path
;
}
}
void
RPCServer
::
WaitBarrier
(
const
std
::
string
&
rpc_name
)
{
void
RPCServer
::
WaitBarrier
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
3
)
<<
"WaitBarrier in: "
<<
rpc_name
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
this
->
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
this
->
mutex_
);
barrier_cond_
.
wait
(
lock
,
[
this
,
&
rpc_name
]
{
barrier_cond_
.
wait
(
lock
,
[
this
,
&
rpc_name
]
{
return
((
barrier_counter_
[
rpc_name
]
==
client_num_
&&
client_num_
!=
0
)
||
return
((
barrier_counter_
[
rpc_name
]
==
client_num_
&&
client_num_
!=
0
)
||
exit_flag_
.
load
());
exit_flag_
.
load
());
});
});
VLOG
(
3
)
<<
"
batch_barrier_: "
<<
rpc_name
<<
" "
VLOG
(
3
)
<<
"
WaitBarrier out: "
<<
rpc_name
<<
barrier_counter_
[
rpc_name
];
<<
" counter: "
<<
barrier_counter_
[
rpc_name
];
}
}
void
RPCServer
::
IncreaseBatchBarrier
(
const
std
::
string
rpc_name
)
{
void
RPCServer
::
IncreaseBatchBarrier
(
const
std
::
string
rpc_name
)
{
VLOG
(
4
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
VLOG
(
3
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
// barrier msg should make sure that it's in the right cond(send|recv)
WaitCond
(
rpc_name
);
int
b
=
0
;
int
b
=
0
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
b
=
++
barrier_counter_
[
rpc_name
];
b
=
++
barrier_counter_
[
rpc_name
];
VLOG
(
3
)
<<
rpc_name
<<
" barrier_counter: "
<<
b
;
if
(
b
>=
client_num_
)
{
if
(
b
>=
client_num_
)
{
lock
.
unlock
();
lock
.
unlock
();
VLOG
(
3
)
<<
"BatchBarrier counter reach "
<<
client_num_
<<
" for "
<<
rpc_name
;
barrier_cond_
.
notify_all
();
barrier_cond_
.
notify_all
();
lock
.
lock
();
lock
.
lock
();
}
}
...
@@ -71,7 +77,7 @@ void RPCServer::Complete() {
...
@@ -71,7 +77,7 @@ void RPCServer::Complete() {
client_num_
--
;
client_num_
--
;
need_reset_all_vars_
=
true
;
need_reset_all_vars_
=
true
;
VLOG
(
4
)
<<
"decrease client_num to: "
<<
client_num_
;
VLOG
(
3
)
<<
"decrease client_num to: "
<<
client_num_
;
if
(
cur_cond_
.
load
()
==
rpc_cond_map_
[
kRequestGet
])
{
if
(
cur_cond_
.
load
()
==
rpc_cond_map_
[
kRequestGet
])
{
barrier_counter_
[
kRequestGet
]
--
;
barrier_counter_
[
kRequestGet
]
--
;
}
}
...
@@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
...
@@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
static
int
cond
=
-
1
;
static
int
cond
=
-
1
;
rpc_cond_map_
[
rpc_name
]
=
++
cond
;
rpc_cond_map_
[
rpc_name
]
=
++
cond
;
VLOG
(
4
)
<<
"RegisterRPC rpc_name:"
<<
rpc_name
<<
", handler:
"
<<
handler
VLOG
(
3
)
<<
"RegisterRPC rpc_name: "
<<
rpc_name
<<
", handler:
"
<<
handler
<<
", cond:"
<<
rpc_cond_map_
[
rpc_name
];
<<
", cond:
"
<<
rpc_cond_map_
[
rpc_name
];
}
}
void
RPCServer
::
SetCond
(
const
std
::
string
&
rpc_name
)
{
void
RPCServer
::
SetCond
(
const
std
::
string
&
rpc_name
)
{
...
@@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
...
@@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
}
}
void
RPCServer
::
WaitCond
(
const
std
::
string
&
rpc_name
)
{
void
RPCServer
::
WaitCond
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
4
)
<<
"RPCServer WaitCond
"
<<
rpc_name
;
VLOG
(
3
)
<<
"RPCServer WaitCond in
"
<<
rpc_name
;
int
cond
=
0
;
int
cond
=
0
;
{
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
...
@@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
...
@@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
rpc_cond_
.
wait
(
rpc_cond_
.
wait
(
lock
,
[
=
]
{
return
(
cur_cond_
.
load
()
==
cond
||
exit_flag_
.
load
());
});
lock
,
[
=
]
{
return
(
cur_cond_
.
load
()
==
cond
||
exit_flag_
.
load
());
});
VLOG
(
3
)
<<
"RPCServer WaitCond out "
<<
rpc_name
;
}
}
void
RPCServer
::
RegisterVar
(
const
std
::
string
&
var_name
,
void
RPCServer
::
RegisterVar
(
const
std
::
string
&
var_name
,
...
@@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name,
...
@@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name,
}
}
rpc_cond_
.
notify_all
();
rpc_cond_
.
notify_all
();
VLOG
(
4
)
<<
"RegisterVar context:"
<<
h
.
String
();
VLOG
(
3
)
<<
"RegisterVar context:"
<<
h
.
String
();
}
}
void
RPCServer
::
IncreaseVarBarrier
(
const
std
::
string
&
var_name
)
{
void
RPCServer
::
IncreaseVarBarrier
(
const
std
::
string
&
var_name
)
{
...
@@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
...
@@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
barrier_cond_
.
notify_all
();
barrier_cond_
.
notify_all
();
}
}
VLOG
(
4
)
<<
"IncreaseVarBarrier context:"
<<
h
.
String
();
VLOG
(
3
)
<<
"IncreaseVarBarrier context:"
<<
h
.
String
();
}
}
void
RPCServer
::
WaitVarBarrier
(
const
std
::
string
&
var_name
)
{
void
RPCServer
::
WaitVarBarrier
(
const
std
::
string
&
var_name
)
{
VLOG
(
4
)
<<
"Wait
Barrier var_name:"
<<
var_name
;
VLOG
(
3
)
<<
"WaitVar
Barrier var_name:"
<<
var_name
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
barrier_cond_
.
wait
(
lock
,
[
&
]()
{
barrier_cond_
.
wait
(
lock
,
[
&
]()
{
...
@@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) {
...
@@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) {
exit_flag_
.
load
());
exit_flag_
.
load
());
});
});
VLOG
(
4
)
<<
"Wait
Barrier context: "
<<
var_map_
[
var_name
].
String
();
VLOG
(
3
)
<<
"WaitVar
Barrier context: "
<<
var_map_
[
var_name
].
String
();
}
}
void
RPCServer
::
SetVarCond
(
const
std
::
string
&
var_name
)
{
void
RPCServer
::
SetVarCond
(
const
std
::
string
&
var_name
)
{
VLOG
(
4
)
<<
"SetVarCond var_name:"
<<
var_name
;
VLOG
(
3
)
<<
"SetVarCond var_name:"
<<
var_name
;
{
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
if
(
var_map_
.
find
(
var_name
)
!=
var_map_
.
end
())
{
if
(
var_map_
.
find
(
var_name
)
!=
var_map_
.
end
())
{
...
@@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) {
...
@@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) {
}
}
void
RPCServer
::
WaitVarCond
(
const
std
::
string
&
var_name
)
{
void
RPCServer
::
WaitVarCond
(
const
std
::
string
&
var_name
)
{
VLOG
(
4
)
<<
"WaitVarCond var_name:"
<<
var_name
;
VLOG
(
3
)
<<
"WaitVarCond var_name:"
<<
var_name
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
rpc_cond_
.
wait
(
lock
,
[
=
]
{
rpc_cond_
.
wait
(
lock
,
[
=
]
{
return
(
var_map_
.
find
(
var_name
)
!=
var_map_
.
end
()
||
exit_flag_
.
load
());
return
(
var_map_
.
find
(
var_name
)
!=
var_map_
.
end
()
||
exit_flag_
.
load
());
});
});
VLOG
(
4
)
<<
"WaitVarCond var_name:"
<<
var_name
<<
" end"
;
VLOG
(
3
)
<<
"WaitVarCond var_name:"
<<
var_name
<<
" end"
;
}
}
MonomerHandle
RPCServer
::
GetMonomer
(
const
std
::
string
&
var_name
)
{
MonomerHandle
RPCServer
::
GetMonomer
(
const
std
::
string
&
var_name
)
{
...
...
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
c7e38680
...
@@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData(
...
@@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData(
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
ToVarType
(
meta_
.
data_type
()));
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
ToVarType
(
meta_
.
data_type
()));
VLOG
(
6
)
<<
"Tensor.memory_size = "
<<
tensor
->
memory_size
()
VLOG
(
6
)
<<
"Tensor.memory_size = "
<<
tensor
->
memory_size
()
<<
", Buffer Size = "
<<
length
;
<<
", Buffer Size = "
<<
length
<<
", dims:"
<<
dims
PADDLE_ENFORCE_EQ
(
tensor
->
memory_size
(),
static_cast
<
unsigned
int
>
(
length
));
<<
", numel:"
<<
tensor
->
numel
();
PADDLE_ENFORCE_GE
(
tensor
->
memory_size
(),
static_cast
<
unsigned
int
>
(
length
));
return
ReadRaw
(
input
,
ctx
,
tensor
->
place
(),
tensor_data
,
length
);
return
ReadRaw
(
input
,
ctx
,
tensor
->
place
(),
tensor_data
,
length
);
}
}
...
...
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
浏览文件 @
c7e38680
...
@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop(
...
@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop(
while
(
true
)
{
while
(
true
)
{
// Get from multiple trainers, we don't care about the order in which
// Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient.
// the gradients arrives, just add suffix 0~n and merge the gradient.
VLOG
(
3
)
<<
"wait all clients to send gradient"
;
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
rpc_service_
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"wait all clients to send send_barrier"
;
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestSend
);
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestSend
);
if
(
rpc_service_
->
IsExit
())
{
if
(
rpc_service_
->
IsExit
())
{
...
@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop(
...
@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop(
}
}
ParallelExecuteBlocks
(
parallel_blkids
,
executor
,
optimize_prepared
,
program
,
ParallelExecuteBlocks
(
parallel_blkids
,
executor
,
optimize_prepared
,
program
,
recv_scope
);
recv_scope
);
VLOG
(
2
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
VLOG
(
3
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
VLOG
(
3
)
<<
"ResetReceivedVars"
;
ResetReceivedVars
(
recv_scope
,
dev_ctx
,
rpc_service_
->
NeedResetAllVars
());
ResetReceivedVars
(
recv_scope
,
dev_ctx
,
rpc_service_
->
NeedResetAllVars
());
VLOG
(
3
)
<<
"wait all clients to get parameters back"
;
rpc_service_
->
SetCond
(
distributed
::
kRequestGet
);
rpc_service_
->
SetCond
(
distributed
::
kRequestGet
);
VLOG
(
3
)
<<
"wait all clients to send fetch_barrier"
;
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestGet
);
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestGet
);
VLOG
(
3
)
<<
"ResetBarrierCounter"
;
rpc_service_
->
ResetBarrierCounter
();
rpc_service_
->
ResetBarrierCounter
();
}
// while(true)
}
// while(true)
}
}
...
...
paddle/fluid/operators/distributed_ops/merge_ids_op.h
浏览文件 @
c7e38680
...
@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
...
@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ
(
ids
.
size
(),
outs
.
size
(),
PADDLE_ENFORCE_EQ
(
ids
.
size
(),
outs
.
size
(),
"the number of Ids and Out should be the same"
);
"the number of Ids and Out should be the same"
);
size
_t
row_ids_size
=
0
;
int64
_t
row_ids_size
=
0
;
int
row_size
=
0
;
int
64_t
row_size
=
0
;
int
embedding_size
=
0
;
int
64_t
embedding_size
=
0
;
for
(
size_t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
const
auto
*
x_tensor
=
x_tensors
[
i
];
const
auto
*
x_tensor
=
x_tensors
[
i
];
...
@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
...
@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
for
(
size_t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
x_tensors
.
size
();
++
i
)
{
const
auto
*
row_id
=
row_ids
[
i
];
const
auto
*
row_id
=
row_ids
[
i
];
for
(
int
j
=
0
;
j
<
row_id
->
numel
();
++
j
)
{
for
(
auto
j
=
0
;
j
<
row_id
->
numel
();
++
j
)
{
int64_t
key
=
row_id
->
data
<
int64_t
>
()[
j
];
int64_t
key
=
row_id
->
data
<
int64_t
>
()[
j
];
std
::
tuple
<
int64_t
,
int64_t
>
val
=
std
::
make_tuple
(
i
,
j
);
std
::
tuple
<
int64_t
,
int64_t
>
val
=
std
::
make_tuple
(
i
,
j
);
selected_rows_idx_map
.
insert
(
std
::
make_pair
(
key
,
val
));
selected_rows_idx_map
.
insert
(
std
::
make_pair
(
key
,
val
));
...
@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
...
@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
out
->
set_lod
(
out_ids
->
lod
());
out
->
set_lod
(
out_ids
->
lod
());
int
nums
=
static_cast
<
int
>
(
out_ids
->
dims
()[
0
])
;
auto
nums
=
out_ids
->
dims
()[
0
]
;
auto
*
out_data
=
out
->
mutable_data
<
T
>
(
auto
*
out_data
=
out
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
nums
,
embedding_size
}),
place
);
framework
::
make_ddim
({
nums
,
embedding_size
}),
place
);
for
(
int
j
=
0
;
j
<
nums
;
++
j
)
{
for
(
auto
j
=
0
;
j
<
nums
;
++
j
)
{
int
id
=
out_ids
->
data
<
int64_t
>
()[
j
];
auto
id
=
out_ids
->
data
<
int64_t
>
()[
j
];
auto
row_tuple
=
selected_rows_idx_map
[
id
]
;
auto
row_tuple
=
selected_rows_idx_map
.
at
(
id
)
;
int64_t
row_idx
=
std
::
get
<
1
>
(
row_tuple
);
auto
row_idx
=
std
::
get
<
1
>
(
row_tuple
);
const
auto
*
x_tensor
=
x_tensors
[
std
::
get
<
0
>
(
row_tuple
)];
const
auto
*
x_tensor
=
x_tensors
[
std
::
get
<
0
>
(
row_tuple
)];
memcpy
(
out_data
+
embedding_size
*
j
,
memcpy
(
out_data
+
embedding_size
*
j
,
...
...
paddle/fluid/operators/elementwise/elementwise_op_function.h
浏览文件 @
c7e38680
...
@@ -277,68 +277,6 @@ class TransformFunctor {
...
@@ -277,68 +277,6 @@ class TransformFunctor {
Functor
func_
;
Functor
func_
;
};
};
#define EIGEN_FUNCTOR(name, eigen_op) \
struct Eigen##name##Functor { \
template <typename DeviceContext, typename T> \
inline void Run(const framework::Tensor *x, const framework::Tensor *y, \
framework::Tensor *z, \
const framework::ExecutionContext &ctx) { \
auto x_e = framework::EigenVector<T>::Flatten(*x); \
auto y_e = framework::EigenVector<T>::Flatten(*y); \
auto z_e = framework::EigenVector<T>::Flatten(*z); \
z_e.device( \
*ctx.template device_context<DeviceContext>().eigen_device()) = \
eigen_op(x_e, y_e); \
} \
template <typename DeviceContext, typename T> \
inline void RunBroadCast(const framework::Tensor *x, \
const framework::Tensor *y, framework::Tensor *z, \
const framework::ExecutionContext &ctx, int pre, \
int n) { \
auto x_e = framework::EigenVector<T>::Flatten(*x); \
auto y_e = framework::EigenVector<T>::Flatten(*y); \
auto z_e = framework::EigenVector<T>::Flatten(*z); \
auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n)) \
.broadcast(Eigen::DSizes<int, 2>(pre, 1)) \
.reshape(Eigen::DSizes<int, 1>(x_e.size())); \
z_e.device( \
*ctx.template device_context<DeviceContext>().eigen_device()) = \
eigen_op(x_e, y_bcast); \
} \
template <typename DeviceContext, typename T> \
inline void RunBroadCast2(const framework::Tensor *x, \
const framework::Tensor *y, \
framework::Tensor *z, \
const framework::ExecutionContext &ctx, int pre, \
int n, int post) { \
auto x_e = framework::EigenVector<T>::Flatten(*x); \
auto y_e = framework::EigenVector<T>::Flatten(*y); \
auto z_e = framework::EigenVector<T>::Flatten(*z); \
auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1)) \
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post)) \
.reshape(Eigen::DSizes<int, 1>(x_e.size())); \
z_e.device( \
*ctx.template device_context<DeviceContext>().eigen_device()) = \
eigen_op(x_e, y_bcast); \
} \
}
#define EIGEN_ADD(x, y) ((x) + (y))
EIGEN_FUNCTOR
(
Add
,
EIGEN_ADD
);
#define EIGEN_SUB(x, y) ((x) - (y))
EIGEN_FUNCTOR
(
Sub
,
EIGEN_SUB
);
#define EIGEN_MUL(x, y) ((x) * (y))
EIGEN_FUNCTOR
(
Mul
,
EIGEN_MUL
);
#define EIGEN_DIV(x, y) ((x) / (y))
EIGEN_FUNCTOR
(
Div
,
EIGEN_DIV
);
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
template
<
typename
T
,
typename
DX_OP
,
typename
DY_OP
>
struct
ElemwiseGradNoBroadcast
{
struct
ElemwiseGradNoBroadcast
{
const
T
*
x_
;
const
T
*
x_
;
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
c7e38680
...
@@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
...
@@ -216,19 +216,18 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
out_datas
.
push_back
(
out_datas
.
push_back
(
static_cast
<
void
*>
(
output_data
+
(
oc0
+
oc1
+
oc2
)
*
h
*
w
));
static_cast
<
void
*>
(
output_data
+
(
oc0
+
oc1
+
oc2
)
*
h
*
w
));
auto
temp_allocation
=
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
workspace_size_in_bytes
);
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
auto
func
=
[
&
](
void
*
cudnn_workspace
)
{
handle
,
&
alpha
,
in_desc
[
i
],
in_datas
[
i
],
filter_desc
[
i
],
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBiasActivationForward
(
static_cast
<
const
void
*>
(
filters
[
i
]
->
data
<
T
>
()),
conv_desc
[
i
],
handle
,
&
alpha
,
in_desc
[
i
],
in_datas
[
i
],
filter_desc
[
i
],
algo
[
i
],
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
out_desc
[
i
],
static_cast
<
const
void
*>
(
filters
[
i
]
->
data
<
T
>
()),
conv_desc
[
i
],
out_datas
[
i
],
bias_desc
[
i
],
algo
[
i
],
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
static_cast
<
const
void
*>
(
bias
[
i
]
->
data
<
T
>
()),
cudnn_act_desc
,
out_desc
[
i
],
out_datas
[
i
],
bias_desc
[
i
],
out_desc
[
i
],
out_datas
[
i
]));
static_cast
<
const
void
*>
(
bias
[
i
]
->
data
<
T
>
()),
cudnn_act_desc
,
out_desc
[
i
],
out_datas
[
i
]));
};
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
workspace_handle
.
RunFunc
(
func
,
workspace_size_in_bytes
);
}
}
cudnnTensorDescriptor_t
x_desc
;
cudnnTensorDescriptor_t
x_desc
;
...
...
paddle/fluid/operators/grid_sampler_op.cc
浏览文件 @
c7e38680
...
@@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel {
...
@@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
grid_dims
[
3
]
==
2
,
"Input(Grid) dims[3] should be 2."
);
PADDLE_ENFORCE
(
grid_dims
[
3
]
==
2
,
"Input(Grid) dims[3] should be 2."
);
PADDLE_ENFORCE_EQ
(
grid_dims
[
0
],
x_dims
[
0
],
PADDLE_ENFORCE_EQ
(
grid_dims
[
0
],
x_dims
[
0
],
"Input(X) and Input(Grid) dims[0] should be equal."
);
"Input(X) and Input(Grid) dims[0] should be equal."
);
PADDLE_ENFORCE_EQ
(
if
(
ctx
->
IsRuntime
())
{
grid_dims
[
1
],
x_dims
[
2
],
PADDLE_ENFORCE_EQ
(
"Input(X) dims[2] and Input(Grid) dims[1] should be equal."
);
grid_dims
[
1
],
x_dims
[
2
],
PADDLE_ENFORCE_EQ
(
"Input(X) dims[2] and Input(Grid) dims[1] should be equal."
);
grid_dims
[
2
],
x_dims
[
3
],
PADDLE_ENFORCE_EQ
(
"Input(X) dims[3] and Input(Grid) dims[2] should be equal."
);
grid_dims
[
2
],
x_dims
[
3
],
"Input(X) dims[3] and Input(Grid) dims[2] should be equal."
);
}
ctx
->
SetOutputDim
(
"Output"
,
x_dims
);
ctx
->
SetOutputDim
(
"Output"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
"Output"
);
ctx
->
ShareLoD
(
"X"
,
"Output"
);
...
...
paddle/fluid/operators/jit/CMakeLists.txt
浏览文件 @
c7e38680
...
@@ -21,5 +21,5 @@ endif()
...
@@ -21,5 +21,5 @@ endif()
cc_library
(
jit_kernel_helper SRCS
${
jit_kernel_cc_srcs
}
DEPS
${
JIT_KERNEL_DEPS
}
)
cc_library
(
jit_kernel_helper SRCS
${
jit_kernel_cc_srcs
}
DEPS
${
JIT_KERNEL_DEPS
}
)
cc_test
(
jit_kernel_test SRCS test.cc DEPS jit_kernel_helper
)
cc_test
(
jit_kernel_test SRCS test.cc DEPS jit_kernel_helper
)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
cc_binary
(
jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer
)
cc_binary
(
jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer
tensor
)
endif
()
endif
()
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
c7e38680
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
#include <vector>
#include <vector>
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
...
@@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
...
@@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
LOG
(
INFO
)
<<
loginfos
.
str
();
LOG
(
INFO
)
<<
loginfos
.
str
();
}
}
using
Tensor
=
paddle
::
framework
::
Tensor
;
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
BenchXYZNKernel
()
{
void
BenchXYZNKernel
()
{
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
std
::
vector
<
T
>
x
(
d
),
y
(
d
),
z
(
d
);
Tensor
x
,
y
,
z
;
RandomVec
<
T
>
(
d
,
x
.
data
());
x
.
Resize
({
d
});
RandomVec
<
T
>
(
d
,
y
.
data
());
y
.
Resize
({
d
});
BenchAllImpls
<
KT
,
jit
::
XYZNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
(),
y
.
data
(),
z
.
Resize
({
d
});
z
.
data
(),
d
);
T
*
x_data
=
x
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
T
*
z_data
=
z
.
mutable_data
<
T
>
(
PlaceType
());
RandomVec
<
T
>
(
d
,
x_data
);
RandomVec
<
T
>
(
d
,
y_data
);
BenchAllImpls
<
KT
,
jit
::
XYZNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
y
.
data
<
T
>
(),
z_data
,
d
);
}
}
}
}
...
@@ -170,9 +179,13 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
...
@@ -170,9 +179,13 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void
BenchAXYNKernel
()
{
void
BenchAXYNKernel
()
{
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
const
T
a
=
static_cast
<
T
>
(
3
);
const
T
a
=
static_cast
<
T
>
(
3
);
std
::
vector
<
T
>
x
(
d
),
y
(
d
);
Tensor
x
,
y
;
RandomVec
<
T
>
(
d
,
x
.
data
());
x
.
Resize
({
d
});
BenchAllImpls
<
KT
,
jit
::
AXYNTuples
<
T
>
,
PlaceType
>
(
d
,
&
a
,
x
.
data
(),
y
.
data
(),
y
.
Resize
({
d
});
T
*
x_data
=
x
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
RandomVec
<
T
>
(
d
,
x_data
);
BenchAllImpls
<
KT
,
jit
::
AXYNTuples
<
T
>
,
PlaceType
>
(
d
,
&
a
,
x
.
data
<
T
>
(),
y_data
,
d
);
d
);
}
}
}
}
...
@@ -180,9 +193,13 @@ void BenchAXYNKernel() {
...
@@ -180,9 +193,13 @@ void BenchAXYNKernel() {
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
template
<
paddle
::
operators
::
jit
::
KernelType
KT
,
typename
T
,
typename
PlaceType
>
void
BenchXYNKernel
()
{
void
BenchXYNKernel
()
{
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
std
::
vector
<
T
>
x
(
d
),
y
(
d
);
Tensor
x
,
y
;
RandomVec
<
T
>
(
d
,
x
.
data
());
x
.
Resize
({
d
});
BenchAllImpls
<
KT
,
jit
::
XYNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
(),
y
.
data
(),
d
);
y
.
Resize
({
d
});
T
*
x_data
=
x
.
mutable_data
<
T
>
(
PlaceType
());
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
RandomVec
<
T
>
(
d
,
x_data
);
BenchAllImpls
<
KT
,
jit
::
XYNTuples
<
T
>
,
PlaceType
>
(
d
,
x
.
data
<
T
>
(),
y_data
,
d
);
}
}
}
}
...
@@ -192,16 +209,23 @@ void BenchLSTMKernel() {
...
@@ -192,16 +209,23 @@ void BenchLSTMKernel() {
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
const
jit
::
lstm_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
,
jit
::
kVTanh
,
const
jit
::
lstm_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
,
jit
::
kVTanh
,
use_peephole
);
use_peephole
);
std
::
vector
<
T
>
x
(
4
*
d
),
ct_1
(
d
),
ct
(
d
),
ht
(
d
),
wp
(
3
*
d
),
checked
(
2
*
d
);
Tensor
x
,
ct_1
,
ct
,
ht
,
wp
,
checked
;
RandomVec
<
T
>
(
4
*
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
x
.
Resize
({
4
*
d
});
RandomVec
<
T
>
(
3
*
d
,
wp
.
data
(),
-
2.
f
,
2.
f
);
ct_1
.
Resize
({
d
});
RandomVec
<
T
>
(
d
,
ct_1
.
data
(),
-
2.
f
,
2.
f
);
ct
.
Resize
({
d
});
const
T
*
ct_1_data
=
ct_1
.
data
();
ht
.
Resize
({
d
});
const
T
*
wp_data
=
wp
.
data
();
wp
.
Resize
({
3
*
d
});
T
*
x_data
=
x
.
data
();
checked
.
Resize
({
2
*
d
});
T
*
checked_data
=
checked
.
data
();
auto
place
=
PlaceType
();
T
*
ct_data
=
ct
.
data
();
RandomVec
<
T
>
(
x
.
numel
(),
x
.
mutable_data
<
T
>
(
place
),
-
2.
f
,
2.
f
);
T
*
ht_data
=
ht
.
data
();
RandomVec
<
T
>
(
wp
.
numel
(),
wp
.
mutable_data
<
T
>
(
place
),
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
ct_1
.
numel
(),
ct_1
.
mutable_data
<
T
>
(
place
),
-
2.
f
,
2.
f
);
const
T
*
ct_1_data
=
ct_1
.
data
<
T
>
();
const
T
*
wp_data
=
wp
.
data
<
T
>
();
T
*
x_data
=
x
.
mutable_data
<
T
>
(
place
);
T
*
checked_data
=
checked
.
mutable_data
<
T
>
(
place
);
T
*
ct_data
=
ct
.
mutable_data
<
T
>
(
place
);
T
*
ht_data
=
ht
.
mutable_data
<
T
>
(
place
);
jit
::
lstm_t
step
;
jit
::
lstm_t
step
;
step
.
gates
=
x_data
;
step
.
gates
=
x_data
;
step
.
ct_1
=
ct_1_data
;
step
.
ct_1
=
ct_1_data
;
...
@@ -220,12 +244,16 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
...
@@ -220,12 +244,16 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void
BenchGRUKernel
()
{
void
BenchGRUKernel
()
{
for
(
int
d
:
TestSizes
())
{
for
(
int
d
:
TestSizes
())
{
const
jit
::
gru_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
);
const
jit
::
gru_attr_t
attr
(
d
,
jit
::
kVSigmoid
,
jit
::
kVTanh
);
std
::
vector
<
T
>
x
(
3
*
d
),
ht_1
(
d
),
ht
(
d
);
auto
place
=
PlaceType
();
RandomVec
<
T
>
(
3
*
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
Tensor
x
,
ht_1
,
ht
;
RandomVec
<
T
>
(
d
,
ht_1
.
data
(),
-
2.
f
,
2.
f
);
x
.
Resize
({
3
*
d
});
const
T
*
ht_1_data
=
ht_1
.
data
();
ht_1
.
Resize
({
d
});
T
*
x_data
=
x
.
data
();
ht
.
Resize
({
d
});
T
*
ht_data
=
ht
.
data
();
RandomVec
<
T
>
(
3
*
d
,
x
.
mutable_data
<
T
>
(
place
),
-
2.
f
,
2.
f
);
RandomVec
<
T
>
(
d
,
ht_1
.
mutable_data
<
T
>
(
place
),
-
2.
f
,
2.
f
);
const
T
*
ht_1_data
=
ht_1
.
data
<
T
>
();
T
*
x_data
=
x
.
mutable_data
<
T
>
(
place
);
T
*
ht_data
=
ht
.
mutable_data
<
T
>
(
place
);
jit
::
gru_t
step
;
jit
::
gru_t
step
;
step
.
gates
=
x_data
;
step
.
gates
=
x_data
;
step
.
ht_1
=
ht_1_data
;
step
.
ht_1
=
ht_1_data
;
...
@@ -243,10 +271,12 @@ void BenchSeqPoolKernel() {
...
@@ -243,10 +271,12 @@ void BenchSeqPoolKernel() {
jit
::
seq_pool_attr_t
attr
(
w
,
type
);
jit
::
seq_pool_attr_t
attr
(
w
,
type
);
for
(
int
h
:
TestSizes
())
{
for
(
int
h
:
TestSizes
())
{
attr
.
h
=
h
;
attr
.
h
=
h
;
std
::
vector
<
T
>
x
(
h
*
w
),
y
(
w
);
Tensor
x
,
y
;
RandomVec
<
T
>
(
h
*
w
,
x
.
data
(),
-
2.
f
,
2.
f
);
x
.
Resize
({
h
*
w
});
const
T
*
x_data
=
x
.
data
();
y
.
Resize
({
w
});
T
*
y_data
=
y
.
data
();
RandomVec
<
T
>
(
h
*
w
,
x
.
mutable_data
<
T
>
(
PlaceType
()),
-
2.
f
,
2.
f
);
const
T
*
x_data
=
x
.
data
<
T
>
();
T
*
y_data
=
y
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
KT
,
jit
::
SeqPoolTuples
<
T
>
,
PlaceType
>
(
attr
,
x_data
,
BenchAllImpls
<
KT
,
jit
::
SeqPoolTuples
<
T
>
,
PlaceType
>
(
attr
,
x_data
,
y_data
,
&
attr
);
y_data
,
&
attr
);
}
}
...
@@ -259,12 +289,15 @@ void BenchMatMulKernel() {
...
@@ -259,12 +289,15 @@ void BenchMatMulKernel() {
for
(
int
m
:
{
1
,
2
,
3
,
4
})
{
for
(
int
m
:
{
1
,
2
,
3
,
4
})
{
for
(
int
n
:
TestSizes
())
{
for
(
int
n
:
TestSizes
())
{
for
(
int
k
:
TestSizes
())
{
for
(
int
k
:
TestSizes
())
{
std
::
vector
<
T
>
a
(
m
*
k
),
b
(
k
*
n
),
c
(
m
*
n
);
Tensor
a
,
b
,
c
;
RandomVec
<
T
>
(
m
*
k
,
a
.
data
(),
-
2.
f
,
2.
f
);
a
.
Resize
({
m
*
k
});
RandomVec
<
T
>
(
k
*
n
,
b
.
data
(),
-
2.
f
,
2.
f
);
b
.
Resize
({
k
*
n
});
const
T
*
a_data
=
a
.
data
();
c
.
Resize
({
m
*
n
});
const
T
*
b_data
=
b
.
data
();
RandomVec
<
T
>
(
m
*
k
,
a
.
mutable_data
<
T
>
(
PlaceType
()),
-
2.
f
,
2.
f
);
T
*
c_data
=
c
.
data
();
RandomVec
<
T
>
(
k
*
n
,
b
.
mutable_data
<
T
>
(
PlaceType
()),
-
2.
f
,
2.
f
);
const
T
*
a_data
=
a
.
data
<
T
>
();
const
T
*
b_data
=
b
.
data
<
T
>
();
T
*
c_data
=
c
.
mutable_data
<
T
>
(
PlaceType
());
BenchAllImpls
<
KT
,
jit
::
MatMulTuples
<
T
>
,
PlaceType
>
(
k
,
a_data
,
b_data
,
BenchAllImpls
<
KT
,
jit
::
MatMulTuples
<
T
>
,
PlaceType
>
(
k
,
a_data
,
b_data
,
c_data
,
m
,
n
,
k
);
c_data
,
m
,
n
,
k
);
}
}
...
...
paddle/fluid/operators/lrn_mkldnn_op.cc
浏览文件 @
c7e38680
...
@@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mid
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mid
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
int
n
=
ctx
.
Attr
<
int
>
(
"n"
);
const
int
n
=
ctx
.
Attr
<
int
>
(
"n"
);
const
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
);
// MKL-DNN implements LRN in a caffe way:
// http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
// Where sum of squares is divided by size of normalization window
// this is not the case for PaddlePaddle LRN.
// Hence we need to compensate for this diffrence by
// multipliing alpha by size of window(n)
const
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
)
*
static_cast
<
float
>
(
n
);
const
float
beta
=
ctx
.
Attr
<
float
>
(
"beta"
);
const
float
beta
=
ctx
.
Attr
<
float
>
(
"beta"
);
const
float
k
=
ctx
.
Attr
<
float
>
(
"k"
);
const
float
k
=
ctx
.
Attr
<
float
>
(
"k"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
...
@@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
dims
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
dims
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
src_md
=
paddle
::
platform
::
MKLDNNMemDesc
(
auto
src_md
=
paddle
::
platform
::
MKLDNNMemDesc
(
dims
,
mkldnn
::
memory
::
data_type
::
f32
,
mkldnn
::
memory
::
format
::
nchw
);
dims
,
mkldnn
::
memory
::
data_type
::
f32
,
x
->
format
());
auto
dst_md
=
paddle
::
platform
::
MKLDNNMemDesc
(
dims
,
mkldnn
::
memory
::
data_type
::
f32
,
mkldnn
::
memory
::
format
::
nchw
);
auto
forward_desc
=
mkldnn
::
lrn_forward
::
desc
{
mkldnn
::
prop_kind
::
forward
,
auto
forward_desc
=
mkldnn
::
lrn_forward
::
desc
{
mkldnn
::
prop_kind
::
forward
,
mkldnn
::
lrn_across_channels
,
mkldnn
::
lrn_across_channels
,
...
@@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
k
};
k
};
auto
src_memory_pd
=
mkldnn
::
memory
::
primitive_desc
{
src_md
,
mkldnn_engine
};
auto
src_memory_pd
=
mkldnn
::
memory
::
primitive_desc
{
src_md
,
mkldnn_engine
};
auto
dst_memory
=
mkldnn
::
memory
{{
dst_md
,
mkldnn_engine
},
static_cast
<
void
*>
(
output_data
)};
if
(
!
is_test
)
{
if
(
!
is_test
)
{
const
std
::
string
key
=
ctx
.
op
().
Output
(
"Out"
);
const
std
::
string
key
=
ctx
.
op
().
Output
(
"Out"
);
...
@@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory
->
set_data_handle
(
src_memory
->
set_data_handle
(
static_cast
<
void
*>
(
const_cast
<
T
*>
(
input_data
)));
static_cast
<
void
*>
(
const_cast
<
T
*>
(
input_data
)));
auto
dst_memory
=
mkldnn
::
memory
(
forward_pd
->
dst_primitive_desc
(),
static_cast
<
void
*>
(
output_data
));
auto
workspace_memory
=
insert_to_context
<
mkldnn
::
memory
>
(
auto
workspace_memory
=
insert_to_context
<
mkldnn
::
memory
>
(
key_workspace_memory
,
dev_ctx
,
key_workspace_memory
,
dev_ctx
,
forward_pd
->
workspace_primitive_desc
());
forward_pd
->
workspace_primitive_desc
());
run_primitive
(
*
forward_pd
,
*
src_memory
,
*
workspace_memory
,
dst_memory
);
run_primitive
(
*
forward_pd
,
*
src_memory
,
*
workspace_memory
,
dst_memory
);
out
->
set_layout
(
framework
::
DataLayout
::
kMKLDNN
);
out
->
set_format
(
platform
::
GetMKLDNNFormat
(
dst_memory
));
}
else
{
}
else
{
auto
forward_pd
=
auto
forward_pd
=
mkldnn
::
lrn_forward
::
primitive_desc
{
forward_desc
,
mkldnn_engine
};
mkldnn
::
lrn_forward
::
primitive_desc
{
forward_desc
,
mkldnn_engine
};
...
@@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
src_memory_pd
,
static_cast
<
void
*>
(
const_cast
<
T
*>
(
input_data
))};
src_memory_pd
,
static_cast
<
void
*>
(
const_cast
<
T
*>
(
input_data
))};
auto
workspace_memory
=
auto
workspace_memory
=
mkldnn
::
memory
{
forward_pd
.
workspace_primitive_desc
()};
mkldnn
::
memory
{
forward_pd
.
workspace_primitive_desc
()};
auto
dst_memory
=
mkldnn
::
memory
(
forward_pd
.
dst_primitive_desc
(),
static_cast
<
void
*>
(
output_data
));
run_primitive
(
forward_pd
,
src_memory
,
workspace_memory
,
dst_memory
);
run_primitive
(
forward_pd
,
src_memory
,
workspace_memory
,
dst_memory
);
out
->
set_layout
(
framework
::
DataLayout
::
kMKLDNN
);
out
->
set_format
(
platform
::
GetMKLDNNFormat
(
dst_memory
));
}
}
}
}
};
};
...
@@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const
std
::
string
key_workspace_memory
=
key
+
"@lrn_workspace_memory"
;
const
std
::
string
key_workspace_memory
=
key
+
"@lrn_workspace_memory"
;
const
int
n
=
ctx
.
Attr
<
int
>
(
"n"
);
const
int
n
=
ctx
.
Attr
<
int
>
(
"n"
);
const
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
);
const
float
alpha
=
ctx
.
Attr
<
float
>
(
"alpha"
)
*
static_cast
<
float
>
(
n
)
;
const
float
beta
=
ctx
.
Attr
<
float
>
(
"beta"
);
const
float
beta
=
ctx
.
Attr
<
float
>
(
"beta"
);
const
float
k
=
ctx
.
Attr
<
float
>
(
"k"
);
const
float
k
=
ctx
.
Attr
<
float
>
(
"k"
);
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
c7e38680
...
@@ -54,6 +54,7 @@ math_library(sequence_padding)
...
@@ -54,6 +54,7 @@ math_library(sequence_padding)
math_library
(
sequence_pooling DEPS math_function jit_kernel_helper
)
math_library
(
sequence_pooling DEPS math_function jit_kernel_helper
)
math_library
(
sequence_scale
)
math_library
(
sequence_scale
)
math_library
(
softmax DEPS math_function
)
math_library
(
softmax DEPS math_function
)
math_library
(
beam_search DEPS math_function
)
math_library
(
matrix_bit_code
)
math_library
(
matrix_bit_code
)
...
@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
...
@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
cc_test
(
vol2col_test SRCS vol2col_test.cc DEPS vol2col
)
cc_test
(
vol2col_test SRCS vol2col_test.cc DEPS vol2col
)
cc_test
(
sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding
)
cc_test
(
sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding
)
cc_test
(
sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling
)
cc_test
(
sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling
)
cc_test
(
beam_search_test SRCS beam_search_test.cc DEPS beam_search
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_test
(
math_function_gpu_test SRCS math_function_test.cu DEPS math_function
)
nv_test
(
math_function_gpu_test SRCS math_function_test.cu DEPS math_function
)
nv_test
(
selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function
)
nv_test
(
selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function
)
...
...
paddle/fluid/operators/math/beam_search.cc
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include <algorithm>
#include <map>
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
class
BeamSearchFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
LoDTensor
*
pre_ids
,
const
framework
::
LoDTensor
*
pre_scores
,
const
framework
::
LoDTensor
*
ids
,
const
framework
::
LoDTensor
*
scores
,
framework
::
LoDTensor
*
selected_ids
,
framework
::
LoDTensor
*
selected_scores
,
size_t
level
,
size_t
beam_size
,
int
end_id
,
bool
is_accumulated
)
{
auto
abs_lod
=
framework
::
ToAbsOffset
(
scores
->
lod
());
auto
&
high_level
=
abs_lod
[
level
];
auto
items
=
SelectTopBeamSizeItems
(
pre_ids
,
pre_scores
,
ids
,
scores
,
level
,
beam_size
,
end_id
,
is_accumulated
);
auto
selected_items
=
ToMap
(
items
,
high_level
.
back
());
if
(
FLAGS_v
==
3
)
{
VLOG
(
3
)
<<
"selected_items:"
;
for
(
size_t
i
=
0
;
i
<
selected_items
.
size
();
++
i
)
{
VLOG
(
3
)
<<
"offset: "
<<
i
;
for
(
auto
&
item
:
selected_items
[
i
])
{
VLOG
(
3
)
<<
item
.
ToString
();
}
}
}
PruneEndBeams
(
pre_ids
,
abs_lod
,
&
selected_items
,
level
,
end_id
);
// calculate the output tensor's height
size_t
num_instances
=
std
::
accumulate
(
std
::
begin
(
selected_items
),
std
::
end
(
selected_items
),
0
,
[](
size_t
a
,
std
::
vector
<
Item
>
&
b
)
{
return
a
+
b
.
size
();
});
// the output tensor shape should be [num_instances, 1]
auto
dims
=
framework
::
make_ddim
(
std
::
vector
<
int64_t
>
({
static_cast
<
int
>
(
num_instances
),
1
}));
selected_ids
->
Resize
(
dims
);
selected_scores
->
Resize
(
dims
);
auto
*
selected_ids_data
=
selected_ids
->
mutable_data
<
int64_t
>
(
platform
::
CPUPlace
());
auto
*
selected_scores_data
=
selected_scores
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
// fill in data
std
::
vector
<
size_t
>
low_level
;
size_t
low_offset
=
0
;
for
(
auto
&
items
:
selected_items
)
{
low_level
.
push_back
(
low_offset
);
for
(
auto
&
item
:
items
)
{
selected_ids_data
[
low_offset
]
=
item
.
id
;
selected_scores_data
[
low_offset
]
=
item
.
score
;
low_offset
++
;
}
}
low_level
.
push_back
(
low_offset
);
// fill lod
framework
::
LoD
lod
(
2
);
lod
[
0
].
assign
(
high_level
.
begin
(),
high_level
.
end
());
lod
[
1
].
assign
(
low_level
.
begin
(),
low_level
.
end
());
if
(
!
framework
::
CheckLoD
(
lod
))
{
PADDLE_THROW
(
"lod %s is not right"
,
framework
::
LoDToString
(
lod
));
}
selected_ids
->
set_lod
(
lod
);
selected_scores
->
set_lod
(
lod
);
}
/*
* The basic items help to sort.
*/
struct
Item
{
Item
()
{}
Item
(
size_t
offset
,
size_t
id
,
float
score
)
:
offset
(
offset
),
id
(
id
),
score
(
score
)
{}
// offset in the higher lod level.
size_t
offset
;
// prefix id in the lower lod level.
// size_t prefix;
// the candidate id
size_t
id
;
// the corresponding score
float
score
;
inline
bool
operator
<
(
const
Item
&
in
)
const
{
return
(
score
<
in
.
score
)
||
((
score
==
in
.
score
)
&&
(
offset
<
in
.
offset
));
}
inline
void
operator
=
(
const
Item
&
in
)
{
offset
=
in
.
offset
;
id
=
in
.
id
;
score
=
in
.
score
;
}
std
::
string
ToString
()
{
std
::
ostringstream
os
;
os
<<
"{"
;
os
<<
"offset: "
<<
offset
<<
", "
;
os
<<
"id: "
<<
id
<<
", "
;
os
<<
"score: "
<<
score
<<
""
;
os
<<
"}"
;
return
os
.
str
();
}
};
protected:
/*
* Prune the source sentences all branchs finished, and it is optional.
* Pruning must one step later than finishing (thus pre_ids is needed here),
* since the end tokens must be writed out.
*/
void
PruneEndBeams
(
const
framework
::
LoDTensor
*
pre_ids
,
const
framework
::
LoD
&
abs_lod
,
std
::
vector
<
std
::
vector
<
Item
>>
*
items
,
size_t
lod_level
,
int
end_id
)
{
auto
*
pre_ids_data
=
pre_ids
->
data
<
int64_t
>
();
auto
&
high_level
=
abs_lod
[
lod_level
];
for
(
size_t
src_idx
=
0
;
src_idx
<
high_level
.
size
()
-
1
;
++
src_idx
)
{
size_t
src_prefix_start
=
high_level
[
src_idx
];
size_t
src_prefix_end
=
high_level
[
src_idx
+
1
];
bool
finish_flag
=
true
;
for
(
size_t
offset
=
src_prefix_start
;
offset
<
src_prefix_end
;
offset
++
)
{
for
(
auto
&
item
:
items
->
at
(
offset
))
{
if
(
item
.
id
!=
static_cast
<
size_t
>
(
end_id
)
||
pre_ids_data
[
offset
]
!=
end_id
)
{
finish_flag
=
false
;
break
;
}
}
if
(
!
finish_flag
)
break
;
}
if
(
finish_flag
)
{
// all branchs of the beam (source sentence) end and
// prune this beam
for
(
size_t
offset
=
src_prefix_start
;
offset
<
src_prefix_end
;
offset
++
)
items
->
at
(
offset
).
clear
();
}
}
}
/*
* Transform the items into a map whose key is offset, value is the items.
* NOTE low performance.
*/
std
::
vector
<
std
::
vector
<
Item
>>
ToMap
(
const
std
::
vector
<
std
::
vector
<
Item
>>
&
items
,
size_t
element_num
)
{
std
::
vector
<
std
::
vector
<
Item
>>
result
;
result
.
resize
(
element_num
);
for
(
auto
&
entries
:
items
)
{
for
(
const
auto
&
item
:
entries
)
{
result
[
item
.
offset
].
push_back
(
item
);
}
}
return
result
;
}
void
Insert
(
std
::
vector
<
Item
>
*
top_beam_ptr
,
const
Item
&
item
,
size_t
beam_size
)
{
std
::
vector
<
Item
>
&
top_beam
=
*
top_beam_ptr
;
size_t
num_beams
=
top_beam
.
size
();
if
(
num_beams
<
beam_size
)
{
top_beam
.
resize
(
num_beams
+
1
);
num_beams
++
;
}
else
{
if
(
item
<
top_beam
[
beam_size
-
1
])
{
return
;
}
}
for
(
int
k
=
static_cast
<
int
>
(
num_beams
)
-
2
;
k
>=
0
;
--
k
)
{
if
(
top_beam
[
k
]
<
item
)
{
top_beam
[
k
+
1
]
=
top_beam
[
k
];
}
else
{
top_beam
[
k
+
1
]
=
item
;
return
;
}
}
top_beam
[
0
]
=
item
;
}
/*
* For each source, select top beam_size records.
*/
std
::
vector
<
std
::
vector
<
Item
>>
SelectTopBeamSizeItems
(
const
framework
::
LoDTensor
*
pre_ids
,
const
framework
::
LoDTensor
*
pre_scores
,
const
framework
::
LoDTensor
*
ids
,
const
framework
::
LoDTensor
*
scores
,
size_t
lod_level
,
size_t
beam_size
,
int
end_id
,
bool
is_accumulated
)
{
std
::
vector
<
std
::
vector
<
Item
>>
result
;
// find the current candidates
auto
abs_lod
=
framework
::
ToAbsOffset
(
scores
->
lod
());
auto
*
pre_ids_data
=
pre_ids
->
data
<
int64_t
>
();
auto
*
pre_scores_data
=
pre_scores
->
data
<
float
>
();
auto
*
ids_data
=
ids
?
ids
->
data
<
int64_t
>
()
:
nullptr
;
auto
*
scores_data
=
scores
->
data
<
float
>
();
size_t
num_seqs
=
scores
->
NumElements
(
lod_level
);
size_t
seq_width
=
1
;
for
(
int
i
=
1
;
i
<
scores
->
dims
().
size
();
i
++
)
{
seq_width
*=
scores
->
dims
()[
i
];
}
for
(
size_t
seq_id
=
0
;
seq_id
<
num_seqs
;
++
seq_id
)
{
size_t
seq_offset_start
=
abs_lod
[
lod_level
][
seq_id
];
size_t
seq_offset_end
=
abs_lod
[
lod_level
][
seq_id
+
1
];
std
::
vector
<
Item
>
top_beam
;
top_beam
.
reserve
(
beam_size
);
for
(
size_t
offset
=
seq_offset_start
;
offset
<
seq_offset_end
;
++
offset
)
{
auto
pre_id
=
pre_ids_data
[
offset
];
auto
pre_score
=
pre_scores_data
[
offset
];
if
(
pre_id
==
end_id
)
{
// Allocate all probability mass to end_id for finished branchs and
// the other candidate ids can be ignored.
Item
item
(
offset
,
end_id
,
pre_score
);
Insert
(
&
top_beam
,
item
,
beam_size
);
}
else
{
size_t
index
=
offset
*
seq_width
;
for
(
size_t
d
=
0
;
d
<
seq_width
;
d
++
,
index
++
)
{
int64_t
id
=
ids_data
?
ids_data
[
index
]
:
static_cast
<
int64_t
>
(
d
);
float
score
=
is_accumulated
?
scores_data
[
index
]
:
pre_score
+
std
::
log
(
scores_data
[
index
]);
Item
item
(
offset
,
id
,
score
);
Insert
(
&
top_beam
,
item
,
beam_size
);
}
}
}
result
.
emplace_back
(
top_beam
);
}
if
(
FLAGS_v
==
3
)
{
VLOG
(
3
)
<<
"SelectTopBeamSizeItems result size "
<<
result
.
size
();
for
(
auto
&
items
:
result
)
{
VLOG
(
3
)
<<
"item set:"
;
for
(
auto
&
item
:
items
)
{
VLOG
(
3
)
<<
item
.
ToString
();
}
}
}
return
result
;
}
};
template
class
BeamSearchFunctor
<
platform
::
CPUDeviceContext
,
int
>;
template
class
BeamSearchFunctor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
class
BeamSearchFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
BeamSearchFunctor
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/beam_search.cu
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include "paddle/fluid/platform/cuda_device_function.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
struct
Triple
{
__device__
__forceinline__
Triple
()
{}
__device__
__forceinline__
Triple
(
int
o
,
int
i
,
float
s
)
:
offset
(
o
),
id
(
i
),
score
(
s
)
{}
__device__
__forceinline__
void
set
(
int
o
,
int
i
,
float
s
)
{
offset
=
o
;
id
=
i
;
score
=
s
;
}
__device__
__forceinline__
void
operator
=
(
const
Triple
&
in
)
{
offset
=
in
.
offset
;
id
=
in
.
id
;
score
=
in
.
score
;
}
__device__
__forceinline__
bool
operator
<
(
const
float
s
)
const
{
return
score
<
s
;
}
__device__
__forceinline__
bool
operator
<
(
const
Triple
&
in
)
const
{
return
(
score
<
in
.
score
)
||
((
score
==
in
.
score
)
&&
(
offset
<
in
.
offset
));
}
int
offset
;
int
id
;
float
score
;
};
__device__
__forceinline__
void
Insert
(
Triple
*
top_beam
,
const
Triple
&
p
,
int
beam_size
)
{
if
(
p
<
top_beam
[
beam_size
-
1
])
{
return
;
}
for
(
int
k
=
beam_size
-
2
;
k
>=
0
;
--
k
)
{
if
(
top_beam
[
k
]
<
p
)
{
top_beam
[
k
+
1
]
=
top_beam
[
k
];
}
else
{
top_beam
[
k
+
1
]
=
p
;
return
;
}
}
top_beam
[
0
]
=
p
;
}
template
<
int
MaxThreadsPerSeq
,
bool
IsAccumulated
=
true
>
__device__
__forceinline__
int
SelectTopBeam
(
Triple
*
top_beam
,
const
int64_t
*
pre_ids
,
const
float
*
pre_scores
,
const
int64_t
*
ids
,
const
float
*
scores
,
const
int
seq_offset_start
,
const
int
seq_offset_end
,
const
int
seq_width
,
int
beam_size
,
int
end_id
,
int
used_threads
)
{
// top_beam is shared memory
const
int
tid
=
threadIdx
.
x
;
const
int
tid_of_seq
=
threadIdx
.
x
%
MaxThreadsPerSeq
;
int
num_used_threads
=
used_threads
;
Triple
*
top_beam_local
=
top_beam
+
tid
*
beam_size
;
if
(
tid_of_seq
<
num_used_threads
)
{
for
(
int
i
=
0
;
i
<
beam_size
;
++
i
)
{
top_beam_local
[
i
].
set
(
-
1
,
-
1
,
-
INFINITY
);
}
for
(
int
offset
=
seq_offset_start
;
offset
<
seq_offset_end
;
++
offset
)
{
int
pre_id
=
static_cast
<
int
>
(
pre_ids
[
offset
]);
if
(
pre_id
==
end_id
)
{
if
(
tid_of_seq
==
0
)
{
Triple
tmp
(
offset
,
end_id
,
pre_scores
[
offset
]);
Insert
(
top_beam_local
,
tmp
,
beam_size
);
}
}
else
{
int
index
=
offset
*
seq_width
+
tid_of_seq
;
if
(
!
IsAccumulated
)
{
float
pre_score
=
pre_scores
[
offset
];
for
(
int
i
=
tid_of_seq
;
i
<
seq_width
;
i
+=
num_used_threads
)
{
float
score
=
pre_score
+
__logf
(
scores
[
index
]);
int
id
=
ids
?
static_cast
<
int
>
(
ids
[
index
])
:
i
;
Triple
tmp
(
offset
,
id
,
score
);
Insert
(
top_beam_local
,
tmp
,
beam_size
);
index
+=
num_used_threads
;
}
}
else
{
for
(
int
i
=
tid_of_seq
;
i
<
seq_width
;
i
+=
num_used_threads
)
{
int
id
=
ids
?
static_cast
<
int
>
(
ids
[
index
])
:
i
;
float
score
=
scores
[
index
];
Triple
tmp
(
offset
,
id
,
score
);
Insert
(
top_beam_local
,
tmp
,
beam_size
);
index
+=
num_used_threads
;
}
}
}
}
}
while
(
num_used_threads
>
1
)
{
if
(
num_used_threads
>
16
)
{
__syncthreads
();
}
num_used_threads
=
num_used_threads
>>
1
;
if
(
tid_of_seq
<
num_used_threads
)
{
int
index_in_sh
=
(
num_used_threads
+
tid
)
*
beam_size
;
for
(
int
i
=
0
;
i
<
beam_size
;
i
++
)
{
Insert
(
top_beam_local
,
top_beam
[
index_in_sh
],
beam_size
);
index_in_sh
++
;
}
}
}
if
(
tid_of_seq
==
0
)
{
int
num_items
=
0
;
for
(
int
i
=
0
;
i
<
beam_size
;
++
i
)
{
num_items
=
(
top_beam_local
[
i
].
score
>
-
INFINITY
)
?
num_items
+
1
:
num_items
;
}
return
num_items
;
}
return
0
;
}
__device__
__forceinline__
bool
PruneEndBeams
(
Triple
*
top_beam_local
,
const
int64_t
*
pre_ids
,
const
int
end_id
,
int
num_items
)
{
bool
finish_flag
=
true
;
for
(
int
i
=
0
;
i
<
num_items
;
++
i
)
{
int
offset
=
top_beam_local
[
i
].
offset
;
if
(
top_beam_local
[
i
].
id
!=
end_id
||
static_cast
<
int
>
(
pre_ids
[
offset
])
!=
end_id
)
{
finish_flag
=
false
;
break
;
}
}
return
finish_flag
;
}
__device__
__forceinline__
void
WriteBack
(
int64_t
*
selected_ids
,
float
*
selected_scores
,
size_t
*
selected_offsets
,
Triple
*
top_beam_local
,
const
int
seq_offset_start
,
const
int
seq_offset_end
,
const
int
selected_seq_start
,
const
int
selected_seq_length
)
{
const
int
tid
=
threadIdx
.
x
;
// use 1 thread only for each sequence
int
global_index
=
selected_seq_start
;
for
(
int
global_offset
=
seq_offset_start
;
global_offset
<
seq_offset_end
;
++
global_offset
)
{
for
(
int
local_index
=
0
;
local_index
<
selected_seq_length
;
++
local_index
)
{
if
(
top_beam_local
[
local_index
].
offset
==
global_offset
)
{
selected_ids
[
global_index
]
=
static_cast
<
int64_t
>
(
top_beam_local
[
local_index
].
id
);
selected_scores
[
global_index
]
=
top_beam_local
[
local_index
].
score
;
global_index
++
;
}
}
selected_offsets
[
global_offset
+
1
]
=
static_cast
<
size_t
>
(
global_index
);
}
}
template
<
int
MaxLength
,
int
MaxThreadsPerSeq
,
int
MaxSeqs
>
__device__
void
BeamSearchDetails
(
int64_t
*
selected_ids
,
float
*
selected_scores
,
size_t
*
selected_offsets
,
const
int64_t
*
pre_ids
,
const
float
*
pre_scores
,
const
int64_t
*
ids
,
const
float
*
scores
,
const
int
seq_offset_start
,
const
int
seq_offset_end
,
const
int
seq_width
,
int
beam_size
,
int
end_id
,
bool
is_accumulated
,
int
num_used_threads
)
{
__shared__
Triple
top_beam
[
MaxLength
];
int
num_items
=
0
;
if
(
is_accumulated
)
{
num_items
=
SelectTopBeam
<
MaxThreadsPerSeq
,
true
>
(
top_beam
,
pre_ids
,
pre_scores
,
ids
,
scores
,
seq_offset_start
,
seq_offset_end
,
seq_width
,
beam_size
,
end_id
,
num_used_threads
);
}
else
{
num_items
=
SelectTopBeam
<
MaxThreadsPerSeq
,
false
>
(
top_beam
,
pre_ids
,
pre_scores
,
ids
,
scores
,
seq_offset_start
,
seq_offset_end
,
seq_width
,
beam_size
,
end_id
,
num_used_threads
);
}
const
int
tid
=
threadIdx
.
x
;
// use 1 thread only for each sequence
const
int
tid_of_seq
=
tid
%
MaxThreadsPerSeq
;
if
(
tid_of_seq
==
0
)
{
// Use 1 thread for each sequence.
Triple
*
top_beam_local
=
top_beam
+
tid
*
beam_size
;
bool
finish_flag
=
PruneEndBeams
(
top_beam_local
,
pre_ids
,
end_id
,
num_items
);
int
selected_seq_start
=
0
;
int
selected_seq_length
=
finish_flag
?
0
:
num_items
;
if
(
MaxSeqs
>
1
)
{
const
int
seq_id
=
(
MaxSeqs
>
1
)
?
tid
/
MaxThreadsPerSeq
:
tid
;
__shared__
int
shared_mem
[
MaxSeqs
];
// [0, MaxSeqs - 1], length of each sequences
shared_mem
[
seq_id
]
=
selected_seq_length
;
__syncthreads
();
for
(
int
s
=
0
;
s
<
seq_id
;
++
s
)
{
selected_seq_start
+=
shared_mem
[
s
];
}
if
(
seq_id
==
0
)
{
selected_offsets
[
0
]
=
0
;
}
}
else
{
selected_offsets
[
0
]
=
0
;
}
WriteBack
(
selected_ids
,
selected_scores
,
selected_offsets
,
top_beam_local
,
seq_offset_start
,
seq_offset_end
,
selected_seq_start
,
selected_seq_length
);
}
}
template
<
int
MaxLength
,
int
MaxThreadsPerSeq
,
int
MaxSeqs
>
__global__
void
BeamSearchKernel
(
int64_t
*
selected_ids
,
float
*
selected_scores
,
size_t
*
selected_offsets
,
const
int64_t
*
pre_ids
,
const
float
*
pre_scores
,
const
int64_t
*
ids
,
const
float
*
scores
,
const
size_t
*
seq_offsets
,
const
int
num_seqs
,
const
int
seq_width
,
int
beam_size
,
int
end_id
,
bool
is_accumulated
,
int
num_used_threads
)
{
const
int
tid
=
threadIdx
.
x
;
const
int
seq_id
=
(
MaxSeqs
>
1
)
?
tid
/
MaxThreadsPerSeq
:
tid
;
int
seq_offset_start
=
static_cast
<
int
>
(
seq_offsets
[
seq_id
]);
int
seq_offset_end
=
static_cast
<
int
>
(
seq_offsets
[
seq_id
+
1
]);
BeamSearchDetails
<
MaxLength
,
MaxThreadsPerSeq
,
MaxSeqs
>
(
selected_ids
,
selected_scores
,
selected_offsets
,
pre_ids
,
pre_scores
,
ids
,
scores
,
seq_offset_start
,
seq_offset_end
,
seq_width
,
beam_size
,
end_id
,
is_accumulated
,
num_used_threads
);
}
template
<
int
MaxLength
,
int
MaxThreadsPerSeq
>
__global__
void
BeamSearchKernelSingle
(
int64_t
*
selected_ids
,
float
*
selected_scores
,
size_t
*
selected_offsets
,
const
int64_t
*
pre_ids
,
const
float
*
pre_scores
,
const
int64_t
*
ids
,
const
float
*
scores
,
const
int
seq_length
,
const
int
seq_width
,
int
beam_size
,
int
end_id
,
bool
is_accumulated
,
int
num_used_threads
)
{
const
int
seq_offset_start
=
0
;
const
int
seq_offset_end
=
seq_length
;
BeamSearchDetails
<
MaxLength
,
MaxThreadsPerSeq
,
1
>
(
selected_ids
,
selected_scores
,
selected_offsets
,
pre_ids
,
pre_scores
,
ids
,
scores
,
seq_offset_start
,
seq_offset_end
,
seq_width
,
beam_size
,
end_id
,
is_accumulated
,
num_used_threads
);
}
static
inline
int
GetNumUsedThreads
(
const
int
max_threads_per_seq
,
const
int
seq_width
,
int
beam_size
)
{
int
num_used_threads
=
(
seq_width
+
beam_size
-
1
)
/
beam_size
;
num_used_threads
=
max_threads_per_seq
<
num_used_threads
?
max_threads_per_seq
:
num_used_threads
;
num_used_threads
=
num_used_threads
>
32
?
(
num_used_threads
>>
5
)
<<
5
:
(
num_used_threads
>
16
?
32
:
(
num_used_threads
>
8
?
16
:
(
num_used_threads
>
4
?
8
:
(
num_used_threads
>
2
?
4
:
num_used_threads
))));
return
num_used_threads
;
}
template
<
typename
T
>
class
BeamSearchFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
LoDTensor
*
pre_ids
,
const
framework
::
LoDTensor
*
pre_scores
,
const
framework
::
LoDTensor
*
ids
,
const
framework
::
LoDTensor
*
scores
,
framework
::
LoDTensor
*
selected_ids
,
framework
::
LoDTensor
*
selected_scores
,
size_t
level
,
size_t
beam_size
,
int
end_id
,
bool
is_accumulated
)
{
auto
abs_lod
=
framework
::
ToAbsOffset
(
scores
->
lod
());
const
int64_t
*
pre_ids_data
=
pre_ids
->
data
<
int64_t
>
();
const
float
*
pre_scores_data
=
pre_scores
->
data
<
float
>
();
const
int64_t
*
ids_data
=
ids
?
ids
->
data
<
int64_t
>
()
:
nullptr
;
const
float
*
scores_data
=
scores
->
data
<
float
>
();
const
size_t
num_seqs
=
abs_lod
[
level
].
size
()
-
1
;
size_t
seq_width
=
1
;
for
(
int
i
=
1
;
i
<
scores
->
dims
().
size
();
i
++
)
{
seq_width
*=
scores
->
dims
()[
i
];
}
// Reserve a big enough memory.
auto
selected_dims
=
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
num_seqs
*
beam_size
),
1
});
int64_t
*
selected_ids_data
=
selected_ids
->
mutable_data
<
int64_t
>
(
selected_dims
,
context
.
GetPlace
());
float
*
selected_scores_data
=
selected_scores
->
mutable_data
<
float
>
(
selected_dims
,
context
.
GetPlace
());
framework
::
LoD
selected_lod
(
2
);
selected_lod
[
0
].
assign
(
abs_lod
[
level
].
begin
(),
abs_lod
[
level
].
end
());
selected_lod
[
1
].
resize
(
scores
->
dims
()[
0
]
+
1
);
size_t
*
selected_offsets
=
selected_lod
[
1
].
CUDAMutableData
(
context
.
GetPlace
());
if
(
num_seqs
==
1
)
{
const
int
seq_length
=
static_cast
<
int
>
(
abs_lod
[
level
][
1
]);
const
int
kMaxThreadsPerSeq
=
1024
;
int
num_used_threads
=
GetNumUsedThreads
(
kMaxThreadsPerSeq
,
static_cast
<
int
>
(
seq_width
),
static_cast
<
int
>
(
beam_size
));
switch
(
platform
::
RoundToPowerOfTwo
(
beam_size
*
seq_width
))
{
CUDA_LAUNCH_KERNEL_HELPER
(
BeamSearchKernelSingle
<
kPowerOfTwoDim
,
kMaxThreadsPerSeq
><<<
1
,
kMaxThreadsPerSeq
,
0
,
context
.
stream
()
>>>
(
selected_ids_data
,
selected_scores_data
,
selected_offsets
,
pre_ids_data
,
pre_scores_data
,
ids_data
,
scores_data
,
seq_length
,
static_cast
<
int
>
(
seq_width
),
static_cast
<
int
>
(
beam_size
),
static_cast
<
int
>
(
end_id
),
is_accumulated
,
num_used_threads
));
}
}
else
if
(
num_seqs
<=
4
)
{
const
size_t
*
seq_offsets
=
abs_lod
[
level
].
CUDAData
(
context
.
GetPlace
());
// Use only 1 block
const
int
kMaxThreadsPerSeq
=
32
;
const
int
kMaxSeqs
=
4
;
int
num_used_threads
=
GetNumUsedThreads
(
kMaxThreadsPerSeq
,
static_cast
<
int
>
(
seq_width
),
static_cast
<
int
>
(
beam_size
));
switch
(
platform
::
RoundToPowerOfTwo
(
beam_size
*
num_seqs
*
32
))
{
CUDA_LAUNCH_KERNEL_HELPER
(
BeamSearchKernel
<
kPowerOfTwoDim
,
kMaxThreadsPerSeq
,
kMaxSeqs
><<<
1
,
num_seqs
*
kMaxThreadsPerSeq
,
0
,
context
.
stream
()
>>>
(
selected_ids_data
,
selected_scores_data
,
selected_offsets
,
pre_ids_data
,
pre_scores_data
,
ids_data
,
scores_data
,
seq_offsets
,
static_cast
<
int
>
(
num_seqs
),
static_cast
<
int
>
(
seq_width
),
static_cast
<
int
>
(
beam_size
),
end_id
,
is_accumulated
,
num_used_threads
));
}
}
else
{
LOG
(
FATAL
)
<<
"Not implemented."
;
}
context
.
Wait
();
if
(
!
framework
::
CheckLoD
(
selected_lod
))
{
PADDLE_THROW
(
"lod %s is not right"
,
framework
::
LoDToString
(
selected_lod
));
}
selected_ids
->
set_lod
(
selected_lod
);
selected_scores
->
set_lod
(
selected_lod
);
if
(
selected_lod
[
1
].
back
()
<
num_seqs
*
beam_size
)
{
auto
final_selected_dims
=
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
selected_lod
[
1
].
back
()),
1
});
selected_ids
->
Resize
(
final_selected_dims
);
selected_scores
->
Resize
(
final_selected_dims
);
}
}
};
template
class
BeamSearchFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
BeamSearchFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
BeamSearchFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
BeamSearchFunctor
<
platform
::
CUDADeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/beam_search.h
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/*
* This is an implementation of beam search.
*
* To explain the details, lets take machine translation task for example, in
* this task, one source sentence is translated to multiple target sentences,
* during this period, one sentence will be translated to multiple translation
* prefixes(target sentence that have not ended), in each time step a prefix
* will have some candidates, input the candidate ids and their corresponding
* scores (probabilities), it will sort and select the top beam_size candidates
* for each source sentence, and store the selected candidates's score and their
* corresponding ids to LoDTensors.
*
* A detailed example:
*
* Input
*
* ids:
* - LoD (should have 2 levels)
* - first level: [0, 1, 4]
* - second level: [0, 1, 2, 3, 4]
* - tensor's data:
* [[4, 2, 5]
* [2, 1, 3]
* [3, 5, 2]
* [8, 2, 1]]
*
* scores:
* - LoD same as `ids`
* - tensor's data
* [[0.5, 0.3, 0.2]
* [0.6, 0.3, 0.1]
* [0.9, 0.5, 0.1]
* [0.7, 0.5, 0.1]]
*
* The inputs means that there are 2 source sentences to translate, and the
* first source has 1 prefix, the second source has 2 prefix.
*
* Lets assume beam size is 2, and the beam search's output should be
* - LoD
* - first level: [0, 1, 2]
* - second level: [0, 2, 4]
* - id tensor's data
* [[4,
* 1,
* 3,
* 8]]
* - score tensor's data
* [[0.5,
* 0.3,
* 0.9,
* 0.7]]
*
* TODO all the prune operations should be in the beam search, so it is better
* to split the beam search algorithm into a sequence of smaller operators, and
* the prune operators can be inserted in this sequence.
*/
template
<
typename
DeviceContext
,
typename
T
>
class
BeamSearchFunctor
{
public:
/*
* The main function of beam search.
*
* @selected_ids: a [None, 1]-shaped tensor with LoD.
* In a machine translation model, it might be the candidate term id sets,
* each set stored as a varience-length sequence.
* The format might be described with a two-level LoD
* - [[0 1],
* [0 1 2]]
* - [[]
* [0 1]]
* the first level of LoD tells that there are two source sentences. The
* second level describes the details of the candidate id set's offsets in
* the source sentences.
*
* @selected_scores: a LoD tensor with the same shape and LoD with
* selected_ids.
* It stores the corresponding scores of candidate ids in selected_ids.
*
* Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running.
*/
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
LoDTensor
*
pre_ids
,
const
framework
::
LoDTensor
*
pre_scores
,
const
framework
::
LoDTensor
*
ids
,
const
framework
::
LoDTensor
*
scores
,
framework
::
LoDTensor
*
selected_ids
,
framework
::
LoDTensor
*
selected_scores
,
size_t
level
,
size_t
beam_size
,
int
end_id
,
bool
is_accumulated
);
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/beam_search_test.cc
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include <gtest/gtest.h>
#include <vector>
void
PrepareCPUTensors
(
paddle
::
framework
::
LoDTensor
*
ids
,
paddle
::
framework
::
LoDTensor
*
scores
,
paddle
::
framework
::
LoDTensor
*
pre_ids
,
paddle
::
framework
::
LoDTensor
*
pre_scores
)
{
// lod
paddle
::
framework
::
LoD
lod
;
std
::
vector
<
size_t
>
level0
({
0
,
2
,
4
});
std
::
vector
<
size_t
>
level1
({
0
,
1
,
2
,
3
,
4
});
lod
.
push_back
(
level0
);
lod
.
push_back
(
level1
);
ids
->
set_lod
(
lod
);
scores
->
set_lod
(
lod
);
auto
dims
=
paddle
::
framework
::
make_ddim
({
4
,
3
});
ids
->
Resize
(
dims
);
scores
->
Resize
(
dims
);
paddle
::
platform
::
CPUPlace
place
;
auto
*
ids_data
=
ids
->
mutable_data
<
int64_t
>
(
place
);
auto
*
scores_data
=
scores
->
mutable_data
<
float
>
(
place
);
std
::
vector
<
int64_t
>
ids_vec_data
({
4
,
2
,
5
,
2
,
1
,
3
,
3
,
5
,
2
,
8
,
2
,
1
});
std
::
vector
<
float
>
scores_vec_data
(
{
0.6
f
,
0.3
f
,
0.5
f
,
0.2
f
,
0.3
f
,
0.1
f
,
0.9
f
,
0.5
f
,
0.1
f
,
0.7
f
,
0.5
f
,
0.1
f
});
CHECK_EQ
(
static_cast
<
size_t
>
(
ids
->
numel
()),
ids_vec_data
.
size
());
CHECK_EQ
(
static_cast
<
size_t
>
(
ids
->
numel
()),
scores_vec_data
.
size
());
for
(
int
i
=
0
;
i
<
ids
->
numel
();
i
++
)
{
ids_data
[
i
]
=
ids_vec_data
[
i
];
scores_data
[
i
]
=
scores_vec_data
[
i
];
}
// pre_ids
pre_ids
->
Resize
(
paddle
::
framework
::
make_ddim
({
4
,
1
}));
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
pre_ids
->
mutable_data
<
int64_t
>
(
place
)[
i
]
=
i
+
1
;
}
// pre_scores
pre_scores
->
Resize
(
paddle
::
framework
::
make_ddim
({
4
,
1
}));
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
pre_scores
->
mutable_data
<
float
>
(
place
)[
i
]
=
0.1
*
(
i
+
1
);
}
}
template
<
typename
DeviceContext
,
typename
Place
>
void
TestBeamSearch
()
{
paddle
::
framework
::
LoDTensor
ids
;
paddle
::
framework
::
LoDTensor
scores
;
paddle
::
framework
::
LoDTensor
pre_ids
;
paddle
::
framework
::
LoDTensor
pre_scores
;
auto
*
place
=
new
Place
();
DeviceContext
*
context
=
new
DeviceContext
(
*
place
);
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
PrepareCPUTensors
(
&
ids
,
&
scores
,
&
pre_ids
,
&
pre_scores
);
}
else
{
paddle
::
framework
::
LoDTensor
cpu_ids
;
paddle
::
framework
::
LoDTensor
cpu_scores
;
paddle
::
framework
::
LoDTensor
cpu_pre_ids
;
paddle
::
framework
::
LoDTensor
cpu_pre_scores
;
PrepareCPUTensors
(
&
cpu_ids
,
&
cpu_scores
,
&
cpu_pre_ids
,
&
cpu_pre_scores
);
TensorCopySync
(
cpu_ids
,
*
place
,
&
ids
);
TensorCopySync
(
cpu_scores
,
*
place
,
&
scores
);
TensorCopySync
(
cpu_pre_ids
,
*
place
,
&
pre_ids
);
TensorCopySync
(
cpu_pre_scores
,
*
place
,
&
pre_scores
);
ids
.
set_lod
(
cpu_ids
.
lod
());
scores
.
set_lod
(
cpu_scores
.
lod
());
pre_ids
.
set_lod
(
cpu_pre_ids
.
lod
());
pre_scores
.
set_lod
(
cpu_pre_scores
.
lod
());
}
paddle
::
framework
::
LoDTensor
selected_ids
;
paddle
::
framework
::
LoDTensor
selected_scores
;
size_t
level
=
0
;
size_t
beam_size
=
2
;
int
end_id
=
0
;
paddle
::
operators
::
math
::
BeamSearchFunctor
<
DeviceContext
,
float
>
beamsearch
;
beamsearch
(
*
context
,
&
pre_ids
,
&
pre_scores
,
&
ids
,
&
scores
,
&
selected_ids
,
&
selected_scores
,
level
,
beam_size
,
end_id
,
true
);
ASSERT_EQ
(
selected_ids
.
lod
(),
selected_scores
.
lod
());
paddle
::
framework
::
LoDTensor
cpu_selected_ids
;
paddle
::
framework
::
LoDTensor
cpu_selected_scores
;
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
cpu_selected_ids
=
selected_ids
;
cpu_selected_scores
=
selected_scores
;
}
else
{
TensorCopySync
(
selected_ids
,
paddle
::
platform
::
CPUPlace
(),
&
cpu_selected_ids
);
TensorCopySync
(
selected_scores
,
paddle
::
platform
::
CPUPlace
(),
&
cpu_selected_scores
);
cpu_selected_ids
.
set_lod
(
selected_ids
.
lod
());
cpu_selected_scores
.
set_lod
(
selected_scores
.
lod
());
}
std
::
vector
<
int64_t
>
expected_ids
({
4
,
5
,
3
,
8
});
std
::
vector
<
float
>
expected_scores
({
0.6
f
,
0.5
f
,
0.9
f
,
0.7
f
});
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
ASSERT_EQ
(
expected_ids
[
i
],
cpu_selected_ids
.
data
<
int64_t
>
()[
i
]);
ASSERT_EQ
(
expected_scores
[
i
],
cpu_selected_scores
.
data
<
float
>
()[
i
]);
}
delete
place
;
delete
context
;
}
TEST
(
BeamSearch
,
CPU
)
{
TestBeamSearch
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
}
#ifdef PADDLE_WITH_CUDA
TEST
(
BeamSearch
,
GPU
)
{
TestBeamSearch
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
}
#endif
paddle/fluid/operators/math/sampler.cc
浏览文件 @
c7e38680
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/fluid/operators/math/sampler.h"
#include <glog/logging.h>
#include <iostream>
#include <iostream>
#include <queue>
#include <queue>
#include <utility>
#include <utility>
...
@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const {
...
@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const {
auto
index
=
(
*
int_dist_
)(
*
random_engine_
);
auto
index
=
(
*
int_dist_
)(
*
random_engine_
);
auto
p
=
(
*
real_dist_
)(
*
random_engine_
);
auto
p
=
(
*
real_dist_
)(
*
random_engine_
);
if
(
p
>
alias_probs_
[
index
])
{
if
(
p
>
alias_probs_
[
index
])
{
return
alias_
[
index
];
int
alias
=
alias_
[
index
];
if
(
alias
==
exceptional_val
)
{
LOG
(
WARNING
)
<<
"WARNING: CustomSampler get alias "
<<
exceptional_val
;
return
index
;
}
return
alias
;
}
else
{
}
else
{
return
index
;
return
index
;
}
}
...
...
paddle/fluid/operators/math/sampler.h
浏览文件 @
c7e38680
...
@@ -116,6 +116,7 @@ class CustomSampler : public Sampler {
...
@@ -116,6 +116,7 @@ class CustomSampler : public Sampler {
const
float
*
alias_probs_
;
const
float
*
alias_probs_
;
const
int
*
alias_
;
const
int
*
alias_
;
const
float
*
probs_
;
const
float
*
probs_
;
const
int
exceptional_val
=
-
1
;
std
::
shared_ptr
<
std
::
mt19937
>
random_engine_
;
std
::
shared_ptr
<
std
::
mt19937
>
random_engine_
;
std
::
shared_ptr
<
std
::
uniform_real_distribution
<>>
real_dist_
;
std
::
shared_ptr
<
std
::
uniform_real_distribution
<>>
real_dist_
;
std
::
shared_ptr
<
std
::
uniform_int_distribution
<>>
int_dist_
;
std
::
shared_ptr
<
std
::
uniform_int_distribution
<>>
int_dist_
;
...
...
paddle/fluid/operators/math/selected_rows_functor_test.cc
浏览文件 @
c7e38680
...
@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
...
@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
auto
*
out_data
=
output
->
value
().
data
<
float
>
();
auto
*
out_data
=
output
->
value
().
data
<
float
>
();
for
(
size_t
i
=
0
;
i
<
ret_rows
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
ret_rows
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
row_numel
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
static_cast
<
size_t
>
(
row_numel
)
;
++
j
)
{
EXPECT_EQ
(
out_data
[
i
*
row_numel
+
j
],
ret_rows
[
i
]);
EXPECT_EQ
(
out_data
[
i
*
row_numel
+
j
],
ret_rows
[
i
]);
}
}
}
}
...
...
paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
浏览文件 @
c7e38680
...
@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
...
@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
auto
*
out_data
=
output_cpu
.
data
<
float
>
();
auto
*
out_data
=
output_cpu
.
data
<
float
>
();
for
(
size_t
i
=
0
;
i
<
ret_rows
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
ret_rows
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
row_numel
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
static_cast
<
size_t
>
(
row_numel
)
;
++
j
)
{
EXPECT_EQ
(
out_data
[
i
*
row_numel
+
j
],
ret_rows
[
i
]);
EXPECT_EQ
(
out_data
[
i
*
row_numel
+
j
],
ret_rows
[
i
]);
}
}
}
}
...
...
paddle/fluid/operators/math/sequence_pooling_test.cc
浏览文件 @
c7e38680
...
@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
...
@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
cpu_in_grad
.
set_lod
(
in_grad
.
lod
());
cpu_in_grad
.
set_lod
(
in_grad
.
lod
());
}
}
EXPECT_EQ
(
in_grad
.
numel
(),
lod
[
0
].
back
()
*
second_dim
);
EXPECT_EQ
(
in_grad
.
numel
(),
static_cast
<
int64_t
>
(
lod
[
0
].
back
()
*
second_dim
)
);
EXPECT_EQ
(
in_grad
.
lod
(),
lod
);
EXPECT_EQ
(
in_grad
.
lod
(),
lod
);
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
if
(
paddle
::
platform
::
is_cpu_place
(
*
place
))
{
...
...
paddle/fluid/operators/nce_op.h
浏览文件 @
c7e38680
...
@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> {
...
@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> {
PrepareSamples
<
DeviceContext
,
T
>
(
context
,
sampler
);
PrepareSamples
<
DeviceContext
,
T
>
(
context
,
sampler
);
auto
sample_labels
=
context
.
Output
<
Tensor
>
(
"SampleLabels"
);
auto
sample_labels
=
context
.
Output
<
Tensor
>
(
"SampleLabels"
);
const
int64_t
*
sample_labels_data
=
sample_labels
->
data
<
int64_t
>
();
const
int64_t
*
sample_labels_data
=
sample_labels
->
data
<
int64_t
>
();
for
(
int
x
=
0
;
x
<
sample_labels
->
numel
();
x
++
)
{
PADDLE_ENFORCE_GE
(
sample_labels_data
[
x
],
0
,
"nce sample label %d"
,
x
);
}
auto
sample_out
=
context
.
Output
<
Tensor
>
(
"SampleLogits"
);
auto
sample_out
=
context
.
Output
<
Tensor
>
(
"SampleLogits"
);
T
*
sample_out_data
=
sample_out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
sample_out_data
=
sample_out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
label
=
context
.
Input
<
Tensor
>
(
"Label"
);
auto
label
=
context
.
Input
<
Tensor
>
(
"Label"
);
...
...
paddle/fluid/operators/ngraph/CMakeLists.txt
0 → 100644
浏览文件 @
c7e38680
if
(
WITH_NGRAPH
)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph
)
cc_library
(
ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto
)
op_library
(
ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context
)
endif
()
paddle/fluid/
framework
/ngraph_bridge.cc
→
paddle/fluid/
operators/ngraph
/ngraph_bridge.cc
浏览文件 @
c7e38680
...
@@ -17,39 +17,39 @@ limitations under the License. */
...
@@ -17,39 +17,39 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
operators
{
namespace
NG_OPS
=
paddle
::
operators
::
ngraphs
;
namespace
NG_OPS
=
paddle
::
operators
::
ngraphs
;
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
OperatorBase
>&
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphBridge
::
NG_NODE_MAP
=
{
NgraphBridge
::
NG_NODE_MAP
=
{
{
"elementwise_add"
,
NG_OPS
::
BuildElementwiseAddNode
},
{
"elementwise_add"
,
NG_OPS
::
BuildElementwiseAddNode
},
{
"elementwise_add_grad"
,
NG_OPS
::
BuildElementwiseAddGradNode
},
{
"elementwise_add_grad"
,
NG_OPS
::
BuildElementwiseAddGradNode
},
{
"fill_constant"
,
paddle
::
operators
::
ngraphs
::
BuildFillConstantNode
},
{
"fill_constant"
,
NG_OPS
::
BuildFillConstantNode
},
{
"mean"
,
paddle
::
operators
::
ngraphs
::
BuildMeanNode
},
{
"mean"
,
NG_OPS
::
BuildMeanNode
},
{
"mean_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMeanGradNode
},
{
"mean_grad"
,
NG_OPS
::
BuildMeanGradNode
},
{
"mul"
,
paddle
::
operators
::
ngraphs
::
BuildMulNode
},
{
"mul"
,
NG_OPS
::
BuildMulNode
},
{
"mul_grad"
,
paddle
::
operators
::
ngraphs
::
BuildMulGradNode
},
{
"mul_grad"
,
NG_OPS
::
BuildMulGradNode
},
{
"softmax"
,
paddle
::
operators
::
ngraphs
::
BuildSoftmaxNode
},
{
"softmax"
,
NG_OPS
::
BuildSoftmaxNode
},
{
"softmax_grad"
,
paddle
::
operators
::
ngraphs
::
BuildSoftmaxGradNode
},
{
"softmax_grad"
,
NG_OPS
::
BuildSoftmaxGradNode
},
{
"scale"
,
paddle
::
operators
::
ngraphs
::
BuildScaleNode
},
{
"scale"
,
NG_OPS
::
BuildScaleNode
},
{
"relu"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"relu"
,
NG_OPS
::
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"tanh"
,
paddle
::
operators
::
ngraphs
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"tanh"
,
NG_OPS
::
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
},
{
"top_k"
,
paddle
::
operators
::
ngraphs
::
BuildTopKNode
}};
{
"top_k"
,
NG_OPS
::
BuildTopKNode
}};
void
NgraphBridge
::
BuildNgNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
)
{
void
NgraphBridge
::
BuildNgNode
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
)
{
auto
&
op_type
=
op
->
Type
();
auto
&
op_type
=
op
->
Type
();
NG_NODE_MAP
[
op_type
](
op
,
ngb_node_map_
);
NG_NODE_MAP
[
op_type
](
op
,
ngb_node_map_
);
}
}
}
// namespace
framework
}
// namespace
operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/
framework
/ngraph_bridge.h
→
paddle/fluid/
operators/ngraph
/ngraph_bridge.h
浏览文件 @
c7e38680
...
@@ -21,16 +21,16 @@ limitations under the License. */
...
@@ -21,16 +21,16 @@ limitations under the License. */
#include "ngraph/node.hpp"
#include "ngraph/node.hpp"
namespace
paddle
{
#include "paddle/fluid/framework/operator.h"
namespace
framework
{
class
OperatorBase
;
namespace
paddle
{
namespace
operators
{
class
NgraphBridge
{
class
NgraphBridge
{
public:
public:
static
std
::
map
<
static
std
::
map
<
std
::
string
,
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
OperatorBase
>&
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NG_NODE_MAP
;
NG_NODE_MAP
;
...
@@ -41,7 +41,7 @@ class NgraphBridge {
...
@@ -41,7 +41,7 @@ class NgraphBridge {
var_node_map
)
var_node_map
)
:
ngb_node_map_
(
var_node_map
)
{}
:
ngb_node_map_
(
var_node_map
)
{}
void
BuildNgNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
);
void
BuildNgNode
(
const
std
::
shared_ptr
<
framework
::
OperatorBase
>&
op
);
private:
private:
std
::
shared_ptr
<
std
::
shared_ptr
<
...
@@ -49,5 +49,5 @@ class NgraphBridge {
...
@@ -49,5 +49,5 @@ class NgraphBridge {
ngb_node_map_
;
ngb_node_map_
;
};
};
}
// namespace
framework
}
// namespace
operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/
framework/ngraph_operator
.cc
→
paddle/fluid/
operators/ngraph/ngraph_engine
.cc
浏览文件 @
c7e38680
...
@@ -16,22 +16,25 @@ limitations under the License. */
...
@@ -16,22 +16,25 @@ limitations under the License. */
#include <algorithm>
#include <algorithm>
#include <map>
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "
ngraph/ngraph.hpp
"
#include "
paddle/fluid/operators/ngraph/ngraph_engine.h
"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
operators
{
static
ngraph
::
Shape
Ddim2Shape
(
const
DDim
&
dims
)
{
static
ngraph
::
Shape
Ddim2Shape
(
const
framework
::
DDim
&
dims
)
{
ngraph
::
Shape
sp
;
ngraph
::
Shape
sp
;
for
(
int
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
int
k
=
dims
[
i
];
int
k
=
dims
[
i
];
...
@@ -41,117 +44,39 @@ static ngraph::Shape Ddim2Shape(const DDim& dims) {
...
@@ -41,117 +44,39 @@ static ngraph::Shape Ddim2Shape(const DDim& dims) {
return
sp
;
return
sp
;
}
}
static
std
::
map
<
proto
::
VarType
::
Type
,
ngraph
::
element
::
Type
>
pd2ng_type_map
=
{
static
std
::
map
<
framework
::
proto
::
VarType
::
Type
,
ngraph
::
element
::
Type
>
{
proto
::
VarType
::
FP32
,
ngraph
::
element
::
f32
},
pd2ng_type_map
=
{
{
proto
::
VarType
::
FP64
,
ngraph
::
element
::
f64
},
{
framework
::
proto
::
VarType
::
FP32
,
ngraph
::
element
::
f32
},
{
proto
::
VarType
::
INT32
,
ngraph
::
element
::
i32
},
{
framework
::
proto
::
VarType
::
FP64
,
ngraph
::
element
::
f64
},
{
proto
::
VarType
::
INT64
,
ngraph
::
element
::
i64
},
{
framework
::
proto
::
VarType
::
INT32
,
ngraph
::
element
::
i32
},
{
proto
::
VarType
::
BOOL
,
ngraph
::
element
::
boolean
},
{
framework
::
proto
::
VarType
::
INT64
,
ngraph
::
element
::
i64
},
{
framework
::
proto
::
VarType
::
BOOL
,
ngraph
::
element
::
boolean
},
};
};
typedef
enum
{
/* nGraph support state on ops */
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
FULL_TRAIN
,
/* Support full ops for train */
NgraphEngine
::
func_cache_
=
{};
PARTIAL_TRAIN
,
/* Support partial ops for train */
FULL_TEST
,
/* Support full list of ops for test */
PARTIAL_TEST
/* Support partial list of ops for test */
}
op_state
;
// perform graph build through bridge and execute computation
class
NgraphEngine
{
public:
explicit
NgraphEngine
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
,
const
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
>>&
ops
,
const
std
::
unordered_map
<
std
::
string
,
ngraph
::
element
::
Type
>&
var_type_map
,
const
std
::
unordered_set
<
std
::
string
>&
persist
,
const
std
::
unordered_set
<
std
::
string
>&
fetches
,
const
std
::
unordered_set
<
std
::
string
>&
post_op_inputs
,
op_state
ng_op_state
)
:
scope_
(
scope
),
place_
(
place
),
fused_ops_
(
ops
),
var_type_map_
(
var_type_map
),
persistables_
(
persist
),
fetches_
(
fetches
),
post_op_inputs_
(
post_op_inputs
),
ng_op_state_
(
ng_op_state
)
{
var_in_node_map_
=
std
::
make_shared
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
();
var_node_map_
=
std
::
make_shared
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
();
BuildNgIO
();
GetNgFunction
();
}
void
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
;
std
::
shared_ptr
<
ngraph
::
runtime
::
Backend
>
NgraphEngine
::
backend_
=
ngraph
::
runtime
::
Backend
::
create
(
"CPU"
);
private:
static
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
func_cache_
;
const
Scope
&
scope_
;
const
platform
::
Place
&
place_
;
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
>>
fused_ops_
;
std
::
unordered_map
<
std
::
string
,
ngraph
::
element
::
Type
>
var_type_map_
;
std
::
unordered_set
<
std
::
string
>
persistables_
;
std
::
unordered_set
<
std
::
string
>
fetches_
;
std
::
unordered_set
<
std
::
string
>
post_op_inputs_
;
op_state
ng_op_state_
;
// ngraph backend eg. CPU
static
std
::
shared_ptr
<
ngraph
::
runtime
::
Backend
>
backend_
;
// ngraph function to call and execute
std
::
shared_ptr
<
ngraph
::
Function
>
ngraph_function_
;
// var_name of inputs
std
::
vector
<
std
::
string
>
var_in_
;
// var_name of outputs from fetch in order
std
::
vector
<
std
::
string
>
var_out_
;
// map input vars to nodes
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_in_node_map_
;
// map each var name with a ngraph node
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_node_map_
;
// cache key to check if function is cached
std
::
shared_ptr
<
std
::
string
>
GetCacheKey
();
// get ngraph input and define ngraph input parameters
void
GetNgInputShape
(
std
::
shared_ptr
<
OperatorBase
>
op
);
// Call ngraph bridge to map ops
void
BuildNgNodes
();
// get the ngraph input and output var list
void
BuildNgIO
();
// build ngraph function call
void
BuildNgFunction
();
// Check cache for ngraph function or otherwise build the function
void
GetNgFunction
();
};
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
>>
static
std
::
vector
<
std
::
vector
<
int
>>
NgraphOpIntervals
(
NgraphOperator
::
NgraphOpIntervals
(
framework
::
BlockDesc
*
block
)
{
std
::
vector
<
std
::
unique_ptr
<
paddle
::
framework
::
OperatorBase
>>*
ops
)
{
std
::
vector
<
std
::
vector
<
int
>>
intervals
;
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
>>
auto
ops
=
block
->
AllOps
();
intervals
;
int
size
=
ops
.
size
();
if
(
ops
->
empty
())
{
int
left
=
0
;
return
intervals
;
while
(
left
<
size
&&
ops
.
at
(
left
)
->
Type
()
!=
framework
::
kFeedOpType
)
{
}
size_t
size
=
ops
->
size
();
size_t
left
=
0
;
while
(
left
<
size
&&
ops
->
at
(
left
)
->
Type
()
!=
kFeedOpType
)
{
++
left
;
++
left
;
}
}
if
(
left
==
size
)
{
if
(
left
==
size
)
{
return
intervals
;
return
intervals
;
}
}
while
(
left
<
size
&&
ops
->
at
(
left
)
->
Type
()
==
kFeedOpType
)
{
while
(
left
<
size
&&
ops
.
at
(
left
)
->
Type
()
==
framework
::
kFeedOpType
)
{
++
left
;
++
left
;
}
}
size_
t
right
=
left
;
in
t
right
=
left
;
while
(
right
<
size
&&
ops
->
at
(
right
)
->
Type
()
!=
kFetchOpType
)
{
while
(
right
<
size
&&
ops
.
at
(
right
)
->
Type
()
!=
framework
::
kFetchOpType
)
{
++
right
;
++
right
;
}
}
if
(
right
==
size
)
{
if
(
right
==
size
)
{
...
@@ -160,66 +85,89 @@ NgraphOperator::NgraphOpIntervals(
...
@@ -160,66 +85,89 @@ NgraphOperator::NgraphOpIntervals(
if
(
left
>=
right
)
return
intervals
;
if
(
left
>=
right
)
return
intervals
;
// (left, right - 1) represents indices between feed and fetch
// (left, right - 1) represents indices between feed and fetch
size_
t
pivot
=
left
;
in
t
pivot
=
left
;
while
(
pivot
<
right
)
{
while
(
pivot
<
right
)
{
auto
op_type
=
ops
->
at
(
pivot
)
->
Type
();
auto
op_type
=
ops
.
at
(
pivot
)
->
Type
();
if
(
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
find
(
op_type
)
==
if
(
NgraphBridge
::
NG_NODE_MAP
.
find
(
op_type
)
==
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
end
())
{
NgraphBridge
::
NG_NODE_MAP
.
end
())
{
++
pivot
;
++
pivot
;
}
else
{
}
else
{
size_
t
start
=
pivot
,
end
=
start
;
in
t
start
=
pivot
,
end
=
start
;
while
(
pivot
<
right
&&
while
(
pivot
<
right
&&
(
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
find
(
(
NgraphBridge
::
NG_NODE_MAP
.
find
(
ops
.
at
(
pivot
)
->
Type
())
!=
ops
->
at
(
pivot
)
->
Type
())
!=
NgraphBridge
::
NG_NODE_MAP
.
end
()))
{
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
end
()))
{
++
pivot
;
++
pivot
;
++
end
;
++
end
;
}
}
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
>
std
::
vector
<
int
>
interval
=
{
start
,
end
};
interval
=
{
ops
->
begin
()
+
start
,
ops
->
begin
()
+
end
};
intervals
.
push_back
(
interval
);
intervals
.
push_back
(
interval
);
}
}
}
// end while
}
// end while
return
intervals
;
return
intervals
;
}
}
NgraphOperator
::
NgraphOperator
(
static
void
SubstituteNgraphOp
(
framework
::
BlockDesc
*
block
,
const
ProgramDesc
&
prog
,
size_t
block_id
,
std
::
string
block_str
,
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
start
,
std
::
vector
<
int
>
interval
)
{
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
end
,
framework
::
ProgramDesc
program
;
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
block
->
RemoveOp
(
interval
.
at
(
0
),
interval
.
at
(
1
));
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
)
auto
*
ng_op
=
block
->
InsertOp
(
interval
.
at
(
0
));
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
),
ng_op
->
SetType
(
"ngraph_engine"
);
pdesc_
(
prog
),
ng_op
->
SetAttr
(
"interval"
,
interval
);
block_
(
block_id
)
{
ng_op
->
SetAttr
(
"graph"
,
block_str
);
for
(
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
it
=
start
;
}
it
!=
end
;
++
it
)
{
fused_ops_
.
push_back
(
std
::
move
(
*
it
));
}
for
(
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
it
=
end
;
// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
(
*
it
)
->
Type
()
!=
kFetchOpType
;
++
it
)
{
void
NgraphEngine
::
EnableNgraph
(
const
framework
::
ProgramDesc
&
program
)
{
for
(
auto
&
var_name_item
:
(
*
it
)
->
Inputs
())
{
#ifdef PADDLE_WITH_NGRAPH
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
VLOG
(
4
)
<<
"use_ngraph=True"
;
post_op_inputs_
.
insert
(
var_name
);
for
(
size_t
bid
=
0
;
bid
<
program
.
Size
();
++
bid
)
{
}
// TODO(baojun-nervana): Remove the const_cast
auto
*
block
=
const_cast
<
framework
::
ProgramDesc
&>
(
program
).
MutableBlock
(
bid
);
std
::
string
block_str
=
block
->
Proto
()
->
SerializeAsString
();
auto
intervals
=
NgraphOpIntervals
(
block
);
for
(
auto
it
=
intervals
.
rbegin
();
it
!=
intervals
.
rend
();
++
it
)
{
SubstituteNgraphOp
(
block
,
block_str
,
*
it
);
}
}
}
}
#else
LOG
(
WARNING
)
<<
"'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"
;
#endif
}
if
((
*
(
start
-
1
))
->
Type
()
==
kFeedOpType
&&
(
*
end
)
->
Type
()
==
kFetchOpType
)
{
NgraphEngine
::
NgraphEngine
(
const
framework
::
Scope
&
scope
,
is_full_
=
true
;
const
platform
::
Place
&
place
,
}
const
std
::
string
&
serialized_graph
,
const
std
::
vector
<
int
>&
interval
)
:
scope_
(
scope
),
place_
(
place
)
{
var_in_node_map_
=
std
::
make_shared
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
();
var_node_map_
=
std
::
make_shared
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
();
Process
();
func_cache_key_
=
std
::
to_string
(
interval
[
0
])
+
std
::
to_string
(
interval
[
1
])
+
serialized_graph
;
framework
::
proto
::
BlockDesc
bdesc
;
bdesc
.
ParseFromString
(
serialized_graph
);
framework
::
BlockDesc
block
(
nullptr
,
&
bdesc
);
Prepare
(
block
,
interval
);
BuildNgIO
();
GetNgFunction
();
}
}
void
Ngraph
Operator
::
Process
()
{
void
Ngraph
Engine
::
Prepare
(
const
framework
::
BlockDesc
&
block
,
auto
&
bdesc
=
pdesc_
.
Block
(
block_
);
const
std
::
vector
<
int
>&
interval
)
{
for
(
auto
&
var
:
b
desc
.
AllVars
())
{
for
(
auto
&
var
:
b
lock
.
AllVars
())
{
if
(
!
(
var
->
GetType
()
==
proto
::
VarType
::
SELECTED_ROWS
||
if
(
!
(
var
->
GetType
()
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
||
var
->
GetType
()
==
proto
::
VarType
::
LOD_TENSOR
||
var
->
GetType
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
||
var
->
GetType
()
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
))
{
var
->
GetType
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR_ARRAY
))
{
continue
;
continue
;
}
}
...
@@ -228,7 +176,8 @@ void NgraphOperator::Process() {
...
@@ -228,7 +176,8 @@ void NgraphOperator::Process() {
continue
;
continue
;
}
}
if
(
var_name
!=
"fetch"
&&
var_name
!=
"feed"
)
{
if
(
var_name
!=
framework
::
kFeedOpType
&&
var_name
!=
framework
::
kFetchOpType
)
{
auto
pd_type
=
var
->
GetDataType
();
auto
pd_type
=
var
->
GetDataType
();
if
(
pd2ng_type_map
.
find
(
pd_type
)
==
pd2ng_type_map
.
end
())
{
if
(
pd2ng_type_map
.
find
(
pd_type
)
==
pd2ng_type_map
.
end
())
{
PADDLE_THROW
(
"Data type of var %s not found in pd2ng_type_map"
,
PADDLE_THROW
(
"Data type of var %s not found in pd2ng_type_map"
,
...
@@ -242,53 +191,66 @@ void NgraphOperator::Process() {
...
@@ -242,53 +191,66 @@ void NgraphOperator::Process() {
}
}
}
}
for
(
auto
*
op
:
bdesc
.
AllOps
())
{
auto
ops_desc
=
block
.
AllOps
();
if
(
op
->
Type
()
==
kFetchOpType
)
{
int
idx
=
interval
[
0
];
std
::
string
fetch_target_name
=
op
->
Input
(
"X"
)[
0
];
while
(
idx
<
interval
[
1
])
{
fetches_
.
insert
(
fetch_target_name
);
auto
op_desc
=
ops_desc
.
at
(
idx
);
}
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
fused_ops_
.
push_back
(
std
::
move
(
op
));
++
idx
;
}
}
}
void
NgraphOperator
::
RunImpl
(
const
Scope
&
scope
,
while
(
ops_desc
.
at
(
idx
)
->
Type
()
!=
framework
::
kFetchOpType
)
{
const
platform
::
Place
&
place
)
const
{
auto
op_desc
=
ops_desc
.
at
(
idx
);
op_state
ng_op_state
=
PARTIAL_TEST
;
for
(
auto
&
var_name_item
:
op_desc
->
Inputs
())
{
auto
&
bdesc
=
pdesc_
.
Block
(
block_
);
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
for
(
auto
*
op
:
bdesc
.
AllOps
())
{
post_op_inputs_
.
insert
(
var_name
);
if
(
op
->
Type
().
find
(
"_grad"
)
!=
std
::
string
::
npos
)
{
}
ng_op_state
=
PARTIAL_TRAIN
;
break
;
}
}
++
idx
;
}
}
if
(
is_full_
)
{
while
(
idx
<
static_cast
<
int
>
(
ops_desc
.
size
())
&&
ng_op_state
=
ng_op_state
==
PARTIAL_TEST
?
FULL_TEST
:
FULL_TRAIN
;
ops_desc
.
at
(
idx
)
->
Type
()
==
framework
::
kFetchOpType
)
{
std
::
string
fetch_target_name
=
ops_desc
.
at
(
idx
)
->
Input
(
"X"
)[
0
];
fetches_
.
insert
(
fetch_target_name
);
++
idx
;
}
}
NgraphEngine
ngraph_engine
(
scope
,
place
,
fused_ops_
,
var_type_map_
,
if
(
ops_desc
.
at
(
interval
.
at
(
0
)
-
1
)
->
Type
()
==
framework
::
kFeedOpType
&&
persistables_
,
fetches_
,
post_op_inputs_
,
ops_desc
.
at
(
interval
.
at
(
1
))
->
Type
()
==
framework
::
kFetchOpType
)
{
ng_op_state
);
ng_op_state_
=
OpState
::
FULL
;
ngraph_engine
.
Run
(
scope
,
place
);
}
}
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
for
(
auto
*
op_desc
:
ops_desc
)
{
NgraphEngine
::
func_cache_
=
{};
if
(
op_desc
->
Type
().
find
(
"_grad"
)
!=
std
::
string
::
npos
)
{
ng_op_state_
=
ng_op_state_
==
OpState
::
FULL
?
OpState
::
FULL_TRAIN
:
OpState
::
PARTIAL_TRAIN
;
break
;
}
}
std
::
shared_ptr
<
ngraph
::
runtime
::
Backend
>
NgraphEngine
::
backend_
=
if
(
ng_op_state_
!=
OpState
::
FULL_TRAIN
&&
ngraph
::
runtime
::
Backend
::
create
(
"CPU"
);
ng_op_state_
!=
OpState
::
PARTIAL_TRAIN
)
{
ng_op_state_
=
ng_op_state_
==
OpState
::
FULL
?
OpState
::
FULL_TEST
:
OpState
::
PARTIAL_TEST
;
}
}
void
NgraphEngine
::
GetNgInputShape
(
std
::
shared_ptr
<
OperatorBase
>
op
)
{
void
NgraphEngine
::
GetNgInputShape
(
RuntimeContext
ctx
(
op
->
Inputs
(),
op
->
Outputs
(),
scope_
);
std
::
shared_ptr
<
framework
::
OperatorBase
>
op
)
{
framework
::
RuntimeContext
ctx
(
op
->
Inputs
(),
op
->
Outputs
(),
scope_
);
op
->
RuntimeInferShape
(
scope_
,
place_
,
ctx
);
op
->
RuntimeInferShape
(
scope_
,
place_
,
ctx
);
for
(
auto
&
var_name_item
:
op
->
Inputs
())
{
for
(
auto
&
var_name_item
:
op
->
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
auto
*
var
=
scope_
.
FindVar
(
var_name
);
auto
*
var
=
scope_
.
FindVar
(
var_name
);
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
if
(
var
&&
var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
sp
=
Ddim2Shape
(
tensor_pd
->
dims
());
auto
sp
=
Ddim2Shape
(
tensor_pd
->
dims
());
if
(
std
::
find
(
var_in_
.
begin
(),
var_in_
.
end
(),
var_name
)
!=
if
(
std
::
find
(
var_in_
.
begin
(),
var_in_
.
end
(),
var_name
)
!=
var_in_
.
end
())
{
var_in_
.
end
())
{
if
(
var_node_map_
->
find
(
var_name
)
==
var_node_map_
->
end
())
{
if
(
var_node_map_
->
find
(
var_name
)
==
var_node_map_
->
end
())
{
// auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
auto
ng_type
=
var_type_map_
.
at
(
var_name
);
auto
ng_type
=
var_type_map_
.
at
(
var_name
);
auto
prm
=
auto
prm
=
std
::
make_shared
<
ngraph
::
op
::
Parameter
>
(
ng_type
,
sp
,
true
);
std
::
make_shared
<
ngraph
::
op
::
Parameter
>
(
ng_type
,
sp
,
true
);
...
@@ -302,22 +264,25 @@ void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
...
@@ -302,22 +264,25 @@ void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
}
}
void
NgraphEngine
::
BuildNgNodes
()
{
void
NgraphEngine
::
BuildNgNodes
()
{
for
(
auto
&
var_name
:
var_out_
)
{
for
(
auto
&
op
:
fused_ops_
)
{
if
(
var_node_map_
->
find
(
var_name
)
==
var_node_map_
->
end
())
{
for
(
auto
&
var_name_item
:
op
->
Outputs
())
{
auto
*
var
=
scope_
.
FindVar
(
var_name
);
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
if
(
var_node_map_
->
find
(
var_name
)
==
var_node_map_
->
end
())
{
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
*
var
=
scope_
.
FindVar
(
var_name
);
auto
&
ddim
=
tensor_pd
->
dims
();
if
(
var
&&
var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
ng_shape
=
Ddim2Shape
(
ddim
);
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
ng_type
=
var_type_map_
.
at
(
var_name
);
auto
&
ddim
=
tensor_pd
->
dims
();
auto
prm
=
auto
ng_shape
=
Ddim2Shape
(
ddim
);
std
::
make_shared
<
ngraph
::
op
::
Parameter
>
(
ng_type
,
ng_shape
,
true
);
auto
ng_type
=
var_type_map_
.
at
(
var_name
);
(
*
var_node_map_
)[
var_name
]
=
prm
;
auto
prm
=
std
::
make_shared
<
ngraph
::
op
::
Parameter
>
(
ng_type
,
ng_shape
,
true
);
(
*
var_node_map_
)[
var_name
]
=
prm
;
}
}
}
}
}
}
}
}
NgraphBridge
ngb
(
var_node_map_
);
paddle
::
framework
::
NgraphBridge
ngb
(
var_node_map_
);
for
(
auto
&
op
:
fused_ops_
)
{
for
(
auto
&
op
:
fused_ops_
)
{
ngb
.
BuildNgNode
(
op
);
ngb
.
BuildNgNode
(
op
);
}
}
...
@@ -363,25 +328,25 @@ void NgraphEngine::BuildNgIO() {
...
@@ -363,25 +328,25 @@ void NgraphEngine::BuildNgIO() {
op
->
Type
());
op
->
Type
());
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
switch
(
ng_op_state_
)
{
switch
(
ng_op_state_
)
{
case
PARTIAL_TEST
:
case
OpState
::
PARTIAL_TEST
:
if
(
post_op_inputs_
.
find
(
var_name
)
!=
post_op_inputs_
.
end
()
||
if
(
post_op_inputs_
.
find
(
var_name
)
!=
post_op_inputs_
.
end
()
||
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
())
{
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
())
{
var_out_
.
push_back
(
var_name
);
var_out_
.
push_back
(
var_name
);
}
}
break
;
break
;
case
FULL_TEST
:
case
OpState
::
FULL_TEST
:
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
())
{
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
())
{
var_out_
.
push_back
(
var_name
);
var_out_
.
push_back
(
var_name
);
}
}
break
;
break
;
case
PARTIAL_TRAIN
:
case
OpState
::
PARTIAL_TRAIN
:
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
()
||
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
()
||
post_op_inputs_
.
find
(
var_name
)
!=
post_op_inputs_
.
end
()
||
post_op_inputs_
.
find
(
var_name
)
!=
post_op_inputs_
.
end
()
||
persistables_
.
find
(
var_name
)
!=
persistables_
.
end
())
{
persistables_
.
find
(
var_name
)
!=
persistables_
.
end
())
{
var_out_
.
push_back
(
var_name
);
var_out_
.
push_back
(
var_name
);
}
}
break
;
break
;
case
FULL_TRAIN
:
case
OpState
::
FULL_TRAIN
:
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
()
||
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
()
||
persistables_
.
find
(
var_name
)
!=
persistables_
.
end
())
{
persistables_
.
find
(
var_name
)
!=
persistables_
.
end
())
{
var_out_
.
push_back
(
var_name
);
var_out_
.
push_back
(
var_name
);
...
@@ -416,50 +381,30 @@ void NgraphEngine::BuildNgFunction() {
...
@@ -416,50 +381,30 @@ void NgraphEngine::BuildNgFunction() {
std
::
make_shared
<
ngraph
::
Function
>
(
func_outputs
,
func_inputs
);
std
::
make_shared
<
ngraph
::
Function
>
(
func_outputs
,
func_inputs
);
}
}
std
::
shared_ptr
<
std
::
string
>
NgraphEngine
::
GetCacheKey
()
{
auto
cache_key
=
std
::
make_shared
<
std
::
string
>
(
""
);
*
cache_key
+=
std
::
to_string
(
fused_ops_
.
size
());
for
(
auto
&
op
:
fused_ops_
)
{
*
cache_key
+=
op
->
Type
();
}
for
(
auto
&
var_name
:
var_in_
)
{
auto
shape
=
var_node_map_
->
at
(
var_name
)
->
get_shape
();
*
cache_key
+=
var_name
;
*
cache_key
+=
var_type_map_
.
at
(
var_name
).
c_type_string
();
for
(
size_t
i
=
0
;
i
<
shape
.
size
();
++
i
)
{
*
cache_key
+=
std
::
to_string
(
shape
.
at
(
i
));
}
}
for
(
auto
&
var_name
:
var_out_
)
{
auto
*
var
=
scope_
.
FindVar
(
var_name
);
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
&
ddim
=
tensor_pd
->
dims
();
for
(
int
i
=
0
;
i
<
ddim
.
size
();
++
i
)
{
*
cache_key
+=
std
::
to_string
(
ddim
[
i
]);
}
}
}
return
cache_key
;
}
void
NgraphEngine
::
GetNgFunction
()
{
void
NgraphEngine
::
GetNgFunction
()
{
bool
cache_on
=
true
;
bool
cache_on
=
true
;
if
(
cache_on
)
{
if
(
cache_on
)
{
std
::
string
cache_key_val
=
*
GetCacheKey
();
std
::
string
input_shape_str
;
if
(
func_cache_
.
find
(
cache_key_val
)
!=
func_cache_
.
end
())
{
for
(
auto
&
var_name
:
var_in_
)
{
ngraph_function_
=
func_cache_
.
at
(
cache_key_val
);
auto
shape
=
var_node_map_
->
at
(
var_name
)
->
get_shape
();
for
(
size_t
i
=
0
;
i
<
shape
.
size
();
++
i
)
{
input_shape_str
+=
std
::
to_string
(
shape
.
at
(
i
));
}
}
func_cache_key_
=
input_shape_str
+
func_cache_key_
;
if
(
func_cache_
.
find
(
func_cache_key_
)
!=
func_cache_
.
end
())
{
ngraph_function_
=
func_cache_
.
at
(
func_cache_key_
);
}
else
{
}
else
{
BuildNgFunction
();
BuildNgFunction
();
func_cache_
[
cache_key_val
]
=
ngraph_function_
;
func_cache_
[
func_cache_key_
]
=
ngraph_function_
;
}
}
}
else
{
}
else
{
BuildNgFunction
();
BuildNgFunction
();
}
}
}
}
void
NgraphEngine
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
void
NgraphEngine
::
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
std
::
vector
<
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>>
t_in
;
std
::
vector
<
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>>
t_in
;
std
::
vector
<
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>>
t_out
;
std
::
vector
<
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>>
t_out
;
...
@@ -468,37 +413,35 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
...
@@ -468,37 +413,35 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
auto
sp
=
var_node_map_
->
at
(
vi
)
->
get_shape
();
auto
sp
=
var_node_map_
->
at
(
vi
)
->
get_shape
();
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>
ti
;
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>
ti
;
auto
*
var
=
scope
.
FindVar
(
vi
);
auto
*
var
=
scope
.
FindVar
(
vi
);
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
if
(
var
&&
var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
*
tensor_pd
=
Get
LoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
*
tensor_pd
=
Get
MutableLoDTensorOrSelectedRowsValueFromVar
(
var
);
PADDLE_ENFORCE
(
sp
==
Ddim2Shape
(
tensor_pd
->
dims
()),
PADDLE_ENFORCE
(
sp
==
Ddim2Shape
(
tensor_pd
->
dims
()),
"Ensure ngraph tensor layout align with paddle tensor"
);
"Ensure ngraph tensor layout align with paddle tensor"
);
if
(
tensor_pd
->
type
()
==
proto
::
VarType
::
FP32
)
{
auto
ng_type
=
var_type_map_
.
at
(
vi
);
const
float
*
arr
=
tensor_pd
->
data
<
float
>
();
if
(
ng_type
==
ngraph
::
element
::
f32
)
{
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
f32
,
sp
,
auto
pd_arr
=
tensor_pd
->
mutable_data
<
float
>
(
place
);
const_cast
<
float
*>
(
arr
)
);
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
f32
,
sp
,
pd_arr
);
}
else
if
(
tensor_pd
->
type
()
==
proto
::
VarType
::
INT
32
)
{
}
else
if
(
ng_type
==
ngraph
::
element
::
i
32
)
{
const
int
*
arr
=
tensor_pd
->
data
<
int
>
();
const
int
*
arr
=
tensor_pd
->
data
<
int
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
i32
,
sp
,
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
i32
,
sp
,
const_cast
<
int
*>
(
arr
));
const_cast
<
int
*>
(
arr
));
}
else
if
(
tensor_pd
->
type
()
==
proto
::
VarType
::
INT64
)
{
}
else
if
(
ng_type
==
ngraph
::
element
::
i64
)
{
const
int64_t
*
arr
=
tensor_pd
->
data
<
int64_t
>
();
auto
pd_arr
=
tensor_pd
->
mutable_data
<
int64_t
>
(
place
);
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
i64
,
sp
,
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
i64
,
sp
,
pd_arr
);
const_cast
<
int64_t
*>
(
arr
));
}
else
if
(
ng_type
==
ngraph
::
element
::
f64
)
{
}
else
if
(
tensor_pd
->
type
()
==
proto
::
VarType
::
FP64
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
double
>
(
place
);
const
double
*
arr
=
tensor_pd
->
data
<
double
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
f64
,
sp
,
pd_arr
);
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
f64
,
sp
,
}
else
if
(
ng_type
==
ngraph
::
element
::
boolean
)
{
const_cast
<
double
*>
(
arr
));
auto
pd_arr
=
tensor_pd
->
mutable_data
<
bool
>
(
place
);
}
else
if
(
tensor_pd
->
type
()
==
proto
::
VarType
::
BOOL
)
{
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
boolean
,
sp
,
pd_arr
);
const
bool
*
arr
=
tensor_pd
->
data
<
bool
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
boolean
,
sp
,
const_cast
<
bool
*>
(
arr
));
}
else
{
}
else
{
PADDLE_THROW
(
"Data type not handling for var %s"
,
vi
);
PADDLE_THROW
(
"Data type not handling for var %s"
,
vi
);
}
}
}
else
{
}
else
{
PADDLE_THROW
(
"Cannot find var or tensor with var name %s"
,
vi
);
PADDLE_THROW
(
"Cannot find var or tensor with var name %s"
,
vi
);
}
}
bool
is_test
=
(
ng_op_state_
==
PARTIAL_TEST
||
ng_op_state_
==
FULL_TEST
)
bool
is_test
=
(
ng_op_state_
==
OpState
::
PARTIAL_TEST
||
ng_op_state_
==
OpState
::
FULL_TEST
)
?
true
?
true
:
false
;
:
false
;
bool
is_persistable
=
bool
is_persistable
=
...
@@ -510,36 +453,39 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
...
@@ -510,36 +453,39 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
}
}
for
(
size_t
i
=
0
;
i
<
var_out_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
var_out_
.
size
();
++
i
)
{
auto
v
ar_name
=
var_out_
[
i
];
auto
v
o
=
var_out_
[
i
];
auto
*
var
=
scope
.
FindVar
(
v
ar_name
);
auto
*
var
=
scope
.
FindVar
(
v
o
);
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>
to
;
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>
to
;
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
if
(
var
&&
var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetMutableLoDTensorOrSelectedRowsValueFromVar
(
var
);
auto
*
tensor_pd
=
GetMutableLoDTensorOrSelectedRowsValueFromVar
(
var
);
auto
dd
=
tensor_pd
->
dims
();
auto
dd
=
tensor_pd
->
dims
();
ngraph
::
Shape
sp
=
Ddim2Shape
(
dd
);
ngraph
::
Shape
sp
=
Ddim2Shape
(
dd
);
auto
ng_type
=
var_type_map_
.
at
(
v
ar_name
);
auto
ng_type
=
var_type_map_
.
at
(
v
o
);
if
(
ng_type
==
ngraph
::
element
::
f32
)
{
if
(
ng_type
==
ngraph
::
element
::
f32
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
float
>
(
place
);
auto
pd_arr
=
tensor_pd
->
mutable_data
<
float
>
(
place
);
to
=
backend_
->
create_tensor
(
ng
raph
::
element
::
f32
,
sp
,
pd_arr
);
to
=
backend_
->
create_tensor
(
ng
_type
,
sp
,
pd_arr
);
}
else
if
(
ng_type
==
ngraph
::
element
::
i64
)
{
}
else
if
(
ng_type
==
ngraph
::
element
::
i64
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
int64_t
>
(
place
);
auto
pd_arr
=
tensor_pd
->
mutable_data
<
int64_t
>
(
place
);
to
=
backend_
->
create_tensor
(
ngraph
::
element
::
i64
,
sp
,
pd_arr
);
to
=
backend_
->
create_tensor
(
ng_type
,
sp
,
pd_arr
);
}
else
if
(
ng_type
==
ngraph
::
element
::
i32
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
int
>
(
place
);
to
=
backend_
->
create_tensor
(
ng_type
,
sp
,
pd_arr
);
}
else
if
(
ng_type
==
ngraph
::
element
::
f64
)
{
}
else
if
(
ng_type
==
ngraph
::
element
::
f64
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
double
>
(
place
);
auto
pd_arr
=
tensor_pd
->
mutable_data
<
double
>
(
place
);
to
=
backend_
->
create_tensor
(
ng
raph
::
element
::
f64
,
sp
,
pd_arr
);
to
=
backend_
->
create_tensor
(
ng
_type
,
sp
,
pd_arr
);
}
else
if
(
ng_type
==
ngraph
::
element
::
boolean
)
{
}
else
if
(
ng_type
==
ngraph
::
element
::
boolean
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
bool
>
(
place
);
auto
pd_arr
=
tensor_pd
->
mutable_data
<
bool
>
(
place
);
to
=
backend_
->
create_tensor
(
ng
raph
::
element
::
boolean
,
sp
,
pd_arr
);
to
=
backend_
->
create_tensor
(
ng
_type
,
sp
,
pd_arr
);
}
else
{
}
else
{
PADDLE_THROW
(
"Data type not handled in for var %s"
,
v
ar_name
);
PADDLE_THROW
(
"Data type not handled in for var %s"
,
v
o
);
}
}
t_out
.
push_back
(
to
);
t_out
.
push_back
(
to
);
}
else
{
}
else
{
PADDLE_THROW
(
"Cannot find var or tensor with var name %s"
,
v
ar_name
);
PADDLE_THROW
(
"Cannot find var or tensor with var name %s"
,
v
o
);
}
}
}
}
backend_
->
call
(
backend_
->
compile
(
ngraph_function_
),
t_out
,
t_in
);
backend_
->
call
(
backend_
->
compile
(
ngraph_function_
),
t_out
,
t_in
);
}
// NgraphEngine::Run
Impl
}
// NgraphEngine::Run
}
// namespace
framework
}
// namespace
operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/ngraph/ngraph_engine.h
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "ngraph/ngraph.hpp"
namespace
paddle
{
namespace
operators
{
enum
class
OpState
{
/* nGraph support state on ops */
FULL_TRAIN
,
/* Support full ops for train */
PARTIAL_TRAIN
,
/* Support partial ops for train */
FULL_TEST
,
/* Support full list of ops for test */
PARTIAL_TEST
,
/* Support partial list of ops for test */
FULL
,
/* All ops supported from feed to fetch */
UNKNOWN
/* Output all for debug purpose */
};
// perform graph build through bridge and execute computation
class
NgraphEngine
{
public:
explicit
NgraphEngine
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
,
const
std
::
string
&
serialized_graph
,
const
std
::
vector
<
int
>&
interval
);
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
;
static
void
EnableNgraph
(
const
framework
::
ProgramDesc
&
program
);
private:
static
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
func_cache_
;
const
framework
::
Scope
&
scope_
;
const
platform
::
Place
&
place_
;
std
::
vector
<
std
::
shared_ptr
<
framework
::
OperatorBase
>>
fused_ops_
;
std
::
unordered_map
<
std
::
string
,
ngraph
::
element
::
Type
>
var_type_map_
;
std
::
unordered_set
<
std
::
string
>
persistables_
;
std
::
unordered_set
<
std
::
string
>
fetches_
;
std
::
unordered_set
<
std
::
string
>
post_op_inputs_
;
OpState
ng_op_state_
=
OpState
::
UNKNOWN
;
std
::
string
func_cache_key_
;
// ngraph backend eg. CPU
static
std
::
shared_ptr
<
ngraph
::
runtime
::
Backend
>
backend_
;
// ngraph function to call and execute
std
::
shared_ptr
<
ngraph
::
Function
>
ngraph_function_
;
// var_name of inputs
std
::
vector
<
std
::
string
>
var_in_
;
// var_name of outputs from fetch in order
std
::
vector
<
std
::
string
>
var_out_
;
// map input vars to nodes
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_in_node_map_
;
// map each var name with a ngraph node
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_node_map_
;
// prepare info for nraph engine
void
Prepare
(
const
framework
::
BlockDesc
&
block
,
const
std
::
vector
<
int
>&
interval
);
// get ngraph input and define ngraph input parameters
void
GetNgInputShape
(
std
::
shared_ptr
<
framework
::
OperatorBase
>
op
);
// Call ngraph bridge to map ops
void
BuildNgNodes
();
// get the ngraph input and output var list
void
BuildNgIO
();
// build ngraph function call
void
BuildNgFunction
();
// Check cache for ngraph function or otherwise build the function
void
GetNgFunction
();
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ngraph/ngraph_engine_op.cc
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h"
namespace
paddle
{
namespace
operators
{
class
NgraphEngineOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Xs"
,
"A list of inputs."
).
AsDispensable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDispensable
();
AddAttr
<
std
::
string
>
(
"graph"
,
"the graph."
);
AddAttr
<
std
::
vector
<
int
>>
(
"interval"
,
"op interval supported by ngraph"
);
AddComment
(
"ngraph engine operator."
);
}
};
class
NgraphEngineInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
ngraph_engine
,
ops
::
NgraphEngineOp
,
ops
::
NgraphEngineOpMaker
,
ops
::
NgraphEngineOpMaker
);
REGISTER_OP_CPU_KERNEL
(
ngraph_engine
,
ops
::
NgraphEngineKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
paddle/fluid/operators/ngraph/ngraph_engine_op.h
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
operators
{
class
NgraphEngineOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
framework
::
proto
::
VarType
::
FP32
,
ctx
.
GetPlace
());
return
kt
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
NgraphEngineKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
scope
=
ctx
.
scope
();
auto
place
=
ctx
.
GetPlace
();
std
::
string
serialized_graph
=
ctx
.
Attr
<
std
::
string
>
(
"graph"
);
auto
interval
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"interval"
);
NgraphEngine
ngraph_engine
(
scope
,
place
,
serialized_graph
,
interval
);
ngraph_engine
.
Run
(
scope
,
place
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reader/create_ctr_reader_op.cc
浏览文件 @
c7e38680
...
@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase {
...
@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase {
auto
*
queue_holder
=
auto
*
queue_holder
=
queue_holder_var
->
template
GetMutable
<
LoDTensorBlockingQueueHolder
>();
queue_holder_var
->
template
GetMutable
<
LoDTensorBlockingQueueHolder
>();
int
thread_num
=
Attr
<
int
>
(
"thread_num"
);
auto
thread_num
=
Attr
<
int
>
(
"thread_num"
);
std
::
vector
<
std
::
string
>
slots
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"slots"
);
auto
sparse_slots
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"sparse_slots"
);
int
batch_size
=
Attr
<
int
>
(
"batch_size"
);
auto
dense_slot_index
=
Attr
<
std
::
vector
<
int
>>
(
"dense_slot_index"
);
std
::
vector
<
std
::
string
>
file_list
=
auto
sparse_slot_index
=
Attr
<
std
::
vector
<
int
>>
(
"sparse_slot_index"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
);
auto
batch_size
=
Attr
<
int
>
(
"batch_size"
);
out
->
Reset
(
std
::
make_shared
<
CTRReader
>
(
queue_holder
->
GetQueue
(),
batch_size
,
auto
file_type
=
Attr
<
std
::
string
>
(
"file_type"
);
thread_num
,
slots
,
file_list
));
auto
file_format
=
Attr
<
std
::
string
>
(
"file_format"
);
auto
file_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
);
DataDesc
data_desc
(
batch_size
,
file_list
,
file_type
,
file_format
,
dense_slot_index
,
sparse_slot_index
,
sparse_slots
);
VLOG
(
1
)
<<
data_desc
;
out
->
Reset
(
std
::
make_shared
<
CTRReader
>
(
queue_holder
->
GetQueue
(),
thread_num
,
data_desc
));
}
}
};
};
...
@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
...
@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
"Name of the `LoDTensorBlockingQueueHolder` variable"
);
"Name of the `LoDTensorBlockingQueueHolder` variable"
);
AddAttr
<
int
>
(
"thread_num"
,
"the thread num to read data"
);
AddAttr
<
int
>
(
"thread_num"
,
"the thread num to read data"
);
AddAttr
<
int
>
(
"batch_size"
,
"the batch size of read data"
);
AddAttr
<
int
>
(
"batch_size"
,
"the batch size of read data"
);
AddAttr
<
std
::
string
>
(
"file_type"
,
"plain or gzip"
).
SetDefault
(
"plain"
);
AddAttr
<
std
::
string
>
(
"file_format"
,
"svm or csv"
).
SetDefault
(
"csv"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
,
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
,
"The list of files that need to read"
);
"The list of files that need to read"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
AddAttr
<
std
::
vector
<
int
>>
(
"slots"
,
"the slots that should be extract from file"
);
"dense_slot_index"
,
"the dense slots id that should be extract from file"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
int
>>
(
"sparse_slot_index"
,
"the sparse slots id that should be extract from file"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"sparse_slots"
,
"the sparse slots id that should be "
"extract from file, used when file "
"format is svm"
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Create CTRReader to support read ctr data with cpp.
Create CTRReader to support read ctr data with cpp.
...
...
paddle/fluid/operators/reader/ctr_reader.cc
浏览文件 @
c7e38680
...
@@ -73,6 +73,9 @@ static inline void parse_line(
...
@@ -73,6 +73,9 @@ static inline void parse_line(
}
}
}
}
// label slot1:fea_sign slot2:fea_sign slot1:fea_sign
static
inline
void
parse_svm_line
(
const
std
::
string
&
line
)
{}
class
Reader
{
class
Reader
{
public:
public:
virtual
~
Reader
()
{}
virtual
~
Reader
()
{}
...
@@ -95,11 +98,27 @@ class GzipReader : public Reader {
...
@@ -95,11 +98,27 @@ class GzipReader : public Reader {
igzstream
gzstream_
;
igzstream
gzstream_
;
};
};
class
MultiGzip
Reader
:
public
Reader
{
class
PlainFile
Reader
:
public
Reader
{
public:
public:
explicit
MultiGzipReader
(
const
std
::
vector
<
std
::
string
>&
file_list
)
{
explicit
PlainFileReader
(
const
std
::
string
&
file_name
)
:
stream_
(
file_name
.
c_str
())
{}
~
PlainFileReader
()
{}
bool
HasNext
()
override
{
return
stream_
.
peek
()
!=
EOF
;
}
void
NextLine
(
std
::
string
*
line
)
override
{
std
::
getline
(
stream_
,
*
line
);
}
private:
std
::
ifstream
stream_
;
};
template
<
typename
SingleFileReader
>
class
MultiFileReader
:
public
Reader
{
public:
explicit
MultiFileReader
(
const
std
::
vector
<
std
::
string
>&
file_list
)
{
for
(
auto
&
file
:
file_list
)
{
for
(
auto
&
file
:
file_list
)
{
readers_
.
emplace_back
(
std
::
make_shared
<
Gzip
Reader
>
(
file
));
readers_
.
emplace_back
(
std
::
make_shared
<
SingleFile
Reader
>
(
file
));
}
}
}
}
...
@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader {
...
@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader {
}
}
private:
private:
std
::
vector
<
std
::
shared_ptr
<
Gzip
Reader
>>
readers_
;
std
::
vector
<
std
::
shared_ptr
<
SingleFile
Reader
>>
readers_
;
size_t
current_reader_index_
=
0
;
size_t
current_reader_index_
=
0
;
};
};
void
MonitorThread
(
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
void
MonitorThread
(
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
3
0
)
<<
"monitor thread in"
;
VLOG
(
3
)
<<
"monitor thread in"
;
bool
reader_thread_is_running
=
true
;
bool
reader_thread_is_running
=
true
;
while
(
reader_thread_is_running
)
{
while
(
reader_thread_is_running
)
{
VLOG
(
3
0
)
<<
"reader_thread_is_running"
;
VLOG
(
3
)
<<
"reader_thread_is_running"
;
reader_thread_is_running
=
false
;
reader_thread_is_running
=
false
;
for
(
size_t
i
=
0
;
i
<
(
*
thread_status
).
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
(
*
thread_status
).
size
();
++
i
)
{
if
((
*
thread_status
)[
i
]
==
Running
)
{
if
((
*
thread_status
)[
i
]
==
Running
)
{
VLOG
(
3
0
)
<<
"reader is running!"
;
VLOG
(
3
)
<<
"reader is running!"
;
reader_thread_is_running
=
true
;
reader_thread_is_running
=
true
;
}
}
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
}
}
VLOG
(
3
0
)
<<
"all reader thread is stopped, push empty data into
queue"
;
VLOG
(
3
)
<<
"all reader thread is stopped, close the
queue"
;
queue
->
Push
({}
);
queue
->
Close
(
);
VLOG
(
3
0
)
<<
"monitor thread exited"
;
VLOG
(
3
)
<<
"monitor thread exited"
;
}
}
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
void
ReadSvmData
(
const
DataDesc
&
data_desc
,
std
::
shared_ptr
<
Reader
>
reader
,
const
std
::
vector
<
std
::
string
>&
slots
,
int
batch_size
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
30
)
<<
"["
<<
thread_id
<<
"]"
<<
" reader thread start! thread_id = "
<<
thread_id
;
for
(
auto
&
file
:
file_list
)
{
VLOG
(
30
)
<<
"["
<<
thread_id
<<
"]"
<<
" file "
<<
file
;
}
(
*
thread_status
)[
thread_id
]
=
Running
;
VLOG
(
30
)
<<
"set status to running"
;
std
::
unordered_map
<
std
::
string
,
size_t
>
slot_to_index
;
std
::
unordered_map
<
std
::
string
,
size_t
>
slot_to_index
;
for
(
size_t
i
=
0
;
i
<
slots
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
data_desc
.
sparse_slot_ids_
.
size
();
++
i
)
{
slot_to_index
[
slots
[
i
]]
=
i
;
slot_to_index
[
data_desc
.
sparse_slot_ids_
[
i
]]
=
i
;
}
}
std
::
string
line
;
std
::
string
line
;
...
@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list,
...
@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list,
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>>
batch_data
;
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>>
batch_data
;
std
::
vector
<
int64_t
>
batch_label
;
std
::
vector
<
int64_t
>
batch_label
;
MultiGzipReader
reader
(
file_list
);
while
(
reader
->
HasNext
())
{
VLOG
(
30
)
<<
"reader inited"
;
while
(
reader
.
HasNext
())
{
batch_data
.
clear
();
batch_data
.
clear
();
batch_data
.
reserve
(
batch_size
);
batch_data
.
reserve
(
data_desc
.
batch_size_
);
batch_label
.
clear
();
batch_label
.
clear
();
batch_label
.
reserve
(
batch_size
);
batch_label
.
reserve
(
data_desc
.
batch_size_
);
// read batch_size data
// read batch_size data
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
data_desc
.
batch_size_
;
++
i
)
{
if
(
reader
.
HasNext
())
{
if
(
reader
->
HasNext
())
{
reader
.
NextLine
(
&
line
);
reader
->
NextLine
(
&
line
);
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>
slot_to_data
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>
slot_to_data
;
int64_t
label
;
int64_t
label
;
parse_line
(
line
,
slot_to_index
,
&
label
,
&
slot_to_data
);
parse_line
(
line
,
slot_to_index
,
&
label
,
&
slot_to_data
);
...
@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list,
...
@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list,
std
::
vector
<
framework
::
LoDTensor
>
lod_datas
;
std
::
vector
<
framework
::
LoDTensor
>
lod_datas
;
// first insert tensor for each slots
// first insert tensor for each s
parse_s
lots
for
(
auto
&
slot
:
slots
)
{
for
(
auto
&
slot
:
data_desc
.
sparse_slot_ids_
)
{
std
::
vector
<
size_t
>
lod_data
{
0
};
std
::
vector
<
size_t
>
lod_data
{
0
};
std
::
vector
<
int64_t
>
batch_feasign
;
std
::
vector
<
int64_t
>
batch_feasign
;
...
@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list,
...
@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list,
lod_datas
.
push_back
(
label_tensor
);
lod_datas
.
push_back
(
label_tensor
);
queue
->
Push
(
lod_datas
);
queue
->
Push
(
lod_datas
);
VLOG
(
40
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
VLOG
(
4
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
}
}
// label dense_fea,dense_fea sparse_fea,sparse_fea
static
inline
void
parse_csv_line
(
const
std
::
string
&
line
,
const
DataDesc
&
data_desc
,
int64_t
*
label
,
std
::
vector
<
std
::
vector
<
float
>>*
dense_datas
,
std
::
vector
<
std
::
vector
<
int64_t
>>*
sparse_datas
)
{
std
::
vector
<
std
::
string
>
ret
;
string_split
(
line
,
' '
,
&
ret
);
*
label
=
std
::
stol
(
ret
[
0
]);
dense_datas
->
resize
(
data_desc
.
dense_slot_index_
.
size
());
for
(
size_t
i
=
0
;
i
<
data_desc
.
dense_slot_index_
.
size
();
++
i
)
{
int
slot_idx
=
data_desc
.
dense_slot_index_
[
i
];
auto
&
slot_data
=
ret
[
slot_idx
];
std
::
vector
<
std
::
string
>
data_in_slot_str
;
string_split
(
slot_data
,
','
,
&
data_in_slot_str
);
std
::
vector
<
float
>
data_in_slot
;
for
(
auto
&
data_str
:
data_in_slot_str
)
{
(
*
dense_datas
)[
i
].
push_back
(
std
::
stof
(
data_str
));
}
}
sparse_datas
->
resize
(
data_desc
.
sparse_slot_index_
.
size
());
for
(
size_t
i
=
0
;
i
<
data_desc
.
sparse_slot_index_
.
size
();
++
i
)
{
int
slot_idx
=
data_desc
.
sparse_slot_index_
[
i
];
auto
&
slot_data
=
ret
[
slot_idx
];
std
::
vector
<
std
::
string
>
data_in_slot_str
;
string_split
(
slot_data
,
','
,
&
data_in_slot_str
);
std
::
vector
<
int64_t
>
data_in_slot
;
for
(
auto
&
data_str
:
data_in_slot_str
)
{
auto
id
=
std
::
stol
(
data_str
);
(
*
sparse_datas
)[
i
].
push_back
(
id
);
}
}
}
void
ReadCsvData
(
const
DataDesc
&
data_desc
,
std
::
shared_ptr
<
Reader
>
reader
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
std
::
string
line
;
while
(
reader
->
HasNext
())
{
std
::
vector
<
int64_t
>
batch_label
;
batch_label
.
reserve
(
data_desc
.
batch_size_
);
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
batch_dense_data
;
batch_dense_data
.
reserve
(
data_desc
.
batch_size_
);
std
::
vector
<
std
::
vector
<
std
::
vector
<
int64_t
>>>
batch_sparse_data
;
batch_sparse_data
.
reserve
(
data_desc
.
batch_size_
);
// read batch_size data
for
(
int
i
=
0
;
i
<
data_desc
.
batch_size_
;
++
i
)
{
if
(
reader
->
HasNext
())
{
reader
->
NextLine
(
&
line
);
int64_t
label
;
std
::
vector
<
std
::
vector
<
float
>>
dense_datas
;
std
::
vector
<
std
::
vector
<
int64_t
>>
sparse_datas
;
parse_csv_line
(
line
,
data_desc
,
&
label
,
&
dense_datas
,
&
sparse_datas
);
batch_label
.
push_back
(
label
);
if
(
!
batch_dense_data
.
empty
())
{
PADDLE_ENFORCE_EQ
(
batch_dense_data
[
0
].
size
(),
dense_datas
.
size
(),
"dense data should have the same shape"
);
}
batch_dense_data
.
push_back
(
dense_datas
);
batch_sparse_data
.
push_back
(
sparse_datas
);
}
else
{
break
;
}
}
// the order of output data is label, dense_datas, sparse_datas
std
::
vector
<
framework
::
LoDTensor
>
lod_datas
;
// insert label tensor
framework
::
LoDTensor
label_tensor
;
auto
*
label_tensor_data
=
label_tensor
.
mutable_data
<
int64_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
batch_label
.
size
()),
1
}),
platform
::
CPUPlace
());
memcpy
(
label_tensor_data
,
batch_label
.
data
(),
batch_label
.
size
()
*
sizeof
(
int64_t
));
lod_datas
.
push_back
(
label_tensor
);
// insert tensor for each dense_slots
for
(
size_t
i
=
0
;
i
<
data_desc
.
dense_slot_index_
.
size
();
++
i
)
{
framework
::
LoDTensor
lod_tensor
;
size_t
width
=
batch_dense_data
[
0
][
i
].
size
();
auto
*
tensor_data
=
lod_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
batch_dense_data
.
size
()),
// batch_size
static_cast
<
int64_t
>
(
width
)}),
platform
::
CPUPlace
());
for
(
size_t
j
=
0
;
j
<
batch_dense_data
.
size
();
++
j
)
{
auto
&
dense_data_row
=
batch_dense_data
[
j
][
i
];
memcpy
(
tensor_data
+
j
*
width
,
dense_data_row
.
data
(),
width
*
sizeof
(
float
));
}
lod_datas
.
push_back
(
lod_tensor
);
}
// insert tensor for each sparse_slots
for
(
size_t
i
=
0
;
i
<
data_desc
.
sparse_slot_index_
.
size
();
++
i
)
{
std
::
vector
<
size_t
>
lod_data
{
0
};
std
::
vector
<
int64_t
>
batch_feasign
;
for
(
size_t
row_idx
=
0
;
row_idx
<
batch_sparse_data
.
size
();
++
row_idx
)
{
auto
&
sparse_ids
=
batch_sparse_data
[
row_idx
][
i
];
lod_data
.
push_back
(
lod_data
.
back
()
+
sparse_ids
.
size
());
batch_feasign
.
insert
(
batch_feasign
.
end
(),
sparse_ids
.
begin
(),
sparse_ids
.
end
());
}
framework
::
LoDTensor
lod_tensor
;
framework
::
LoD
lod
{
lod_data
};
lod_tensor
.
set_lod
(
lod
);
int64_t
*
tensor_data
=
lod_tensor
.
mutable_data
<
int64_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
batch_feasign
.
size
()),
1
}),
platform
::
CPUPlace
());
memcpy
(
tensor_data
,
batch_feasign
.
data
(),
batch_feasign
.
size
()
*
sizeof
(
int64_t
));
lod_datas
.
push_back
(
lod_tensor
);
}
queue
->
Push
(
lod_datas
);
VLOG
(
4
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
}
}
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
const
DataDesc
&
data_desc
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
3
)
<<
"["
<<
thread_id
<<
"]"
<<
" reader thread start! thread_id = "
<<
thread_id
;
for
(
auto
&
file
:
file_list
)
{
VLOG
(
3
)
<<
"["
<<
thread_id
<<
"]"
<<
" file "
<<
file
;
}
(
*
thread_status
)[
thread_id
]
=
Running
;
VLOG
(
3
)
<<
"set status to running"
;
std
::
shared_ptr
<
Reader
>
reader
;
if
(
data_desc
.
file_type_
==
"gzip"
)
{
reader
.
reset
(
new
MultiFileReader
<
GzipReader
>
(
file_list
));
}
else
if
(
data_desc
.
file_type_
==
"plain"
)
{
reader
.
reset
(
new
MultiFileReader
<
PlainFileReader
>
(
file_list
));
}
else
{
PADDLE_THROW
(
"do not support file format %s"
,
data_desc
.
file_type_
);
}
VLOG
(
3
)
<<
"reader inited"
;
if
(
data_desc
.
file_format_
==
"svm"
)
{
ReadSvmData
(
data_desc
,
reader
,
queue
);
}
else
if
(
data_desc
.
file_format_
==
"csv"
)
{
ReadCsvData
(
data_desc
,
reader
,
queue
);
}
}
(
*
thread_status
)[
thread_id
]
=
Stopped
;
(
*
thread_status
)[
thread_id
]
=
Stopped
;
VLOG
(
3
0
)
<<
"set status to stopped, thread "
<<
thread_id
<<
" exited"
;
VLOG
(
3
)
<<
"set status to stopped, thread "
<<
thread_id
<<
" exited"
;
}
}
}
// namespace reader
}
// namespace reader
...
...
paddle/fluid/operators/reader/ctr_reader.h
浏览文件 @
c7e38680
...
@@ -36,9 +36,63 @@ namespace reader {
...
@@ -36,9 +36,63 @@ namespace reader {
enum
ReaderThreadStatus
{
Running
,
Stopped
};
enum
ReaderThreadStatus
{
Running
,
Stopped
};
struct
DataDesc
{
DataDesc
(
int
batch_size
,
const
std
::
vector
<
std
::
string
>&
file_names
,
const
std
::
string
&
file_type
,
const
std
::
string
&
file_format
,
const
std
::
vector
<
int
>&
dense_slot_index
,
const
std
::
vector
<
int
>&
sparse_slot_index
,
const
std
::
vector
<
std
::
string
>&
sparse_slot_ids
)
:
batch_size_
(
batch_size
),
file_names_
(
file_names
),
file_type_
(
file_type
),
file_format_
(
file_format
),
dense_slot_index_
(
dense_slot_index
),
sparse_slot_index_
(
sparse_slot_index
),
sparse_slot_ids_
(
sparse_slot_ids
)
{}
const
int
batch_size_
;
const
std
::
vector
<
std
::
string
>
file_names_
;
const
std
::
string
file_type_
;
// gzip or plain
const
std
::
string
file_format_
;
// csv or svm
// used for csv data format
const
std
::
vector
<
int
>
dense_slot_index_
;
const
std
::
vector
<
int
>
sparse_slot_index_
;
// used for svm data format
const
std
::
vector
<
std
::
string
>
sparse_slot_ids_
;
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
DataDesc
&
data_desc
)
{
os
<<
"data_desc:
\n
"
;
os
<<
"
\t
batch_size -> "
<<
data_desc
.
batch_size_
<<
"
\n
"
;
os
<<
"
\t
file_type -> "
<<
data_desc
.
file_type_
<<
"
\n
"
;
os
<<
"
\t
file_format -> "
<<
data_desc
.
file_format_
<<
"
\n
"
;
os
<<
"
\t
file_names -> {"
;
for
(
auto
&
file_name
:
data_desc
.
file_names_
)
{
os
<<
file_name
<<
","
;
}
os
<<
"}
\n
"
;
os
<<
"
\t
dense_slot_index -> {"
;
for
(
auto
&
slot
:
data_desc
.
dense_slot_index_
)
{
os
<<
slot
<<
","
;
}
os
<<
"}
\n
"
;
os
<<
"
\t
sparse_slot_index_ -> {"
;
for
(
auto
&
slot
:
data_desc
.
sparse_slot_index_
)
{
os
<<
slot
<<
","
;
}
os
<<
"}
\n
"
;
os
<<
"
\t
sparse_slot_ids_ -> {"
;
for
(
auto
&
slot
:
data_desc
.
sparse_slot_ids_
)
{
os
<<
slot
<<
","
;
}
os
<<
"}
\n
"
;
return
os
;
}
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
const
std
::
vector
<
std
::
string
>&
slots
,
int
batch_size
,
const
DataDesc
&
data_desc
,
int
thread_id
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
);
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
);
// monitor all running thread, if they are all stopped,
// monitor all running thread, if they are all stopped,
...
@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
...
@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
class
CTRReader
:
public
framework
::
FileReader
{
class
CTRReader
:
public
framework
::
FileReader
{
public:
public:
explicit
CTRReader
(
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
queue
,
CTRReader
(
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
queue
,
int
batch_size
,
size_t
thread_num
,
int
thread_num
,
const
DataDesc
&
data_desc
)
const
std
::
vector
<
std
::
string
>&
slots
,
:
data_desc_
(
data_desc
)
{
const
std
::
vector
<
std
::
string
>&
file_list
)
:
batch_size_
(
batch_size
),
slots_
(
slots
),
file_list_
(
file_list
)
{
PADDLE_ENFORCE_GT
(
thread_num
,
0
,
"thread num should be larger then 0!"
);
PADDLE_ENFORCE_GT
(
thread_num
,
0
,
"thread num should be larger then 0!"
);
PADDLE_ENFORCE
(
queue
!=
nullptr
,
"LoDTensorBlockingQueue must not be null"
);
PADDLE_ENFORCE
(
queue
!=
nullptr
,
"LoDTensorBlockingQueue must not be null"
);
PADDLE_ENFORCE_GT
(
file_list
.
size
(),
0
,
"file list should not be empty"
);
PADDLE_ENFORCE_GT
(
data_desc_
.
file_names_
.
size
(),
0
,
thread_num_
=
std
::
min
<
size_t
>
(
file_list_
.
size
(),
thread_num
);
"file list should not be empty"
);
thread_num_
=
std
::
min
<
size_t
>
(
data_desc_
.
file_names_
.
size
(),
thread_num
);
queue_
=
queue
;
queue_
=
queue
;
SplitFiles
();
SplitFiles
();
for
(
size_t
i
=
0
;
i
<
thread_num_
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
thread_num_
;
++
i
)
{
...
@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader {
...
@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader {
}
}
}
}
~
CTRReader
()
{}
~
CTRReader
()
{
Shutdown
();
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
{
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
{
bool
success
;
bool
success
;
...
@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader {
...
@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader {
for
(
auto
&
read_thread
:
read_threads_
)
{
for
(
auto
&
read_thread
:
read_threads_
)
{
read_thread
->
join
();
read_thread
->
join
();
}
}
monitor_thread_
->
join
();
if
(
monitor_thread_
)
{
monitor_thread_
->
join
();
}
read_threads_
.
clear
();
read_threads_
.
clear
();
monitor_thread_
.
reset
(
nullptr
);
monitor_thread_
.
reset
(
nullptr
);
...
@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader {
...
@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader {
queue_
->
ReOpen
();
queue_
->
ReOpen
();
VLOG
(
3
)
<<
"reopen success"
;
VLOG
(
3
)
<<
"reopen success"
;
VLOG
(
3
)
<<
"thread_num "
<<
thread_num_
;
VLOG
(
3
)
<<
"thread_num "
<<
thread_num_
;
for
(
size_
t
thread_id
=
0
;
thread_id
<
thread_num_
;
thread_id
++
)
{
for
(
in
t
thread_id
=
0
;
thread_id
<
thread_num_
;
thread_id
++
)
{
read_threads_
.
emplace_back
(
new
std
::
thread
(
std
::
bind
(
read_threads_
.
emplace_back
(
new
std
::
thread
(
std
::
bind
(
&
ReadThread
,
file_groups_
[
thread_id
],
slots_
,
batch_size
_
,
&
ReadThread
,
file_groups_
[
thread_id
],
data_desc
_
,
static_cast
<
int
>
(
thread_id
),
&
read_thread_status_
,
queue_
)));
static_cast
<
int
>
(
thread_id
),
&
read_thread_status_
,
queue_
)));
}
}
monitor_thread_
.
reset
(
new
std
::
thread
(
monitor_thread_
.
reset
(
new
std
::
thread
(
...
@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader {
...
@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader {
private:
private:
void
SplitFiles
()
{
void
SplitFiles
()
{
file_groups_
.
resize
(
thread_num_
);
file_groups_
.
resize
(
thread_num_
);
for
(
size_t
i
=
0
;
i
<
file_list
_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
data_desc_
.
file_names
_
.
size
();
++
i
)
{
auto
&
file_name
=
file_list
_
[
i
];
auto
&
file_name
=
data_desc_
.
file_names
_
[
i
];
std
::
ifstream
f
(
file_name
.
c_str
());
std
::
ifstream
f
(
file_name
.
c_str
());
PADDLE_ENFORCE
(
f
.
good
(),
"file %s not exist!"
,
file_name
);
PADDLE_ENFORCE
(
f
.
good
(),
"file %s not exist!"
,
file_name
);
file_groups_
[
i
%
thread_num_
].
push_back
(
file_name
);
file_groups_
[
i
%
thread_num_
].
push_back
(
file_name
);
...
@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader {
...
@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader {
private:
private:
size_t
thread_num_
;
size_t
thread_num_
;
const
int
batch_size_
;
const
DataDesc
data_desc_
;
const
std
::
vector
<
std
::
string
>
slots_
;
const
std
::
vector
<
std
::
string
>
file_list_
;
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue_
;
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
thread
>>
read_threads_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
thread
>>
read_threads_
;
std
::
unique_ptr
<
std
::
thread
>
monitor_thread_
;
std
::
unique_ptr
<
std
::
thread
>
monitor_thread_
;
...
...
paddle/fluid/operators/reader/ctr_reader_test.cc
浏览文件 @
c7e38680
...
@@ -36,6 +36,7 @@ using paddle::framework::LoD;
...
@@ -36,6 +36,7 @@ using paddle::framework::LoD;
using
paddle
::
framework
::
DDim
;
using
paddle
::
framework
::
DDim
;
using
paddle
::
platform
::
CPUPlace
;
using
paddle
::
platform
::
CPUPlace
;
using
paddle
::
framework
::
make_ddim
;
using
paddle
::
framework
::
make_ddim
;
using
paddle
::
operators
::
reader
::
DataDesc
;
static
void
generatedata
(
const
std
::
vector
<
std
::
string
>&
data
,
static
void
generatedata
(
const
std
::
vector
<
std
::
string
>&
data
,
const
std
::
string
&
file_name
)
{
const
std
::
string
&
file_name
)
{
...
@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) {
...
@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) {
LoDTensorBlockingQueueHolder
queue_holder
;
LoDTensorBlockingQueueHolder
queue_holder
;
int
capacity
=
64
;
int
capacity
=
64
;
queue_holder
.
InitOnce
(
capacity
,
{},
false
);
queue_holder
.
InitOnce
(
capacity
,
false
);
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
=
queue_holder
.
GetQueue
();
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
=
queue_holder
.
GetQueue
();
int
batch_size
=
3
;
int
batch_size
=
3
;
int
thread_num
=
1
;
int
thread_num
=
1
;
std
::
vector
<
std
::
string
>
slots
=
{
"6002"
,
"6003"
};
std
::
vector
<
std
::
string
>
s
parse_s
lots
=
{
"6002"
,
"6003"
};
std
::
vector
<
std
::
string
>
file_list
;
std
::
vector
<
std
::
string
>
file_list
;
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
file_list
.
push_back
(
gz_file_name
);
file_list
.
push_back
(
gz_file_name
);
}
}
CTRReader
reader
(
queue
,
batch_size
,
thread_num
,
slots
,
file_list
);
DataDesc
data_desc
(
batch_size
,
file_list
,
"gzip"
,
"svm"
,
{},
{},
sparse_slots
);
CTRReader
reader
(
queue
,
thread_num
,
data_desc
);
reader
.
Start
();
reader
.
Start
();
size_t
batch_num
=
size_t
batch_num
=
std
::
ceil
(
static_cast
<
float
>
(
ctr_data
.
size
())
/
batch_size
)
*
thread_num
;
std
::
ceil
(
static_cast
<
float
>
(
ctr_data
.
size
())
/
batch_size
)
*
thread_num
;
check_all_data
(
ctr_data
,
slots
,
label_dims
,
label_value
,
data_slot_6002
,
check_all_data
(
ctr_data
,
sparse_slots
,
label_dims
,
label_value
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
reader
.
Shutdown
();
reader
.
Shutdown
();
reader
.
Start
();
reader
.
Start
();
check_all_data
(
ctr_data
,
slots
,
label_dims
,
label_value
,
data_slot_6002
,
check_all_data
(
ctr_data
,
sparse_slots
,
label_dims
,
label_value
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
reader
.
Shutdown
();
reader
.
Shutdown
();
}
}
static
void
GenereteCsvData
(
const
std
::
string
&
file_name
,
const
std
::
vector
<
std
::
string
>&
data
)
{
std
::
ofstream
out
(
file_name
.
c_str
());
PADDLE_ENFORCE
(
out
.
good
(),
"open file %s failed!"
,
file_name
);
for
(
auto
&
c
:
data
)
{
out
<<
c
;
}
out
.
close
();
PADDLE_ENFORCE
(
out
.
good
(),
"save file %s failed!"
,
file_name
);
}
static
void
CheckReadCsvOut
(
const
std
::
vector
<
LoDTensor
>&
out
)
{
ASSERT_EQ
(
out
.
size
(),
3
);
ASSERT_EQ
(
out
[
0
].
dims
()[
1
],
1
);
ASSERT_EQ
(
out
[
1
].
dims
()[
1
],
2
);
ASSERT_EQ
(
out
[
2
].
dims
()[
1
],
1
);
for
(
size_t
i
=
0
;
i
<
out
[
0
].
numel
();
++
i
)
{
int64_t
label
=
out
[
0
].
data
<
int64_t
>
()[
i
];
auto
&
dense_dim
=
out
[
1
].
dims
();
for
(
size_t
j
=
0
;
j
<
dense_dim
[
1
];
++
j
)
{
ASSERT_EQ
(
out
[
1
].
data
<
float
>
()[
i
*
dense_dim
[
1
]
+
j
],
static_cast
<
float
>
(
label
+
0.1
));
}
auto
&
sparse_lod
=
out
[
2
].
lod
();
for
(
size_t
j
=
sparse_lod
[
0
][
i
];
j
<
sparse_lod
[
0
][
i
+
1
];
++
j
)
{
ASSERT_EQ
(
out
[
2
].
data
<
int64_t
>
()[
j
],
label
);
}
}
}
TEST
(
CTR_READER
,
read_csv_data
)
{
std
::
string
file_name
=
"test_ctr_reader_data.csv"
;
const
std
::
vector
<
std
::
string
>
csv_data
=
{
"0 0.1,0.1 0,0,0,0
\n
"
,
"1 1.1,1.1 1,1,1,1
\n
"
,
"2 2.1,2.1 2,2,2,2
\n
"
,
"3 3.1,3.1 3,3,3,3
\n
"
,
};
GenereteCsvData
(
file_name
,
csv_data
);
LoDTensorBlockingQueueHolder
queue_holder
;
int
capacity
=
64
;
queue_holder
.
InitOnce
(
capacity
,
false
);
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
=
queue_holder
.
GetQueue
();
int
batch_size
=
3
;
int
thread_num
=
1
;
std
::
vector
<
std
::
string
>
file_list
;
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
file_list
.
push_back
(
file_name
);
}
DataDesc
data_desc
(
batch_size
,
file_list
,
"plain"
,
"csv"
,
{
1
},
{
2
},
{});
CTRReader
reader
(
queue
,
thread_num
,
data_desc
);
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
reader
.
Start
();
std
::
vector
<
LoDTensor
>
out
;
while
(
true
)
{
reader
.
ReadNext
(
&
out
);
if
(
out
.
empty
())
{
break
;
}
CheckReadCsvOut
(
out
);
}
reader
.
Shutdown
();
}
}
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
浏览文件 @
c7e38680
...
@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue {
...
@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue {
friend
class
LoDTensorBlockingQueueHolder
;
friend
class
LoDTensorBlockingQueueHolder
;
private:
private:
LoDTensorBlockingQueue
(
size_t
capacity
,
explicit
LoDTensorBlockingQueue
(
size_t
capacity
,
bool
speed_test_mode
=
false
)
const
std
::
vector
<
framework
::
DDim
>&
dims
,
:
queue_
(
capacity
,
speed_test_mode
)
{}
bool
speed_test_mode
=
false
)
:
queue_
(
capacity
,
speed_test_mode
),
dims_
(
dims
)
{}
public:
public:
bool
Push
(
const
std
::
vector
<
framework
::
LoDTensor
>&
lod_tensor_vec
)
{
bool
Push
(
const
std
::
vector
<
framework
::
LoDTensor
>&
lod_tensor_vec
)
{
...
@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue {
...
@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue {
private:
private:
BlockingQueue
<
std
::
vector
<
framework
::
LoDTensor
>>
queue_
;
BlockingQueue
<
std
::
vector
<
framework
::
LoDTensor
>>
queue_
;
std
::
vector
<
framework
::
DDim
>
dims_
;
};
};
class
LoDTensorBlockingQueueHolder
{
class
LoDTensorBlockingQueueHolder
{
public:
public:
void
InitOnce
(
size_t
capacity
,
const
std
::
vector
<
framework
::
DDim
>&
dims
,
void
InitOnce
(
size_t
capacity
,
bool
speed_test_mode
=
false
)
{
bool
speed_test_mode
=
false
)
{
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
queue_
==
nullptr
,
queue_
==
nullptr
,
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"
);
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"
);
queue_
.
reset
(
new
LoDTensorBlockingQueue
(
capacity
,
dims
,
speed_test_mode
));
queue_
.
reset
(
new
LoDTensorBlockingQueue
(
capacity
,
speed_test_mode
));
}
}
inline
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
GetQueue
()
const
{
inline
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
GetQueue
()
const
{
...
...
paddle/fluid/operators/reader/read_op.cc
浏览文件 @
c7e38680
...
@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase {
...
@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase {
"The ReadOp must take a reader as input."
);
"The ReadOp must take a reader as input."
);
PADDLE_ENFORCE
(
ctx
->
HasOutputs
(
"Out"
),
PADDLE_ENFORCE
(
ctx
->
HasOutputs
(
"Out"
),
"The ReadOp should be assigned with output."
);
"The ReadOp should be assigned with output."
);
std
::
vector
<
framework
::
DDim
>
reader_dims
=
ctx
->
GetReaderDims
(
"Reader"
);
if
(
!
ctx
->
IsRuntime
()
&&
ctx
->
Attrs
().
Get
<
bool
>
(
"infer_out"
))
{
std
::
vector
<
std
::
string
>
out_names
=
ctx
->
Outputs
(
"Out
"
);
std
::
vector
<
framework
::
DDim
>
reader_dims
=
ctx
->
GetReaderDims
(
"Reader
"
);
PADDLE_ENFORCE_EQ
(
std
::
vector
<
std
::
string
>
out_names
=
ctx
->
Outputs
(
"Out"
);
reader_dims
.
size
(),
out_names
.
size
(),
PADDLE_ENFORCE_EQ
(
"The reader's dim number doesn't match the output number."
);
reader_dims
.
size
(),
out_names
.
size
(),
ctx
->
SetOutputsDim
(
"Out"
,
reader_dims
);
"The reader's dim number doesn't match the output number."
);
if
(
!
ctx
->
IsRuntime
())
{
ctx
->
SetOutputsDim
(
"Out"
,
reader_dims
);
auto
in_desc
=
auto
in_desc
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"Reader"
)[
0
]);
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"Reader"
)[
0
]);
auto
in_lod_levels
=
in_desc
->
GetLoDLevels
();
auto
in_lod_levels
=
in_desc
->
GetLoDLevels
();
...
@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference {
...
@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference {
public:
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
framework
::
BlockDesc
*
block
)
const
override
{
std
::
string
reader_name
=
op_desc
.
Input
(
"Reader"
)[
0
];
bool
infer_out
=
boost
::
get
<
bool
>
(
op_desc
.
GetAttr
(
"infer_out"
));
std
::
vector
<
std
::
string
>
out_names
=
op_desc
.
Output
(
"Out"
);
if
(
infer_out
)
{
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
std
::
string
reader_name
=
op_desc
.
Input
(
"Reader"
)[
0
];
auto
dtypes
=
reader
->
GetDataTypes
();
std
::
vector
<
std
::
string
>
out_names
=
op_desc
.
Output
(
"Out"
);
PADDLE_ENFORCE_EQ
(
dtypes
.
size
(),
out_names
.
size
());
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
for
(
size_t
i
=
0
;
i
<
dtypes
.
size
();
++
i
)
{
auto
dtypes
=
reader
->
GetDataTypes
();
framework
::
VarDesc
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_names
[
i
]);
PADDLE_ENFORCE_EQ
(
dtypes
.
size
(),
out_names
.
size
());
out
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
for
(
size_t
i
=
0
;
i
<
dtypes
.
size
();
++
i
)
{
out
.
SetDataType
(
dtypes
[
i
]);
framework
::
VarDesc
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_names
[
i
]);
out
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
out
.
SetDataType
(
dtypes
[
i
]);
}
}
}
}
}
};
};
...
@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase {
...
@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase {
private:
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
platform
::
Place
&
dev_place
)
const
override
{
VLOG
(
3
)
<<
"read op in"
;
framework
::
ReaderHolder
*
reader
=
framework
::
ReaderHolder
*
reader
=
detail
::
Ref
(
scope
.
FindVar
(
Input
(
"Reader"
)),
detail
::
Ref
(
scope
.
FindVar
(
Input
(
"Reader"
)),
"Cannot find reader variable %s"
,
Input
(
"Reader"
))
"Cannot find reader variable %s"
,
Input
(
"Reader"
))
...
@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase {
...
@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase {
reader
->
ReadNext
(
&
ins
);
reader
->
ReadNext
(
&
ins
);
if
(
ins
.
empty
())
{
if
(
ins
.
empty
())
{
VLOG
(
3
)
<<
"read empty data in"
;
if
(
Attr
<
bool
>
(
"throw_eof_exp"
))
{
if
(
Attr
<
bool
>
(
"throw_eof_exp"
))
{
VLOG
(
3
)
<<
"throw_eof_exp"
;
PADDLE_THROW_EOF
();
PADDLE_THROW_EOF
();
}
else
{
}
else
{
ins
.
resize
(
out_arg_names
.
size
());
ins
.
resize
(
out_arg_names
.
size
());
...
@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase {
...
@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase {
tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
0
}),
dev_place
);
tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
0
}),
dev_place
);
}
}
}
}
VLOG
(
3
)
<<
"read empty data out"
;
}
}
PADDLE_ENFORCE_EQ
(
ins
.
size
(),
out_arg_names
.
size
());
PADDLE_ENFORCE_EQ
(
ins
.
size
(),
out_arg_names
.
size
());
for
(
size_t
i
=
0
;
i
<
out_arg_names
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
out_arg_names
.
size
();
++
i
)
{
...
@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
" only when the data-balance is enabled in ParallelExecutor"
" only when the data-balance is enabled in ParallelExecutor"
" and it is set by ParallelExecutor instance, not users."
)
" and it is set by ParallelExecutor instance, not users."
)
.
SetDefault
(
true
);
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"infer_out"
,
""
).
SetDefault
(
true
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Read Operator
Read Operator
...
...
paddle/fluid/operators/reader/reader_op_registry.cc
浏览文件 @
c7e38680
...
@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() {
...
@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() {
"It means the reader will generate two data each time,"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively."
);
"whose shapes are [2,3,4] and [5,6] respectively."
);
AddAttr
<
std
::
vector
<
int
>>
(
"lod_levels"
,
"The LoD levels of each data."
);
AddAttr
<
std
::
vector
<
int
>>
(
"lod_levels"
,
"The LoD levels of each data."
);
AddAttr
<
bool
>
(
"use_data_config"
,
"Use the config of all datas like shape_concat/ranks/lod_levels"
)
.
SetDefault
(
true
);
Apply
();
Apply
();
}
}
...
@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
...
@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output file reader should not be null."
);
"The output file reader should not be null."
);
const
auto
shape_concat
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
bool
use_data_config
=
ctx
->
Attrs
().
Get
<
bool
>
(
"use_data_config"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
if
(
use_data_config
)
{
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
const
auto
shape_concat
=
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d)."
,
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
lod_levels
.
size
(),
shapes
.
size
());
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
framework
::
VarDesc
*
reader
=
"The number of 'lod_levels'(%d) doesn't match the number "
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
"of 'shapes'(%d)."
,
reader
->
SetLoDLevels
(
lod_levels
);
lod_levels
.
size
(),
shapes
.
size
());
framework
::
VarDesc
*
reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
reader
->
SetLoDLevels
(
lod_levels
);
}
}
}
void
FileReaderInferVarType
::
operator
()(
const
framework
::
OpDesc
&
op_desc
,
void
FileReaderInferVarType
::
operator
()(
const
framework
::
OpDesc
&
op_desc
,
...
...
paddle/fluid/operators/shuffle_channel_op.cc
0 → 100644
浏览文件 @
c7e38680
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h"
namespace
paddle
{
namespace
operators
{
class
ShuffleChannelOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of ShuffleChannelOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of ShuffleChannelOp should not be null."
);
auto
input_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE
(
input_dims
.
size
()
==
4
,
"The layout of input is NCHW."
);
ctx
->
SetOutputDim
(
"Out"
,
input_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
)
->
type
(),
ctx
.
device_context
());
}
};
class
ShuffleChannelOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor, default Tensor<float>), "
"the input feature data of ShuffleChannelOp, the layout is NCHW."
);
AddOutput
(
"Out"
,
"(Tensor, default Tensor<float>), the output of "
"ShuffleChannelOp. The layout is NCHW."
);
AddAttr
<
int
>
(
"group"
,
"the number of groups."
)
.
SetDefault
(
1
)
.
AddCustomChecker
([](
const
int
&
group
)
{
PADDLE_ENFORCE_GE
(
group
,
1
,
"group should be larger than 0."
);
});
AddComment
(
R"DOC(
Shuffle Channel operator
This opearator shuffles the channels of input x.
It divide the input channels in each group into several subgroups,
and obtain a new order by selecting element from every subgroup one by one.
Shuffle channel operation makes it possible to build more powerful structures
with multiple group convolutional layers.
please get more information from the following paper:
https://arxiv.org/pdf/1707.01083.pdf
)DOC"
);
}
};
class
ShuffleChannelGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@Grad) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
"Output(X@Grad) should not be null"
);
auto
input_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE
(
input_dims
.
size
()
==
4
,
"The layout of input is NCHW."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
input_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
)
->
type
(),
ctx
.
device_context
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
shuffle_channel
,
ops
::
ShuffleChannelOp
,
ops
::
ShuffleChannelOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
shuffle_channel_grad
,
ops
::
ShuffleChannelGradOp
);
REGISTER_OP_CPU_KERNEL
(
shuffle_channel
,
ops
::
ShuffleChannelOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ShuffleChannelOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
shuffle_channel_grad
,
ops
::
ShuffleChannelGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
ShuffleChannelGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/shuffle_channel_op.cu
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/shuffle_channel_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaximumNumBlocks
=
4096
;
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaximumNumBlocks
);
}
template
<
typename
T
>
__global__
void
ShuffleChannel
(
const
int
nthreads
,
const
int
feature_map_size
,
T
*
output
,
const
T
*
input
,
int
group_row
,
int
group_column
,
int
len
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
ii
=
index
;
ii
<
nthreads
;
ii
+=
offset
)
{
const
int
n
=
index
/
group_row
/
group_column
/
len
;
const
int
i
=
(
index
/
group_column
/
len
)
%
group_row
;
const
int
j
=
index
/
len
%
group_column
;
const
int
k
=
index
-
(
n
*
feature_map_size
+
(
i
*
group_column
+
j
)
*
len
);
T
*
p_o
=
output
+
n
*
feature_map_size
+
(
j
*
group_row
+
i
)
*
len
;
p_o
[
k
]
=
input
[
index
];
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
ShuffleChannelOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
int
group
=
ctx
.
Attr
<
int
>
(
"group"
);
auto
input_dims
=
input
->
dims
();
auto
num
=
input_dims
[
0
];
auto
channel
=
input_dims
[
1
];
auto
height
=
input_dims
[
2
];
auto
weight
=
input_dims
[
3
];
auto
feature_map_size
=
channel
*
height
*
weight
;
auto
sp_sz
=
height
*
weight
;
int
group_row
=
group
;
int
group_column
=
channel
/
group_row
;
// count is the product of NCHW same as numel()
int
count
=
num
*
group_column
*
group_row
*
sp_sz
;
int
blocks
=
NumBlocks
(
output
->
numel
());
int
threads
=
kNumCUDAThreads
;
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
ShuffleChannel
<
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
count
,
feature_map_size
,
output_data
,
input_data
,
group_row
,
group_column
,
sp_sz
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ShuffleChannelGradOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
int
group
=
ctx
.
Attr
<
int
>
(
"group"
);
auto
input_dims
=
input
->
dims
();
auto
num
=
input_dims
[
0
];
auto
channel
=
input_dims
[
1
];
auto
height
=
input_dims
[
2
];
auto
weight
=
input_dims
[
3
];
auto
feature_map_size
=
channel
*
height
*
weight
;
auto
sp_sz
=
height
*
weight
;
int
group_row
=
group
;
int
group_column
=
channel
/
group_row
;
auto
*
output_grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
input_grad
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
output_grad_data
=
output_grad
->
data
<
T
>
();
int
blocks
=
NumBlocks
(
output_grad
->
numel
());
int
threads
=
kNumCUDAThreads
;
int
count
=
num
*
group_column
*
group_row
*
sp_sz
;
ShuffleChannel
<
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
count
,
feature_map_size
,
input_grad_data
,
output_grad_data
,
group_row
,
group_column
,
sp_sz
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
shuffle_channel
,
ops
::
ShuffleChannelOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ShuffleChannelOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
shuffle_channel_grad
,
ops
::
ShuffleChannelGradOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ShuffleChannelGradOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/shuffle_channel_op.h
0 → 100644
浏览文件 @
c7e38680
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ShuffleChannelOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
int
group
=
ctx
.
Attr
<
int
>
(
"group"
);
auto
input_dims
=
input
->
dims
();
auto
num
=
input_dims
[
0
];
auto
channel
=
input_dims
[
1
];
auto
height
=
input_dims
[
2
];
auto
weight
=
input_dims
[
3
];
auto
feature_map_size
=
channel
*
height
*
weight
;
auto
sp_sz
=
height
*
weight
;
int
group_row
=
group
;
int
group_column
=
channel
/
group_row
;
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
for
(
int
n
=
0
;
n
<
num
;
++
n
)
{
for
(
int
i
=
0
;
i
<
group_row
;
++
i
)
{
for
(
int
j
=
0
;
j
<
group_column
;
++
j
)
{
const
T
*
p_i
=
input_data
+
n
*
feature_map_size
+
(
i
*
group_column
+
j
)
*
sp_sz
;
T
*
p_o
=
output_data
+
n
*
feature_map_size
+
(
j
*
group_row
+
i
)
*
sp_sz
;
memcpy
(
p_o
,
p_i
,
sizeof
(
int
)
*
sp_sz
);
}
}
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ShuffleChannelGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
int
group
=
ctx
.
Attr
<
int
>
(
"group"
);
auto
input_dims
=
input
->
dims
();
auto
num
=
input_dims
[
0
];
auto
channel
=
input_dims
[
1
];
auto
height
=
input_dims
[
2
];
auto
weight
=
input_dims
[
3
];
auto
feature_map_size
=
channel
*
height
*
weight
;
auto
sp_sz
=
height
*
weight
;
int
group_row
=
group
;
int
group_column
=
channel
/
group_row
;
auto
*
output_grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
input_grad
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
output_grad_data
=
output_grad
->
data
<
T
>
();
for
(
int
n
=
0
;
n
<
num
;
++
n
)
{
for
(
int
i
=
0
;
i
<
group_row
;
++
i
)
{
for
(
int
j
=
0
;
j
<
group_column
;
++
j
)
{
const
T
*
p_i
=
output_grad_data
+
n
*
feature_map_size
+
(
i
*
group_column
+
j
)
*
sp_sz
;
T
*
p_o
=
input_grad_data
+
n
*
feature_map_size
+
(
j
*
group_row
+
i
)
*
sp_sz
;
memcpy
(
p_o
,
p_i
,
sizeof
(
int
)
*
sp_sz
);
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
浏览文件 @
c7e38680
...
@@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Xs"
,
"A list of inputs."
).
AsDuplicable
();
AddInput
(
"Xs"
,
"A list of inputs."
).
AsDuplicable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"calibration_data"
,
"the calibration data for int8"
);
AddAttr
<
std
::
string
>
(
"engine_key"
,
"The engine_key here is used to distinguish different TRT Engines"
);
AddAttr
<
int
>
(
"max_batch_size"
,
"the maximum batch size."
);
AddAttr
<
int
>
(
"max_batch_size"
,
"the maximum batch size."
);
AddAttr
<
int
>
(
"workspace_size"
,
"the workspace size."
);
AddAttr
<
int
>
(
"workspace_size"
,
"the workspace size."
);
AddAttr
<
framework
::
BlockDesc
*>
(
"sub_block"
,
"the trt block"
);
AddAttr
<
bool
>
(
"enable_int8"
,
"whether swith to int8 mode"
);
AddComment
(
"TensorRT engine operator."
);
AddComment
(
"TensorRT engine operator."
);
}
}
};
};
...
@@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
...
@@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
tensorrt_engine
,
ops
::
TensorRTEngineOp
,
REGISTER_OPERATOR
(
tensorrt_engine
,
ops
::
TensorRTEngineOp
,
ops
::
TensorRTEngineOpMaker
);
ops
::
TensorRTEngineOpMaker
,
ops
::
TensorRTEngineOpMaker
);
#endif // PADDLE_WITH_CUDA
#endif // PADDLE_WITH_CUDA
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
c7e38680
...
@@ -17,8 +17,10 @@
...
@@ -17,8 +17,10 @@
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
#include <string>
#include <string>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/helper.h"
...
@@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
...
@@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
using
inference
::
Singleton
;
using
inference
::
Singleton
;
using
inference
::
tensorrt
::
TensorRTEngine
;
using
inference
::
tensorrt
::
TensorRTEngine
;
using
inference
::
tensorrt
::
TRTInt8Calibrator
;
using
inference
::
tensorrt
::
TRTCalibratorEngine
;
using
inference
::
tensorrt
::
TRTCalibratorEngineManager
;
class
TensorRTEngineOp
:
public
framework
::
OperatorBase
{
class
TensorRTEngineOp
:
public
framework
::
OperatorBase
{
private:
private:
...
@@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
mutable
std
::
unique_ptr
<
TensorRTEngine
>
trt_engine_
;
mutable
std
::
unique_ptr
<
TensorRTEngine
>
trt_engine_
;
int
max_batch_size_
;
int
max_batch_size_
;
int
workspace_size_
;
int
workspace_size_
;
std
::
unique_ptr
<
TRTInt8Calibrator
>
calibrator_
;
bool
enable_int8_
;
std
::
string
calibration_data_
;
std
::
string
engine_key_
;
bool
calibration_mode_
;
public:
public:
TensorRTEngineOp
(
const
std
::
string
&
type
,
TensorRTEngineOp
(
const
std
::
string
&
type
,
...
@@ -80,19 +90,96 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -80,19 +90,96 @@ class TensorRTEngineOp : public framework::OperatorBase {
input_names_
=
Inputs
(
"Xs"
);
input_names_
=
Inputs
(
"Xs"
);
max_batch_size_
=
Attr
<
int
>
(
"max_batch_size"
);
max_batch_size_
=
Attr
<
int
>
(
"max_batch_size"
);
workspace_size_
=
Attr
<
int
>
(
"workspace_size"
);
workspace_size_
=
Attr
<
int
>
(
"workspace_size"
);
enable_int8_
=
Attr
<
bool
>
(
"enable_int8"
);
calibration_data_
=
Attr
<
std
::
string
>
(
"calibration_data"
);
engine_key_
=
Attr
<
std
::
string
>
(
"engine_key"
);
auto
params
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
auto
params
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
for
(
const
auto
&
param
:
params
)
{
for
(
const
auto
&
param
:
params
)
{
param_names_
.
insert
(
param
);
param_names_
.
insert
(
param
);
}
}
// calibration_mode is ture represents we need to
// generate the calibration table data.
calibration_mode_
=
(
enable_int8_
&&
calibration_data_
.
size
()
==
0
);
VLOG
(
4
)
<<
"calibration_mode: "
<<
calibration_mode_
;
if
(
enable_int8_
&&
calibration_data_
.
size
())
{
calibrator_
.
reset
(
new
TRTInt8Calibrator
(
calibration_data_
));
}
}
}
protected:
protected:
void
RunNativeImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
framework
::
Executor
executor
(
dev_place
);
auto
*
block
=
Attr
<
framework
::
BlockDesc
*>
(
"sub_block"
);
auto
*
program
=
block
->
Program
();
auto
&
current_scope
=
scope
.
NewScope
();
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
());
executor
.
RunPreparedContext
(
ctx
.
get
(),
&
current_scope
,
false
,
true
,
true
);
}
void
RunImpl
(
const
framework
::
Scope
&
scope
,
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
platform
::
Place
&
dev_place
)
const
override
{
if
(
calibration_mode_
==
true
)
{
RunCalibration
(
scope
,
dev_place
);
return
;
}
RunTrt
(
scope
,
dev_place
);
RunTrt
(
scope
,
dev_place
);
}
}
void
RunCalibration
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
// This process will builds a 32-bit trt engine, runs it on the calibration
// set, and records a histogram for each
// tensor of the distribution of activation values.
LOG_FIRST_N
(
INFO
,
1
)
<<
"The TRT engine: "
<<
engine_key_
<<
" is running calibration trt int8... "
;
int
runtime_batch
=
1
;
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
if
(
!
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Has
(
engine_key_
))
{
TRTCalibratorEngine
*
calib_res
=
Singleton
<
TRTCalibratorEngineManager
>::
Global
().
Create
(
engine_key_
);
std
::
unordered_map
<
std
::
string
,
size_t
>
calib_buffers
;
for
(
auto
&
x
:
input_names_
)
{
if
(
param_names_
.
count
(
x
))
continue
;
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
calib_buffers
[
x
]
=
t
.
memory_size
();
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
runtime_batch
=
t_shape
[
0
];
}
calib_res
->
calib_
.
reset
(
new
TRTInt8Calibrator
(
calib_buffers
,
runtime_batch
,
engine_key_
,
dev_place
));
calib_res
->
thr_
.
reset
(
new
std
::
thread
([
&
]()
{
calib_res
->
engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
stream
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
,
enable_int8_
,
calib_res
->
calib_
.
get
()));
VLOG
(
3
)
<<
"start the calib trt engine thread"
;
Prepare
(
scope
,
dev_place
,
calib_res
->
engine_
.
get
());
}));
}
TRTInt8Calibrator
*
temp_calibrator
=
Singleton
<
TRTCalibratorEngineManager
>::
Global
()
.
Get
(
engine_key_
)
->
calib_
.
get
();
std
::
unordered_map
<
std
::
string
,
void
*>
calib_data
;
for
(
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param_names_
.
count
(
x
))
continue
;
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
calib_data
.
emplace
(
x
,
t
.
data
<
void
>
());
}
temp_calibrator
->
setBatch
(
calib_data
);
RunNativeImpl
(
scope
,
dev_place
);
}
void
RunTrt
(
const
framework
::
Scope
&
scope
,
void
RunTrt
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
const
platform
::
Place
&
dev_place
)
const
{
int
runtime_batch
=
1
;
int
runtime_batch
=
1
;
...
@@ -101,9 +188,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -101,9 +188,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
auto
stream
=
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
).
stream
();
if
(
trt_engine_
.
get
()
==
nullptr
)
{
if
(
trt_engine_
.
get
()
==
nullptr
)
{
trt_engine_
.
reset
(
new
TensorRTEngine
(
trt_engine_
.
reset
(
max_batch_size_
,
workspace_size_
,
stream
,
new
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
stream
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
));
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
,
enable_int8_
,
calibrator_
.
get
()));
Prepare
(
scope
,
dev_place
,
trt_engine_
.
get
());
Prepare
(
scope
,
dev_place
,
trt_engine_
.
get
());
}
}
...
@@ -173,7 +261,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
...
@@ -173,7 +261,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
void
Prepare
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
void
Prepare
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
TensorRTEngine
*
engine
)
const
{
TensorRTEngine
*
engine
)
const
{
VLOG
(
4
)
<<
"Prepare engine"
;
LOG
(
INFO
)
<<
"Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time."
;
framework
::
proto
::
BlockDesc
block_desc
;
framework
::
proto
::
BlockDesc
block_desc
;
block_desc
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
block_desc
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
浏览文件 @
c7e38680
...
@@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) {
...
@@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc
.
SetType
(
"tensorrt_engine"
);
engine_op_desc
.
SetType
(
"tensorrt_engine"
);
engine_op_desc
.
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
({
"x"
}));
engine_op_desc
.
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
({
"x"
}));
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"subgraph"
,
block_
->
SerializeAsString
());
engine_op_desc
.
SetBlockAttr
(
"sub_block"
,
&
block_desc
);
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_batch_size"
,
2
);
engine_op_desc
.
SetAttr
(
"max_batch_size"
,
static_cast
<
int
>
(
2
));
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"workspace_size"
,
1
<<
20
);
engine_op_desc
.
SetAttr
(
"workspace_size"
,
static_cast
<
int
>
(
1
<<
20
));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"engine_uniq_key"
,
"a_engine"
);
engine_op_desc
.
SetAttr
(
"parameters"
,
std
::
vector
<
std
::
string
>
({}));
SetAttr
<
std
::
vector
<
std
::
string
>>
(
engine_op_desc
.
Proto
(),
"parameters"
,
engine_op_desc
.
SetAttr
(
"engine_key"
,
std
::
string
(
"a_engine"
));
std
::
vector
<
std
::
string
>
({}));
engine_op_desc
.
SetAttr
(
"calibration_data"
,
std
::
string
(
""
));
SetAttr
<
std
::
vector
<
std
::
string
>>
(
engine_op_desc
.
Proto
(),
engine_op_desc
.
SetAttr
(
"enable_int8"
,
static_cast
<
bool
>
(
false
));
"output_name_mapping"
,
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
std
::
vector
<
std
::
string
>
({
"z0"
}));
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
LOG
(
INFO
)
<<
"create engine op"
;
LOG
(
INFO
)
<<
"create engine op"
;
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
*
engine_op_desc
.
Proto
()
);
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
LOG
(
INFO
)
<<
"engine_op "
<<
engine_op
.
get
();
LOG
(
INFO
)
<<
"engine_op "
<<
engine_op
.
get
();
framework
::
Scope
scope
;
framework
::
Scope
scope
;
...
@@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
...
@@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc
.
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
({
"x0"
}));
engine_op_desc
.
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
({
"x0"
}));
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z3"
}));
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z3"
}));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"subgraph"
,
engine_op_desc
.
SetBlockAttr
(
"sub_block"
,
&
block_desc
);
block_
->
SerializeAsString
());
engine_op_desc
.
SetAttr
(
"max_batch_size"
,
static_cast
<
int
>
(
batch_size
));
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_batch_size"
,
batch_size
);
engine_op_desc
.
SetAttr
(
"workspace_size"
,
static_cast
<
int
>
(
1
<<
20
));
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"workspace_size"
,
1
<<
20
);
engine_op_desc
.
SetAttr
(
"parameters"
,
SetAttr
<
std
::
vector
<
std
::
string
>>
(
std
::
vector
<
std
::
string
>
({
"y0"
,
"y1"
,
"y2"
,
"y3"
}));
engine_op_desc
.
Proto
(),
"parameters"
,
engine_op_desc
.
SetAttr
(
"engine_key"
,
std
::
string
(
"b_engine"
));
std
::
vector
<
std
::
string
>
({
"y0"
,
"y1"
,
"y2"
,
"y3"
}));
engine_op_desc
.
SetAttr
(
"calibration_data"
,
std
::
string
(
""
));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"engine_uniq_key"
,
"b_engine"
);
engine_op_desc
.
SetAttr
(
"enable_int8"
,
static_cast
<
bool
>
(
false
));
engine_op_desc
.
SetAttr
(
"output_name_mapping"
,
SetAttr
<
std
::
vector
<
std
::
string
>>
(
engine_op_desc
.
Proto
(),
std
::
vector
<
std
::
string
>
({
"z3"
}));
"output_name_mapping"
,
engine_op_desc
.
SetAttr
(
"subgraph"
,
std
::
string
(
block_
->
SerializeAsString
()));
std
::
vector
<
std
::
string
>
({
"z3"
}));
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
engine_op_desc
);
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
*
engine_op_desc
.
Proto
());
// Execute them.
// Execute them.
engine_op
->
Run
(
scope
,
place
);
engine_op
->
Run
(
scope
,
place
);
...
...
paddle/fluid/operators/warpctc_cudnn_op.cu.cc
浏览文件 @
c7e38680
...
@@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
...
@@ -144,19 +144,17 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
,
cu_ctcloss_desc
,
&
workspace_size
));
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
,
cu_ctcloss_desc
,
&
workspace_size
));
T
*
loss_data
=
loss
->
mutable_data
<
T
>
(
loss_dims
,
ctx
.
GetPlace
());
T
*
loss_data
=
loss
->
mutable_data
<
T
>
(
loss_dims
,
ctx
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
loss
,
static_cast
<
T
>
(
0
));
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
auto
temp_allocation
=
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCTCLoss
(
platform
::
DeviceTemporaryAllocator
::
Instance
().
Get
(
dev_ctx
).
Allocate
(
handle
,
cu_logits_desc
,
warpctc_logits_data
,
warpctc_label_data
,
workspace_size
);
warpctc_label_lengths
.
data
(),
warpctc_logits_lengths
.
data
(),
void
*
cudnn_workspace
=
temp_allocation
->
ptr
();
loss_data
,
cu_grad_desc
,
warpctc_grad_data
,
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
,
cu_ctcloss_desc
,
cudnn_workspace
,
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCTCLoss
(
workspace_size
));
handle
,
cu_logits_desc
,
warpctc_logits_data
,
warpctc_label_data
,
};
warpctc_label_lengths
.
data
(),
warpctc_logits_lengths
.
data
(),
loss_data
,
workspace_handle
.
RunFunc
(
cudnn_func
,
workspace_size
);
cu_grad_desc
,
warpctc_grad_data
,
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
,
cu_ctcloss_desc
,
cudnn_workspace
,
workspace_size
));
}
}
};
};
...
...
paddle/fluid/platform/cuda_device_function.h
浏览文件 @
c7e38680
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <cuda.h>
#include <cuda.h>
// NOTE(): support float16 to half in header file.
// NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16
#define PADDLE_CUDA_FP16
...
@@ -30,6 +31,34 @@ namespace platform {
...
@@ -30,6 +31,34 @@ namespace platform {
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif
inline
static
int
RoundToPowerOfTwo
(
int
dim
)
{
if
(
dim
>
512
)
{
return
1024
;
}
else
if
(
dim
>
256
)
{
return
512
;
}
else
if
(
dim
>
128
)
{
return
256
;
}
else
if
(
dim
>
64
)
{
return
128
;
}
else
if
(
dim
>
32
)
{
return
64
;
}
else
{
return
32
;
}
}
#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \
case (dim): { \
constexpr auto kPowerOfTwoDim = (dim); \
__VA_ARGS__; \
} break
#define CUDA_LAUNCH_KERNEL_HELPER(...) \
CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \
CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \
CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \
CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
template
<
typename
T
>
template
<
typename
T
>
__forceinline__
__device__
T
CudaShuffleDownSync
(
unsigned
mask
,
T
val
,
__forceinline__
__device__
T
CudaShuffleDownSync
(
unsigned
mask
,
T
val
,
int
delta
,
int
width
=
32
)
{
int
delta
,
int
width
=
32
)
{
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
c7e38680
...
@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
...
@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
auto
it
=
device_contexts_
.
find
(
place
);
auto
it
=
device_contexts_
.
find
(
place
);
if
(
it
==
device_contexts_
.
end
())
{
if
(
it
==
device_contexts_
.
end
())
{
PADDLE_THROW
(
PADDLE_THROW
(
"'Place' is not supported, Please re-compile with WITH_GPU "
"Place %s is not supported, Please re-compile with WITH_GPU "
"option"
);
"option"
,
place
);
}
}
return
it
->
second
.
get
().
get
();
return
it
->
second
.
get
().
get
();
}
}
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
c7e38680
...
@@ -15,6 +15,8 @@ limitations under the License. */
...
@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include <algorithm>
#include <algorithm>
#include <cstdlib>
#include <string>
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "",
...
@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "",
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
int
GetCUDADeviceCount
()
{
static
int
GetCUDADeviceCountImpl
()
{
const
auto
*
cuda_visible_devices
=
std
::
getenv
(
"CUDA_VISIBLE_DEVICES"
);
if
(
cuda_visible_devices
!=
nullptr
)
{
std
::
string
cuda_visible_devices_str
(
cuda_visible_devices
);
if
(
std
::
all_of
(
cuda_visible_devices_str
.
begin
(),
cuda_visible_devices_str
.
end
(),
[](
char
ch
)
{
return
ch
==
' '
;
}))
{
VLOG
(
2
)
<<
"CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected."
;
return
0
;
}
}
int
count
;
int
count
;
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
cudaGetDeviceCount
(
&
count
),
cudaGetDeviceCount
(
&
count
),
...
@@ -66,6 +79,11 @@ int GetCUDADeviceCount() {
...
@@ -66,6 +79,11 @@ int GetCUDADeviceCount() {
return
count
;
return
count
;
}
}
int
GetCUDADeviceCount
()
{
static
auto
dev_cnt
=
GetCUDADeviceCountImpl
();
return
dev_cnt
;
}
int
GetCUDAComputeCapability
(
int
id
)
{
int
GetCUDAComputeCapability
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
cudaDeviceProp
device_prop
;
cudaDeviceProp
device_prop
;
...
@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() {
...
@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() {
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
void
GpuMemcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
)
{
enum
cudaMemcpyKind
kind
,
cudaStream_t
stream
)
{
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
),
PADDLE_ENFORCE
(
cudaMemcpyAsync
(
dst
,
src
,
count
,
kind
,
stream
),
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"
);
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
"(%p -> %p, length: %d)"
,
src
,
dst
,
static_cast
<
int
>
(
count
));
}
}
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
void
GpuMemcpySync
(
void
*
dst
,
const
void
*
src
,
size_t
count
,
enum
cudaMemcpyKind
kind
)
{
enum
cudaMemcpyKind
kind
)
{
PADDLE_ENFORCE
(
cudaMemcpy
(
dst
,
src
,
count
,
kind
),
PADDLE_ENFORCE
(
cudaMemcpy
(
dst
,
src
,
count
,
kind
),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync"
);
"cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
"%p, length: %d)"
,
src
,
dst
,
static_cast
<
int
>
(
count
));
}
}
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
void
GpuMemcpyPeerAsync
(
void
*
dst
,
int
dst_device
,
const
void
*
src
,
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
c7e38680
...
@@ -15,18 +15,38 @@ limitations under the License. */
...
@@ -15,18 +15,38 @@ limitations under the License. */
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace
paddle
{
namespace
paddle
{
namespace
pybind
{
namespace
pybind
{
// Bind Methods
// Bind Methods
void
BindTracer
(
pybind11
::
module
*
m
)
{
void
BindTracer
(
pybind11
::
module
*
m
)
{
pybind11
::
class_
<
imperative
::
Tracer
>
(
*
m
,
"Tracer"
,
""
)
pybind11
::
class_
<
imperative
::
Tracer
>
(
*
m
,
"Tracer"
,
""
)
.
def
(
"__init__"
,
.
def
(
"__init__"
,
[](
imperative
::
Tracer
&
self
,
framework
::
BlockDesc
*
root_block
)
{
[](
imperative
::
Tracer
&
self
,
framework
::
BlockDesc
*
root_block
)
{
new
(
&
self
)
imperative
::
Tracer
(
root_block
);
new
(
&
self
)
imperative
::
Tracer
(
root_block
);
})
})
.
def
(
"trace"
,
&
imperative
::
Tracer
::
Trace
)
.
def
(
"trace"
,
[](
imperative
::
Tracer
&
self
,
imperative
::
OpBase
*
op
,
const
imperative
::
VarBasePtrMap
&
inputs
,
const
imperative
::
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
platform
::
CPUPlace
expected_place
,
const
bool
stop_gradient
=
false
)
{
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
stop_gradient
);
})
.
def
(
"trace"
,
[](
imperative
::
Tracer
&
self
,
imperative
::
OpBase
*
op
,
const
imperative
::
VarBasePtrMap
&
inputs
,
const
imperative
::
VarBasePtrMap
&
outputs
,
framework
::
BlockDesc
*
block
,
const
platform
::
CUDAPlace
expected_place
,
const
bool
stop_gradient
=
false
)
{
self
.
Trace
(
op
,
inputs
,
outputs
,
block
,
expected_place
,
stop_gradient
);
})
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
pybind11
::
return_value_policy
::
take_ownership
);
pybind11
::
return_value_policy
::
take_ownership
);
}
}
...
...
paddle/fluid/pybind/inference_api.cc
浏览文件 @
c7e38680
...
@@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) {
...
@@ -180,8 +180,14 @@ void BindNativePredictor(py::module *m) {
}
}
void
BindAnalysisConfig
(
py
::
module
*
m
)
{
void
BindAnalysisConfig
(
py
::
module
*
m
)
{
py
::
class_
<
AnalysisConfig
>
(
*
m
,
"AnalysisConfig"
)
py
::
class_
<
AnalysisConfig
>
analysis_config
(
*
m
,
"AnalysisConfig"
);
.
def
(
py
::
init
<
const
AnalysisConfig
&>
())
py
::
enum_
<
AnalysisConfig
::
Precision
>
(
analysis_config
,
"Precision"
)
.
value
(
"Float32"
,
AnalysisConfig
::
Precision
::
kFloat32
)
.
value
(
"Int8"
,
AnalysisConfig
::
Precision
::
kInt8
)
.
export_values
();
analysis_config
.
def
(
py
::
init
<
const
AnalysisConfig
&>
())
.
def
(
py
::
init
<
const
std
::
string
&>
())
.
def
(
py
::
init
<
const
std
::
string
&>
())
.
def
(
py
::
init
<
const
std
::
string
&
,
const
std
::
string
&>
())
.
def
(
py
::
init
<
const
std
::
string
&
,
const
std
::
string
&>
())
.
def
(
"set_model"
,
(
void
(
AnalysisConfig
::*
)(
const
std
::
string
&
))
&
.
def
(
"set_model"
,
(
void
(
AnalysisConfig
::*
)(
const
std
::
string
&
))
&
...
@@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
...
@@ -215,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
.
def
(
"specify_input_name"
,
&
AnalysisConfig
::
specify_input_name
)
.
def
(
"specify_input_name"
,
&
AnalysisConfig
::
specify_input_name
)
.
def
(
"enable_tensorrt_engine"
,
&
AnalysisConfig
::
EnableTensorRtEngine
,
.
def
(
"enable_tensorrt_engine"
,
&
AnalysisConfig
::
EnableTensorRtEngine
,
py
::
arg
(
"workspace_size"
)
=
1
<<
20
,
py
::
arg
(
"max_batch_size"
)
=
1
,
py
::
arg
(
"workspace_size"
)
=
1
<<
20
,
py
::
arg
(
"max_batch_size"
)
=
1
,
py
::
arg
(
"min_subgraph_size"
)
=
3
)
py
::
arg
(
"min_subgraph_size"
)
=
3
,
py
::
arg
(
"precision_mode"
)
=
AnalysisConfig
::
Precision
::
kFloat32
)
.
def
(
"tensorrt_engine_enabled"
,
&
AnalysisConfig
::
tensorrt_engine_enabled
)
.
def
(
"tensorrt_engine_enabled"
,
&
AnalysisConfig
::
tensorrt_engine_enabled
)
.
def
(
"switch_ir_debug"
,
&
AnalysisConfig
::
SwitchIrDebug
,
.
def
(
"switch_ir_debug"
,
&
AnalysisConfig
::
SwitchIrDebug
,
py
::
arg
(
"x"
)
=
true
)
py
::
arg
(
"x"
)
=
true
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
c7e38680
...
@@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) {
...
@@ -138,6 +138,22 @@ PYBIND11_MODULE(core, m) {
.
def
(
"_grad_ivar"
,
.
def
(
"_grad_ivar"
,
[](
const
imperative
::
VarBase
&
self
)
{
return
self
.
grads_
;
},
[](
const
imperative
::
VarBase
&
self
)
{
return
self
.
grads_
;
},
py
::
return_value_policy
::
reference
)
py
::
return_value_policy
::
reference
)
.
def
(
"_copy_to"
,
[](
const
imperative
::
VarBase
&
self
,
const
platform
::
CPUPlace
&
place
,
bool
blocking
)
{
std
::
unique_ptr
<
imperative
::
VarBase
>
new_var
=
self
.
NewVarBase
(
place
,
blocking
);
return
new_var
.
release
();
},
py
::
return_value_policy
::
take_ownership
)
.
def
(
"_copy_to"
,
[](
const
imperative
::
VarBase
&
self
,
const
platform
::
CUDAPlace
&
place
,
bool
blocking
)
{
std
::
unique_ptr
<
imperative
::
VarBase
>
new_var
=
self
.
NewVarBase
(
place
,
blocking
);
return
new_var
.
release
();
},
py
::
return_value_policy
::
take_ownership
)
.
def
(
"value"
,
[](
const
imperative
::
VarBase
&
self
)
{
return
self
.
var_
;
},
.
def
(
"value"
,
[](
const
imperative
::
VarBase
&
self
)
{
return
self
.
var_
;
},
py
::
return_value_policy
::
reference
)
py
::
return_value_policy
::
reference
)
.
def_property
(
.
def_property
(
...
@@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -469,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle.
py
::
return_value_policy
::
reference
);
py
::
return_value_policy
::
reference
);
py
::
class_
<
framework
::
ReaderHolder
>
(
m
,
"Reader"
,
""
)
py
::
class_
<
framework
::
ReaderHolder
>
(
m
,
"Reader"
,
""
)
.
def
(
"start"
,
&
framework
::
ReaderHolder
::
Start
)
.
def
(
"reset"
,
&
framework
::
ReaderHolder
::
ResetAll
);
.
def
(
"reset"
,
&
framework
::
ReaderHolder
::
ResetAll
);
using
LoDTensorBlockingQueue
=
using
LoDTensorBlockingQueue
=
...
@@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -489,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"is_closed"
,
&
LoDTensorBlockingQueue
::
IsClosed
);
.
def
(
"is_closed"
,
&
LoDTensorBlockingQueue
::
IsClosed
);
m
.
def
(
"init_lod_tensor_blocking_queue"
,
m
.
def
(
"init_lod_tensor_blocking_queue"
,
[](
Variable
&
var
,
size_t
capacity
,
[](
Variable
&
var
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>
&
shapes
)
size_t
capacity
)
->
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
{
->
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
{
auto
*
holder
=
var
.
GetMutable
<
LoDTensorBlockingQueueHolder
>
();
std
::
vector
<
DDim
>
dims
(
shapes
.
size
());
holder
->
InitOnce
(
capacity
,
FLAGS_reader_queue_speed_test_mode
);
std
::
transform
(
shapes
.
begin
(),
shapes
.
end
(),
dims
.
begin
(),
return
holder
->
GetQueue
();
[](
const
std
::
vector
<
int64_t
>
&
shape
)
{
},
return
make_ddim
(
shape
);
});
auto
*
holder
=
var
.
GetMutable
<
LoDTensorBlockingQueueHolder
>
();
holder
->
InitOnce
(
capacity
,
dims
,
FLAGS_reader_queue_speed_test_mode
);
return
holder
->
GetQueue
();
},
py
::
return_value_policy
::
copy
);
py
::
return_value_policy
::
copy
);
py
::
class_
<
Scope
>
(
m
,
"_Scope"
,
R"DOC(
py
::
class_
<
Scope
>
(
m
,
"_Scope"
,
R"DOC(
...
@@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -626,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle.
py
::
class_
<
platform
::
Communicator
>
(
m
,
"Communicator"
).
def
(
py
::
init
<>
());
py
::
class_
<
platform
::
Communicator
>
(
m
,
"Communicator"
).
def
(
py
::
init
<>
());
#endif
#endif
py
::
class_
<
platform
::
CUDAPlace
>
(
m
,
"CUDAPlace"
)
py
::
class_
<
platform
::
CUDAPlace
>
(
m
,
"CUDAPlace"
)
.
def
(
py
::
init
<
int
>
())
.
def
(
"__init__"
,
[](
platform
::
CUDAPlace
&
self
,
int
dev_id
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
dev_id
>=
0
&&
dev_id
<
platform
::
GetCUDADeviceCount
(),
"Invalid CUDAPlace(%d), must inside [0, %d)"
,
dev_id
,
platform
::
GetCUDADeviceCount
());
new
(
&
self
)
platform
::
CUDAPlace
(
dev_id
);
#else
PADDLE_THROW
(
"Cannot use CUDAPlace in CPU only version"
);
#endif
})
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
CUDAPlace
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
CUDAPlace
&>
);
py
::
class_
<
paddle
::
platform
::
CPUPlace
>
(
m
,
"CPUPlace"
)
py
::
class_
<
paddle
::
platform
::
CPUPlace
>
(
m
,
"CPUPlace"
)
...
@@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -634,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
CPUPlace
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
CPUPlace
&>
);
py
::
class_
<
paddle
::
platform
::
CUDAPinnedPlace
>
(
m
,
"CUDAPinnedPlace"
)
py
::
class_
<
paddle
::
platform
::
CUDAPinnedPlace
>
(
m
,
"CUDAPinnedPlace"
)
.
def
(
py
::
init
<>
())
.
def
(
"__init__"
,
[](
platform
::
CUDAPinnedPlace
&
)
{
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW
(
"Cannot use CUDAPinnedPlace in CPU only version"
);
#endif
})
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
CUDAPinnedPlace
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
CUDAPinnedPlace
&>
);
py
::
class_
<
platform
::
Place
>
(
m
,
"Place"
)
py
::
class_
<
platform
::
Place
>
(
m
,
"Place"
)
...
@@ -1005,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1005,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_ENFORCE
(
!
self
.
IsFinalized
(),
"BuildStrategy is finlaized."
);
PADDLE_ENFORCE
(
!
self
.
IsFinalized
(),
"BuildStrategy is finlaized."
);
self
.
remove_unnecessary_lock_
=
b
;
self
.
remove_unnecessary_lock_
=
b
;
},
},
R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default
Fals
e.)DOC"
)
R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default
Tru
e.)DOC"
)
.
def_property
(
.
def_property
(
"num_trainers"
,
"num_trainers"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
num_trainers_
;
},
[](
const
BuildStrategy
&
self
)
{
return
self
.
num_trainers_
;
},
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
c7e38680
...
@@ -173,7 +173,6 @@ function cmake_gen() {
...
@@ -173,7 +173,6 @@ function cmake_gen() {
-DCMAKE_BUILD_TYPE=
${
CMAKE_BUILD_TYPE
:-
Release
}
-DCMAKE_BUILD_TYPE=
${
CMAKE_BUILD_TYPE
:-
Release
}
${
PYTHON_FLAGS
}
${
PYTHON_FLAGS
}
-DWITH_DSO=ON
-DWITH_DSO=ON
-DWITH_DOC=
${
WITH_DOC
:-
OFF
}
-DWITH_GPU=
${
WITH_GPU
:-
OFF
}
-DWITH_GPU=
${
WITH_GPU
:-
OFF
}
-DWITH_AMD_GPU=
${
WITH_AMD_GPU
:-
OFF
}
-DWITH_AMD_GPU=
${
WITH_AMD_GPU
:-
OFF
}
-DWITH_DISTRIBUTE=
${
distibuted_flag
}
-DWITH_DISTRIBUTE=
${
distibuted_flag
}
...
@@ -208,7 +207,6 @@ EOF
...
@@ -208,7 +207,6 @@ EOF
-DCMAKE_BUILD_TYPE
=
${
CMAKE_BUILD_TYPE
:-
Release
}
\
-DCMAKE_BUILD_TYPE
=
${
CMAKE_BUILD_TYPE
:-
Release
}
\
${
PYTHON_FLAGS
}
\
${
PYTHON_FLAGS
}
\
-DWITH_DSO
=
ON
\
-DWITH_DSO
=
ON
\
-DWITH_DOC
=
${
WITH_DOC
:-
OFF
}
\
-DWITH_GPU
=
${
WITH_GPU
:-
OFF
}
\
-DWITH_GPU
=
${
WITH_GPU
:-
OFF
}
\
-DWITH_AMD_GPU
=
${
WITH_AMD_GPU
:-
OFF
}
\
-DWITH_AMD_GPU
=
${
WITH_AMD_GPU
:-
OFF
}
\
-DWITH_DISTRIBUTE
=
${
distibuted_flag
}
\
-DWITH_DISTRIBUTE
=
${
distibuted_flag
}
\
...
@@ -328,7 +326,8 @@ function run_brpc_test() {
...
@@ -328,7 +326,8 @@ function run_brpc_test() {
========================================
========================================
EOF
EOF
set
+x
set
+x
declare
-a
other_tests
=(
"test_listen_and_serv_op"
"system_allocator_test"
)
declare
-a
other_tests
=(
"test_listen_and_serv_op"
"system_allocator_test"
\
"rpc_server_test"
"varhandle_test"
"collective_server_test"
"brpc_serde_test"
)
all_tests
=
`
ctest
-N
`
all_tests
=
`
ctest
-N
`
for
t
in
"
${
other_tests
[@]
}
"
for
t
in
"
${
other_tests
[@]
}
"
...
@@ -527,31 +526,6 @@ function bind_test() {
...
@@ -527,31 +526,6 @@ function bind_test() {
wait
wait
}
}
function
gen_docs
()
{
mkdir
-p
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
cat
<<
EOF
========================================
Building documentation ...
In /paddle/build
========================================
EOF
cmake ..
\
-DCMAKE_BUILD_TYPE
=
Release
\
-DWITH_DOC
=
ON
\
-DWITH_GPU
=
OFF
\
-DWITH_MKL
=
OFF
make
-j
`
nproc
`
paddle_docs paddle_apis
# check websites for broken links
linkchecker doc/v2/en/html/index.html
linkchecker doc/v2/cn/html/index.html
linkchecker doc/v2/api/en/html/index.html
}
function
gen_doc_lib
()
{
function
gen_doc_lib
()
{
mkdir
-p
${
PADDLE_ROOT
}
/build
mkdir
-p
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
...
@@ -563,7 +537,6 @@ function gen_doc_lib() {
...
@@ -563,7 +537,6 @@ function gen_doc_lib() {
EOF
EOF
cmake ..
\
cmake ..
\
-DCMAKE_BUILD_TYPE
=
Release
\
-DCMAKE_BUILD_TYPE
=
Release
\
-DWITH_DOC
=
ON
\
-DWITH_GPU
=
OFF
\
-DWITH_GPU
=
OFF
\
-DWITH_MKL
=
OFF
\
-DWITH_MKL
=
OFF
\
-DWITH_FLUID_ONLY
=
ON
-DWITH_FLUID_ONLY
=
ON
...
@@ -802,9 +775,6 @@ function main() {
...
@@ -802,9 +775,6 @@ function main() {
bind_test
)
bind_test
)
bind_test
bind_test
;;
;;
doc
)
gen_docs
;;
gen_doc_lib
)
gen_doc_lib
)
gen_doc_lib
$2
gen_doc_lib
$2
;;
;;
...
...
python/paddle/fluid/contrib/__init__.py
浏览文件 @
c7e38680
...
@@ -22,6 +22,8 @@ from . import op_frequence
...
@@ -22,6 +22,8 @@ from . import op_frequence
from
.op_frequence
import
*
from
.op_frequence
import
*
from
.
import
quantize
from
.
import
quantize
from
.quantize
import
*
from
.quantize
import
*
from
.
import
reader
from
.reader
import
*
from
.
import
slim
from
.
import
slim
from
.slim
import
*
from
.slim
import
*
from
.
import
utils
from
.
import
utils
...
@@ -32,5 +34,6 @@ __all__ += decoder.__all__
...
@@ -32,5 +34,6 @@ __all__ += decoder.__all__
__all__
+=
memory_usage_calc
.
__all__
__all__
+=
memory_usage_calc
.
__all__
__all__
+=
op_frequence
.
__all__
__all__
+=
op_frequence
.
__all__
__all__
+=
quantize
.
__all__
__all__
+=
quantize
.
__all__
__all__
+=
reader
.
__all__
__all__
+=
slim
.
__all__
__all__
+=
slim
.
__all__
__all__
+=
utils
.
__all__
__all__
+=
utils
.
__all__
python/paddle/fluid/contrib/int8_inference/utility.py
浏览文件 @
c7e38680
...
@@ -32,10 +32,13 @@ class Calibrator(object):
...
@@ -32,10 +32,13 @@ class Calibrator(object):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
program
=
kwargs
[
'program'
]
self
.
program
=
kwargs
[
'program'
]
self
.
iterations
=
kwargs
[
'iterations'
]
self
.
pretrained_model
=
kwargs
[
'pretrained_model'
]
self
.
pretrained_model
=
kwargs
[
'pretrained_model'
]
self
.
debug
=
kwargs
[
'debug'
]
self
.
debug
=
kwargs
[
'debug'
]
if
'debug'
in
kwargs
else
False
self
.
algo
=
kwargs
[
'algo'
]
self
.
algo
=
kwargs
[
'algo'
]
self
.
output
=
kwargs
[
'output'
]
self
.
feed_var_names
=
kwargs
[
'feed_var_names'
]
self
.
fetch_list
=
kwargs
[
'fetch_list'
]
self
.
exe
=
kwargs
[
'exe'
]
self
.
_conv_input_var_name
=
[]
self
.
_conv_input_var_name
=
[]
self
.
_conv_output_var_name
=
[]
self
.
_conv_output_var_name
=
[]
...
@@ -54,17 +57,38 @@ class Calibrator(object):
...
@@ -54,17 +57,38 @@ class Calibrator(object):
self
.
_u8_output_var
=
[]
self
.
_u8_output_var
=
[]
self
.
_s8_output_var
=
[]
self
.
_s8_output_var
=
[]
self
.
_persistable_vars
=
[]
self
.
_persistable_vars
=
[]
self
.
_sampling_data
=
{}
def
generate_sampling_program
(
self
):
self
.
__init_analysis
()
self
.
__init_analysis
()
self
.
__generate_output_program
()
self
.
__generate_output_program
()
def
generate_quantized_data
(
self
,
sampling_data
):
def
save_int8_model
(
self
):
self
.
__sampling
(
sampling_data
)
self
.
__sampling
(
s
elf
.
_s
ampling_data
)
self
.
__save_scale
()
self
.
__save_scale
()
self
.
__update_program
()
self
.
__update_program
()
self
.
__update_output_program_attr
()
self
.
__update_output_program_attr
()
self
.
__display_debug
()
self
.
__display_debug
()
self
.
__save_offline_model
()
def
sample_data
(
self
):
'''
Sampling the tensor data of variable.
'''
for
i
in
self
.
sampling_program
.
list_vars
():
if
i
.
name
in
self
.
sampling_vars
:
np_data
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
i
.
name
)
.
get_tensor
())
if
i
.
name
not
in
self
.
_sampling_data
:
self
.
_sampling_data
[
i
.
name
]
=
[]
self
.
_sampling_data
[
i
.
name
].
append
(
np_data
)
def
__save_offline_model
(
self
):
'''
Save the quantized model to the disk.
'''
fluid
.
io
.
save_inference_model
(
self
.
output
,
self
.
feed_var_names
,
self
.
fetch_list
,
self
.
exe
,
self
.
sampling_program
)
def
__display_debug
(
self
):
def
__display_debug
(
self
):
if
self
.
debug
:
if
self
.
debug
:
...
...
python/paddle/fluid/contrib/reader/README.md
0 → 100644
浏览文件 @
c7e38680
## CTR READER
An multi-thread cpp reader that has the same interface with py_reader. It
uses cpp multi-thread to read file and is much more faster then the Python read
thread in py_reader.
Currently, it support two types of file:
-
gzip
-
plain text file
and two types of data format:
-
cvs data format is :
*
label dense_fea,dense_fea sparse_fea,sparse_fea
-
the svm data format is :
*
label slot1:fea_sign slot2:fea_sign slot1:fea_sign
python/paddle/fluid/contrib/reader/__init__.py
0 → 100644
浏览文件 @
c7e38680
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
.
import
ctr_reader
__all__
=
ctr_reader
.
__all__
python/paddle/fluid/contrib/reader/ctr_reader.py
浏览文件 @
c7e38680
...
@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \
...
@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \
default_startup_program
,
Variable
default_startup_program
,
Variable
from
paddle.fluid.unique_name
import
generate
as
unique_name
from
paddle.fluid.unique_name
import
generate
as
unique_name
__all__
=
[
'ctr_reader'
]
def
monkey_patch_reader_methods
(
reader
):
def
monkey_patch_reader_methods
(
reader
):
def
__get_reader__
():
def
__get_reader__
():
...
@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader):
...
@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader):
def
reset
():
def
reset
():
return
__get_reader__
().
reset
()
return
__get_reader__
().
reset
()
def
start
():
return
__get_reader__
().
start
()
reader
.
reset
=
reset
reader
.
reset
=
reset
reader
.
start
=
start
reader
.
stop_gradient
=
True
reader
.
stop_gradient
=
True
reader
.
persistable
=
True
reader
.
persistable
=
True
return
reader
return
reader
...
@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var):
...
@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var):
return
new_var
return
new_var
def
ctr_reader
(
feed_data
,
def
ctr_reader
(
capacity
,
feed_dict
,
thread_num
,
file_type
,
# gzip or plain
batch_size
,
file_format
,
# csv or svm
file_list
,
dense_slot_index
,
slots
,
sparse_slot_index
,
name
=
None
):
capacity
,
thread_num
,
batch_size
,
file_list
,
slots
,
name
=
None
):
"""
"""
Create a CTR reader for data feeding in Python
Create a CTR reader for data feeding in Python
...
@@ -67,12 +78,21 @@ def ctr_reader(feed_data,
...
@@ -67,12 +78,21 @@ def ctr_reader(feed_data,
Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
Args:
Args:
feed_dict(list(variable)): a list of data variable.
file_type('gzip'|'plain'): the type of the data file
file_format('csv'|'svm'): csv data or svm data format.
cvs data format is :
label dense_fea,dense_fea sparse_fea,sparse_fea
the svm data format is :
label slot1:fea_sign slot2:fea_sign slot1:fea_sign
dense_slot_index(list(int)): the index of dense slots
sparse_slot_index(list(int)): the index of sparse slots
capacity(int): The buffer capacity maintained by :code:`py_reader`.
capacity(int): The buffer capacity maintained by :code:`py_reader`.
thread_num(
list|tuple): List of tuples which declaring data shapes
.
thread_num(
int): the thread num to read files by cpp reader
.
batch_size(
list|tuple): List of strs which declaring data type
.
batch_size(
int): batch size of data
.
file_list(list
|tuple): List of ints which declaring data lod_level
.
file_list(list
(str)): List of file names that need to read
.
slots(
bool): Whether use double buffer or not
.
slots(
list(int64)): list of slot id
.
name(
base
string): The prefix Python queue name and Reader name. None will
name(string): The prefix Python queue name and Reader name. None will
be generated automatically.
be generated automatically.
Returns:
Returns:
...
@@ -80,7 +100,15 @@ def ctr_reader(feed_data,
...
@@ -80,7 +100,15 @@ def ctr_reader(feed_data,
Examples:
Examples:
1. The basic usage of :code:`py_reader` is as follows:
1. The basic usage of :code:`ctr_reader` is as follows:
.. code-block:: python
py_reader = fluid.contrib.ctr_reader.ctr_reader(
feed_dict=datas, file_type='plain', file_format='csv',
file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[],
capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader')
"""
"""
if
name
is
None
:
if
name
is
None
:
queue_name
=
unique_name
(
'lod_tensor_blocking_queue'
)
queue_name
=
unique_name
(
'lod_tensor_blocking_queue'
)
...
@@ -90,7 +118,7 @@ def ctr_reader(feed_data,
...
@@ -90,7 +118,7 @@ def ctr_reader(feed_data,
reader_name
=
"_"
.
join
([
name
,
"reader"
])
reader_name
=
"_"
.
join
([
name
,
"reader"
])
var
=
global_scope
().
var
(
queue_name
)
var
=
global_scope
().
var
(
queue_name
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
,
shapes
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
)
startup_blk
=
default_startup_program
().
current_block
()
startup_blk
=
default_startup_program
().
current_block
()
reader_var
=
startup_blk
.
create_var
(
name
=
reader_name
)
reader_var
=
startup_blk
.
create_var
(
name
=
reader_name
)
...
@@ -99,12 +127,22 @@ def ctr_reader(feed_data,
...
@@ -99,12 +127,22 @@ def ctr_reader(feed_data,
inputs
=
{
'blocking_queue'
:
[
queue_name
]},
inputs
=
{
'blocking_queue'
:
[
queue_name
]},
outputs
=
{
'Out'
:
[
reader_var
]},
outputs
=
{
'Out'
:
[
reader_var
]},
attrs
=
{
attrs
=
{
'use_data_config'
:
False
,
'thread_num'
:
thread_num
,
'thread_num'
:
thread_num
,
'batch_size'
:
batch_size
,
'batch_size'
:
batch_size
,
'file_list'
:
file_list
,
'file_list'
:
file_list
,
'slots'
:
slots
,
'file_type'
:
file_type
,
'file_format'
:
file_format
,
'dense_slot_index'
:
dense_slot_index
,
'sparse_slot_index'
:
sparse_slot_index
,
'sparse_slots'
:
slots
,
'ranks'
:
[],
'lod_levels'
:
[],
'shape_concat'
:
[]
})
})
dtypes
=
[
data
.
dtype
for
data
in
feed_dict
]
reader_var
.
desc
.
set_dtypes
(
dtypes
)
reader_var
.
persistable
=
True
reader_var
.
persistable
=
True
main_prog_reader_var
=
_copy_reader_var_
(
main_prog_reader_var
=
_copy_reader_var_
(
...
@@ -118,6 +156,9 @@ def ctr_reader(feed_data,
...
@@ -118,6 +156,9 @@ def ctr_reader(feed_data,
main_blk
=
default_main_program
().
current_block
()
main_blk
=
default_main_program
().
current_block
()
main_blk
.
append_op
(
main_blk
.
append_op
(
type
=
'read'
,
inputs
=
{
'Reader'
:
[
reader
]},
outputs
=
{
'Out'
:
feed_data
})
type
=
'read'
,
inputs
=
{
'Reader'
:
[
reader
]},
attrs
=
{
'infer_out'
:
False
},
outputs
=
{
'Out'
:
feed_dict
})
return
reader
return
reader
python/paddle/fluid/contrib/tests/test_calibration.py
浏览文件 @
c7e38680
...
@@ -23,10 +23,11 @@ import argparse
...
@@ -23,10 +23,11 @@ import argparse
import
functools
import
functools
import
contextlib
import
contextlib
import
paddle.fluid.profiler
as
profiler
import
paddle.fluid.profiler
as
profiler
from
paddle.dataset.common
import
download
from
PIL
import
Image
,
ImageEnhance
from
PIL
import
Image
,
ImageEnhance
import
math
import
math
sys
.
path
.
append
(
'..'
)
sys
.
path
.
append
(
'..'
)
import
int8_inference.utility
as
ut
import
int8_inference.utility
as
int8_utility
random
.
seed
(
0
)
random
.
seed
(
0
)
np
.
random
.
seed
(
0
)
np
.
random
.
seed
(
0
)
...
@@ -116,27 +117,43 @@ def val(data_dir=DATA_DIR):
...
@@ -116,27 +117,43 @@ def val(data_dir=DATA_DIR):
return
_reader_creator
(
file_list
,
'val'
,
shuffle
=
False
,
data_dir
=
data_dir
)
return
_reader_creator
(
file_list
,
'val'
,
shuffle
=
False
,
data_dir
=
data_dir
)
class
TestCalibration
(
unittest
.
TestCase
):
class
TestCalibration
ForResnet50
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
# TODO(guomingz): Put the download process in the cmake.
self
.
int8_download
=
'int8/download'
# Download and unzip test data set
self
.
cache_folder
=
os
.
path
.
expanduser
(
'~/.cache/paddle/dataset/'
+
imagenet_dl_url
=
'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
self
.
int8_download
)
zip_file_name
=
imagenet_dl_url
.
split
(
'/'
)[
-
1
]
cmd
=
'rm -rf data {} && mkdir data && wget {} && tar xvf {} -C data'
.
format
(
data_url
=
'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
zip_file_name
,
imagenet_dl_url
,
zip_file_name
)
data_md5
=
'1b6c1c434172cca1bf9ba1e4d7a3157d'
os
.
system
(
cmd
)
self
.
data_cache_folder
=
self
.
download_data
(
data_url
,
data_md5
,
"data"
)
# resnet50 fp32 data
resnet50_fp32_model_url
=
'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
# reader/decorator.py requires the relative path to the data folder
resnet50_zip_name
=
resnet50_fp32_model_url
.
split
(
'/'
)[
-
1
]
cmd
=
'rm -rf {0} && ln -s {1} {0}'
.
format
(
"data"
,
resnet50_unzip_folder_name
=
'resnet50_fp32'
self
.
data_cache_folder
)
cmd
=
'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'
.
format
(
resnet50_unzip_folder_name
,
resnet50_zip_name
,
resnet50_unzip_folder_name
,
resnet50_fp32_model_url
,
resnet50_zip_name
,
resnet50_unzip_folder_name
)
os
.
system
(
cmd
)
os
.
system
(
cmd
)
self
.
iterations
=
100
self
.
iterations
=
50
self
.
skip_batch_num
=
5
def
cache_unzipping
(
self
,
target_folder
,
zip_path
):
if
not
os
.
path
.
exists
(
target_folder
):
cmd
=
'mkdir {0} && tar xf {1} -C {0}'
.
format
(
target_folder
,
zip_path
)
os
.
system
(
cmd
)
def
download_data
(
self
,
data_url
,
data_md5
,
folder_name
):
download
(
data_url
,
self
.
int8_download
,
data_md5
)
data_cache_folder
=
os
.
path
.
join
(
self
.
cache_folder
,
folder_name
)
file_name
=
data_url
.
split
(
'/'
)[
-
1
]
zip_path
=
os
.
path
.
join
(
self
.
cache_folder
,
file_name
)
self
.
cache_unzipping
(
data_cache_folder
,
zip_path
)
return
data_cache_folder
def
download_resnet50_model
(
self
):
# resnet50 fp32 data
data_url
=
'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
data_md5
=
'4a5194524823d9b76da6e738e1367881'
self
.
model_cache_folder
=
self
.
download_data
(
data_url
,
data_md5
,
"resnet50_fp32"
)
def
run_program
(
self
,
model_path
,
generate_int8
=
False
,
algo
=
'direct'
):
def
run_program
(
self
,
model_path
,
generate_int8
=
False
,
algo
=
'direct'
):
image_shape
=
[
3
,
224
,
224
]
image_shape
=
[
3
,
224
,
224
]
...
@@ -163,16 +180,15 @@ class TestCalibration(unittest.TestCase):
...
@@ -163,16 +180,15 @@ class TestCalibration(unittest.TestCase):
print
(
"Start calibration ..."
)
print
(
"Start calibration ..."
)
calibrator
=
ut
.
Calibrator
(
calibrator
=
int8_utility
.
Calibrator
(
program
=
infer_program
,
program
=
infer_program
,
pretrained_model
=
model_path
,
pretrained_model
=
model_path
,
iterations
=
100
,
algo
=
algo
,
debug
=
Fals
e
,
exe
=
ex
e
,
algo
=
algo
)
output
=
int8_model
,
feed_var_names
=
feed_dict
,
sampling_data
=
{}
fetch_list
=
fetch_targets
)
calibrator
.
generate_sampling_program
()
test_info
=
[]
test_info
=
[]
cnt
=
0
cnt
=
0
for
batch_id
,
data
in
enumerate
(
val_reader
()):
for
batch_id
,
data
in
enumerate
(
val_reader
()):
...
@@ -192,13 +208,7 @@ class TestCalibration(unittest.TestCase):
...
@@ -192,13 +208,7 @@ class TestCalibration(unittest.TestCase):
feed_dict
[
1
]:
label
},
feed_dict
[
1
]:
label
},
fetch_list
=
fetch_targets
)
fetch_list
=
fetch_targets
)
if
generate_int8
:
if
generate_int8
:
for
i
in
calibrator
.
sampling_program
.
list_vars
():
calibrator
.
sample_data
()
if
i
.
name
in
calibrator
.
sampling_vars
:
np_data
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
i
.
name
)
.
get_tensor
())
if
i
.
name
not
in
sampling_data
:
sampling_data
[
i
.
name
]
=
[]
sampling_data
[
i
.
name
].
append
(
np_data
)
test_info
.
append
(
np
.
mean
(
acc1
)
*
len
(
data
))
test_info
.
append
(
np
.
mean
(
acc1
)
*
len
(
data
))
cnt
+=
len
(
data
)
cnt
+=
len
(
data
)
...
@@ -209,18 +219,35 @@ class TestCalibration(unittest.TestCase):
...
@@ -209,18 +219,35 @@ class TestCalibration(unittest.TestCase):
break
break
if
generate_int8
:
if
generate_int8
:
calibrator
.
generate_quantized_data
(
sampling_data
)
calibrator
.
save_int8_model
()
fluid
.
io
.
save_inference_model
(
int8_model
,
feed_dict
,
fetch_targets
,
exe
,
calibrator
.
sampling_program
)
print
(
print
(
"Calibration is done and the corresponding files
we
re generated at {}"
.
"Calibration is done and the corresponding files
a
re generated at {}"
.
format
(
os
.
path
.
abspath
(
"calibration_out"
)))
format
(
os
.
path
.
abspath
(
"calibration_out"
)))
else
:
else
:
return
np
.
sum
(
test_info
)
/
cnt
return
np
.
sum
(
test_info
)
/
cnt
def
test_calibration_for_resnet50
(
self
):
def
test_calibration
(
self
):
fp32_acc1
=
self
.
run_program
(
"resnet50_fp32/model"
)
self
.
download_resnet50_model
()
self
.
run_program
(
"resnet50_fp32/model"
,
True
)
fp32_acc1
=
self
.
run_program
(
self
.
model_cache_folder
+
"/model"
)
self
.
run_program
(
self
.
model_cache_folder
+
"/model"
,
True
)
int8_acc1
=
self
.
run_program
(
"calibration_out"
)
delta_value
=
np
.
abs
(
fp32_acc1
-
int8_acc1
)
self
.
assertLess
(
delta_value
,
0.01
)
class
TestCalibrationForMobilenetv1
(
TestCalibrationForResnet50
):
def
download_mobilenetv1_model
(
self
):
# mobilenetv1 fp32 data
data_url
=
'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
data_md5
=
'13892b0716d26443a8cdea15b3c6438b'
self
.
model_cache_folder
=
self
.
download_data
(
data_url
,
data_md5
,
"mobilenetv1_fp32"
)
def
test_calibration
(
self
):
self
.
download_mobilenetv1_model
()
fp32_acc1
=
self
.
run_program
(
self
.
model_cache_folder
+
"/model"
)
self
.
run_program
(
self
.
model_cache_folder
+
"/model"
,
True
,
algo
=
'KL'
)
int8_acc1
=
self
.
run_program
(
"calibration_out"
)
int8_acc1
=
self
.
run_program
(
"calibration_out"
)
delta_value
=
np
.
abs
(
fp32_acc1
-
int8_acc1
)
delta_value
=
np
.
abs
(
fp32_acc1
-
int8_acc1
)
self
.
assertLess
(
delta_value
,
0.01
)
self
.
assertLess
(
delta_value
,
0.01
)
...
...
python/paddle/fluid/framework.py
浏览文件 @
c7e38680
...
@@ -70,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
...
@@ -70,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
CONTROL_DEP_VAR_PREFIX
=
core
.
kControlDepVarName
()
CONTROL_DEP_VAR_PREFIX
=
core
.
kControlDepVarName
()
_imperative_tracer_
=
None
_imperative_tracer_
=
None
_imperative_current_expected_place_
=
None
def
_in_imperative_mode
():
def
_in_imperative_mode
():
...
@@ -80,6 +81,10 @@ def _imperative_tracer():
...
@@ -80,6 +81,10 @@ def _imperative_tracer():
return
_imperative_tracer_
return
_imperative_tracer_
def
_current_expected_place
():
return
_imperative_current_expected_place_
class
NameScope
(
object
):
class
NameScope
(
object
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
self
.
_children
=
dict
()
self
.
_children
=
dict
()
...
@@ -383,8 +388,8 @@ class Variable(object):
...
@@ -383,8 +388,8 @@ class Variable(object):
self
.
_ivar
.
stop_gradient
=
stop_gradient
self
.
_ivar
.
stop_gradient
=
stop_gradient
def
_numpy
(
self
):
def
_numpy
(
self
):
tensor
=
self
.
_ivar
.
value
().
get_tensor
(
)
new_ivar
=
self
.
_ivar
.
_copy_to
(
core
.
CPUPlace
(),
True
)
return
np
.
array
(
tensor
)
return
np
.
array
(
new_ivar
.
value
().
get_tensor
()
)
def
_backward
(
self
):
def
_backward
(
self
):
self
.
_ivar
.
_run_backward
()
self
.
_ivar
.
_run_backward
()
...
@@ -1311,6 +1316,7 @@ class Block(object):
...
@@ -1311,6 +1316,7 @@ class Block(object):
def
_trace_op
(
self
,
op
,
stop_gradient
=
False
):
def
_trace_op
(
self
,
op
,
stop_gradient
=
False
):
if
_in_imperative_mode
():
if
_in_imperative_mode
():
_imperative_tracer
().
trace
(
op
.
iop
,
op
.
inputs
,
op
.
outputs
,
self
.
desc
,
_imperative_tracer
().
trace
(
op
.
iop
,
op
.
inputs
,
op
.
outputs
,
self
.
desc
,
_imperative_current_expected_place_
,
stop_gradient
)
stop_gradient
)
def
_insert_op
(
self
,
index
,
*
args
,
**
kwargs
):
def
_insert_op
(
self
,
index
,
*
args
,
**
kwargs
):
...
@@ -2502,5 +2508,18 @@ def _imperative_guard(tracer):
...
@@ -2502,5 +2508,18 @@ def _imperative_guard(tracer):
global
_imperative_tracer_
global
_imperative_tracer_
tmp_trace
=
_imperative_tracer_
tmp_trace
=
_imperative_tracer_
_imperative_tracer_
=
tracer
_imperative_tracer_
=
tracer
yield
yield
_imperative_tracer_
=
tmp_trace
_imperative_tracer_
=
tmp_trace
@
contextlib
.
contextmanager
def
_imperative_place_guard
(
place
):
global
_imperative_current_expected_place_
tmp_place
=
_imperative_current_expected_place_
_imperative_current_expected_place_
=
place
yield
_imperative_current_expected_place_
=
tmp_place
python/paddle/fluid/imperative/base.py
浏览文件 @
c7e38680
...
@@ -25,18 +25,28 @@ def enabled():
...
@@ -25,18 +25,28 @@ def enabled():
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
guard
():
def
guard
(
place
=
None
):
train
=
framework
.
Program
()
train
=
framework
.
Program
()
startup
=
framework
.
Program
()
startup
=
framework
.
Program
()
tracer
=
core
.
Tracer
(
train
.
current_block
().
desc
)
tracer
=
core
.
Tracer
(
train
.
current_block
().
desc
)
if
place
is
None
:
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
else
:
place
=
core
.
CPUPlace
()
with
framework
.
program_guard
(
train
,
startup
):
with
framework
.
program_guard
(
train
,
startup
):
with
framework
.
unique_name
.
guard
():
with
framework
.
unique_name
.
guard
():
with
framework
.
_imperative_guard
(
tracer
):
with
framework
.
_imperative_guard
(
tracer
):
yield
with
framework
.
_imperative_place_guard
(
place
):
yield
def
to_variable
(
value
,
block
=
None
):
def
to_variable
(
value
,
block
=
None
):
if
isinstance
(
value
,
np
.
ndarray
):
if
isinstance
(
value
,
np
.
ndarray
):
assert
enabled
(),
"to_variable could only be called in imperative mode"
if
not
block
:
if
not
block
:
block
=
framework
.
default_main_program
().
current_block
()
block
=
framework
.
default_main_program
().
current_block
()
py_var
=
framework
.
Variable
(
py_var
=
framework
.
Variable
(
...
@@ -47,9 +57,7 @@ def to_variable(value, block=None):
...
@@ -47,9 +57,7 @@ def to_variable(value, block=None):
dtype
=
value
.
dtype
)
dtype
=
value
.
dtype
)
var
=
py_var
.
_ivar
.
value
()
var
=
py_var
.
_ivar
.
value
()
tensor
=
var
.
get_tensor
()
tensor
=
var
.
get_tensor
()
tensor
.
set
(
value
,
core
.
CPUP
lace
())
tensor
.
set
(
value
,
framework
.
_current_expected_p
lace
())
return
py_var
return
py_var
elif
isinstance
(
value
,
framework
.
Variable
):
elif
isinstance
(
value
,
framework
.
Variable
):
return
value
return
value
else
:
raise
ValueError
(
"Unsupported type %s"
%
type
(
value
))
python/paddle/fluid/imperative/nn.py
浏览文件 @
c7e38680
...
@@ -27,6 +27,7 @@ __all__ = [
...
@@ -27,6 +27,7 @@ __all__ = [
'Conv2D'
,
'Conv2D'
,
'Pool2D'
,
'Pool2D'
,
'FC'
,
'FC'
,
'BatchNorm'
,
]
]
...
@@ -55,7 +56,8 @@ class Conv2D(layers.Layer):
...
@@ -55,7 +56,8 @@ class Conv2D(layers.Layer):
param_attr
=
param_attr
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
bias_attr
=
bias_attr
,
dtype
=
dtype
,
dtype
=
dtype
,
name
=
name
)
name
=
name
,
act
=
act
)
self
.
_groups
=
groups
self
.
_groups
=
groups
self
.
_stride
=
utils
.
convert_to_list
(
stride
,
2
,
'stride'
)
self
.
_stride
=
utils
.
convert_to_list
(
stride
,
2
,
'stride'
)
...
@@ -141,6 +143,7 @@ class Conv2D(layers.Layer):
...
@@ -141,6 +143,7 @@ class Conv2D(layers.Layer):
outputs
=
{
'Out'
:
[
pre_act
]},
outputs
=
{
'Out'
:
[
pre_act
]},
attrs
=
{
'axis'
:
1
})
attrs
=
{
'axis'
:
1
})
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
pre_act
)
return
self
.
_helper
.
append_activation
(
pre_act
)
...
@@ -216,6 +219,7 @@ class FC(layers.Layer):
...
@@ -216,6 +219,7 @@ class FC(layers.Layer):
act
=
None
,
act
=
None
,
name
=
None
):
name
=
None
):
super
(
FC
,
self
).
__init__
()
super
(
FC
,
self
).
__init__
()
self
.
_size
=
size
self
.
_size
=
size
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_num_flatten_dims
=
num_flatten_dims
self
.
_dtype
=
dtype
self
.
_dtype
=
dtype
...
@@ -241,6 +245,16 @@ class FC(layers.Layer):
...
@@ -241,6 +245,16 @@ class FC(layers.Layer):
dtype
=
self
.
_dtype
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
is_bias
=
False
)
if
self
.
_helper
.
bias_attr
:
size
=
list
([
self
.
_size
])
self
.
_b
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
bias_attr
,
shape
=
size
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
else
:
self
.
_b
=
None
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
tmp
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
tmp
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
self
.
_helper
.
append_op
(
self
.
_helper
.
append_op
(
...
@@ -253,28 +267,155 @@ class FC(layers.Layer):
...
@@ -253,28 +267,155 @@ class FC(layers.Layer):
"y_num_col_dims"
:
1
"y_num_col_dims"
:
1
})
})
out
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
pre_bias
=
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
self
.
_helper
.
append_op
(
self
.
_helper
.
append_op
(
type
=
"sum"
,
type
=
"sum"
,
inputs
=
{
"X"
:
[
tmp
]},
inputs
=
{
"X"
:
[
tmp
]},
outputs
=
{
"Out"
:
out
},
outputs
=
{
"Out"
:
pre_bias
},
attrs
=
{
"use_mkldnn"
:
False
})
attrs
=
{
"use_mkldnn"
:
False
})
bias_attr
=
self
.
_helper
.
bias_attr
if
self
.
_b
:
if
bias_attr
:
pre_activation
=
self
.
_helper
.
create_variable_for_type_inference
(
# add bias
dtype
=
self
.
_dtype
)
size
=
list
(
out
.
shape
[
1
:])
if
not
self
.
_built
:
self
.
_b
=
self
.
_helper
.
create_parameter
(
attr
=
bias_attr
,
shape
=
size
,
dtype
=
out
.
dtype
,
is_bias
=
True
)
bias_out
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
out
.
dtype
)
self
.
_helper
.
append_op
(
self
.
_helper
.
append_op
(
type
=
'elementwise_add'
,
type
=
'elementwise_add'
,
inputs
=
{
'X'
:
[
out
],
inputs
=
{
'X'
:
[
pre_bias
],
'Y'
:
[
self
.
_b
]},
'Y'
:
[
self
.
_b
]},
outputs
=
{
'Out'
:
[
bias_out
]},
outputs
=
{
'Out'
:
[
pre_activation
]},
attrs
=
{
'axis'
:
1
})
attrs
=
{
'axis'
:
self
.
_num_flatten_dims
})
out
=
bias_out
else
:
# add activation
pre_activation
=
pre_bias
return
self
.
_helper
.
append_activation
(
out
)
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
pre_activation
)
class
BatchNorm
(
layers
.
Layer
):
def
__init__
(
self
,
num_channels
,
act
=
None
,
is_test
=
False
,
momentum
=
0.9
,
epsilon
=
1e-05
,
param_attr
=
None
,
bias_attr
=
None
,
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
data_layout
=
'NCHW'
,
in_place
=
False
,
name
=
None
,
moving_mean_name
=
None
,
moving_variance_name
=
None
,
do_model_average_for_mean_and_var
=
False
,
fuse_with_relu
=
False
,
use_global_stats
=
False
):
super
(
BatchNorm
,
self
).
__init__
()
assert
bias_attr
is
not
False
,
"bias_attr should not be False in batch_norm."
from
..layer_helper
import
LayerHelper
self
.
_helper
=
LayerHelper
(
'batch_norm'
,
param_attr
=
param_attr
,
bias_attr
=
bias_attr
,
name
=
name
,
act
=
act
)
if
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
self
.
_dtype
=
core
.
VarDesc
.
VarType
.
FP32
else
:
self
.
_dtype
=
dtype
param_shape
=
[
num_channels
]
# create parameter
self
.
_scale
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
param_attr
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
default_initializer
=
Constant
(
1.0
))
# TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
# # setting stop_gradient=True to reduce computation
# if use_global_stats and self._helper.param_attr.learning_rate == 0.:
# self._scale.stop_gradient = True
self
.
_bias
=
self
.
_helper
.
create_parameter
(
attr
=
self
.
_helper
.
bias_attr
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
is_bias
=
True
)
# TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
# # setting stop_gradient=True to reduce computation
# if use_global_stats and self._helper.bias_attr.learning_rate == 0.:
# self._bias.stop_gradient = True
self
.
_mean
=
self
.
_helper
.
create_parameter
(
attr
=
ParamAttr
(
name
=
moving_mean_name
,
initializer
=
Constant
(
0.0
),
trainable
=
False
,
do_model_average
=
do_model_average_for_mean_and_var
),
shape
=
param_shape
,
dtype
=
self
.
_dtype
)
self
.
_mean
.
stop_gradient
=
True
self
.
_variance
=
self
.
_helper
.
create_parameter
(
attr
=
ParamAttr
(
name
=
moving_variance_name
,
initializer
=
Constant
(
1.0
),
trainable
=
False
,
do_model_average
=
do_model_average_for_mean_and_var
),
shape
=
param_shape
,
dtype
=
self
.
_dtype
)
self
.
_variance
.
stop_gradient
=
True
self
.
_in_place
=
in_place
self
.
_momentum
=
momentum
self
.
_epsilon
=
epsilon
self
.
_is_test
=
is_test
self
.
_fuse_with_relu
=
fuse_with_relu
self
.
_use_global_stats
=
use_global_stats
def
_build_once
(
self
,
input
):
pass
def
forward
(
self
,
input
):
# create output
# mean and mean_out share the same memory
mean_out
=
self
.
_mean
# variance and variance out share the same memory
variance_out
=
self
.
_variance
saved_mean
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
,
stop_gradient
=
True
)
saved_variance
=
self
.
_helper
.
create_variable_for_type_inference
(
dtype
=
self
.
_dtype
,
stop_gradient
=
True
)
batch_norm_out
=
input
if
self
.
_in_place
else
self
.
_helper
.
create_variable_for_type_inference
(
self
.
_dtype
)
self
.
_helper
.
append_op
(
type
=
"batch_norm"
,
inputs
=
{
"X"
:
input
,
"Scale"
:
self
.
_scale
,
"Bias"
:
self
.
_bias
,
"Mean"
:
self
.
_mean
,
"Variance"
:
self
.
_variance
},
outputs
=
{
"Y"
:
batch_norm_out
,
"MeanOut"
:
mean_out
,
"VarianceOut"
:
variance_out
,
"SavedMean"
:
saved_mean
,
"SavedVariance"
:
saved_variance
},
attrs
=
{
"momentum"
:
self
.
_momentum
,
"epsilon"
:
self
.
_epsilon
,
"is_test"
:
self
.
_is_test
,
"use_mkldnn"
:
False
,
"fuse_with_relu"
:
self
.
_fuse_with_relu
,
"use_global_stats"
:
self
.
_use_global_stats
})
# Currently, we don't support inplace in imperative mode
return
self
.
_helper
.
append_activation
(
batch_norm_out
)
python/paddle/fluid/layer_helper.py
浏览文件 @
c7e38680
...
@@ -435,7 +435,10 @@ class LayerHelper(object):
...
@@ -435,7 +435,10 @@ class LayerHelper(object):
act_type
=
act
.
pop
(
'type'
)
act_type
=
act
.
pop
(
'type'
)
tmp
=
input_var
tmp
=
input_var
# NOTE(dzhwinter): some activation support inplace compution.
# NOTE(dzhwinter): some activation support inplace compution.
if
not
core
.
IsInplace
(
act_type
):
# NOTE(minqiyang): currently, we don't support inplace in imperative mode
if
not
imperative_base
.
enabled
()
and
core
.
IsInplace
(
act_type
):
tmp
=
input_var
else
:
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
tmp
=
self
.
create_variable_for_type_inference
(
dtype
=
input_var
.
dtype
)
self
.
append_op
(
self
.
append_op
(
type
=
act_type
,
type
=
act_type
,
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
c7e38680
...
@@ -523,7 +523,7 @@ def _py_reader(capacity,
...
@@ -523,7 +523,7 @@ def _py_reader(capacity,
double_buffer_name
=
"_"
.
join
([
name
,
"double_buffer"
])
double_buffer_name
=
"_"
.
join
([
name
,
"double_buffer"
])
var
=
global_scope
().
var
(
queue_name
)
var
=
global_scope
().
var
(
queue_name
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
,
shapes
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
)
startup_blk
=
default_startup_program
().
current_block
()
startup_blk
=
default_startup_program
().
current_block
()
startup_var
=
startup_blk
.
create_var
(
name
=
reader_name
)
startup_var
=
startup_blk
.
create_var
(
name
=
reader_name
)
...
...
python/paddle/fluid/layers/learning_rate_scheduler.py
浏览文件 @
c7e38680
...
@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay):
...
@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay):
The decayed learning rate
The decayed learning rate
Examples:
Examples:
.. code-block:: python
.. code-block:: python
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
"""
"""
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
c7e38680
...
@@ -179,6 +179,7 @@ __all__ = [
...
@@ -179,6 +179,7 @@ __all__ = [
'merge_selected_rows'
,
'merge_selected_rows'
,
'get_tensor_from_selected_rows'
,
'get_tensor_from_selected_rows'
,
'lstm'
,
'lstm'
,
'shuffle_channel'
,
'py_func'
,
'py_func'
,
'psroi_pool'
,
'psroi_pool'
,
'teacher_student_sigmoid_loss'
,
'teacher_student_sigmoid_loss'
,
...
@@ -2874,7 +2875,7 @@ def batch_norm(input,
...
@@ -2874,7 +2875,7 @@ def batch_norm(input,
attr
=
helper
.
bias_attr
,
shape
=
param_shape
,
dtype
=
dtype
,
is_bias
=
True
)
attr
=
helper
.
bias_attr
,
shape
=
param_shape
,
dtype
=
dtype
,
is_bias
=
True
)
# setting stop_gradient=True to reduce computation
# setting stop_gradient=True to reduce computation
if
use_global_stats
and
helper
.
bias_attr
.
learning_rate
==
0.
:
if
use_global_stats
and
helper
.
bias_attr
.
learning_rate
==
0.
:
scale
.
stop_gradient
=
True
bias
.
stop_gradient
=
True
mean
=
helper
.
create_parameter
(
mean
=
helper
.
create_parameter
(
attr
=
ParamAttr
(
attr
=
ParamAttr
(
...
@@ -3875,6 +3876,7 @@ def beam_search(pre_ids,
...
@@ -3875,6 +3876,7 @@ def beam_search(pre_ids,
beam_size
,
beam_size
,
end_id
,
end_id
,
level
=
0
,
level
=
0
,
is_accumulated
=
True
,
name
=
None
):
name
=
None
):
"""
"""
Beam search is a classical algorithm for selecting candidate words in a
Beam search is a classical algorithm for selecting candidate words in a
...
@@ -3887,14 +3889,17 @@ def beam_search(pre_ids,
...
@@ -3887,14 +3889,17 @@ def beam_search(pre_ids,
selects the top-K candidate word ids of current step from :attr:`ids`
selects the top-K candidate word ids of current step from :attr:`ids`
according to their :attr:`scores` for all source sentences, where K is
according to their :attr:`scores` for all source sentences, where K is
:attr:`beam_size` and :attr:`ids, scores` are predicted results from the
:attr:`beam_size` and :attr:`ids, scores` are predicted results from the
computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
computation cell. If :attr:`ids` is not set, it will be calculated out
the output of beam_search at previous step, they are needed for special use
according to :attr:`scores`. Additionally, :attr:`pre_ids` and
to handle ended candidate translations.
:attr:`pre_scores` are the output of beam_search at previous step, they
are needed for special use to handle ended candidate translations.
Note that the :attr:`scores` passed in should be accumulated scores, and
length penalty should be done with extra operators before calculating the
Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
accumulated scores if needed, also suggest finding top-K before it and
passed in should be accumulated scores. Else, the :attr:`scores` are
using the top-K candidates following.
considered as the straightforward scores and will be transformed to the
log field and accumulated the :attr:`pre_scores` in this operator.
Length penalty should be done with extra operators before calculating the
accumulated scores if needed.
Please see the following demo for a fully beam search usage example:
Please see the following demo for a fully beam search usage example:
...
@@ -3924,6 +3929,8 @@ def beam_search(pre_ids,
...
@@ -3924,6 +3929,8 @@ def beam_search(pre_ids,
describes how these candidates belong to the prefix. The paths
describes how these candidates belong to the prefix. The paths
linking prefixes and selected candidates are organized and reserved
linking prefixes and selected candidates are organized and reserved
in lod.
in lod.
is_accumulated(bool, default True): Whether the input :attr:`score` is
accumulated scores.
name(str|None): A name for this layer(optional). If set None, the layer
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
will be named automatically.
...
@@ -3952,8 +3959,12 @@ def beam_search(pre_ids,
...
@@ -3952,8 +3959,12 @@ def beam_search(pre_ids,
end_id=end_id)
end_id=end_id)
"""
"""
helper
=
LayerHelper
(
'beam_search'
,
**
locals
())
helper
=
LayerHelper
(
'beam_search'
,
**
locals
())
score_type
=
scores
.
dtype
score_type
=
pre_scores
.
dtype
id_type
=
ids
.
dtype
id_type
=
pre_ids
.
dtype
inputs
=
{
"pre_ids"
:
pre_ids
,
"pre_scores"
:
pre_scores
,
"scores"
:
scores
}
if
ids
is
not
None
:
inputs
[
"ids"
]
=
ids
selected_scores
=
helper
.
create_variable_for_type_inference
(
selected_scores
=
helper
.
create_variable_for_type_inference
(
dtype
=
score_type
)
dtype
=
score_type
)
...
@@ -3961,12 +3972,7 @@ def beam_search(pre_ids,
...
@@ -3961,12 +3972,7 @@ def beam_search(pre_ids,
helper
.
append_op
(
helper
.
append_op
(
type
=
'beam_search'
,
type
=
'beam_search'
,
inputs
=
{
inputs
=
inputs
,
'pre_ids'
:
pre_ids
,
'pre_scores'
:
pre_scores
,
'ids'
:
ids
,
'scores'
:
scores
,
},
outputs
=
{
outputs
=
{
'selected_ids'
:
selected_ids
,
'selected_ids'
:
selected_ids
,
'selected_scores'
:
selected_scores
,
'selected_scores'
:
selected_scores
,
...
@@ -3976,6 +3982,7 @@ def beam_search(pre_ids,
...
@@ -3976,6 +3982,7 @@ def beam_search(pre_ids,
'level'
:
level
,
'level'
:
level
,
'beam_size'
:
beam_size
,
'beam_size'
:
beam_size
,
'end_id'
:
end_id
,
'end_id'
:
end_id
,
'is_accumulated'
:
is_accumulated
,
})
})
return
selected_ids
,
selected_scores
return
selected_ids
,
selected_scores
...
@@ -5146,9 +5153,9 @@ def nce(input,
...
@@ -5146,9 +5153,9 @@ def nce(input,
littles
=
[]
littles
=
[]
for
i
in
range
(
custom_dist_len
):
for
i
in
range
(
custom_dist_len
):
normal_prob
=
custom_dist
[
i
]
*
custom_dist_len
normal_prob
=
custom_dist
[
i
]
*
custom_dist_len
if
normal_prob
-
1.0
>
1e-4
:
if
normal_prob
-
1.0
>
0
:
bigs
.
append
((
i
,
normal_prob
))
bigs
.
append
((
i
,
normal_prob
))
elif
1.0
-
normal_prob
>
1e-4
:
elif
1.0
-
normal_prob
>
0
:
littles
.
append
((
i
,
normal_prob
))
littles
.
append
((
i
,
normal_prob
))
else
:
else
:
alias_probs_
[
i
]
=
normal_prob
alias_probs_
[
i
]
=
normal_prob
...
@@ -5164,9 +5171,9 @@ def nce(input,
...
@@ -5164,9 +5171,9 @@ def nce(input,
alias_probs_
[
little
[
0
]]
=
little
[
1
]
alias_probs_
[
little
[
0
]]
=
little
[
1
]
alias_
[
little
[
0
]]
=
big_idx
alias_
[
little
[
0
]]
=
big_idx
big_left
=
big
[
1
]
+
little
[
1
]
-
1
big_left
=
big
[
1
]
+
little
[
1
]
-
1
if
big_left
-
1.0
>
1e-4
:
if
big_left
-
1.0
>
0
:
bigs
.
append
((
big_idx
,
big_left
))
bigs
.
append
((
big_idx
,
big_left
))
elif
1.0
-
big_left
>
1e-4
:
elif
1.0
-
big_left
>
0
:
littles
.
append
((
big_idx
,
big_left
))
littles
.
append
((
big_idx
,
big_left
))
else
:
else
:
alias_probs_
[
big_idx
]
=
big_left
alias_probs_
[
big_idx
]
=
big_left
...
@@ -5856,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
...
@@ -5856,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
type
=
'increment'
,
type
=
'increment'
,
inputs
=
{
'X'
:
[
counter
]},
inputs
=
{
'X'
:
[
counter
]},
outputs
=
{
'Out'
:
[
counter
]},
outputs
=
{
'Out'
:
[
counter
]},
attrs
=
{
'step'
:
float
(
step
)})
attrs
=
{
'step'
:
float
(
step
)},
stop_gradient
=
True
)
counter
.
stop_gradient
=
True
counter
.
stop_gradient
=
True
return
counter
return
counter
...
@@ -9475,7 +9483,7 @@ def teacher_student_sigmoid_loss(input,
...
@@ -9475,7 +9483,7 @@ def teacher_student_sigmoid_loss(input,
by the previous operator.
by the previous operator.
label (Variable|list): the ground truth which is a 2-D tensor with
label (Variable|list): the ground truth which is a 2-D tensor with
shape [N x 1], where N is the batch size.
shape [N x 1], where N is the batch size.
soft_max_up_bound (float): if input > soft_max_up_bound, will be bound
soft_max_up_bound (float): if input > soft_max_up_bound, will be bound
soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
Returns:
Returns:
...
@@ -9639,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None):
...
@@ -9639,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None):
return
out
return
out
def
shuffle_channel
(
x
,
group
,
name
=
None
):
"""
**Shuffle Channel Operator**
This operator shuffles the channels of input x.
It divide the input channels in each group into :attr:`group` subgroups,
and obtain a new order by selecting element from every subgroup one by one.
Please refer to the paper
https://arxiv.org/pdf/1707.01083.pdf
.. code-block:: text
Given a 4-D tensor input with the shape (N, C, H, W):
input.shape = (1, 4, 2, 2)
input.data =[[[[0.1, 0.2],
[0.2, 0.3]],
[[0.3, 0.4],
[0.4, 0.5]],
[[0.5, 0.6],
[0.6, 0.7]],
[[0.7, 0.8],
[0.8, 0.9]]]]
Given group: 2
then we get a 4-D tensor out whth the same shape of input:
out.shape = (1, 4, 2, 2)
out.data = [[[[0.1, 0.2],
[0.2, 0.3]],
[[0.5, 0.6],
[0.6, 0.7]],
[[0.3, 0.4],
[0.4, 0.5]],
[[0.7, 0.8],
[0.8, 0.9]]]]
Args:
x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
group(int): Indicating the conuts of subgroups, It should divide the number of channels.
Returns:
out(Variable): the channels shuffling result is a tensor variable with the
same shape and same type as the input.
Raises:
ValueError: If group is not an int type variable.
Examples:
.. code-block:: python
input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
out = fluid.layers.shuffle_channel(x=input, group=2)
"""
helper
=
LayerHelper
(
"shuffle_channel"
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
if
not
isinstance
(
group
,
int
):
raise
TypeError
(
"group must be int type"
)
helper
.
append_op
(
type
=
"shuffle_channel"
,
inputs
=
{
"X"
:
x
},
outputs
=
{
"Out"
:
out
},
attrs
=
{
"group"
:
group
})
return
out
class
PyFuncRegistry
(
object
):
class
PyFuncRegistry
(
object
):
_register_funcs
=
[]
_register_funcs
=
[]
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
c7e38680
...
@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
...
@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
'dtype'
:
out
.
dtype
,
'dtype'
:
out
.
dtype
,
'value'
:
float
(
value
),
'value'
:
float
(
value
),
'force_cpu'
:
force_cpu
or
force_init_on_cpu
()
'force_cpu'
:
force_cpu
or
force_init_on_cpu
()
})
},
stop_gradient
=
True
)
out
.
stop_gradient
=
True
out
.
stop_gradient
=
True
return
out
return
out
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
c7e38680
...
@@ -301,10 +301,10 @@ class Optimizer(object):
...
@@ -301,10 +301,10 @@ class Optimizer(object):
no_grad_set (set|None): set of Variables should be ignored.
no_grad_set (set|None): set of Variables should be ignored.
callbacks (list|None): list of callables to run when appending backward
callbacks (list|None): list of callables to run when appending backward
operator for one parameter.
operator for one parameter.
Return:
Return:
list: list of (param, grad) pair, grad is the output of backward.
list: list of (param, grad) pair, grad is the output of backward.
Examples:
Examples:
See examples in `apply_gradients`.
See examples in `apply_gradients`.
"""
"""
...
@@ -322,10 +322,10 @@ class Optimizer(object):
...
@@ -322,10 +322,10 @@ class Optimizer(object):
Args:
Args:
params_grads (list): list of (param, grad) pair to do optimization.
params_grads (list): list of (param, grad) pair to do optimization.
Returns:
Returns:
list: A list of operators appended to the current program.
list: A list of operators appended to the current program.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -364,7 +364,7 @@ class Optimizer(object):
...
@@ -364,7 +364,7 @@ class Optimizer(object):
This method combines interface `backward()` and
This method combines interface `backward()` and
`apply_gradients()` into one.
`apply_gradients()` into one.
Args:
Args:
loss (Variable): loss variable to run optimizations.
loss (Variable): loss variable to run optimizations.
startup_program (Program): startup_program for initializing parameters
startup_program (Program): startup_program for initializing parameters
...
@@ -381,18 +381,21 @@ class Optimizer(object):
...
@@ -381,18 +381,21 @@ class Optimizer(object):
optimize_ops
=
[]
optimize_ops
=
[]
if
imperative_base
.
enabled
():
if
imperative_base
.
enabled
():
if
parameter_list
is
not
None
:
if
parameter_list
is
not
None
:
param
s_grad
s
=
parameter_list
param
eter
s
=
parameter_list
else
:
else
:
parameters
=
program
.
global_block
().
all_parameters
()
parameters
=
program
.
global_block
().
all_parameters
()
params_grads
=
[]
for
param
in
parameters
:
params_grads
=
[]
# create gradient variable
for
param
in
parameters
:
grad_var
=
Variable
(
if
param
.
stop_gradient
:
block
=
loss
.
block
,
continue
name
=
param
.
_ivar
.
_grad_name
(),
# create gradient variable
stop_gradient
=
True
,
grad_var
=
Variable
(
ivar
=
param
.
_ivar
.
_grad_ivar
())
block
=
loss
.
block
,
params_grads
.
append
((
param
,
grad_var
))
name
=
param
.
_ivar
.
_grad_name
(),
stop_gradient
=
True
,
ivar
=
param
.
_ivar
.
_grad_ivar
())
params_grads
.
append
((
param
,
grad_var
))
with
program_guard
(
program
,
startup_program
):
with
program_guard
(
program
,
startup_program
):
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
)
else
:
else
:
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
c7e38680
...
@@ -159,7 +159,7 @@ class ParallelExecutor(object):
...
@@ -159,7 +159,7 @@ class ParallelExecutor(object):
trainers_endpoints
=
main
.
_trainers_endpoints
trainers_endpoints
=
main
.
_trainers_endpoints
if
num_trainers
>
1
and
trainers_endpoints
:
if
num_trainers
>
1
and
trainers_endpoints
:
assert
num_trainers
==
len
(
assert
num_trainers
==
len
(
trainers_endpoints
),
"num_trainers == len(end
_
points)"
trainers_endpoints
),
"num_trainers == len(endpoints)"
build_strategy
.
trainers_endpoints
=
trainers_endpoints
build_strategy
.
trainers_endpoints
=
trainers_endpoints
# step6: get persistable_vars, places. persistable_vars
# step6: get persistable_vars, places. persistable_vars
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
c7e38680
...
@@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
...
@@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
list
(
REMOVE_ITEM TEST_OPS test_image_classification_resnet
)
list
(
REMOVE_ITEM TEST_OPS test_image_classification_resnet
)
list
(
REMOVE_ITEM TEST_OPS test_bilinear_interp_op
)
list
(
REMOVE_ITEM TEST_OPS test_bilinear_interp_op
)
list
(
REMOVE_ITEM TEST_OPS test_nearest_interp_op
)
list
(
REMOVE_ITEM TEST_OPS test_nearest_interp_op
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_resnet
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
endforeach
(
TEST_OP
)
endforeach
(
TEST_OP
)
...
@@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_
...
@@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_
py_test_modules
(
test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=
${
WARPCTC_LIB_DIR
}
SERIAL
)
py_test_modules
(
test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=
${
WARPCTC_LIB_DIR
}
SERIAL
)
py_test_modules
(
test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL
)
py_test_modules
(
test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL
)
py_test_modules
(
test_nearest_interp_op MODULES test_nearest_interp_op SERIAL
)
py_test_modules
(
test_nearest_interp_op MODULES test_nearest_interp_op SERIAL
)
py_test_modules
(
test_imperative_resnet MODULES test_imperative_resnet ENVS
FLAGS_cudnn_deterministic=1
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_DISTRIBUTE
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
c7e38680
...
@@ -124,7 +124,7 @@ class TestDistRunnerBase(object):
...
@@ -124,7 +124,7 @@ class TestDistRunnerBase(object):
if
args
.
batch_merge_repeat
>
1
:
if
args
.
batch_merge_repeat
>
1
:
pass_builder
=
build_stra
.
_finalize_strategy_and_create_passes
()
pass_builder
=
build_stra
.
_finalize_strategy_and_create_passes
()
mypass
=
pass_builder
.
insert_pass
(
mypass
=
pass_builder
.
insert_pass
(
len
(
pass_builder
.
all_passes
())
-
2
,
"multi_batch_merge_pass"
)
len
(
pass_builder
.
all_passes
())
-
3
,
"multi_batch_merge_pass"
)
mypass
.
set
(
"num_repeats"
,
args
.
batch_merge_repeat
)
mypass
.
set
(
"num_repeats"
,
args
.
batch_merge_repeat
)
if
args
.
update_method
==
"nccl2"
:
if
args
.
update_method
==
"nccl2"
:
...
...
python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
浏览文件 @
c7e38680
...
@@ -16,12 +16,17 @@ import os
...
@@ -16,12 +16,17 @@ import os
import
unittest
import
unittest
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
"0.0"
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
"0.0"
# FIXME(zjl): It seems that this unittest fails randomly
# when comparing all reduce last loss and reduce last loss
# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
# Disable it temporarily.
'''
from test_parallel_executor_mnist import TestMNIST
from test_parallel_executor_mnist import TestMNIST
class EagerDeletionTestMNIST(TestMNIST):
class EagerDeletionTestMNIST(TestMNIST):
pass
pass
'''
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_imperative.py
浏览文件 @
c7e38680
...
@@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer):
...
@@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer):
class
TestImperative
(
unittest
.
TestCase
):
class
TestImperative
(
unittest
.
TestCase
):
def
test_sum_op
(
self
):
x
=
np
.
ones
([
2
,
2
],
np
.
float32
)
with
fluid
.
imperative
.
guard
():
inputs
=
[]
for
_
in
range
(
10
):
inputs
.
append
(
fluid
.
imperative
.
base
.
to_variable
(
x
))
ret
=
fluid
.
layers
.
sums
(
inputs
)
loss
=
fluid
.
layers
.
reduce_sum
(
ret
)
loss
.
_backward
()
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
x
*
10
))
self
.
assertTrue
(
np
.
allclose
(
inputs
[
0
].
_gradient
(),
x
))
def
test_layer
(
self
):
def
test_layer
(
self
):
with
fluid
.
imperative
.
guard
():
with
fluid
.
imperative
.
guard
():
cl
=
core
.
Layer
()
cl
=
core
.
Layer
()
...
@@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase):
...
@@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase):
x
=
fluid
.
layers
.
reduce_sum
(
fluid
.
layers
.
tanh
(
x1
))
x
=
fluid
.
layers
.
reduce_sum
(
fluid
.
layers
.
tanh
(
x1
))
param_grads
=
fluid
.
backward
.
append_backward
(
param_grads
=
fluid
.
backward
.
append_backward
(
x
,
parameter_list
=
[
x1
.
name
])[
0
]
x
,
parameter_list
=
[
x1
.
name
])[
0
]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
static_out
,
static_grad
=
exe
.
run
(
static_out
,
static_grad
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
feed
=
{
inp
.
name
:
np_inp
},
...
@@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase):
...
@@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase):
x
=
l
(
inp
)[
0
]
x
=
l
(
inp
)[
0
]
param_grads
=
fluid
.
backward
.
append_backward
(
param_grads
=
fluid
.
backward
.
append_backward
(
x
,
parameter_list
=
[
l
.
_x_for_debug
.
name
])[
0
]
x
,
parameter_list
=
[
l
.
_x_for_debug
.
name
])[
0
]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
static_out
,
static_grad
=
exe
.
run
(
static_out
,
static_grad
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
feed
=
{
inp
.
name
:
np_inp
},
...
@@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase):
...
@@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase):
out
=
mlp
(
inp
)
out
=
mlp
(
inp
)
param_grads
=
fluid
.
backward
.
append_backward
(
param_grads
=
fluid
.
backward
.
append_backward
(
out
,
parameter_list
=
[
mlp
.
_fc1
.
_w
.
name
])[
0
]
out
,
parameter_list
=
[
mlp
.
_fc1
.
_w
.
name
])[
0
]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
fluid
.
default_startup_program
())
static_out
,
static_grad
=
exe
.
run
(
static_out
,
static_grad
=
exe
.
run
(
...
...
python/paddle/fluid/tests/unittests/test_imperative_gan.py
浏览文件 @
c7e38680
...
@@ -20,6 +20,7 @@ import sys
...
@@ -20,6 +20,7 @@ import sys
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.optimizer
import
SGDOptimizer
from
paddle.fluid.optimizer
import
SGDOptimizer
from
paddle.fluid.imperative.nn
import
Conv2D
,
Pool2D
,
FC
from
paddle.fluid.imperative.nn
import
Conv2D
,
Pool2D
,
FC
from
test_imperative_base
import
new_program_scope
from
test_imperative_base
import
new_program_scope
...
@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer):
...
@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer):
class
TestImperativeMnist
(
unittest
.
TestCase
):
class
TestImperativeMnist
(
unittest
.
TestCase
):
def
test_
mnist_cpu
_float32
(
self
):
def
test_
gan
_float32
(
self
):
seed
=
90
seed
=
90
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
...
@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase):
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
sgd
.
minimize
(
g_loss
)
sgd
.
minimize
(
g_loss
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
()
if
not
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CUDAPlace
(
0
))
static_params
=
dict
()
static_params
=
dict
()
with
fluid
.
scope_guard
(
scope
):
with
fluid
.
scope_guard
(
scope
):
img
=
np
.
ones
([
2
,
1
],
np
.
float32
)
img
=
np
.
ones
([
2
,
1
],
np
.
float32
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
浏览文件 @
c7e38680
...
@@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase):
...
@@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase):
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
mnist
=
MNIST
()
mnist
=
MNIST
()
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
sgd
=
SGDOptimizer
(
learning_rate
=
1e-3
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_resnet.py
0 → 100644
浏览文件 @
c7e38680
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
contextlib
import
unittest
import
numpy
as
np
import
six
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.optimizer
import
SGDOptimizer
from
paddle.fluid.imperative.nn
import
Conv2D
,
Pool2D
,
BatchNorm
,
FC
from
paddle.fluid.imperative.base
import
to_variable
from
test_imperative_base
import
new_program_scope
batch_size
=
8
train_parameters
=
{
"input_size"
:
[
3
,
224
,
224
],
"input_mean"
:
[
0.485
,
0.456
,
0.406
],
"input_std"
:
[
0.229
,
0.224
,
0.225
],
"learning_strategy"
:
{
"name"
:
"piecewise_decay"
,
"batch_size"
:
batch_size
,
"epochs"
:
[
30
,
60
,
90
],
"steps"
:
[
0.1
,
0.01
,
0.001
,
0.0001
]
},
"batch_size"
:
batch_size
,
"lr"
:
0.1
,
"total_images"
:
1281164
,
}
def
optimizer_setting
(
params
):
ls
=
params
[
"learning_strategy"
]
if
ls
[
"name"
]
==
"piecewise_decay"
:
if
"total_images"
not
in
params
:
total_images
=
1281167
else
:
total_images
=
params
[
"total_images"
]
batch_size
=
ls
[
"batch_size"
]
step
=
int
(
total_images
/
batch_size
+
1
)
bd
=
[
step
*
e
for
e
in
ls
[
"epochs"
]]
base_lr
=
params
[
"lr"
]
lr
=
[]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
# TODO(minqiyang): Add learning rate scheduler support to imperative mode
# optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
return
optimizer
class
ConvBNLayer
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
num_channels
,
num_filters
,
filter_size
,
stride
=
1
,
groups
=
1
,
act
=
None
):
super
(
ConvBNLayer
,
self
).
__init__
()
self
.
_conv
=
Conv2D
(
num_channels
=
num_channels
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
stride
=
stride
,
padding
=
(
filter_size
-
1
)
//
2
,
groups
=
groups
,
act
=
None
,
bias_attr
=
None
)
self
.
_batch_norm
=
BatchNorm
(
num_filters
,
act
=
act
)
def
forward
(
self
,
inputs
):
y
=
self
.
_conv
(
inputs
)
y
=
self
.
_batch_norm
(
y
)
return
y
class
BottleneckBlock
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
num_channels
,
num_filters
,
stride
,
shortcut
=
True
):
super
(
BottleneckBlock
,
self
).
__init__
()
self
.
conv0
=
ConvBNLayer
(
num_channels
=
num_channels
,
num_filters
=
num_filters
,
filter_size
=
1
,
act
=
'relu'
)
self
.
conv1
=
ConvBNLayer
(
num_channels
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
3
,
stride
=
stride
,
act
=
'relu'
)
self
.
conv2
=
ConvBNLayer
(
num_channels
=
num_filters
,
num_filters
=
num_filters
*
4
,
filter_size
=
1
,
act
=
None
)
if
not
shortcut
:
self
.
short
=
ConvBNLayer
(
num_channels
=
num_channels
,
num_filters
=
num_filters
*
4
,
filter_size
=
1
,
stride
=
stride
)
self
.
shortcut
=
shortcut
self
.
_num_channels_out
=
num_filters
*
4
def
forward
(
self
,
inputs
):
y
=
self
.
conv0
(
inputs
)
conv1
=
self
.
conv1
(
y
)
conv2
=
self
.
conv2
(
conv1
)
if
self
.
shortcut
:
short
=
inputs
else
:
short
=
self
.
short
(
inputs
)
y
=
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
conv2
)
layer_helper
=
LayerHelper
(
'elementwise_add_activation'
,
act
=
'relu'
)
return
layer_helper
.
append_activation
(
y
)
class
ResNet
(
fluid
.
imperative
.
Layer
):
def
__init__
(
self
,
layers
=
50
,
class_dim
=
102
):
super
(
ResNet
,
self
).
__init__
()
self
.
layers
=
layers
supported_layers
=
[
50
,
101
,
152
]
assert
layers
in
supported_layers
,
\
"supported layers are {} but input layer is {}"
.
format
(
supported_layers
,
layers
)
if
layers
==
50
:
depth
=
[
3
,
4
,
6
,
3
]
elif
layers
==
101
:
depth
=
[
3
,
4
,
23
,
3
]
elif
layers
==
152
:
depth
=
[
3
,
8
,
36
,
3
]
num_filters
=
[
64
,
128
,
256
,
512
]
self
.
conv
=
ConvBNLayer
(
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
self
.
pool2d_max
=
Pool2D
(
pool_size
=
3
,
pool_stride
=
2
,
pool_padding
=
1
,
pool_type
=
'max'
)
self
.
bottleneck_block_list
=
[]
num_channels
=
64
for
block
in
range
(
len
(
depth
)):
shortcut
=
False
for
i
in
range
(
depth
[
block
]):
bottleneck_block
=
BottleneckBlock
(
num_channels
=
num_channels
,
num_filters
=
num_filters
[
block
],
stride
=
2
if
i
==
0
and
block
!=
0
else
1
,
shortcut
=
shortcut
)
num_channels
=
bottleneck_block
.
_num_channels_out
self
.
bottleneck_block_list
.
append
(
bottleneck_block
)
shortcut
=
True
self
.
pool2d_avg
=
Pool2D
(
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
import
math
stdv
=
1.0
/
math
.
sqrt
(
2048
*
1.0
)
self
.
out
=
FC
(
size
=
class_dim
,
act
=
'softmax'
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
def
forward
(
self
,
inputs
):
y
=
self
.
conv
(
inputs
)
y
=
self
.
pool2d_max
(
y
)
for
bottleneck_block
in
self
.
bottleneck_block_list
:
y
=
bottleneck_block
(
y
)
y
=
self
.
pool2d_avg
(
y
)
y
=
self
.
out
(
y
)
return
y
class
TestImperativeResnet
(
unittest
.
TestCase
):
def
test_resnet_float32
(
self
):
seed
=
90
batch_size
=
train_parameters
[
"batch_size"
]
batch_num
=
1
with
fluid
.
imperative
.
guard
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
resnet
=
ResNet
()
optimizer
=
optimizer_setting
(
train_parameters
)
np
.
random
.
seed
(
seed
)
import
random
random
.
seed
=
seed
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
),
batch_size
=
batch_size
)
dy_param_init_value
=
{}
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
dy_param_init_value
[
param
.
name
]
=
param
.
_numpy
()
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
dy_x_data
=
np
.
array
(
[
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
batch_size
,
1
)
img
=
to_variable
(
dy_x_data
)
label
=
to_variable
(
y_data
)
label
.
_stop_gradient
=
True
out
=
resnet
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
dy_out
=
avg_loss
.
_numpy
()
if
batch_id
==
0
:
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
if
param
.
name
not
in
dy_param_init_value
:
dy_param_init_value
[
param
.
name
]
=
param
.
_numpy
()
avg_loss
.
_backward
()
dy_grad_value
=
{}
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
if
not
param
.
stop_gradient
:
np_array
=
np
.
array
(
param
.
_ivar
.
_grad_ivar
().
value
()
.
get_tensor
())
dy_grad_value
[
param
.
name
+
core
.
grad_var_suffix
(
)]
=
np_array
optimizer
.
minimize
(
avg_loss
)
dy_param_value
=
{}
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
dy_param_value
[
param
.
name
]
=
param
.
_numpy
()
with
new_program_scope
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
resnet
=
ResNet
()
optimizer
=
optimizer_setting
(
train_parameters
)
np
.
random
.
seed
(
seed
)
import
random
random
.
seed
=
seed
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
),
batch_size
=
batch_size
)
img
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
out
=
resnet
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
optimizer
.
minimize
(
avg_loss
)
# initialize params and fetch them
static_param_init_value
=
{}
static_param_name_list
=
[]
static_grad_name_list
=
[]
for
param
in
fluid
.
default_startup_program
().
global_block
(
).
all_parameters
():
static_param_name_list
.
append
(
param
.
name
)
for
param
in
fluid
.
default_main_program
().
global_block
(
).
all_parameters
():
if
not
param
.
stop_gradient
:
static_grad_name_list
.
append
(
param
.
name
+
core
.
grad_var_suffix
())
out
=
exe
.
run
(
fluid
.
default_startup_program
(),
fetch_list
=
static_param_name_list
)
for
i
in
range
(
len
(
static_param_name_list
)):
static_param_init_value
[
static_param_name_list
[
i
]]
=
out
[
i
]
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
static_x_data
=
np
.
array
(
[
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
[
batch_size
,
1
])
fetch_list
=
[
avg_loss
.
name
]
fetch_list
.
extend
(
static_param_name_list
)
fetch_list
.
extend
(
static_grad_name_list
)
out
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
"pixel"
:
static_x_data
,
"label"
:
y_data
},
fetch_list
=
fetch_list
)
static_param_value
=
{}
static_grad_value
=
{}
static_out
=
out
[
0
]
param_start_pos
=
1
grad_start_pos
=
len
(
static_param_name_list
)
+
param_start_pos
for
i
in
range
(
param_start_pos
,
len
(
static_param_name_list
)
+
param_start_pos
):
static_param_value
[
static_param_name_list
[
i
-
param_start_pos
]]
=
out
[
i
]
for
i
in
range
(
grad_start_pos
,
len
(
static_grad_name_list
)
+
grad_start_pos
):
static_grad_value
[
static_grad_name_list
[
i
-
grad_start_pos
]]
=
out
[
i
]
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
))
self
.
assertEqual
(
len
(
dy_param_init_value
),
len
(
static_param_init_value
))
for
key
,
value
in
six
.
iteritems
(
static_param_init_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_init_value
[
key
]))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
self
.
assertEqual
(
len
(
dy_grad_value
),
len
(
static_grad_value
))
for
key
,
value
in
six
.
iteritems
(
static_grad_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_grad_value
[
key
]))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
self
.
assertEqual
(
len
(
dy_param_value
),
len
(
static_param_value
))
for
key
,
value
in
six
.
iteritems
(
static_param_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
c7e38680
...
@@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase):
...
@@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase):
print
(
str
(
program
))
print
(
str
(
program
))
def
test_shuffle_channel
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
"X"
,
shape
=
[
16
,
4
,
4
],
dtype
=
"float32"
)
out
=
layers
.
shuffle_channel
(
x
,
group
=
4
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
0 → 100644
浏览文件 @
c7e38680
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
sys
import
math
from
op_test
import
OpTest
import
paddle.fluid.core
as
core
class
TestShuffleChannelOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"shuffle_channel"
self
.
batch_size
=
10
self
.
input_channels
=
16
self
.
layer_h
=
4
self
.
layer_w
=
4
self
.
group
=
4
self
.
x
=
np
.
random
.
random
(
(
self
.
batch_size
,
self
.
input_channels
,
self
.
layer_h
,
self
.
layer_w
)).
astype
(
'float32'
)
self
.
inputs
=
{
'X'
:
self
.
x
}
self
.
attrs
=
{
'group'
:
self
.
group
}
n
,
c
,
h
,
w
=
self
.
x
.
shape
input_reshaped
=
np
.
reshape
(
self
.
x
,
(
-
1
,
self
.
group
,
c
//
self
.
group
,
h
,
w
))
input_transposed
=
np
.
transpose
(
input_reshaped
,
(
0
,
2
,
1
,
3
,
4
))
self
.
outputs
=
{
'Out'
:
np
.
reshape
(
input_transposed
,
(
-
1
,
c
,
h
,
w
))}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/transpiler/details/checkport.py
浏览文件 @
c7e38680
...
@@ -16,6 +16,7 @@ import sys
...
@@ -16,6 +16,7 @@ import sys
import
time
import
time
import
socket
import
socket
from
contextlib
import
closing
from
contextlib
import
closing
from
six
import
string_types
def
wait_server_ready
(
endpoints
):
def
wait_server_ready
(
endpoints
):
...
@@ -32,6 +33,7 @@ def wait_server_ready(endpoints):
...
@@ -32,6 +33,7 @@ def wait_server_ready(endpoints):
wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
"""
"""
assert
not
isinstance
(
endpoints
,
string_types
)
while
True
:
while
True
:
all_ok
=
True
all_ok
=
True
not_ready_endpoints
=
[]
not_ready_endpoints
=
[]
...
@@ -45,7 +47,7 @@ def wait_server_ready(endpoints):
...
@@ -45,7 +47,7 @@ def wait_server_ready(endpoints):
all_ok
=
False
all_ok
=
False
not_ready_endpoints
.
append
(
ep
)
not_ready_endpoints
.
append
(
ep
)
if
not
all_ok
:
if
not
all_ok
:
sys
.
stderr
.
write
(
"
p
server not ready, wait 3 sec to retry...
\n
"
)
sys
.
stderr
.
write
(
"server not ready, wait 3 sec to retry...
\n
"
)
sys
.
stderr
.
write
(
"not ready endpoints:"
+
str
(
not_ready_endpoints
)
+
sys
.
stderr
.
write
(
"not ready endpoints:"
+
str
(
not_ready_endpoints
)
+
"
\n
"
)
"
\n
"
)
sys
.
stderr
.
flush
()
sys
.
stderr
.
flush
()
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
c7e38680
...
@@ -477,13 +477,16 @@ class DistributeTranspiler(object):
...
@@ -477,13 +477,16 @@ class DistributeTranspiler(object):
trainer_id
,
trainer_id
,
trainers
,
trainers
,
current_endpoint
,
current_endpoint
,
startup_program
=
None
):
startup_program
=
None
,
wait_port
=
True
):
if
not
startup_program
:
if
not
startup_program
:
startup_program
=
default_startup_program
()
startup_program
=
default_startup_program
()
if
trainer_id
>=
0
:
if
trainer_id
>=
0
:
worker_endpoints
=
trainers
.
split
(
","
)
worker_endpoints
=
trainers
.
split
(
","
)
# send NCCL_ID to others or recv from trainer 0
# send NCCL_ID to others or recv from trainer 0
worker_endpoints
.
remove
(
current_endpoint
)
worker_endpoints
.
remove
(
current_endpoint
)
if
trainer_id
==
0
and
wait_port
:
wait_server_ready
(
worker_endpoints
)
nccl_id_var
=
startup_program
.
global_block
().
create_var
(
nccl_id_var
=
startup_program
.
global_block
().
create_var
(
name
=
"NCCLID"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
name
=
"NCCLID"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
...
@@ -564,11 +567,13 @@ class DistributeTranspiler(object):
...
@@ -564,11 +567,13 @@ class DistributeTranspiler(object):
if
self
.
config
.
mode
==
"nccl2"
:
if
self
.
config
.
mode
==
"nccl2"
:
assert
(
isinstance
(
trainers
,
str
))
assert
(
isinstance
(
trainers
,
str
))
self
.
origin_program
.
_trainers_endpoints
=
trainers
.
split
(
","
)
self
.
_transpile_nccl2
(
self
.
_transpile_nccl2
(
trainer_id
,
trainer_id
,
trainers
,
trainers
,
current_endpoint
,
current_endpoint
,
startup_program
=
startup_program
)
startup_program
=
startup_program
,
wait_port
=
self
.
config
.
wait_port
)
return
return
self
.
trainer_num
=
trainers
self
.
trainer_num
=
trainers
...
...
python/setup.py.in
浏览文件 @
c7e38680
...
@@ -109,6 +109,7 @@ packages=['paddle',
...
@@ -109,6 +109,7 @@ packages=['paddle',
'paddle.fluid.contrib',
'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.reader',
'paddle.fluid.contrib.slim',
'paddle.fluid.contrib.slim',
'paddle.fluid.contrib.slim.core',
'paddle.fluid.contrib.slim.core',
'paddle.fluid.contrib.slim.graph',
'paddle.fluid.contrib.slim.graph',
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录