Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5c7a8aee
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5c7a8aee
编写于
12月 19, 2018
作者:
H
heqiaozhi
浏览文件
操作
浏览文件
下载
差异文件
merge upstream to my develop
test=develop Merge remote-tracking branch 'upstream/develop' into develop
上级
b3cf476d
aa6e9c30
变更
90
显示空白变更内容
内联
并排
Showing
90 changed file
with
3509 addition
and
1557 deletion
+3509
-1557
CMakeLists.txt
CMakeLists.txt
+2
-2
cmake/external/python.cmake
cmake/external/python.cmake
+2
-3
cmake/external/warpctc.cmake
cmake/external/warpctc.cmake
+25
-5
cmake/inference_lib.cmake
cmake/inference_lib.cmake
+7
-0
cmake/operators.cmake
cmake/operators.cmake
+1
-1
paddle/fluid/API.spec
paddle/fluid/API.spec
+2
-1
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+11
-3
paddle/fluid/framework/details/analysis_var_pass.cc
paddle/fluid/framework/details/analysis_var_pass.cc
+656
-0
paddle/fluid/framework/details/analysis_var_pass.h
paddle/fluid/framework/details/analysis_var_pass.h
+120
-0
paddle/fluid/framework/details/analysis_var_pass_test.cc
paddle/fluid/framework/details/analysis_var_pass_test.cc
+470
-0
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+31
-3
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+11
-0
paddle/fluid/framework/details/early_delete_op_handle.h
paddle/fluid/framework/details/early_delete_op_handle.h
+140
-0
paddle/fluid/framework/details/memory_early_delete_pass.cc
paddle/fluid/framework/details/memory_early_delete_pass.cc
+117
-0
paddle/fluid/framework/details/memory_early_delete_pass.h
paddle/fluid/framework/details/memory_early_delete_pass.h
+32
-0
paddle/fluid/framework/details/memory_reuse_types.cc
paddle/fluid/framework/details/memory_reuse_types.cc
+155
-0
paddle/fluid/framework/details/memory_reuse_types.h
paddle/fluid/framework/details/memory_reuse_types.h
+87
-0
paddle/fluid/framework/details/memory_reuse_types_test.cc
paddle/fluid/framework/details/memory_reuse_types_test.cc
+99
-0
paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
...fluid/framework/details/multi_devices_graph_print_pass.cc
+2
-1
paddle/fluid/framework/details/multi_devices_graph_print_pass.h
.../fluid/framework/details/multi_devices_graph_print_pass.h
+4
-1
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+1
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+1
-1
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-0
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+91
-0
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+33
-0
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+4
-1
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+2
-1
paddle/fluid/framework/ir/graph_helper.h
paddle/fluid/framework/ir/graph_helper.h
+1
-0
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+27
-1
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+18
-0
paddle/fluid/framework/ir/node.cc
paddle/fluid/framework/ir/node.cc
+8
-0
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+4
-1
paddle/fluid/framework/ngraph_operator.cc
paddle/fluid/framework/ngraph_operator.cc
+2
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+89
-25
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+46
-16
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+16
-3
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+16
-0
paddle/fluid/framework/type_defs.h
paddle/fluid/framework/type_defs.h
+3
-0
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
...id/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+0
-3
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+1
-0
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+2
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+16
-2
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+13
-11
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+1
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-3
paddle/fluid/operators/beam_search_decode_op.cc
paddle/fluid/operators/beam_search_decode_op.cc
+2
-1
paddle/fluid/operators/controlflow/parallel_do_op.cc
paddle/fluid/operators/controlflow/parallel_do_op.cc
+0
-426
paddle/fluid/operators/hierarchical_sigmoid_op.h
paddle/fluid/operators/hierarchical_sigmoid_op.h
+18
-10
paddle/fluid/operators/huber_loss_op.cc
paddle/fluid/operators/huber_loss_op.cc
+4
-3
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+8
-0
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+21
-0
paddle/fluid/operators/math/matrix_bit_code.cc
paddle/fluid/operators/math/matrix_bit_code.cc
+321
-159
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+37
-50
paddle/fluid/operators/optimizers/adam_op.cc
paddle/fluid/operators/optimizers/adam_op.cc
+5
-0
paddle/fluid/operators/optimizers/adam_op.h
paddle/fluid/operators/optimizers/adam_op.h
+34
-12
paddle/fluid/operators/tensorrt/CMakeLists.txt
paddle/fluid/operators/tensorrt/CMakeLists.txt
+1
-1
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+1
-4
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
+0
-24
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+75
-92
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+1
-2
paddle/fluid/operators/transpose_mkldnn_op.cc
paddle/fluid/operators/transpose_mkldnn_op.cc
+124
-0
paddle/fluid/operators/transpose_op.cc
paddle/fluid/operators/transpose_op.cc
+47
-2
paddle/fluid/platform/dynload/CMakeLists.txt
paddle/fluid/platform/dynload/CMakeLists.txt
+0
-2
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+1
-1
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+2
-0
paddle/fluid/platform/dynload/dynamic_loader.h
paddle/fluid/platform/dynload/dynamic_loader.h
+6
-0
paddle/fluid/platform/dynload/mklml.h
paddle/fluid/platform/dynload/mklml.h
+3
-1
paddle/fluid/platform/dynload/tensorrt.h
paddle/fluid/platform/dynload/tensorrt.h
+1
-1
paddle/fluid/platform/dynload/warpctc.h
paddle/fluid/platform/dynload/warpctc.h
+1
-1
paddle/fluid/platform/port.h
paddle/fluid/platform/port.h
+0
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+8
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+9
-3
python/paddle/fluid/backward.py
python/paddle/fluid/backward.py
+2
-77
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+15
-7
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+2
-153
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+67
-18
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+12
-4
python/paddle/fluid/tests/book/notest_understand_sentiment.py
...on/paddle/fluid/tests/book/notest_understand_sentiment.py
+1
-17
python/paddle/fluid/tests/book/test_recognize_digits.py
python/paddle/fluid/tests/book/test_recognize_digits.py
+2
-15
python/paddle/fluid/tests/book/test_word2vec.py
python/paddle/fluid/tests/book/test_word2vec.py
+2
-14
python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
.../tests/book_memory_optimization/test_memopt_fit_a_line.py
+0
-87
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+2
-0
python/paddle/fluid/tests/unittests/test_adam_op.py
python/paddle/fluid/tests/unittests/test_adam_op.py
+32
-20
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
...d/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+1
-0
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
...dle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+123
-0
python/paddle/fluid/tests/unittests/test_parallel_op.py
python/paddle/fluid/tests/unittests/test_parallel_op.py
+0
-235
python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
.../paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+76
-0
python/paddle/fluid/tests/unittests/test_transpose_op.py
python/paddle/fluid/tests/unittests/test_transpose_op.py
+11
-2
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
...paddle/fluid/transpiler/memory_optimization_transpiler.py
+55
-18
python/setup.py.in
python/setup.py.in
+5
-4
未找到文件。
CMakeLists.txt
浏览文件 @
5c7a8aee
...
...
@@ -208,10 +208,10 @@ include(external/xxhash) # download xxhash
include
(
external/dlpack
)
include
(
external/snappy
)
# download snappy
include
(
external/snappystream
)
# download snappystream
include
(
external/warpctc
)
# download, build, install warpctc
if
(
NOT WIN32
)
# there is no official support of warpctc, nccl, cupti in windows
include
(
external/warpctc
)
# download, build, install warpctc
# there is no official support of nccl, cupti in windows
include
(
cupti
)
include
(
external/gzstream
)
endif
(
NOT WIN32
)
...
...
cmake/external/python.cmake
浏览文件 @
5c7a8aee
...
...
@@ -18,8 +18,8 @@ ENDIF()
INCLUDE
(
python_module
)
FIND_PACKAGE
(
PythonInterp
${
PY_VERSION
}
)
FIND_PACKAGE
(
PythonLibs
${
PY_VERSION
}
)
FIND_PACKAGE
(
PythonInterp
${
PY_VERSION
}
REQUIRED
)
FIND_PACKAGE
(
PythonLibs
${
PY_VERSION
}
REQUIRED
)
if
(
WIN32
)
execute_process
(
COMMAND
"
${
PYTHON_EXECUTABLE
}
"
"-c"
...
...
@@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND)
"please use pip to upgrade protobuf. pip install -U protobuf"
)
ENDIF
()
ENDIF
(
PYTHONINTERP_FOUND
)
INCLUDE_DIRECTORIES
(
${
PYTHON_INCLUDE_DIR
}
)
INCLUDE_DIRECTORIES
(
${
PYTHON_NUMPY_INCLUDE_DIR
}
)
cmake/external/warpctc.cmake
浏览文件 @
5c7a8aee
...
...
@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
# Used in unit test test_WarpCTCLayer
SET
(
WARPCTC_LIB_DIR
"
${
WARPCTC_INSTALL_DIR
}
/lib"
CACHE PATH
"Warp-ctc Library Directory"
FORCE
)
SET
(
WARPCTC_LIBRARIES
"
${
WARPCTC_INSTALL_DIR
}
/lib/libwarpctc
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
"
CACHE FILEPATH
"Warp-ctc Library"
FORCE
)
IF
(
CMAKE_CXX_COMPILER_ID STREQUAL
"Clang"
OR CMAKE_CXX_COMPILER_ID STREQUAL
"AppleClang"
)
IF
(
CMAKE_CXX_COMPILER_ID STREQUAL
"Clang"
OR CMAKE_CXX_COMPILER_ID STREQUAL
"AppleClang"
OR WIN32
)
SET
(
USE_OMP OFF
)
ELSE
()
SET
(
USE_OMP ON
)
ENDIF
()
IF
(
WIN32
)
SET
(
WARPCTC_REPOSITORY
"https://github.com/wopeizl/warp-ctc.git"
)
ELSE
()
SET
(
WARPCTC_REPOSITORY
"https://github.com/dzhwinter/warp-ctc.git"
)
ENDIF
()
ExternalProject_Add
(
extern_warpctc
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/dzhwinter/warp-ctc.git"
GIT_REPOSITORY
${
WARPCTC_REPOSITORY
}
PREFIX
${
WARPCTC_SOURCES_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
-DCMAKE_C_COMPILER=
${
CMAKE_C_COMPILER
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_C_FLAGS=
${
CMAKE_C_FLAGS
}
-DCMAKE_C_FLAGS_DEBUG=
${
CMAKE_C_FLAGS_DEBUG
}
-DCMAKE_C_FLAGS_RELEASE=
${
CMAKE_C_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS=
${
CMAKE_CXX_FLAGS
}
-DCMAKE_CXX_FLAGS_RELEASE=
${
CMAKE_CXX_FLAGS_RELEASE
}
-DCMAKE_CXX_FLAGS_DEBUG=
${
CMAKE_CXX_FLAGS_DEBUG
}
-DCMAKE_INSTALL_PREFIX=
${
WARPCTC_INSTALL_DIR
}
-DWITH_GPU=
${
WITH_GPU
}
-DWITH_OMP=
${
USE_OMP
}
...
...
@@ -59,6 +67,18 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=
${
WARPCTC_INSTALL_DIR
}
)
IF
(
WIN32
)
IF
(
NOT EXISTS
"
${
WARPCTC_INSTALL_DIR
}
/lib/warpctc
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
"
)
add_custom_command
(
TARGET extern_warpctc POST_BUILD
COMMAND cmake -E copy
${
WARPCTC_INSTALL_DIR
}
/bin/warpctc
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
WARPCTC_INSTALL_DIR
}
/lib/warpctc
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
ENDIF
()
SET
(
WARPCTC_LIBRARIES
"
${
WARPCTC_INSTALL_DIR
}
/lib/warpctc
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
"
CACHE FILEPATH
"Warp-ctc Library"
FORCE
)
else
(
WIN32
)
SET
(
WARPCTC_LIBRARIES
"
${
WARPCTC_INSTALL_DIR
}
/lib/libwarpctc
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
"
CACHE FILEPATH
"Warp-ctc Library"
FORCE
)
ENDIF
(
WIN32
)
MESSAGE
(
STATUS
"warp-ctc library:
${
WARPCTC_LIBRARIES
}
"
)
INCLUDE_DIRECTORIES
(
${
WARPCTC_INCLUDE_DIR
}
)
# For warpctc code to include its headers.
...
...
cmake/inference_lib.cmake
浏览文件 @
5c7a8aee
...
...
@@ -200,6 +200,13 @@ if (WITH_ANAKIN AND WITH_MKL)
list
(
APPEND inference_deps anakin_inference_lib
)
endif
()
if
(
TENSORRT_FOUND
)
copy
(
tensorrt_lib DEPS
${
inference_deps
}
SRCS
${
TENSORRT_ROOT
}
/include/Nv*.h
${
TENSORRT_ROOT
}
/lib/libnvinfer*
DSTS
${
FLUID_INSTALL_DIR
}
/third_party/install/tensorrt/include
${
FLUID_INSTALL_DIR
}
/third_party/install/tensorrt/lib
)
endif
()
set
(
module
"inference"
)
if
(
WIN32
)
set
(
paddle_fluid_lib
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/
${
CMAKE_BUILD_TYPE
}
/libpaddle_fluid.*
)
...
...
cmake/operators.cmake
浏览文件 @
5c7a8aee
...
...
@@ -84,7 +84,7 @@ function(op_library TARGET)
endif
()
if
(
WIN32
)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
"warpctc_op"
)
foreach
(
windows_unsupport_op
"nccl_op"
"gen_nccl_id_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
windows_unsupport_op
}
"
)
return
()
endif
()
...
...
paddle/fluid/API.spec
浏览文件 @
5c7a8aee
...
...
@@ -209,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
@@ -375,7 +376,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'
], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, Non
e))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'
, 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, Fals
e))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
5c7a8aee
...
...
@@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library
(
gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope
)
cc_library
(
memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass
)
cc_library
(
modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper
)
cc_library
(
memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass
)
cc_library
(
reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle
)
cc_library
(
eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper
)
cc_library
(
eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass
)
...
...
@@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass
)
if
(
WITH_GPU
)
list
(
APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass
)
endif
()
cc_test
(
memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph
)
cc_test
(
analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS
${
SSA_GRAPH_EXECUTOR_DEPS
}
)
...
...
@@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse
cc_library
(
build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
)
fuse_elewise_add_act_pass multi_batch_merge_pass
memory_optimize_pass
)
paddle/fluid/framework/details/analysis_var_pass.cc
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/analysis_var_pass.h"
#include <algorithm>
#include <atomic>
#include <deque>
#include <fstream>
#include <iostream>
#include <iterator>
#include <memory>
#include <queue>
#include <sstream>
#include <string>
#include <type_traits>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
DEFINE_bool
(
enable_subgraph_optimize
,
false
,
"SubGraph also reuse global graph variables, it will reduce the "
"memory occupation"
"but a higher risk of memory reuse error. default disabled."
);
DEFINE_string
(
memory_optimize_debug
,
""
,
"debug the operator output variable when do the variable reuse."
"memory reuse pass."
"only for debug, default disabled."
);
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
template
<
typename
Container
,
typename
Callback
>
class
FilterVariableImpl
{
public:
void
operator
()(
const
Container
&
nodes
,
Callback
callback
)
{
for
(
auto
*
node
:
nodes
)
{
callback
(
node
);
}
}
};
// filter var node for op->inputs/outputs
template
<
typename
Callback
>
class
FilterVariableImpl
<
std
::
vector
<
ir
::
Node
*>
,
Callback
>
{
public:
void
operator
()(
const
std
::
vector
<
ir
::
Node
*>&
nodes
,
Callback
callback
)
{
for
(
auto
*
var
:
nodes
)
{
if
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
())
{
callback
(
var
);
}
}
}
};
template
<
typename
Container
,
typename
Callback
>
void
FilterVariables
(
const
Container
&
nodes
,
Callback
callback
)
{
FilterVariableImpl
<
Container
,
Callback
>
()(
nodes
,
callback
);
}
std
::
unique_ptr
<
ir
::
Graph
>
AnalysisVarPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
nodes
=
graph
->
Nodes
();
auto
subblock_vars
=
GetSubBlockVars
(
nodes
);
skip_set_
.
insert
(
subblock_vars
.
begin
(),
subblock_vars
.
end
());
cfg_
.
reset
(
new
details
::
ControlFlowGraph
(
*
graph
));
cfg_
->
LiveVariableAnalysis
();
InitSSAGraphNodes
();
int
reuse_id
=
0
;
for
(
size_t
idx
=
0
;
idx
<
cfg_
->
Ops
().
size
();
++
idx
)
{
auto
&
op
=
cfg_
->
Ops
()[
idx
];
auto
*
op_desc
=
op
->
Op
();
// some op in graph has no op desc
if
(
op_desc
==
nullptr
)
continue
;
if
(
OpHasSubBlock
(
op_desc
))
{
if
(
FLAGS_enable_subgraph_optimize
)
{
SubGraphOptimize
(
op_desc
);
}
else
{
VLOG
(
3
)
<<
op
->
Name
()
<<
" has subblock, but disable subgraph optimize. skipped."
;
continue
;
}
}
for
(
auto
&
var
:
op
->
outputs
)
{
if
(
NodeCanReused
(
var
)
&&
cfg_
->
Use
(
op
).
count
(
var
->
Name
())
==
0
)
{
ir
::
Node
*
cache
=
pool_
.
NodeMatch
(
var
);
if
(
var
->
Name
()
==
FLAGS_memory_optimize_debug
)
{
VLOG
(
3
)
<<
"start match var "
<<
DebugString
(
var
)
<<
" of op "
<<
op
->
Name
();
VLOG
(
3
)
<<
pool_
.
ToString
();
VLOG
(
3
)
<<
"matched in pool : "
<<
((
cache
==
nullptr
)
?
"False"
:
"True"
);
}
if
(
cache
!=
nullptr
)
{
if
(
var
->
Name
()
==
cache
->
Name
())
{
VLOG
(
3
)
<<
"The same cache variable is cascade reused."
<<
var
->
Name
()
<<
" is re-filled to the pool after"
<<
"the reused op is finished. Current op can not "
<<
"replace it again. Skip this candidate."
;
continue
;
}
int
node_idx_in_pool
=
pool_
.
GetIndex
(
cache
);
VLOG
(
3
)
<<
string
::
Sprintf
(
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
std
::
to_string
(
reuse_id
++
),
DebugString
(
var
),
DebugString
(
cache
),
node_idx_in_pool
,
static_cast
<
int
>
(
pool_
.
size
()));
// update CFG Graph on the fly.
// reused var maybe re-fill into the pool
cfg_
->
RenameVarInCFGGraph
(
var
->
Name
(),
cache
->
Name
(),
idx
);
// NOTE(dzhwinter): we need to both update the ProgramDesc
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc
(
var
->
Name
(),
cache
->
Name
(),
idx
);
RenameVarInGraphNode
(
var
->
Name
(),
cache
->
Name
(),
idx
,
graph
.
get
());
pool_
.
Erase
(
cache
);
}
}
}
// fill the pool
for
(
auto
var
:
cfg_
->
LiveIn
(
op
))
{
if
(
cfg_
->
LiveOut
(
op
).
count
(
var
)
==
0
)
{
ir
::
Node
*
var_node
=
cfg_
->
GetNodeFromVarName
(
var
,
op
);
if
(
var_node
==
nullptr
)
continue
;
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
pool_
.
Insert
(
var_node
,
op
);
}
}
}
}
graph
->
ResolveHazard
(
var_nodes_
);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for
(
auto
&
op
:
graph
->
Nodes
())
{
for
(
auto
&
var
:
op
->
inputs
)
{
if
(
pool_
.
Has
(
var
))
{
pool_
.
Insert
(
var
,
op
);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto
&
graph_pool
=
graph
->
Get
<
GraphNodePool
>
(
kGraphNodePool
);
for
(
auto
it
=
pool_
.
begin
();
it
!=
pool_
.
end
();
++
it
)
{
std
::
unordered_set
<
OpDesc
*>
descs
;
for
(
auto
&
op
:
it
->
second
)
{
PADDLE_ENFORCE
(
op
->
IsOp
());
descs
.
insert
(
op
->
Op
());
}
graph_pool
.
push_back
(
std
::
make_pair
(
it
->
first
->
Name
(),
descs
));
}
return
graph
;
}
void
AnalysisVarPass
::
SubGraphOptimize
(
OpDesc
*
op_desc
)
const
{
// conditional block, while op and their grad op
auto
*
sub_block_desc
=
AttrReader
(
op_desc
->
GetAttrMap
()).
Get
<
BlockDesc
*>
(
"sub_block"
);
// create a mirror block to construct an IR Graph.
ProgramDesc
prog
;
auto
*
copy_block
=
prog
.
MutableBlock
(
0
);
for
(
auto
*
op
:
sub_block_desc
->
AllOps
())
{
auto
*
copy_op
=
copy_block
->
AppendOp
();
copy_op
->
CopyFrom
(
*
op
);
copy_op
->
Flush
();
}
for
(
auto
*
var
:
sub_block_desc
->
AllVars
())
{
auto
*
copy_var
=
copy_block
->
Var
(
var
->
Name
());
copy_var
->
SetDataType
(
var
->
GetDataType
());
// only lod tensor can be reused. So ignore the multiple dims case.
copy_var
->
SetType
(
var
->
GetType
());
copy_var
->
SetShape
(
var
->
GetShape
());
copy_var
->
SetPersistable
(
var
->
Persistable
());
}
ir
::
Graph
sub_graph
(
prog
);
std
::
unordered_set
<
ir
::
Node
*>
sub_graph_all_ops
;
FilterVariables
(
sub_graph
.
Nodes
(),
[
&
](
ir
::
Node
*
var
)
{
// sub_graph_all_ops.emplace(var);
if
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
())
{
sub_graph_all_ops
.
emplace
(
var
);
}
});
int
sub_reuse_id
=
0
;
// subgraph nodes is unordered, reuse need to follow the desc order.
// find the right op node through the descs
for
(
auto
*
sub_op_desc
:
sub_block_desc
->
AllOps
())
{
ir
::
Node
*
sub_op
=
nullptr
;
for
(
auto
*
node
:
sub_graph_all_ops
)
{
if
(
node
->
Op
()
==
sub_op_desc
)
{
sub_op
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
sub_op
!=
nullptr
);
for
(
auto
*
var
:
sub_op
->
outputs
)
{
if
(
NodeCanReused
(
var
))
{
ir
::
Node
*
cache
=
pool_
.
NodeMatch
(
var
);
if
(
cache
!=
nullptr
)
{
if
(
var
->
Var
()
->
GetDataType
()
!=
cache
->
Var
()
->
GetDataType
())
{
continue
;
}
int
node_idx_in_pool
=
pool_
.
GetIndex
(
cache
);
VLOG
(
3
)
<<
string
::
Sprintf
(
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
std
::
to_string
(
sub_reuse_id
++
),
DebugString
(
var
),
DebugString
(
cache
),
node_idx_in_pool
,
static_cast
<
int
>
(
pool_
.
size
()));
// NOTE(dzh): subblock is not in IR graph. Modify the block_desc
// immediately to make the subblock variable reuse strategy take
// effect. Because it is a single op in graph. No need to
// update the ir nodes.
sub_op_desc
->
Rename
(
var
->
Name
(),
cache
->
Name
());
if
(
sub_op_desc
->
Block
()
->
HasVar
(
var
->
Name
()))
{
sub_op_desc
->
Block
()
->
RemoveVar
(
var
->
Name
());
}
}
}
}
}
}
std
::
unordered_set
<
std
::
string
>
AnalysisVarPass
::
GetSubBlockVars
(
const
std
::
unordered_set
<
ir
::
Node
*>&
nodes
)
const
{
std
::
unordered_set
<
std
::
string
>
vars
;
for
(
auto
&
op
:
nodes
)
{
if
(
!
op
->
IsOp
()
||
op
->
Op
()
==
nullptr
)
continue
;
auto
*
op_desc
=
op
->
Op
();
if
(
OpHasSubBlock
(
op_desc
))
{
auto
inputs
=
op_desc
->
InputArgumentNames
();
auto
outputs
=
op_desc
->
OutputArgumentNames
();
vars
.
insert
(
inputs
.
begin
(),
inputs
.
end
());
vars
.
insert
(
outputs
.
begin
(),
outputs
.
end
());
}
}
return
vars
;
}
void
AnalysisVarPass
::
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
size_t
idx
)
const
{
for
(
size_t
i
=
idx
;
i
<
cfg_
->
Ops
().
size
();
++
i
)
{
auto
*
op
=
cfg_
->
Ops
()[
i
];
PADDLE_ENFORCE
(
op
->
IsOp
()
&&
op
->
Op
());
auto
*
op_desc
=
op
->
Op
();
op_desc
->
RenameInput
(
var
,
cache_var
);
op_desc
->
RenameOutput
(
var
,
cache_var
);
if
(
op_desc
->
Block
()
->
HasVar
(
var
))
op_desc
->
Block
()
->
RemoveVar
(
var
);
op_desc
->
Flush
();
}
}
void
AnalysisVarPass
::
InitSSAGraphNodes
()
const
{
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
ir
::
Node
*>>
all_vars
;
if
(
var_nodes_
.
empty
())
{
for
(
auto
*
op
:
cfg_
->
Ops
())
{
for
(
auto
*
node
:
op
->
inputs
)
{
if
(
all_vars
[
node
->
Name
()].
count
(
node
)
==
0
)
{
all_vars
[
node
->
Name
()].
emplace
(
node
);
var_nodes_
[
node
->
Name
()].
emplace_back
(
node
);
}
}
for
(
auto
*
node
:
op
->
outputs
)
{
if
(
all_vars
[
node
->
Name
()].
count
(
node
)
==
0
)
{
all_vars
[
node
->
Name
()].
emplace
(
node
);
var_nodes_
[
node
->
Name
()].
emplace_back
(
node
);
}
}
}
}
}
void
AnalysisVarPass
::
RenameVarInGraphNode
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
size_t
idx
,
ir
::
Graph
*
graph
)
const
{
// if replace happens, we need to create a newer version cache_var
// but use the same dims/data_type with var.
PADDLE_ENFORCE
(
var_nodes_
[
var
].
size
()
>=
1
&&
var_nodes_
[
var
].
at
(
0
)
->
Var
()
!=
nullptr
);
std
::
unique_ptr
<
VarDesc
>
var_desc
(
new
VarDesc
(
*
var_nodes_
[
var
].
at
(
0
)
->
Var
()));
var_desc
->
SetName
(
cache_var
);
for
(
size_t
i
=
idx
;
i
<
cfg_
->
Ops
().
size
();
++
i
)
{
auto
*
op
=
cfg_
->
Ops
()[
i
];
// redirect the input to the latest version of cache_var
for
(
auto
*
node
:
op
->
inputs
)
{
if
(
node
->
Name
()
==
var
)
{
ir
::
Node
*
cache_node
=
graph
->
CreateVarNode
(
var_desc
.
get
());
var_nodes_
[
cache_var
].
emplace_back
(
cache_node
);
// swap node to cache_node
cache_node
->
outputs
.
insert
(
cache_node
->
outputs
.
end
(),
node
->
outputs
.
begin
(),
node
->
outputs
.
end
());
PADDLE_ENFORCE
(
node
->
inputs
.
size
()
==
1
&&
node
->
inputs
[
0
]
->
IsOp
());
auto
*
prev_op
=
node
->
inputs
[
0
];
std
::
replace
(
prev_op
->
outputs
.
begin
(),
prev_op
->
outputs
.
end
(),
node
,
cache_node
);
cache_node
->
inputs
.
emplace_back
(
prev_op
);
for
(
auto
*
next_op
:
node
->
outputs
)
{
std
::
replace
(
next_op
->
inputs
.
begin
(),
next_op
->
inputs
.
end
(),
node
,
cache_node
);
}
}
}
// if we need to rename the output,
// always create a newer version of cache_var
for
(
auto
*
node
:
op
->
outputs
)
{
if
(
node
->
Name
()
==
var
)
{
ir
::
Node
*
cache_node
=
graph
->
CreateVarNode
(
var_desc
.
get
());
var_nodes_
[
cache_var
].
emplace_back
(
cache_node
);
// swap node to cache node
cache_node
->
outputs
.
insert
(
cache_node
->
outputs
.
end
(),
node
->
outputs
.
begin
(),
node
->
outputs
.
end
());
cache_node
->
inputs
.
emplace_back
(
op
);
std
::
replace
(
op
->
outputs
.
begin
(),
op
->
outputs
.
end
(),
node
,
cache_node
);
for
(
auto
*
next_op
:
node
->
outputs
)
{
std
::
replace
(
next_op
->
inputs
.
begin
(),
next_op
->
inputs
.
end
(),
node
,
cache_node
);
}
}
}
}
// release node of unused var in graph
for
(
auto
*
node
:
var_nodes_
[
var
])
{
graph
->
RemoveNode
(
node
);
}
var_nodes_
.
at
(
var
).
clear
();
}
bool
AnalysisVarPass
::
NodeCanReused
(
ir
::
Node
*
node
)
const
{
if
(
!
node
->
IsVar
()
||
node
->
IsCtrlVar
())
return
false
;
auto
*
desc
=
node
->
Var
();
auto
type
=
desc
->
GetType
();
if
(
desc
->
Persistable
()
||
type
!=
proto
::
VarType
::
LOD_TENSOR
||
desc
->
GetShape
().
empty
())
{
return
false
;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
std
::
string
name
=
node
->
Name
();
if
(
!
name
.
empty
()
&&
name
[
0
]
==
'@'
&&
name
[
name
.
size
()
-
1
]
==
'@'
)
return
false
;
if
(
skip_set_
.
count
(
name
))
return
false
;
for
(
auto
*
op
:
node
->
inputs
)
{
if
(
op
->
Op
()
->
HasAttr
(
"force_cpu"
))
{
// op output force generated in cpu, can not be reused.
return
framework
::
AttrReader
(
op
->
Op
()
->
GetAttrMap
())
.
Get
<
bool
>
(
"force_cpu"
)
==
0
;
}
}
return
true
;
}
bool
AnalysisVarPass
::
OpHasSubBlock
(
OpDesc
*
desc
)
const
{
const
AttributeMap
&
attrs
=
desc
->
GetAttrMap
();
for
(
auto
&
attr
:
attrs
)
{
if
(
attr
.
second
.
type
()
==
typeid
(
BlockDesc
*
)
||
// NOLINT
attr
.
second
.
type
()
==
typeid
(
std
::
vector
<
BlockDesc
*>
))
// NOLINT
return
true
;
}
return
false
;
}
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
PADDLE_ENFORCE
(
graph
.
Has
(
kAllOpDescs
),
"Graph has no attribute of kAllOpDescs."
);
// 1. get op desc order
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
);
// 2. topology sort order
auto
nodes
=
graph
.
Nodes
();
std
::
deque
<
ir
::
Node
*>
ops
;
FilterVariables
(
nodes
,
[
&
](
ir
::
Node
*
op
)
{
if
(
op
->
IsOp
()
&&
op
->
Op
()
!=
nullptr
)
{
ops
.
emplace_back
(
op
);
}
});
std
::
unordered_map
<
ir
::
Node
*
,
size_t
>
op_deps
;
std
::
list
<
ir
::
Node
*>
ready_ops
;
std
::
unordered_map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
pending_ops
;
for
(
auto
*
op
:
ops
)
{
std
::
unordered_set
<
ir
::
Node
*>
preceding_op
;
for
(
auto
*
in
:
op
->
inputs
)
{
if
(
in
->
inputs
.
empty
())
continue
;
PADDLE_ENFORCE
(
in
->
inputs
.
size
()
==
1
&&
in
->
inputs
[
0
]
->
IsOp
());
preceding_op
.
emplace
(
in
->
inputs
[
0
]);
pending_ops
[
in
->
inputs
[
0
]].
emplace
(
op
);
}
op_deps
[
op
]
=
preceding_op
.
size
();
if
(
preceding_op
.
empty
())
{
ready_ops
.
emplace_back
(
op
);
}
}
// 3. generated op list based desc order and the topology order
std
::
vector
<
ir
::
Node
*>
ret
;
std
::
list
<
OpDesc
*>
op_descs_list
(
op_descs
.
begin
(),
op_descs
.
end
());
auto
update_by_found_node
=
[
&
](
ir
::
Node
*
found_node
)
{
for
(
auto
*
pending_op
:
pending_ops
[
found_node
])
{
if
(
--
op_deps
[
pending_op
]
==
0
)
{
ready_ops
.
emplace_back
(
pending_op
);
}
}
ready_ops
.
remove
(
found_node
);
ret
.
emplace_back
(
found_node
);
};
while
(
!
ready_ops
.
empty
())
{
bool
all_of_ready_op_unmatched
=
true
;
for
(
auto
it
=
op_descs_list
.
begin
();
it
!=
op_descs_list
.
end
();)
{
auto
op_desc
=
*
it
;
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
op
:
ready_ops
)
{
if
(
IsSameDesc
(
op
->
Op
(),
op_desc
))
{
found_node
=
op
;
break
;
}
}
// 3.1 op desc deleted by other pass
if
(
found_node
==
nullptr
)
{
++
it
;
continue
;
}
else
{
all_of_ready_op_unmatched
=
false
;
it
=
op_descs_list
.
erase
(
it
);
}
update_by_found_node
(
found_node
);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std
::
list
<
ir
::
Node
*>
prev_ready_ops
(
ready_ops
);
if
(
all_of_ready_op_unmatched
)
{
for
(
auto
op
:
prev_ready_ops
)
{
update_by_found_node
(
op
);
}
}
}
PADDLE_ENFORCE
(
std
::
all_of
(
op_deps
.
begin
(),
op_deps
.
end
(),
[
&
](
const
std
::
pair
<
ir
::
Node
*
,
size_t
>&
p
)
{
return
p
.
second
==
0
;
}));
return
ret
;
}
ControlFlowGraph
::
ControlFlowGraph
(
const
ir
::
Graph
&
graph
)
{
ops_
=
SortOpLikeDescOrder
(
graph
);
ConnectNodes
();
}
void
ControlFlowGraph
::
BuildCFGGraph
()
{
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for
(
ir
::
Node
*
op
:
ops_
)
{
for
(
auto
&
input_var
:
op
->
inputs
)
{
if
(
!
input_var
->
inputs
.
empty
())
{
PADDLE_ENFORCE
(
input_var
->
inputs
.
size
()
==
1
&&
input_var
->
inputs
[
0
]
->
IsOp
(),
"Preceding Op Node of Var Node must be unique"
);
auto
*
pred_op
=
input_var
->
inputs
[
0
];
if
(
pred_op
->
Op
()
!=
nullptr
)
{
predecessors_
[
op
].
insert
(
pred_op
);
successors_
[
pred_op
].
insert
(
op
);
}
}
if
(
input_var
->
IsVar
()
&&
!
input_var
->
IsCtrlVar
())
{
uses_
[
op
].
insert
(
input_var
->
Name
());
}
}
for
(
auto
&
output_var
:
op
->
outputs
)
{
// output var may be used by many op
for
(
auto
*
succ_op
:
output_var
->
outputs
)
{
if
(
succ_op
->
Op
()
!=
nullptr
)
{
successors_
[
op
].
insert
(
succ_op
);
predecessors_
[
succ_op
].
insert
(
op
);
}
}
if
(
output_var
->
IsVar
()
&&
!
output_var
->
IsCtrlVar
())
{
defs_
[
op
].
insert
(
output_var
->
Name
());
}
}
}
}
void
ControlFlowGraph
::
ConnectNodes
()
{
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
auto
&
op
=
ops_
[
i
];
try
{
auto
&
next_op
=
ops_
.
at
(
i
+
1
);
successors_
[
op
].
insert
(
next_op
);
predecessors_
[
next_op
].
insert
(
op
);
}
catch
(...)
{
// do nothing
}
FilterVariables
(
op
->
inputs
,
[
&
](
ir
::
Node
*
var
)
{
uses_
[
op
].
emplace
(
var
->
Name
());
});
FilterVariables
(
op
->
outputs
,
[
&
](
ir
::
Node
*
var
)
{
defs_
[
op
].
emplace
(
var
->
Name
());
});
}
}
void
ControlFlowGraph
::
LiveVariableAnalysis
()
{
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std
::
list
<
ir
::
Node
*>
work_list
(
ops_
.
rbegin
(),
ops_
.
rend
());
while
(
!
work_list
.
empty
())
{
ir
::
Node
*
op
=
work_list
.
front
();
work_list
.
pop_front
();
// get the live_in calculated before. Empty if first.
auto
prev_live_in
=
std
::
move
(
live_in_
[
op
]);
for
(
auto
&
s
:
successors_
[
op
])
{
for
(
auto
&
var
:
live_in_
[
s
])
{
live_out_
[
op
].
insert
(
var
);
}
}
for
(
auto
&
var
:
uses_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
live_out_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
defs_
[
op
])
{
live_in_
[
op
].
erase
(
var
);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if
(
live_in_
[
op
]
!=
prev_live_in
)
{
for
(
auto
&
pre
:
predecessors_
[
op
])
{
work_list
.
push_back
(
pre
);
}
}
}
}
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
)
{
// update graph from begin idx to the end
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
auto
*
op
=
ops_
[
i
];
if
(
uses_
[
op
].
find
(
old_node
)
!=
uses_
[
op
].
end
())
{
uses_
[
op
].
erase
(
old_node
);
uses_
[
op
].
insert
(
new_node
);
}
if
(
defs_
[
op
].
find
(
old_node
)
!=
defs_
[
op
].
end
())
{
defs_
[
op
].
erase
(
old_node
);
defs_
[
op
].
insert
(
new_node
);
}
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
insert
(
new_node
);
}
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
insert
(
new_node
);
}
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_in_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_in_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_in, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_out_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_out_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
auto
it
=
uses_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
uses_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
vector
<
ir
::
Node
*>
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
ir
::
Node
*
ControlFlowGraph
::
GetNodeFromVarName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
{
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
node
:
ops_
)
{
if
(
node
==
op
)
break
;
for
(
auto
&
output
:
node
->
outputs
)
{
if
(
output
->
Name
()
==
name
)
{
found_node
=
output
;
}
}
}
return
found_node
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
analysis_var_pass
,
paddle
::
framework
::
details
::
AnalysisVarPass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kGraphNodePool
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
paddle/fluid/framework/details/analysis_var_pass.h
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
);
// sort op in bfs order
std
::
vector
<
ir
::
Node
*>
BFSSortGraphOps
(
const
ir
::
Graph
&
graph
);
class
ControlFlowGraph
;
class
AnalysisVarPass
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
private:
// fill the variable map(var_nodes) by version.
void
InitSSAGraphNodes
()
const
;
// update program descs
void
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
size_t
idx
)
const
;
// update ir nodes
void
RenameVarInGraphNode
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
size_t
idx
,
ir
::
Graph
*
graph
)
const
;
void
SubGraphOptimize
(
OpDesc
*
op_desc
)
const
;
// valid a tensor can be reuse or not
bool
NodeCanReused
(
ir
::
Node
*
node
)
const
;
// scan subblock and collect the output/input variables.
std
::
unordered_set
<
std
::
string
>
GetSubBlockVars
(
const
std
::
unordered_set
<
ir
::
Node
*>&
)
const
;
// check op has subblock or not
bool
OpHasSubBlock
(
OpDesc
*
desc
)
const
;
private:
// Reuse Node Pool, Owned.
mutable
OrderedNodePairPool
pool_
;
// controlflow Graph
mutable
std
::
unique_ptr
<
ControlFlowGraph
>
cfg_
;
// skip set
mutable
std
::
unordered_set
<
std
::
string
>
skip_set_
;
// var nodes
mutable
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes_
;
};
class
ControlFlowGraph
{
public:
ControlFlowGraph
()
=
default
;
// For IR Graph in parallelexecutor
explicit
ControlFlowGraph
(
const
ir
::
Graph
&
graph
);
void
LiveVariableAnalysis
();
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
set
<
std
::
string
>
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>
Ops
()
const
;
std
::
vector
<
ir
::
Node
*>&
Ops
();
// for ssa-graph nodes
ir
::
Node
*
GetNodeFromVarName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
;
private:
void
BuildCFGGraph
();
void
ConnectNodes
();
using
NodeListMap
=
std
::
unordered_map
<
ir
::
Node
*
,
std
::
set
<
ir
::
Node
*>>
;
using
VarSetMap
=
std
::
map
<
ir
::
Node
*
,
std
::
set
<
std
::
string
>>
;
// successors ops use the output variables.
NodeListMap
successors_
;
// predecessors ops generated input variables.
NodeListMap
predecessors_
;
// variables lived before run current op.
VarSetMap
live_in_
;
// variables lived after run current op.
VarSetMap
live_out_
;
VarSetMap
uses_
;
// op inputs
VarSetMap
defs_
;
// op outputs
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/analysis_var_pass_test.cc
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/analysis_var_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
framework
{
class
DummyOp
:
public
OperatorBase
{
public:
DummyOp
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
private:
void
RunImpl
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{}
};
class
SumOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
""
).
AsDuplicable
();
AddOutput
(
"Out"
,
""
);
AddComment
(
""
);
}
};
class
AssignOpMaker
:
public
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
""
).
AsDuplicable
();
AddOutput
(
"Out"
,
""
);
AddComment
(
""
);
}
};
class
DummyVarTypeInference
:
public
VarTypeInference
{
public:
void
operator
()(
const
OpDesc
&
op_desc
,
BlockDesc
*
block
)
const
override
{
auto
&
inputs
=
op_desc
.
Input
(
"X"
);
auto
type
=
block
->
Var
(
inputs
.
front
())
->
GetType
();
auto
out_var_name
=
op_desc
.
Output
(
"Out"
).
front
();
block
->
Var
(
out_var_name
)
->
SetType
(
type
);
}
};
}
// namespace framework
}
// namespace paddle
REGISTER_OPERATOR
(
sum
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
assign
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
AssignOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
dummy
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
inline
static
ProgramDesc
FillProgramDesc
()
{
ProgramDesc
prog
;
prog
.
MutableBlock
(
0
)
->
Var
(
"a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"d"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"e"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"b"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"c"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"d"
});
op
->
SetOutput
(
"Out"
,
{
"e"
});
}
return
prog
;
}
template
<
typename
Container
>
inline
static
std
::
string
DebugString
(
const
Container
&
c
)
{
std
::
stringstream
ss
;
for
(
auto
&
item
:
c
)
{
ss
<<
item
<<
" "
;
}
return
ss
.
str
();
}
TEST
(
CFGGraph
,
IRGraph
)
{
// prepare ir graph
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
0
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
0
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
1
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
1
])));
// test sum op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
2
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
2
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
3
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
3
])));
}
// 1. normal test
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 2. remove some op_desc
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
outputs
.
back
()
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
graph
.
RemoveNode
(
e
);
// other node keeps the same order
auto
remain_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
remain_nodes
.
size
();
++
i
)
{
auto
node
=
remain_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
op_descs
.
insert
(
op_descs
.
begin
()
+
4
,
op
);
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 4. add and delete some op_desc
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
if
(
node
->
Name
()
==
"sum"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
c
->
outputs
.
begin
(),
c
->
outputs
.
end
(),
found_node
);
ir
::
Node
*
pending_op
=
found_node
->
outputs
[
0
]
->
outputs
[
0
];
graph
.
RemoveNode
(
e
);
graph
.
RemoveNode
(
pending_op
);
graph
.
RemoveNode
(
found_node
);
}
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
insert
(
op_descs
.
begin
()
+
2
,
op
);
// check the order
auto
mynodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
mynodes
.
size
();
++
i
)
{
auto
node
=
mynodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 5. add and replace some op_desc inplace.
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
emplace_back
(
op
);
// replace op_desc inplace
auto
nodes
=
graph
.
Nodes
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
node
->
Name
()
==
"assign"
)
{
if
(
node
->
outputs
.
size
()
==
1
&&
node
->
outputs
[
0
]
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
e
->
inputs
.
begin
(),
e
->
inputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
}
op_descs
.
erase
(
op_descs
.
begin
()
+
3
);
auto
replace_op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
replace_op
->
SetType
(
"sum"
);
replace_op
->
SetInput
(
"X"
,
{
"d"
,
"d1"
});
replace_op
->
SetOutput
(
"Out"
,
{
"e"
});
{
ir
::
Node
*
sum2
=
graph
.
CreateOpNode
(
replace_op
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
d1
=
find_node_in_graph
(
"d1"
);
sum2
->
inputs
.
emplace_back
(
d
);
sum2
->
inputs
.
emplace_back
(
d1
);
sum2
->
outputs
.
emplace_back
(
e
);
e
->
inputs
.
emplace_back
(
sum2
);
d
->
outputs
.
emplace_back
(
sum2
);
d1
->
outputs
.
emplace_back
(
sum2
);
}
op_descs
.
emplace_back
(
replace_op
);
// compare op order
auto
graph_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
graph_nodes
.
size
();
++
i
)
{
auto
node
=
graph_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
5c7a8aee
...
...
@@ -14,11 +14,16 @@ limitations under the License. */
#include "paddle/fluid/framework/details/build_strategy.h"
#include <glog/logging.h>
#include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace
paddle
{
...
...
@@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
}
VLOG
(
1
)
<<
"CollectiveContext:"
<<
context
->
String
();
// NOTE(dzh): memory optimize should be a runtime pass.
// However, after multi_devices_pass, VarHandle, OpHandle is
// the de-fact IR, any reuse on Graph is meaningless.
// A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface.
if
(
strategy
.
memory_optimize_
)
{
auto
analysis_var_pass
=
AppendPass
(
"analysis_var_pass"
);
}
// Convert graph to run on multi-devices.
auto
multi_devices_pass
=
AppendPass
(
"multi_devices_pass"
);
multi_devices_pass
->
SetNotOwned
<
const
BuildStrategy
>
(
"strategy"
,
...
...
@@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// Add a graph print pass to record a graph with device info.
if
(
!
strategy_
.
debug_graphviz_path_
.
empty
())
{
auto
multi_devices_print_pass
=
AppendPass
(
"multi_devices_print_pass"
);
multi_devices_print_pass
->
SetNotOwned
<
const
std
::
string
>
(
"debug_graphviz_path"
,
&
strategy_
.
debug_graphviz_path_
);
const
std
::
string
graph_path
=
string
::
Sprintf
(
"%s%s"
,
strategy_
.
debug_graphviz_path_
.
c_str
(),
"_multi_devices_graph"
);
multi_devices_print_pass
->
Set
<
std
::
string
>
(
kGraphvizPath
,
new
std
::
string
(
graph_path
));
multi_devices_print_pass
->
Set
<
details
::
GraphvizSSAGraphPrinter
>
(
"graph_printer"
,
new
details
::
GraphvizSSAGraphPrinter
);
}
...
...
@@ -127,7 +143,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
CreatePassesFromStrategy
(
false
);
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
main_program
));
for
(
std
::
shared_ptr
<
ir
::
Pass
>
&
pass
:
pass_builder_
->
AllPasses
())
{
if
(
pass
->
Type
()
==
"multi_devices_pass"
)
{
pass
->
Erase
(
"places"
);
...
...
@@ -145,6 +160,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"analysis_var_pass"
)
{
const
std
::
vector
<
OpDesc
*>
*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
// take ownership
graph
->
Set
<
GraphNodePool
>
(
kGraphNodePool
,
new
GraphNodePool
);
// take ownership
pass
->
Erase
(
kAllOpDescs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
LOG
(
INFO
)
<<
"set enable_sequential_execution:"
<<
enable_sequential_execution_
;
...
...
@@ -166,6 +192,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
}
return
graph
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
...
...
@@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass);
USE_PASS
(
multi_devices_pass
);
USE_PASS
(
multi_devices_check_pass
);
USE_PASS
(
multi_devices_print_pass
);
USE_PASS
(
analysis_var_pass
);
USE_PASS
(
sequential_execution_pass
);
USE_PASS
(
all_reduce_deps_pass
);
USE_PASS
(
modify_op_lock_and_record_event_pass
);
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
5c7a8aee
...
...
@@ -60,8 +60,15 @@ struct BuildStrategy {
kCustomized
=
2
,
};
enum
class
OptimizeStrategy
{
// To be Implemented,bruteforce, recursive compute unused var names.
kBruteForce
=
0
,
kControlFlowGraph
=
1
,
// use cfg_graph algorithm, faster speed.
};
ReduceStrategy
reduce_
{
ReduceStrategy
::
kAllReduce
};
GradientScaleStrategy
gradient_scale_
{
GradientScaleStrategy
::
kCoeffNumDevice
};
OptimizeStrategy
strategy_
{
OptimizeStrategy
::
kControlFlowGraph
};
std
::
string
debug_graphviz_path_
{
""
};
...
...
@@ -69,6 +76,10 @@ struct BuildStrategy {
bool
enable_data_balance_
{
false
};
bool
memory_optimize_
{
false
};
bool
memory_early_delete_
{
false
};
bool
enable_sequential_execution_
{
false
};
bool
fuse_broadcast_op_
{
false
};
...
...
paddle/fluid/framework/details/early_delete_op_handle.h
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
EarlyDeleteOpHandle
:
public
OpHandleBase
{
public:
EarlyDeleteOpHandle
(
ir
::
Node
*
node
,
const
Scope
*
scope
,
const
platform
::
Place
&
place
,
const
std
::
vector
<
std
::
string
>&
names
,
GarbageCollector
*
gc
)
:
OpHandleBase
(
node
),
scope_
(
scope
),
place_
(
place
),
names_
(
names
),
gc_
(
gc
)
{
#ifdef PADDLE_WITH_CUDA
if
(
IsStreamGarabageCollector
())
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
);
PADDLE_ENFORCE
(
cudaSetDevice
(
gpu_place
.
device
));
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event_
,
cudaEventDisableTiming
));
}
#endif
}
~
EarlyDeleteOpHandle
()
{
#ifdef PADDLE_WITH_CUDA
if
(
IsStreamGarabageCollector
())
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx_
->
GetPlace
());
PADDLE_ENFORCE
(
cudaSetDevice
(
gpu_place
.
device
));
PADDLE_ENFORCE
(
cudaEventDestroy
(
event_
));
}
#endif
}
std
::
string
Name
()
const
override
{
return
"early_delete"
;
}
protected:
void
RunImpl
()
override
{
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
tensors
;
auto
*
local_scope
=
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
for
(
auto
&
var_name
:
names_
)
{
auto
*
var
=
local_scope
->
FindVar
(
var_name
);
PADDLE_ENFORCE
(
var
!=
nullptr
,
string
::
Sprintf
(
"Local Scope not has var %s"
,
var_name
));
if
(
var
->
IsType
<
LoDTensor
>
())
{
tensors
.
emplace_back
(
var
->
GetMutable
<
LoDTensor
>
()
->
MoveMemoryHolder
());
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
tensors
.
emplace_back
(
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
()
->
MoveMemoryHolder
());
}
else
if
(
var
->
IsType
<
LoDTensorArray
>
())
{
LoDTensorArray
*
tensor_array
=
var
->
GetMutable
<
LoDTensorArray
>
();
for
(
auto
&
tensor
:
*
tensor_array
)
{
tensors
.
emplace_back
(
tensor
.
MoveMemoryHolder
());
}
}
}
if
(
!
tensors
.
empty
())
{
ClearTensors
(
tensors
);
}
}
private:
void
ClearTensors
(
const
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>&
tensors
)
{
if
(
platform
::
is_cpu_place
(
place_
))
{
ClearCPUTensors
(
tensors
);
}
else
{
ClearGPUTensors
(
tensors
);
}
}
void
ClearCPUTensors
(
const
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>&
tensors
)
{
auto
*
gc
=
dynamic_cast
<
CPUGarbageCollector
*>
(
gc_
);
if
(
gc
!=
nullptr
)
{
gc
->
Add
(
tensors
);
}
}
void
ClearGPUTensors
(
const
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>&
tensors
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
gc
=
dynamic_cast
<
StreamGarbageCollector
*>
(
gc_
);
if
(
gc
!=
nullptr
)
{
auto
compute_stream
=
dev_ctx_
->
stream
();
auto
callback_stream
=
gc
->
stream
();
auto
callback_func
=
[
=
]()
{
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
callback_stream
,
event_
,
0
));
};
gc_
->
Add
(
tensors
,
callback_func
);
}
else
{
gc_
->
Add
(
tensors
);
}
}
bool
IsStreamGarabageCollector
()
const
{
return
dynamic_cast
<
const
StreamGarbageCollector
*>
(
gc_
)
!=
nullptr
;
#endif
}
const
Scope
*
scope_
;
const
platform
::
Place
place_
;
std
::
vector
<
std
::
string
>
names_
;
GarbageCollector
*
gc_
;
#ifdef PADDLE_WITH_CUDA
platform
::
CUDADeviceContext
*
dev_ctx_
;
cudaEvent_t
event_
;
#endif
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/memory_early_delete_pass.cc
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
#include <queue>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
ComputationOpHandle
*
FindNextComputationOpHandle
(
VarHandle
*
var_in
)
{
std
::
queue
<
VarHandleBase
*>
queue
;
queue
.
push
(
var_in
);
do
{
auto
*
var
=
queue
.
front
();
queue
.
pop
();
for
(
auto
*
op
:
var
->
PendingOps
())
{
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place_
)
{
return
compute_op
;
}
for
(
auto
*
out_var
:
op
->
Outputs
())
{
queue
.
push
(
out_var
);
}
}
}
while
(
!
queue
.
empty
());
return
nullptr
;
}
std
::
unique_ptr
<
ir
::
Graph
>
MemoryEarlyDeletePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
&
graph_pool
=
Get
<
GraphNodePool
>
(
kGraphNodePool
);
auto
&
gcs
=
Get
<
GarbageCollectorMap
>
(
kGarbageCollector
);
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
OpDesc
*>>
unlived_vars
;
unlived_vars
.
reserve
(
graph_pool
.
size
());
for
(
auto
&
pair
:
graph_pool
)
{
unlived_vars
.
insert
(
std
::
make_pair
(
pair
.
first
,
pair
.
second
));
}
auto
compare_and_insert_early_delete_op
=
[
&
](
OpHandleBase
*
op
,
const
std
::
vector
<
VarHandleBase
*>&
vars
)
{
if
(
unlived_vars
.
empty
())
return
;
// unlived vars can be deleted after the last used op has finished.
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
const
auto
&
places
=
Get
<
std
::
vector
<
platform
::
Place
>>
(
kAllPlaces
);
for
(
auto
&
var
:
vars
)
{
auto
*
var_handle
=
dynamic_cast
<
VarHandle
*>
(
var
);
auto
var_name
=
var
->
Node
()
->
Name
();
auto
&
var_place
=
var_handle
->
place_
;
if
(
unlived_vars
.
count
(
var_name
)
==
0
)
continue
;
if
(
!
unlived_vars
[
var_name
].
empty
())
{
if
(
compute_op
!=
nullptr
&&
unlived_vars
[
var_name
].
count
(
compute_op
->
Node
()
->
Op
())
!=
0
)
{
unlived_vars
[
var_name
].
erase
(
compute_op
->
Node
()
->
Op
());
}
continue
;
}
if
(
var_handle
==
nullptr
||
!
var_handle
->
Node
()
->
IsVar
()
||
var_handle
->
Node
()
->
IsCtrlVar
())
continue
;
// shameless copyed from reference count pass.
if
(
compute_op
==
nullptr
)
{
// use next computation op scope
compute_op
=
FindNextComputationOpHandle
(
var_handle
);
}
auto
*
early_delete_node
=
graph
->
CreateEmptyNode
(
"early_delete"
,
ir
::
Node
::
Type
::
kOperation
);
GarbageCollector
*
gc
=
gcs
.
at
(
places
[
compute_op
->
GetScopeIdx
()]).
get
();
auto
*
early_delete_handle
=
new
EarlyDeleteOpHandle
(
early_delete_node
,
compute_op
->
GetScope
(),
var_place
,
{
var_name
},
gc
);
if
(
compute_op
->
Outputs
().
empty
())
{
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
compute_op
->
AddOutput
(
dep_var
);
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
}
early_delete_handle
->
AddInput
(
compute_op
->
Outputs
().
front
());
VLOG
(
5
)
<<
"Add early delete op "
<<
var_name
<<
" to Operator"
<<
compute_op
->
Name
();
}
};
auto
all_ops
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
for
(
auto
&
op
:
all_ops
)
{
compare_and_insert_early_delete_op
(
op
,
op
->
Inputs
());
compare_and_insert_early_delete_op
(
op
,
op
->
Outputs
());
}
return
graph
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
memory_early_delete_pass
,
paddle
::
framework
::
details
::
MemoryEarlyDeletePass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kGraphNodePool
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kGarbageCollector
);
paddle/fluid/framework/details/memory_early_delete_pass.h
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/early_delete_op_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
MemoryEarlyDeletePass
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/memory_reuse_types.cc
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include <iostream>
#include <sstream>
#include <string>
namespace
paddle
{
namespace
framework
{
namespace
details
{
size_t
NodeSizeInBytes
(
ir
::
Node
*
n
)
{
auto
*
desc
=
FindVarDescInBlock
(
n
);
auto
shape
=
desc
->
GetShape
();
size_t
type_size
=
SizeOfType
(
desc
->
GetDataType
());
int
size
=
1
;
for
(
auto
&
s
:
shape
)
{
size
*=
s
;
}
return
type_size
*
std
::
abs
(
size
);
}
std
::
string
DebugStringImpl
(
VarDesc
*
var
)
{
std
::
stringstream
ss
;
ss
<<
var
->
Name
();
ss
<<
"["
;
try
{
auto
shape
=
var
->
GetShape
();
for
(
size_t
i
=
0
;
i
<
shape
.
size
();
++
i
)
{
if
(
i
!=
shape
.
size
()
-
1
)
{
ss
<<
shape
[
i
]
<<
","
;
}
else
{
ss
<<
shape
[
i
];
}
}
ss
<<
"]"
;
}
catch
(...)
{
ss
<<
"Var has no VarDesc !!! Name:"
<<
var
->
Name
();
}
return
ss
.
str
();
}
std
::
string
DebugString
(
ir
::
Node
*
var
)
{
return
DebugStringImpl
(
FindVarDescInBlock
(
var
));
}
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will
// have the small size. Find the original node shap from blockdesc.
VarDesc
*
FindVarDescInBlock
(
ir
::
Node
*
n
)
{
PADDLE_ENFORCE
(
n
->
IsVar
()
&&
!
n
->
IsCtrlVar
()
&&
n
->
inputs
.
size
()
==
1
);
BlockDesc
*
block
=
n
->
inputs
[
0
]
->
Op
()
->
Block
();
PADDLE_ENFORCE
(
block
->
HasVar
(
n
->
Name
()),
string
::
Sprintf
(
"Block do not has var %s"
,
n
->
Name
()));
return
block
->
FindVar
(
n
->
Name
());
}
struct
NodeComparator
{
bool
operator
()(
ir
::
Node
*
lhs
,
ir
::
Node
*
rhs
)
const
{
auto
*
lhs_desc
=
FindVarDescInBlock
(
lhs
);
auto
*
rhs_desc
=
FindVarDescInBlock
(
rhs
);
auto
lhs_shape
=
lhs_desc
->
GetShape
();
auto
rhs_shape
=
rhs_desc
->
GetShape
();
if
((
lhs_shape
[
0
]
==
-
1
&&
rhs_shape
[
0
]
==
-
1
)
||
(
lhs_shape
[
0
]
!=
-
1
&&
rhs_shape
[
0
]
!=
-
1
))
{
return
NodeSizeInBytes
(
lhs
)
<=
NodeSizeInBytes
(
rhs
);
}
else
{
return
false
;
}
}
};
void
OrderedNodePairPool
::
Insert
(
ir
::
Node
*
var
,
ir
::
Node
*
op
)
{
PADDLE_ENFORCE
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
());
PADDLE_ENFORCE
(
op
->
IsOp
());
if
(
mark_table_
.
count
(
var
->
Name
())
!=
0
)
{
mark_table_
[
var
->
Name
()]
->
second
.
insert
(
op
);
return
;
}
auto
*
var_desc
=
FindVarDescInBlock
(
var
);
auto
var_shape
=
var_desc
->
GetShape
();
int
batch_size
=
static_cast
<
int
>
(
var_shape
[
0
]);
NodeComparator
compare_node
;
Iter
it
=
nodes_
.
begin
();
while
(
it
!=
nodes_
.
end
())
{
auto
*
cache_desc
=
FindVarDescInBlock
(
it
->
first
);
int
cache_batch_size
=
cache_desc
->
GetShape
()[
0
];
if
((
cache_batch_size
==
-
1
&&
batch_size
==
-
1
)
||
(
cache_batch_size
!=
-
1
&&
batch_size
!=
-
1
))
{
if
(
compare_node
(
it
->
first
,
var
))
{
++
it
;
}
else
{
break
;
}
}
else
if
(
cache_batch_size
==
-
1
&&
batch_size
!=
-
1
)
{
++
it
;
}
else
if
(
cache_batch_size
!=
-
1
&&
batch_size
==
-
1
)
{
break
;
}
}
it
=
nodes_
.
insert
(
it
,
std
::
make_pair
(
var
,
std
::
unordered_set
<
ir
::
Node
*>
{
op
}));
mark_table_
[
var
->
Name
()]
=
it
;
}
int
OrderedNodePairPool
::
GetIndex
(
ir
::
Node
*
var
)
{
return
std
::
distance
(
nodes_
.
begin
(),
mark_table_
[
var
->
Name
()]);
}
ir
::
Node
*
OrderedNodePairPool
::
NodeMatch
(
ir
::
Node
*
var
)
const
{
ir
::
Node
*
found_node
=
nullptr
;
NodeComparator
compare_node
;
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
if
(
compare_node
(
var
,
it
->
first
))
{
found_node
=
it
->
first
;
break
;
}
}
return
found_node
;
}
void
OrderedNodePairPool
::
Erase
(
ir
::
Node
*
var
)
{
PADDLE_ENFORCE
(
mark_table_
.
count
(
var
->
Name
()));
nodes_
.
erase
(
mark_table_
[
var
->
Name
()]);
mark_table_
.
erase
(
var
->
Name
());
}
std
::
string
OrderedNodePairPool
::
ToString
()
const
{
std
::
stringstream
ss
;
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
ss
<<
DebugString
(
it
->
first
)
<<
" "
;
}
return
ss
.
str
();
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/memory_reuse_types.h
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <iostream>
#include <iterator>
#include <list>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
constexpr
char
kFetchedVars
[]
=
"fetched_vars"
;
constexpr
char
kGraphNodePool
[]
=
"graph_node_pool"
;
// NOTE(dzh): Variable and the operators use the var.
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using
GraphNodePool
=
std
::
vector
<
std
::
pair
<
std
::
string
/*var node*/
,
std
::
unordered_set
<
OpDesc
*>
/* ops */
>>
;
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
// in fluid, -1 means the batch_size is determined in runtime.
// the node batch_size equal -1 always ranking in the front than the node not.
// For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class
OrderedNodePairPool
{
public:
using
NodePair
=
std
::
pair
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
;
using
Iter
=
typename
std
::
list
<
NodePair
>::
iterator
;
using
ConstIter
=
typename
std
::
list
<
NodePair
>::
const_iterator
;
void
Insert
(
ir
::
Node
*
var
,
ir
::
Node
*
op
);
void
Erase
(
ir
::
Node
*
var
);
bool
Has
(
ir
::
Node
*
var
)
{
return
mark_table_
.
count
(
var
->
Name
());
}
ir
::
Node
*
NodeMatch
(
ir
::
Node
*
var
)
const
;
// map store non-const iterator, can not promise const
int
GetIndex
(
ir
::
Node
*
var
);
// pool all node to string
std
::
string
ToString
()
const
;
Iter
begin
()
{
return
nodes_
.
begin
();
}
Iter
end
()
{
return
nodes_
.
end
();
}
ConstIter
begin
()
const
{
return
nodes_
.
begin
();
}
ConstIter
end
()
const
{
return
nodes_
.
end
();
}
size_t
size
()
const
{
return
nodes_
.
size
();
}
private:
// for searching.
std
::
unordered_map
<
std
::
string
,
Iter
>
mark_table_
;
// node swap pairs. var -> ops dep var
std
::
list
<
NodePair
>
nodes_
;
};
// node memory size in bytes
size_t
NodeSizeInBytes
(
ir
::
Node
*
n
);
std
::
string
DebugString
(
ir
::
Node
*
var
);
// std::string DebugString(VarDesc* var);
VarDesc
*
FindVarDescInBlock
(
ir
::
Node
*
n
);
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/memory_reuse_types_test.cc
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
TEST
(
OrderedNodePairPool
,
Normal
)
{
OrderedNodePairPool
pool
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes
;
// clang-format off
std
::
vector
<
std
::
vector
<
int64_t
>>
shapes
=
{{
-
1
,
10
},
{
-
1
,
20
},
{
1
,
2
},
{
5
,
2
},
{
10
,
20
},
{
-
1
,
2
,
5
},
{
-
1
,
1
,
5
},
{
-
1
,
1
}};
// clang-format on
const
int
COUNT
=
shapes
.
size
();
ProgramDesc
prog
;
BlockDesc
*
block_desc
=
prog
.
MutableBlock
(
0
);
auto
*
op_desc
=
block_desc
->
AppendOp
();
op_desc
->
SetType
(
"dummy"
);
std
::
unique_ptr
<
ir
::
Node
>
op
=
ir
::
CreateNodeForTest
(
op_desc
);
for
(
int
i
=
0
;
i
<
COUNT
;
++
i
)
{
auto
desc
=
block_desc
->
Var
(
std
::
to_string
(
i
));
desc
->
SetShape
(
shapes
[
i
]);
std
::
unique_ptr
<
ir
::
Node
>
node
=
ir
::
CreateNodeForTest
(
desc
);
node
->
inputs
.
emplace_back
(
op
.
get
());
nodes
.
emplace_back
(
std
::
move
(
node
));
}
for
(
auto
&
node
:
nodes
)
{
pool
.
Insert
(
node
.
get
(),
op
.
get
());
}
// assert its order and interface.
std
::
cout
<<
pool
.
ToString
()
<<
std
::
endl
;
pool
.
Erase
(
nodes
.
front
().
get
());
std
::
cout
<<
pool
.
ToString
()
<<
std
::
endl
;
ASSERT_EQ
(
pool
.
size
(),
static_cast
<
size_t
>
(
COUNT
-
1
));
ASSERT_EQ
(
pool
.
GetIndex
(
nodes
.
back
().
get
()),
0
);
{
auto
v1
=
block_desc
->
Var
(
"11"
);
v1
->
SetShape
({
-
1
,
256
,
56
,
56
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v1
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
ASSERT_EQ
(
cache
,
nullptr
);
}
{
auto
v2
=
block_desc
->
Var
(
"12"
);
v2
->
SetShape
({
-
1
,
2
,
5
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v2
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
ASSERT_EQ
(
pool
.
GetIndex
(
cache
),
2
);
// match 6:[-1,2,5]
}
{
auto
v3
=
block_desc
->
Var
(
"13"
);
v3
->
SetShape
({
2
,
5
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v3
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
ASSERT_EQ
(
pool
.
GetIndex
(
cache
),
5
);
// match 4:[5,2]
}
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
浏览文件 @
5c7a8aee
...
...
@@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
}
// namespace paddle
REGISTER_PASS
(
multi_devices_print_pass
,
paddle
::
framework
::
details
::
SSAGraghBuilderWithPrinter
);
paddle
::
framework
::
details
::
SSAGraghBuilderWithPrinter
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kGraphvizPath
);
paddle/fluid/framework/details/multi_devices_graph_print_pass.h
浏览文件 @
5c7a8aee
...
...
@@ -14,6 +14,7 @@
#pragma once
#include <glog/logging.h>
#include <fstream>
#include <iosfwd>
#include <ostream>
...
...
@@ -24,6 +25,8 @@ namespace paddle {
namespace
framework
{
namespace
details
{
constexpr
char
kGraphvizPath
[]
=
"debug_graphviz_path"
;
class
SSAGraphPrinter
{
public:
virtual
~
SSAGraphPrinter
()
{}
...
...
@@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass {
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
{
std
::
unique_ptr
<
std
::
ostream
>
fout
(
new
std
::
ofstream
(
Get
<
const
std
::
string
>
(
"debug_graphviz_path"
)));
new
std
::
ofstream
(
Get
<
std
::
string
>
(
kGraphvizPath
)));
PADDLE_ENFORCE
(
fout
->
good
());
Get
<
GraphvizSSAGraphPrinter
>
(
"graph_printer"
).
Print
(
*
graph
,
*
fout
);
return
graph
;
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
5c7a8aee
...
...
@@ -25,7 +25,7 @@ namespace paddle {
namespace
framework
{
namespace
details
{
constexpr
char
kLocalExecScopeName
[]
=
"@L
CO
AL_SCOPE@"
;
constexpr
char
kLocalExecScopeName
[]
=
"@L
OC
AL_SCOPE@"
;
// Wraps ir::Node and provide helper utilities.
// It's responsible for populating necessary fields of ir::Node.
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
5c7a8aee
...
...
@@ -100,7 +100,7 @@ static void DeleteUnusedTensors(
continue
;
}
auto
*
var
=
scope
.
FindVar
(
name
);
if
(
var
!
=
nullptr
)
{
if
(
var
=
=
nullptr
)
{
continue
;
}
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
5c7a8aee
...
...
@@ -44,6 +44,7 @@ pass_library(seqconv_eltadd_relu_fuse_pass inference)
pass_library
(
is_test_pass base
)
pass_library
(
conv_elementwise_add_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add2_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add_fuse_pass inference
)
if
(
WITH_MKLDNN
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
depthwise_conv_mkldnn_pass base
)
...
...
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(conv_op); \
GET_IR_NODE(conv_out); \
GET_IR_NODE(conv_filter); \
GET_IR_NODE(elementwise_add_op); \
GET_IR_NODE(elementwise_add_in_y); \
GET_IR_NODE(elementwise_add_out);
std
::
unique_ptr
<
ir
::
Graph
>
ConvElementwiseAddFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
pattern_name
=
"conv_elementwise_add_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
.
get
());
GraphPatternDetector
gpd
;
auto
*
x
=
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
)
->
assert_is_op_input
(
"conv2d"
,
"Input"
)
->
AsInput
();
patterns
::
ConvElementwiseadd
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
x
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
GET_NODES
;
auto
base_op_desc
=
*
conv_op
->
Op
()
->
Proto
();
std
::
string
bias_name
=
elementwise_add_in_y
->
Name
();
std
::
string
output_name
=
elementwise_add_out
->
Name
();
std
::
string
act_type
=
"identity"
;
framework
::
OpDesc
new_op_desc
(
base_op_desc
,
nullptr
);
new_op_desc
.
SetType
(
"conv2d_fusion"
);
new_op_desc
.
SetInput
(
"Bias"
,
{
bias_name
});
new_op_desc
.
SetInput
(
"ResidualData"
,
{});
new_op_desc
.
SetAttr
(
"activation"
,
act_type
);
new_op_desc
.
SetOutput
(
"Output"
,
{
output_name
});
new_op_desc
.
SetAttr
(
"is_test"
,
true
);
new_op_desc
.
SetAttr
(
"use_cudnn"
,
false
);
new_op_desc
.
Flush
();
// Create a new node for the fused op.
auto
*
new_conv_op
=
graph
->
CreateOpNode
(
&
new_op_desc
);
// Link inputs and outputs.
PADDLE_ENFORCE
(
subgraph
.
count
(
x
));
auto
*
conv_in_node
=
subgraph
.
at
(
x
);
IR_NODE_LINK_TO
(
conv_in_node
,
new_conv_op
);
// Input
IR_NODE_LINK_TO
(
conv_filter
,
new_conv_op
);
// Filter
IR_NODE_LINK_TO
(
elementwise_add_in_y
,
new_conv_op
);
// Bias
IR_NODE_LINK_TO
(
new_conv_op
,
elementwise_add_out
);
// Output
// Delete the unneeded nodes.
GraphSafeRemoveNodes
(
graph
.
get
(),
{
conv_op
,
conv_out
,
elementwise_add_op
});
};
gpd
(
graph
.
get
(),
handler
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
conv_elementwise_add_fuse_pass
,
paddle
::
framework
::
ir
::
ConvElementwiseAddFusePass
);
paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
0 → 100644
浏览文件 @
5c7a8aee
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
ConvElementwiseAddFusePass
:
public
FusePassBase
{
public:
virtual
~
ConvElementwiseAddFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph.cc
浏览文件 @
5c7a8aee
...
...
@@ -162,7 +162,10 @@ void Graph::ResolveHazard(
(
*
it_new
)
->
inputs
.
empty
()
?
nullptr
:
(
*
it_new
)
->
inputs
[
0
];
const
auto
&
read_ops
=
(
*
it_old
)
->
outputs
;
PADDLE_ENFORCE
(
write_op
,
"The write_op should not be empty."
);
PADDLE_ENFORCE
(
write_op
,
string
::
Sprintf
(
"The write_op of var %s should not be empty."
,
(
*
it_new
)
->
Name
()));
// Add write after write dependence
ir
::
Node
*
upstream_op
=
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
5c7a8aee
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <unordered_map>
#include <unordered_set>
DEFINE_string
(
print_sub_graph_dir
,
""
,
...
...
@@ -121,7 +122,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
}
size_t
GraphNum
(
const
Graph
&
graph
)
{
std
::
unordered_set
<
ir
::
Node
*>
nodes
=
graph
.
Nodes
(
);
std
::
unordered_set
<
ir
::
Node
*>
nodes
(
graph
.
Nodes
()
);
std
::
unordered_set
<
ir
::
Node
*>
visited_nodes
;
visited_nodes
.
reserve
(
nodes
.
size
());
std
::
deque
<
ir
::
Node
*>
q_nodes
;
...
...
paddle/fluid/framework/ir/graph_helper.h
浏览文件 @
5c7a8aee
...
...
@@ -24,6 +24,7 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
namespace
ir
{
// Test if the graph contains circle.
bool
HasCircle
(
const
Graph
&
graph
);
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
5c7a8aee
...
...
@@ -17,7 +17,6 @@
#include <string>
#include <vector>
#include "graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
...
...
@@ -1210,6 +1209,33 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
return
act_out
;
}
PDNode
*
patterns
::
ConvElementwiseadd
::
operator
()(
PDNode
*
conv_in
)
{
conv_in
->
AsInput
();
auto
conv_op
=
pattern
->
NewNode
(
conv_op_repr
())
->
assert_is_op
(
"conv2d"
);
auto
conv_out
=
pattern
->
NewNode
(
conv_out_repr
())
->
assert_is_op_output
(
"conv2d"
)
->
assert_is_op_input
(
"elementwise_add"
,
"X"
)
->
AsIntermediate
();
auto
conv_filter
=
pattern
->
NewNode
(
conv_filter_repr
())
->
assert_is_op_input
(
"conv2d"
,
"Filter"
)
->
AsInput
();
auto
elementwise_add_op
=
pattern
->
NewNode
(
elementwise_add_op_repr
())
->
assert_is_op
(
"elementwise_add"
);
auto
elementwise_add_in_y
=
pattern
->
NewNode
(
elementwise_add_in_y_repr
())
->
assert_is_op_input
(
"elementwise_add"
,
"Y"
)
->
AsInput
();
auto
elementwise_add_out
=
pattern
->
NewNode
(
elementwise_add_out_repr
())
->
assert_is_op_output
(
"elementwise_add"
)
->
AsOutput
();
conv_op
->
LinksFrom
({
conv_in
,
conv_filter
});
conv_out
->
LinksFrom
({
conv_op
});
elementwise_add_op
->
LinksFrom
({
conv_out
,
elementwise_add_in_y
})
.
LinksTo
({
elementwise_add_out
});
return
elementwise_add_out
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
5c7a8aee
...
...
@@ -716,6 +716,24 @@ struct ConvElementwiseadd2Act : public PatternBase {
PATTERN_DECL_NODE
(
act_out
);
};
// Conv + ElementwiseAdd
// This pattern should be used after ConvElementwiseadd2Act or
// ConvElementwiseadd pass
struct
ConvElementwiseadd
:
public
PatternBase
{
ConvElementwiseadd
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"conv_elementwiseadd"
)
{}
PDNode
*
operator
()(
PDNode
*
conv_in
);
PATTERN_DECL_NODE
(
conv_op
);
PATTERN_DECL_NODE
(
conv_out
);
PATTERN_DECL_NODE
(
conv_filter
);
PATTERN_DECL_NODE
(
elementwise_add_op
);
PATTERN_DECL_NODE
(
elementwise_add_in_y
);
PATTERN_DECL_NODE
(
elementwise_add_out
);
};
}
// namespace patterns
// Link two ir::Nodes from each other.
...
...
paddle/fluid/framework/ir/node.cc
浏览文件 @
5c7a8aee
...
...
@@ -30,6 +30,14 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
return
std
::
unique_ptr
<
Node
>
(
new
Node
(
name
,
type
));
}
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
VarDesc
*
var_desc
)
{
return
std
::
unique_ptr
<
Node
>
(
new
Node
(
var_desc
));
}
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
OpDesc
*
op_desc
)
{
return
std
::
unique_ptr
<
Node
>
(
new
Node
(
op_desc
));
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/node.h
浏览文件 @
5c7a8aee
...
...
@@ -18,7 +18,6 @@ limitations under the License. */
#include <typeindex>
#include <typeinfo>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/macros.h"
...
...
@@ -125,6 +124,8 @@ class Node {
friend
class
Graph
;
friend
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
const
std
::
string
&
name
,
Node
::
Type
type
);
friend
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
VarDesc
*
var_desc
);
friend
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
OpDesc
*
op_desc
);
explicit
Node
(
const
std
::
string
&
name
,
Type
type
)
:
name_
(
name
),
var_desc_
(
nullptr
),
op_desc_
(
nullptr
),
type_
(
type
)
{}
...
...
@@ -152,7 +153,9 @@ class Node {
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
const
std
::
string
&
name
,
Node
::
Type
type
);
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
VarDesc
*
var_desc
);
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
OpDesc
*
op_desc
);
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ngraph_operator.cc
浏览文件 @
5c7a8aee
...
...
@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
ngraph
::
runtime
::
Backend
::
create
(
"CPU"
);
void
NgraphEngine
::
GetNgInputShape
(
std
::
shared_ptr
<
OperatorBase
>
op
)
{
op
->
RuntimeInferShape
(
scope_
,
place_
);
RuntimeContext
ctx
(
op
->
Inputs
(),
op
->
Outputs
(),
scope_
);
op
->
RuntimeInferShape
(
scope_
,
place_
,
ctx
);
for
(
auto
&
var_name_item
:
op
->
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
auto
*
var
=
scope_
.
FindVar
(
var_name
);
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
5c7a8aee
...
...
@@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
}
RuntimeContext
::
RuntimeContext
(
const
VariableNameMap
&
innames
,
const
VariableNameMap
&
outnames
,
const
Scope
&
scope
)
{
for
(
auto
&
var_name_item
:
innames
)
{
std
::
vector
<
Variable
*>&
input_vars
=
inputs
[
var_name_item
.
first
];
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
input_vars
.
push_back
(
scope
.
FindVar
(
var_name
));
}
}
for
(
auto
&
var_name_item
:
outnames
)
{
std
::
vector
<
Variable
*>&
output_vars
=
outputs
[
var_name_item
.
first
];
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
output_vars
.
push_back
(
scope
.
FindVar
(
var_name
));
}
}
}
void
OperatorBase
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
{
VLOG
(
4
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
if
(
platform
::
is_gpu_place
(
place
))
{
...
...
@@ -412,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
return
var
!=
nullptr
;
}
const
Variable
*
ExecutionContext
::
InputVar
(
const
std
::
string
&
name
)
const
{
auto
it
=
ctx_
.
inputs
.
find
(
name
);
if
(
it
==
ctx_
.
inputs
.
end
())
return
nullptr
;
PADDLE_ENFORCE_LE
(
it
->
second
.
size
(),
1UL
,
"Operator %s's input %s should contain only one variable."
,
op_
.
Type
(),
name
);
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
}
const
Variable
*
ExecutionContext
::
LegacyInputVar
(
const
std
::
string
&
name
)
const
{
auto
ipt
=
op_
.
Input
(
name
);
return
ipt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
ipt
);
}
Variable
*
ExecutionContext
::
OutputVar
(
const
std
::
string
&
name
)
const
{
auto
it
=
ctx_
.
outputs
.
find
(
name
);
if
(
it
==
ctx_
.
outputs
.
end
())
return
nullptr
;
PADDLE_ENFORCE_LE
(
it
->
second
.
size
(),
1UL
,
"Operator %s's output %s should contain only one variable."
,
op_
.
Type
(),
name
);
return
it
->
second
.
empty
()
?
nullptr
:
it
->
second
[
0
];
}
Variable
*
ExecutionContext
::
LegacyOutputVar
(
const
std
::
string
&
name
)
const
{
auto
opt
=
op_
.
Output
(
name
);
return
opt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
opt
);
}
template
<
>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
Input
<
LoDTensor
>
(
name
);
}
template
<
>
const
Tensor
*
ExecutionContext
::
LegacyInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
LegacyInput
<
LoDTensor
>
(
name
);
}
template
<
>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
...
...
@@ -441,6 +495,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
return
Output
<
LoDTensor
>
(
name
);
}
template
<
>
Tensor
*
ExecutionContext
::
LegacyOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
return
LegacyOutput
<
LoDTensor
>
(
name
);
}
template
<
>
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
...
...
@@ -477,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) {
class
RuntimeInferShapeContext
:
public
InferShapeContext
{
public:
RuntimeInferShapeContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
)
:
op_
(
op
),
scope_
(
scope
)
{}
RuntimeInferShapeContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
,
const
RuntimeContext
&
ctx
)
:
op_
(
op
),
scope_
(
scope
),
ctx_
(
ctx
)
{}
bool
HasInput
(
const
std
::
string
&
name
)
const
override
{
// has only one input
const
auto
&
ins
=
op_
.
Inputs
()
;
const
auto
&
ins
=
ctx_
.
inputs
;
auto
it
=
ins
.
find
(
name
);
if
(
it
==
ins
.
end
())
{
return
false
;
}
const
auto
&
in
=
it
->
second
;
if
(
in
.
size
()
==
0
||
in
[
0
]
==
kEmptyVarName
)
{
return
false
;
}
if
(
in
.
size
()
==
0
)
return
false
;
PADDLE_ENFORCE_EQ
(
in
.
size
(),
1UL
,
"Input %s should not have more than one inputs"
,
name
);
return
scope_
.
FindVar
(
in
[
0
])
!=
nullptr
;
return
in
[
0
]
!=
nullptr
;
}
bool
HasOutput
(
const
std
::
string
&
name
)
const
override
{
...
...
@@ -678,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
private:
const
OperatorBase
&
op_
;
const
Scope
&
scope_
;
const
RuntimeContext
&
ctx_
;
};
static
void
CheckTensorNANOrInf
(
const
std
::
string
&
name
,
...
...
@@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name,
}
void
OperatorWithKernel
::
RuntimeInferShape
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
scope
);
const
platform
::
Place
&
place
,
const
RuntimeContext
&
ctx
)
const
{
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
scope
,
ctx
);
this
->
InferShape
(
&
infer_shape_ctx
);
}
void
OperatorWithKernel
::
RunImpl
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
scope
);
this
->
InferShape
(
&
infer_shape_ctx
);
RuntimeContext
ctx
(
Inputs
(),
Outputs
(),
scope
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
pool
.
Get
(
place
);
...
...
@@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
// TODO(dzhwinter) : kernel fallback mechanism will be added when all the
// transform functions are ready.
// for (auto& candidate : kKernelPriority) {
// Do selection
// }
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
));
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// do data transformScope &transfer_scope;
std
::
vector
<
std
::
string
>
transfered_inplace_vars
;
auto
*
transfer_scope
=
TryTransferData
(
scope
,
expected_kernel_key
,
&
transfered_inplace_vars
);
PrepareData
(
scope
,
expected_kernel_key
,
&
transfered_inplace_vars
,
&
ctx
);
// exec scope is the scope that kernel actually executed on.
const
Scope
&
exec_scope
=
...
...
@@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx
=
pool
.
Get
(
expected_kernel_key
.
place_
);
}
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
));
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
exec_scope
,
ctx
);
this
->
InferShape
(
&
infer_shape_ctx
);
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
));
if
(
!
transfered_inplace_vars
.
empty
())
{
// there is inplace variable has been transfered.
...
...
@@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}
}
}
void
OperatorWithKernel
::
TransferInplaceVarsBack
(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
Scope
&
transfer_scope
)
const
{
...
...
@@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack(
}
}
Scope
*
OperatorWithKernel
::
TryTransfer
Data
(
Scope
*
OperatorWithKernel
::
Prepare
Data
(
const
Scope
&
scope
,
const
OpKernelType
&
expected_kernel_key
,
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
)
const
{
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
,
RuntimeContext
*
ctx
)
const
{
Scope
*
new_scope
=
nullptr
;
for
(
auto
&
var_name_item
:
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
std
::
vector
<
Variable
*>&
input_vars
=
ctx
->
inputs
[
var_name_item
.
first
];
for
(
size_t
i
=
0
;
i
<
var_name_item
.
second
.
size
();
++
i
)
{
auto
&
var_name
=
var_name_item
.
second
[
i
];
auto
*
var
=
scope
.
FindVar
(
var_name
);
input_vars
[
i
]
=
var
;
// Only tensor can be tranfer to another device.
if
(
var
==
nullptr
||
!
VarIsTensor
(
*
var
))
{
continue
;
...
...
@@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData(
}
auto
*
trans_var
=
new_scope
->
Var
(
var_name
);
input_vars
[
i
]
=
trans_var
;
Tensor
out
;
TransformData
(
expected_kernel_key
,
kernel_type_for_var
,
*
tensor_in
,
&
out
);
...
...
paddle/fluid/framework/operator.h
浏览文件 @
5c7a8aee
...
...
@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
class
OperatorBase
;
class
ExecutionContext
;
class
RuntimeContext
{
public:
RuntimeContext
(
const
VariableNameMap
&
innames
,
const
VariableNameMap
&
outnames
,
const
Scope
&
scope
);
VariableValueMap
inputs
;
VariableValueMap
outputs
;
};
/**
* OperatorBase has the basic elements that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User
...
...
@@ -129,7 +138,8 @@ class OperatorBase {
void
SetIsCalledByExecutor
(
bool
x
)
{
run_by_executor_
=
x
;
}
virtual
void
RuntimeInferShape
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{}
const
platform
::
Place
&
place
,
const
RuntimeContext
&
ctx
)
const
{}
protected:
std
::
string
type_
;
...
...
@@ -156,8 +166,9 @@ class OperatorBase {
class
ExecutionContext
{
public:
ExecutionContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
device_context
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
)
{}
const
platform
::
DeviceContext
&
device_context
,
const
RuntimeContext
&
ctx
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
)
{}
const
OperatorBase
&
op
()
const
{
return
op_
;
}
...
...
@@ -180,15 +191,9 @@ class ExecutionContext {
return
op_
.
Outputs
(
name
).
size
();
}
const
Variable
*
InputVar
(
const
std
::
string
&
name
)
const
{
auto
ipt
=
op_
.
Input
(
name
);
return
ipt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
ipt
);
}
const
Variable
*
InputVar
(
const
std
::
string
&
name
)
const
;
Variable
*
OutputVar
(
const
std
::
string
&
name
)
const
{
auto
opt
=
op_
.
Output
(
name
);
return
opt
==
kEmptyVarName
?
nullptr
:
scope_
.
FindVar
(
opt
);
}
Variable
*
OutputVar
(
const
std
::
string
&
name
)
const
;
const
std
::
vector
<
const
Variable
*>
MultiInputVar
(
const
std
::
string
&
name
)
const
{
...
...
@@ -227,6 +232,22 @@ class ExecutionContext {
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
}
template
<
typename
T
>
const
T
*
LegacyInput
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
LegacyInputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
&
var
->
Get
<
T
>
();
}
template
<
typename
T
>
T
*
LegacyOutput
(
const
std
::
string
&
name
)
const
{
auto
var
=
LegacyOutputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
var
->
GetMutable
<
T
>
();
}
const
Variable
*
LegacyInputVar
(
const
std
::
string
&
name
)
const
;
Variable
*
LegacyOutputVar
(
const
std
::
string
&
name
)
const
;
template
<
typename
T
>
const
std
::
vector
<
const
T
*>
MultiInput
(
const
std
::
string
&
name
)
const
{
auto
names
=
op_
.
Inputs
(
name
);
...
...
@@ -286,11 +307,16 @@ class ExecutionContext {
const
OperatorBase
&
op_
;
const
Scope
&
scope_
;
const
platform
::
DeviceContext
&
device_context_
;
const
RuntimeContext
&
ctx_
;
};
template
<>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
const
Tensor
*
ExecutionContext
::
LegacyInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
const
std
::
vector
<
const
Tensor
*>
ExecutionContext
::
MultiInput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
...
...
@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
template
<>
Tensor
*
ExecutionContext
::
Output
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
Tensor
*
ExecutionContext
::
LegacyOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
template
<>
std
::
vector
<
Tensor
*>
ExecutionContext
::
MultiOutput
<
Tensor
>
(
const
std
::
string
&
name
)
const
;
...
...
@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase {
OpInfoMap
::
Instance
().
Get
(
Type
()).
infer_shape_
(
ctx
);
}
void
RuntimeInferShape
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
;
void
RuntimeInferShape
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
,
const
RuntimeContext
&
ctx
)
const
override
;
protected:
virtual
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
;
...
...
@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase {
*
* * transfered_inplace_vars is a output vector.
*/
Scope
*
TryTransferData
(
const
Scope
&
scope
,
const
OpKernelType
&
expected_kernel_key
,
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
)
const
;
Scope
*
PrepareData
(
const
Scope
&
scope
,
const
OpKernelType
&
expected_kernel_key
,
std
::
vector
<
std
::
string
>*
transfered_inplace_vars
,
RuntimeContext
*
ctx
)
const
;
void
TransferInplaceVarsBack
(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
5c7a8aee
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
#include <algorithm>
#include <string>
#include <tuple>
#include <vector>
...
...
@@ -93,6 +94,7 @@ class ParallelExecutorPrivate {
}
}
BuildStrategy
build_strategy_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
local_scopes_
;
Scope
*
global_scope_
;
// not owned
...
...
@@ -169,6 +171,14 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass
->
SetNotOwned
(
details
::
kAllPlaces
,
&
places_
);
graph
=
eager_deletion_pass
->
Apply
(
std
::
move
(
graph
));
VLOG
(
10
)
<<
"EagerDeletionPass Applied"
;
if
(
build_strategy_
.
memory_early_delete_
)
{
auto
early_delete_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"memory_early_delete_pass"
);
early_delete_pass
->
SetNotOwned
(
details
::
kGarbageCollector
,
&
gcs_
);
graph
=
early_delete_pass
->
Apply
(
std
::
move
(
graph
));
}
VLOG
(
10
)
<<
"MemoryEarlyDeletePass Applied."
;
}
return
graph
;
...
...
@@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor(
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
use_all_reduce_
=
build_strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
;
...
...
@@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor(
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
params
,
member_
->
local_scopes_
,
member_
->
use_cuda_
);
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
if
(
max_memory_size
>=
0
)
{
graph
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
graph
),
...
...
@@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor(
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
places
,
std
::
move
(
graph
)));
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)));
}
else
{
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
places
,
std
::
move
(
graph
)));
exec_strategy
,
member_
->
local_scopes_
,
member_
->
places_
,
std
::
move
(
graph
)));
}
member_
->
executor_
.
reset
(
new
details
::
ScopeBufferedSSAGraphExecutor
(
...
...
@@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() {
}
// namespace framework
}
// namespace paddle
USE_PASS
(
memory_early_delete_pass
);
USE_PASS
(
reference_count_pass
);
USE_PASS
(
eager_deletion_pass
);
paddle/fluid/framework/tensor_test.cc
浏览文件 @
5c7a8aee
...
...
@@ -74,6 +74,22 @@ TEST(Tensor, MutableData) {
p2
=
src_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
2
,
2
}),
platform
::
CPUPlace
());
EXPECT_EQ
(
p1
,
p2
);
float
*
p3
=
nullptr
;
float
*
p4
=
nullptr
;
// set src_tensor a different type but smaller size.
// memory block is supposed to be unchanged.
auto
*
tmp
=
src_tensor
.
mutable_data
<
uint8_t
>
(
framework
::
make_ddim
({
2
,
2
}),
platform
::
CPUPlace
());
p3
=
reinterpret_cast
<
float
*>
(
tmp
);
EXPECT_EQ
(
p1
,
p3
);
// set src_tensor a different type but bigger size.
// memory block is supposed to be changed.
auto
*
tmp2
=
src_tensor
.
mutable_data
<
double
>
(
framework
::
make_ddim
({
2
,
2
,
3
}),
platform
::
CPUPlace
());
p4
=
reinterpret_cast
<
float
*>
(
tmp2
);
EXPECT_NE
(
p1
,
p4
);
}
// Not sure if it's desired, but currently, Tensor type can be changed.
{
...
...
paddle/fluid/framework/type_defs.h
浏览文件 @
5c7a8aee
...
...
@@ -28,8 +28,11 @@ class OperatorBase;
class
OpDesc
;
class
InferShapeContext
;
class
BlockDesc
;
class
Variable
;
using
VariableNameMap
=
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>
;
// TODO(panyx0718): Replace vector with something like gtl::Vector.
using
VariableValueMap
=
std
::
map
<
std
::
string
,
std
::
vector
<
Variable
*>>
;
// The order should be as same as framework.proto
using
Attribute
=
...
...
paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
浏览文件 @
5c7a8aee
...
...
@@ -63,7 +63,6 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
void
TensorRtSubgraphPass
::
CreateTensorRTOp
(
framework
::
ir
::
Node
*
node
,
Graph
*
graph
)
const
{
auto
*
op_desc
=
node
->
Op
();
static
int
counter
{
0
};
auto
&
subgraph
=
*
Agent
(
node
).
subgraph
();
PADDLE_ENFORCE
(
!
subgraph
.
empty
());
...
...
@@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
block_desc
.
Proto
()
->
SerializeAsString
());
SetAttr
(
op_desc
->
Proto
(),
"max_batch_size"
,
Get
<
int
>
(
"max_batch_size"
));
SetAttr
(
op_desc
->
Proto
(),
"workspace_size"
,
Get
<
int
>
(
"workspace_size"
));
SetAttr
(
op_desc
->
Proto
(),
"engine_uniq_key"
,
"trt-"
+
std
::
to_string
(
counter
++
));
SetAttr
(
op_desc
->
Proto
(),
"parameters"
,
ExtractParameters
(
graph
->
Nodes
()));
SetAttr
(
op_desc
->
Proto
(),
"output_name_mapping"
,
output_mapping
);
}
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
5c7a8aee
...
...
@@ -122,6 +122,7 @@ class GpuPassStrategy : public PassStrategy {
"conv_bn_fuse_pass"
,
//
"conv_elementwise_add_act_fuse_pass"
,
//
"conv_elementwise_add2_act_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
});
}
...
...
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
5c7a8aee
...
...
@@ -103,6 +103,7 @@ class OpConverter {
void
ConvertBlock
(
const
framework
::
proto
::
BlockDesc
&
block
,
const
std
::
unordered_set
<
std
::
string
>&
parameters
,
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
mut_
);
for
(
int
i
=
0
;
i
<
block
.
ops_size
();
i
++
)
{
const
auto
&
op
=
block
.
ops
(
i
);
ConvertOp
(
op
,
parameters
,
scope
,
engine
);
...
...
@@ -125,6 +126,7 @@ class OpConverter {
std
::
unordered_map
<
std
::
string
,
OpConverter
*>
converters_
;
// fluid inference scope
framework
::
Scope
*
scope_
{
nullptr
};
std
::
mutex
mut_
;
};
}
// namespace tensorrt
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
5c7a8aee
...
...
@@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
ARGS --infer_model=
${
install_dir
}
/model
)
endfunction
()
function
(
inference_analysis_api_test_with_refer_result target install_dir filename
)
inference_analysis_test
(
${
target
}
SRCS
${
filename
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
install_dir
}
/model --infer_data=
${
install_dir
}
/data.txt
--refer_result=
${
install_dir
}
/result.txt
)
endfunction
()
# RNN1
if
(
NOT APPLE AND WITH_MKLML
)
set
(
RNN1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/rnn1"
)
...
...
@@ -83,14 +90,21 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if
(
NOT EXISTS
${
OCR_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
OCR_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Focr.tar.gz"
)
endif
()
inference_analysis_api_test
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
inference_analysis_api_test_with_refer_result
(
test_analyzer_ocr
${
OCR_INSTALL_DIR
}
analyzer_vis_tester.cc
)
# mobilenet with transpose op
set
(
MOBILENET_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet"
)
if
(
NOT EXISTS
${
MOBILENET_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
MOBILENET_INSTALL_DIR
}
"http://paddlemodels.cdn.bcebos.com/"
"inference-vis-demos%2Fmobilenet.tar.gz"
)
endif
()
inference_analysis_api_test_with_refer_result
(
test_analyzer_mobilenet_transpose
${
MOBILENET_INSTALL_DIR
}
analyzer_vis_tester.cc
)
# resnet50
inference_analysis_api_test_with_fake_data
(
test_analyzer_resnet50
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/resnet50"
analyzer_resnet50_tester.cc
"resnet50_model.tar.gz"
)
# mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data
(
test_analyzer_mobilenet
inference_analysis_api_test_with_fake_data
(
test_analyzer_mobilenet
_depthwise_conv
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mobilenet_depthwise_conv"
analyzer_resnet50_tester.cc
"mobilenet_model.tar.gz"
)
# anakin
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
5c7a8aee
...
...
@@ -93,18 +93,20 @@ void profile(bool use_mkldnn = false) {
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
const
float
ocr_result_data
[]
=
{
5.273636460856323538e-08
,
3.296741795111302054e-07
,
1.873261190610264748e-08
,
3.403730275408634043e-08
,
3.383312474625199684e-08
};
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
1UL
);
size_t
size
=
GetSize
(
outputs
[
0
]);
PADDLE_ENFORCE_GT
(
size
,
0
);
float
*
result
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
for
(
size_t
i
=
0
;
i
<
std
::
min
(
5UL
,
size
);
i
++
)
{
EXPECT_NEAR
(
result
[
i
],
ocr_result_data
[
i
],
1e-3
);
std
::
string
line
;
std
::
ifstream
file
(
FLAGS_refer_result
);
std
::
getline
(
file
,
line
);
auto
refer
=
ProcessALine
(
line
);
file
.
close
();
auto
&
output
=
outputs
.
front
();
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
CHECK_EQ
(
numel
,
refer
.
data
.
size
());
for
(
size_t
i
=
0
;
i
<
numel
;
++
i
)
{
CHECK_LT
(
fabs
(
static_cast
<
float
*>
(
output
.
data
.
data
())[
i
]
-
refer
.
data
[
i
]),
1e-5
);
}
}
}
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
5c7a8aee
...
...
@@ -36,6 +36,7 @@
DEFINE_string
(
model_name
,
""
,
"model name"
);
DEFINE_string
(
infer_model
,
""
,
"model path"
);
DEFINE_string
(
infer_data
,
""
,
"data file"
);
DEFINE_string
(
refer_result
,
""
,
"reference result for comparison"
);
DEFINE_int32
(
batch_size
,
1
,
"batch size."
);
DEFINE_int32
(
repeat
,
1
,
"Running the inference program repeat times."
);
DEFINE_bool
(
test_all_data
,
false
,
"Test the all dataset in data file."
);
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
5c7a8aee
...
...
@@ -64,9 +64,7 @@ endif()
set
(
COMMON_OP_DEPS
${
OP_HEADER_DEPS
}
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
if
(
NOT WIN32
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
endif
()
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
)
if
(
WITH_GPU
)
...
...
paddle/fluid/operators/beam_search_decode_op.cc
浏览文件 @
5c7a8aee
...
...
@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
);
framework
::
RuntimeContext
run_ctx
(
Inputs
(),
Outputs
(),
scope
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
);
const
LoDTensorArray
*
ids
=
ctx
.
Input
<
LoDTensorArray
>
(
"Ids"
);
const
LoDTensorArray
*
scores
=
ctx
.
Input
<
LoDTensorArray
>
(
"Scores"
);
...
...
paddle/fluid/operators/controlflow/parallel_do_op.cc
已删除
100644 → 0
浏览文件 @
b3cf476d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace
paddle
{
namespace
operators
{
static
constexpr
char
kInputs
[]
=
"inputs"
;
static
constexpr
char
kParameters
[]
=
"parameters"
;
static
constexpr
char
kPlaces
[]
=
"places"
;
static
constexpr
char
kOutputs
[]
=
"outputs"
;
static
constexpr
char
kParallelScopes
[]
=
"parallel_scopes"
;
static
constexpr
char
kParallelBlock
[]
=
"sub_block"
;
static
constexpr
char
kUseNCCL
[]
=
"use_nccl"
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
SelectedRows
=
framework
::
SelectedRows
;
static
void
SplitTensorAndMoveTensorToScopes
(
const
framework
::
Scope
&
scope
,
std
::
vector
<
framework
::
Scope
*>
*
sub_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
std
::
string
>
&
names
)
{
size_t
num_sub_scopes
=
0
;
for
(
auto
&
argu
:
names
)
{
const
auto
&
tensor
=
detail
::
Ref
(
scope
.
FindVar
(
argu
),
"Cannot find variable %s in the parent scope"
,
argu
)
.
Get
<
LoDTensor
>
();
auto
lod_tensors
=
tensor
.
SplitLoDTensor
(
places
);
for
(
auto
&
lod
:
lod_tensors
)
{
VLOG
(
3
)
<<
lod
.
dims
();
}
if
(
num_sub_scopes
==
0
)
{
num_sub_scopes
=
lod_tensors
.
size
();
}
else
{
PADDLE_ENFORCE_EQ
(
num_sub_scopes
,
lod_tensors
.
size
());
}
PADDLE_ENFORCE_NE
(
num_sub_scopes
,
0
);
if
(
sub_scopes
->
size
()
==
0
)
{
sub_scopes
->
reserve
(
num_sub_scopes
);
for
(
size_t
i
=
0
;
i
<
num_sub_scopes
;
++
i
)
{
sub_scopes
->
emplace_back
(
&
scope
.
NewScope
());
}
}
for
(
size_t
i
=
0
;
i
<
lod_tensors
.
size
();
++
i
)
{
*
detail
::
Ref
(
sub_scopes
->
at
(
i
)
->
Var
(
argu
),
"Cannot find variable in the sub-scope"
,
argu
)
.
GetMutable
<
LoDTensor
>
()
=
lod_tensors
[
i
];
}
}
}
inline
void
CopyOrShare
(
const
framework
::
Variable
&
src
,
const
platform
::
Place
&
dst_place
,
framework
::
Variable
*
dst
)
{
if
(
src
.
IsType
<
LoDTensor
>
())
{
if
(
src
.
Get
<
LoDTensor
>
().
place
()
==
dst_place
)
{
dst
->
GetMutable
<
LoDTensor
>
()
->
ShareDataWith
(
src
.
Get
<
LoDTensor
>
());
dst
->
GetMutable
<
LoDTensor
>
()
->
set_lod
(
src
.
Get
<
LoDTensor
>
().
lod
());
}
else
{
TensorCopy
(
src
.
Get
<
LoDTensor
>
(),
dst_place
,
dst
->
GetMutable
<
LoDTensor
>
());
}
}
else
if
(
src
.
IsType
<
SelectedRows
>
())
{
auto
&
src_sr
=
src
.
Get
<
SelectedRows
>
();
auto
*
dst_sr
=
dst
->
GetMutable
<
SelectedRows
>
();
dst_sr
->
set_height
(
src_sr
.
height
());
if
(
src_sr
.
value
().
place
()
==
dst_place
)
{
dst_sr
->
mutable_value
()
->
ShareDataWith
(
src_sr
.
value
());
dst_sr
->
set_rows
(
src_sr
.
rows
());
}
else
{
TensorCopy
(
src_sr
.
value
(),
dst_place
,
dst_sr
->
mutable_value
());
}
}
else
{
PADDLE_THROW
(
"Expect LoDTensor/SelectedRows, get %s"
,
src
.
Type
().
name
());
}
}
void
WaitOnPlace
(
const
platform
::
Place
place
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
dev_ctx
.
Wait
();
}
void
WaitOnPlaces
(
const
std
::
vector
<
platform
::
Place
>
places
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
for
(
auto
&
place
:
places
)
{
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
dev_ctx
.
Wait
();
}
}
class
ParallelDoOp
:
public
framework
::
OperatorBase
{
public:
ParallelDoOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
framework
::
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
// get device context from pool
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
place
);
auto
*
block
=
Attr
<
framework
::
BlockDesc
*>
(
kParallelBlock
);
auto
*
program
=
block
->
Program
();
auto
&
places
=
scope
.
FindVar
(
Input
(
kPlaces
))
->
Get
<
platform
::
PlaceList
>
();
auto
&
sub_scopes
=
*
scope
.
FindVar
(
Output
(
kParallelScopes
))
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
// split input
SplitTensorAndMoveTensorToScopes
(
scope
,
&
sub_scopes
,
places
,
Inputs
(
kInputs
));
// copy parameter
for
(
auto
&
param
:
Inputs
(
kParameters
))
{
PADDLE_ENFORCE
(
scope
.
FindVar
(
param
)
->
IsType
<
LoDTensor
>
(),
"Only support parameter type as LoDTensor"
);
auto
&
src
=
scope
.
FindVar
(
param
)
->
Get
<
LoDTensor
>
();
auto
*
sub_scope0
=
sub_scopes
[
0
];
auto
*
dst0
=
sub_scope0
->
Var
(
param
)
->
GetMutable
<
LoDTensor
>
();
dst0
->
ShareDataWith
(
src
);
for
(
size_t
i
=
1
;
i
<
sub_scopes
.
size
();
++
i
)
{
auto
&
place
=
places
[
i
];
auto
*
sub_scope
=
sub_scopes
[
i
];
auto
*
dst
=
sub_scope
->
Var
(
param
)
->
GetMutable
<
LoDTensor
>
();
framework
::
TensorCopy
(
src
,
place
,
dst
);
}
}
WaitOnPlaces
(
places
);
std
::
vector
<
std
::
future
<
void
>>
workers
;
workers
.
reserve
(
places
.
size
());
for
(
size_t
place_idx
=
0
;
place_idx
<
sub_scopes
.
size
();
++
place_idx
)
{
auto
&
place
=
places
[
place_idx
];
auto
*
cur_scope
=
sub_scopes
[
place_idx
];
workers
.
emplace_back
(
framework
::
Async
([
program
,
cur_scope
,
place
,
block
]
{
framework
::
Executor
executor
(
place
);
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
false
/*create_local_scope*/
);
}));
}
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
}
WaitOnPlaces
(
places
);
// merge output
for
(
auto
&
o_name
:
Outputs
(
kOutputs
))
{
std
::
vector
<
const
framework
::
LoDTensor
*>
lod_tensors
;
lod_tensors
.
reserve
(
sub_scopes
.
size
());
for
(
auto
*
sub_scope
:
sub_scopes
)
{
lod_tensors
.
emplace_back
(
&
sub_scope
->
FindVar
(
o_name
)
->
Get
<
LoDTensor
>
());
}
auto
*
lod_tensor_to_be_merged
=
scope
.
FindVar
(
o_name
)
->
GetMutable
<
LoDTensor
>
();
lod_tensor_to_be_merged
->
MergeLoDTensor
(
lod_tensors
,
dev_ctx
.
GetPlace
());
}
WaitOnPlaces
(
places
);
}
};
class
ParallelDoOpProtoMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
kInputs
,
""
).
AsDuplicable
();
AddInput
(
kParameters
,
""
).
AsDuplicable
();
AddInput
(
kPlaces
,
""
);
AddOutput
(
kOutputs
,
""
).
AsDuplicable
();
AddOutput
(
kParallelScopes
,
""
);
AddAttr
<
framework
::
BlockDesc
*>
(
kParallelBlock
,
""
);
AddAttr
<
bool
>
(
kUseNCCL
,
"true if we use nccl on backward"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
ParallelDo Operator.
)DOC"
);
}
};
class
ParallelDoGradOp
:
public
framework
::
OperatorBase
{
public:
ParallelDoGradOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
framework
::
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
*
block
=
Attr
<
framework
::
BlockDesc
*>
(
kParallelBlock
);
auto
*
program
=
block
->
Program
();
auto
&
sub_scopes
=
scope
.
FindVar
(
Input
(
kParallelScopes
))
->
Get
<
std
::
vector
<
framework
::
Scope
*>>
();
auto
&
places
=
scope
.
FindVar
(
Input
(
kPlaces
))
->
Get
<
platform
::
PlaceList
>
();
// feed output@grad
SplitTensorAndMoveTensorToScopes
(
scope
,
const_cast
<
std
::
vector
<
framework
::
Scope
*>
*>
(
&
sub_scopes
),
places
,
Inputs
(
framework
::
GradVarName
(
kOutputs
)));
WaitOnPlaces
(
places
);
// exe run
std
::
vector
<
std
::
future
<
void
>>
workers
;
for
(
size_t
i
=
0
;
i
<
sub_scopes
.
size
();
++
i
)
{
auto
&
place
=
places
[
i
];
auto
*
cur_scope
=
sub_scopes
[
i
];
// execute
workers
.
emplace_back
(
framework
::
Async
([
program
,
cur_scope
,
place
,
block
]
{
framework
::
Executor
executor
(
place
);
executor
.
Run
(
*
program
,
cur_scope
,
block
->
ID
(),
false
/*create_local_scope*/
);
}));
}
for
(
auto
&
worker
:
workers
)
{
worker
.
wait
();
}
WaitOnPlaces
(
places
);
// NCCL allreduce op will be added by backward,
// so no need to explicitly accumulate grad
if
(
!
(
Attr
<
bool
>
(
kUseNCCL
)))
{
AccumulateGrad
(
scope
,
place
,
sub_scopes
,
places
);
}
else
{
for
(
auto
&
place
:
places
)
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
place
),
"NCCL only supports cuda place"
);
}
}
for
(
auto
&
s
:
Outputs
(
framework
::
GradVarName
(
kParameters
)))
{
if
(
s
==
framework
::
kEmptyVarName
)
{
continue
;
}
VLOG
(
3
)
<<
"Moving "
<<
s
;
CopyOrShare
(
*
sub_scopes
[
0
]
->
FindVar
(
s
),
place
,
scope
.
FindVar
(
s
));
}
WaitOnPlaces
(
places
);
}
void
AccumulateGrad
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
,
const
std
::
vector
<
framework
::
Scope
*>
&
sub_scopes
,
const
platform
::
PlaceList
&
places
)
const
{
for
(
auto
&
s
:
Outputs
(
framework
::
GradVarName
(
kParameters
)))
{
if
(
s
==
framework
::
kEmptyVarName
)
{
continue
;
}
VLOG
(
3
)
<<
"Accumulating "
<<
s
;
if
(
s
==
framework
::
kEmptyVarName
)
continue
;
std
::
string
tmp_name
;
auto
*
tmp
=
sub_scopes
[
0
]
->
Var
(
&
tmp_name
);
for
(
size_t
i
=
1
;
i
<
sub_scopes
.
size
();
++
i
)
{
CopyOrShare
(
*
sub_scopes
[
i
]
->
FindVar
(
s
),
places
[
0
],
tmp
);
WaitOnPlaces
(
places
);
auto
sum_op
=
framework
::
OpRegistry
::
CreateOp
(
"sum"
,
{{
"X"
,
{
s
,
tmp_name
}}},
{{
"Out"
,
{
s
}}},
framework
::
AttributeMap
{{
"use_mkldnn"
,
{
false
}}});
VLOG
(
10
)
<<
sum_op
->
DebugStringEx
(
sub_scopes
[
0
]);
sum_op
->
Run
(
*
sub_scopes
[
0
],
places
[
0
]);
WaitOnPlace
(
places
[
0
]);
}
CopyOrShare
(
*
sub_scopes
[
0
]
->
FindVar
(
s
),
place
,
scope
.
FindVar
(
s
));
}
WaitOnPlaces
(
places
);
}
};
std
::
ostream
&
operator
<<
(
std
::
ostream
&
sout
,
const
std
::
vector
<
std
::
string
>
&
strs
)
{
std
::
copy
(
strs
.
begin
(),
strs
.
end
(),
std
::
ostream_iterator
<
std
::
string
>
(
sout
,
","
));
return
sout
;
}
class
ParallelDoGradOpDescMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
virtual
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
{
auto
*
grad
=
new
framework
::
OpDesc
();
grad
->
SetType
(
"parallel_do_grad"
);
for
(
auto
&
input_param
:
this
->
InputNames
())
{
VLOG
(
3
)
<<
input_param
;
grad
->
SetInput
(
input_param
,
this
->
Input
(
input_param
));
if
(
input_param
!=
kPlaces
)
{
grad
->
SetOutput
(
framework
::
GradVarName
(
input_param
),
this
->
InputGrad
(
input_param
,
false
));
}
}
auto
*
g_block
=
this
->
grad_block_
[
0
];
// All variable name that needed by gradient operators
std
::
unordered_set
<
std
::
string
>
all_inputs_in_grad_blocks
;
for
(
size_t
i
=
0
;
i
<
g_block
->
OpSize
();
++
i
)
{
auto
*
op
=
g_block
->
Op
(
i
);
for
(
auto
&
var_name
:
op
->
InputArgumentNames
())
{
all_inputs_in_grad_blocks
.
insert
(
var_name
);
}
}
for
(
auto
&
output_param
:
this
->
OutputNames
())
{
if
(
output_param
==
kParallelScopes
)
{
grad
->
SetInput
(
output_param
,
this
->
Output
(
output_param
));
grad
->
SetInput
(
framework
::
GradVarName
(
output_param
),
this
->
Output
(
output_param
));
}
else
{
grad
->
SetInput
(
output_param
,
this
->
Output
(
output_param
));
std
::
vector
<
std
::
string
>
og_names
;
for
(
auto
&
og_name
:
this
->
OutputGrad
(
output_param
))
{
if
(
all_inputs_in_grad_blocks
.
count
(
og_name
)
!=
0
)
{
// there are some gradient operators who need the OG. So make this
// OG as an input of parallel.do
og_names
.
push_back
(
og_name
);
}
// else, there is no operator who need the OG. Do not use this OG as
// an input
}
grad
->
SetInput
(
framework
::
GradVarName
(
output_param
),
og_names
);
}
}
grad
->
SetInput
(
"Communicator"
,
{
"nccl_com__do_not_change_"
});
grad
->
SetAttrMap
(
this
->
Attrs
());
grad
->
SetBlockAttr
(
kParallelBlock
,
grad_block_
[
0
]);
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
grad
);
}
};
class
ParallelDoGradOpShapeInference
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
kParameters
));
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
kInputs
));
PADDLE_ENFORCE
(
ctx
->
HasInputs
(
kOutputs
));
ctx
->
SetOutputsDim
(
framework
::
GradVarName
(
kParameters
),
ctx
->
GetInputsDim
(
kParameters
));
auto
i_dims
=
ctx
->
GetInputsDim
(
kInputs
);
auto
ig_names
=
ctx
->
Outputs
(
framework
::
GradVarName
(
kInputs
));
for
(
size_t
i
=
0
;
i
<
ig_names
.
size
();
++
i
)
{
auto
&
ig_name
=
ig_names
[
i
];
if
(
ig_name
==
framework
::
kEmptyVarName
)
{
continue
;
}
ctx
->
SetDims
({
ig_name
},
{
i_dims
[
i
]});
}
auto
p_dims
=
ctx
->
GetInputsDim
(
kParameters
);
auto
pg_names
=
ctx
->
Outputs
(
framework
::
GradVarName
(
kParameters
));
for
(
size_t
i
=
0
;
i
<
pg_names
.
size
();
++
i
)
{
auto
&
pg_name
=
pg_names
[
i
];
if
(
pg_name
==
framework
::
kEmptyVarName
)
{
continue
;
}
ctx
->
SetDims
({
pg_name
},
{
p_dims
[
i
]});
}
}
};
class
ParallelDoGradOpVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
framework
::
BlockDesc
*
sub_block
=
boost
::
get
<
framework
::
BlockDesc
*>
(
op_desc
.
GetAttr
(
kParallelBlock
));
for
(
auto
&
out_vars
:
op_desc
.
Outputs
())
{
for
(
auto
&
out_var
:
out_vars
.
second
)
{
auto
&
var
=
block
->
FindRecursiveOrCreateVar
(
out_var
);
auto
sub_var
=
sub_block
->
FindRecursiveOrCreateVar
(
out_var
);
if
(
sub_var
.
GetType
()
!=
var
.
GetType
())
{
var
.
SetType
(
sub_var
.
GetType
());
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OPERATOR
(
parallel_do
,
paddle
::
operators
::
ParallelDoOp
,
paddle
::
operators
::
ParallelDoOpProtoMaker
,
paddle
::
operators
::
ParallelDoGradOpDescMaker
);
REGISTER_OPERATOR
(
parallel_do_grad
,
paddle
::
operators
::
ParallelDoGradOp
,
paddle
::
operators
::
ParallelDoGradOpShapeInference
,
paddle
::
operators
::
ParallelDoGradOpVarTypeInference
);
paddle/fluid/operators/hierarchical_sigmoid_op.h
浏览文件 @
5c7a8aee
...
...
@@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
label
.
data
<
int64_t
>
()));
}
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
pre_out
);
auto
pre_out_grad_mat
=
EigenMatrix
<
T
>::
From
(
pre_out_grad
);
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
out_grad
);
// softrelu derivative
Eigen
::
array
<
int
,
2
>
bcast
{
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])}
;
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
)
;
// softrelu derivative
pre_out_grad_mat
.
device
(
place
)
=
static_cast
<
T
>
(
1.0
)
-
static_cast
<
T
>
(
1.0
)
/
pre_out_mat
.
exp
();
auto
*
pre_out_grad_data
=
pre_out_grad
.
data
<
T
>
();
auto
*
pre_out_data
=
pre_out
.
data
<
T
>
();
auto
n
=
pre_out
.
numel
();
blas
.
VEXP
(
n
,
pre_out_data
,
pre_out_grad_data
);
blas
.
VINV
(
n
,
pre_out_grad_data
,
pre_out_grad_data
);
for
(
int64_t
i
=
0
;
i
<
n
;
++
i
)
{
pre_out_grad_data
[
i
]
=
1.0
-
pre_out_grad_data
[
i
];
}
bit_code
->
Sub
(
&
pre_out_grad
);
// the gradient of clip(w * x + b)
pre_out_grad_mat
.
device
(
place
)
=
pre_out_grad_mat
*
out_grad_mat
.
broadcast
(
bcast
);
auto
*
out_grad_data
=
out_grad
.
data
<
T
>
();
int64_t
dim0
=
pre_out_grad
.
dims
()[
0
];
int64_t
dim1
=
pre_out_grad
.
dims
()[
1
];
for
(
int64_t
i
=
0
;
i
<
dim0
;
++
i
)
{
T
tmp
=
out_grad_data
[
i
];
blas
.
SCAL
(
dim1
,
tmp
,
pre_out_grad_data
+
i
*
dim1
);
}
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
...
...
paddle/fluid/operators/huber_loss_op.cc
浏览文件 @
5c7a8aee
...
...
@@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
huber_loss_grad
,
ops
::
HuberLossGradOp
);
REGISTER_OP_CPU_KERNEL
(
huber_loss
,
ops
::
HuberLossKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
huber_loss
,
ops
::
HuberLossKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
HuberLossKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
huber_loss_grad
,
ops
::
HuberLossGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
ops
::
HuberLossGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
HuberLossGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/math/blas.h
浏览文件 @
5c7a8aee
...
...
@@ -181,6 +181,9 @@ class Blas {
const
framework
::
Tensor
&
mat_b
,
const
MatDescriptor
&
dim_b
,
T
alpha
,
framework
::
Tensor
*
mat_out
,
T
beta
)
const
;
template
<
typename
T
>
void
VINV
(
int
n
,
const
T
*
a
,
T
*
y
)
const
;
private:
const
DeviceContext
&
context_
;
};
...
...
@@ -282,6 +285,11 @@ class BlasT : private Blas<DeviceContext> {
Base
()
->
template
BatchedGEMM
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
VINV
(
ARGS
...
args
)
const
{
Base
()
->
template
VINV
<
T
>(
args
...);
}
private:
const
Blas
<
DeviceContext
>*
Base
()
const
{
return
static_cast
<
const
Blas
<
DeviceContext
>*>
(
this
);
...
...
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
5c7a8aee
...
...
@@ -118,6 +118,11 @@ struct CBlas<float> {
static
void
VPOW
(
ARGS
...
args
)
{
platform
::
dynload
::
vsPowx
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
VINV
(
ARGS
...
args
)
{
platform
::
dynload
::
vsInv
(
args
...);
}
};
template
<
>
...
...
@@ -213,6 +218,11 @@ struct CBlas<double> {
static
void
VPOW
(
ARGS
...
args
)
{
platform
::
dynload
::
vdPowx
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
VINV
(
ARGS
...
args
)
{
platform
::
dynload
::
vdInv
(
args
...);
}
};
#else
...
...
@@ -603,6 +613,17 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
dim_a
.
stride_
,
dim_b
.
stride_
);
}
}
template
<
typename
DeviceContext
>
template
<
typename
T
>
void
Blas
<
DeviceContext
>::
VINV
(
int
n
,
const
T
*
a
,
T
*
y
)
const
{
#ifdef PADDLE_WITH_MKLML
CBlas
<
T
>::
VINV
(
n
,
a
,
y
);
#else
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.0
/
a
[
i
];
}
#endif
}
}
// namespace math
}
// namespace operators
...
...
paddle/fluid/operators/math/matrix_bit_code.cc
浏览文件 @
5c7a8aee
...
...
@@ -14,195 +14,334 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include <iostream>
#include <map>
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Add
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
)
{
size_t
batch_size
=
tmat
->
dims
()[
0
];
size_t
width
=
tmat
->
dims
()[
1
];
struct
MatrixBitCodeFunctorAdd
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
vec_
;
framework
::
Tensor
*
tmat_
;
MatrixBitCodeFunctorAdd
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
)
:
vec_
(
vec
),
tmat_
(
tmat
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
batch_size
=
tmat_
->
dims
()[
0
];
size_t
width
=
tmat_
->
dims
()[
1
];
auto
*
tmat_data
=
tmat_
->
data
<
T
>
();
auto
*
vec_data
=
vec_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
tmat
->
data
<
T
>
()[
i
*
width
+
j
]
+=
vec
.
data
<
T
>
()
[
index
];
size_t
index
=
code
.
calc_index
(
j
);
tmat_data
[
i
*
width
+
j
]
+=
vec_data
[
index
];
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Add
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
)
{
MatrixBitCodeFunctorAdd
<
T
>
func
(
vec
,
tmat
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
)
{
size_t
batch_size
=
tmat
.
dims
()[
0
];
size_t
width
=
tmat
.
dims
()[
1
];
struct
MatrixBitCodeFunctorAddGrad
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
Tensor
*
vec_
;
MatrixBitCodeFunctorAddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
)
:
tmat_
(
tmat
),
vec_
(
vec
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
table
)
{
size_t
batch_size
=
tmat_
.
dims
()[
0
];
size_t
width
=
tmat_
.
dims
()[
1
];
auto
*
vec_data
=
vec_
->
data
<
T
>
();
auto
*
tmat_data
=
tmat_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
vec
->
data
<
T
>
()[
index
]
+=
tmat
.
data
<
T
>
()[
i
*
width
+
j
];
size_t
index
=
code
.
calc_index
(
j
);
vec_data
[
index
]
+=
tmat_data
[
i
*
width
+
j
];
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
)
{
MatrixBitCodeFunctorAddGrad
<
T
>
func
(
tmat
,
vec
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
{
size_t
batch_size
=
tmat
.
dims
()[
0
];
size_t
width
=
tmat
.
dims
()[
1
];
struct
MatrixBitCodeFunctorSelectedRowsAddGrad
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
SelectedRows
*
vec_
;
MatrixBitCodeFunctorSelectedRowsAddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
:
tmat_
(
tmat
),
vec_
(
vec
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
batch_size
=
tmat_
.
dims
()[
0
];
size_t
width
=
tmat_
.
dims
()[
1
];
auto
*
vec_data
=
vec_
->
mutable_value
()
->
template
data
<
T
>();
auto
*
tmat_data
=
tmat_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
int64_t
row_index
=
vec
->
GetIndexFromId
(
static_cast
<
int64_t
>
(
index
));
vec
->
mutable_value
()
->
data
<
T
>
()[
row_index
]
+=
tmat
.
data
<
T
>
()[
i
*
width
+
j
];
size_t
index
=
code
.
calc_index
(
j
);
int64_t
row_index
=
vec_
->
GetIndexFromId
(
static_cast
<
int64_t
>
(
index
));
vec_data
[
row_index
]
+=
tmat_data
[
i
*
width
+
j
];
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
{
MatrixBitCodeFunctorSelectedRowsAddGrad
<
T
>
func
(
tmat
,
vec
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sum
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
sum
,
T
scale_sum
)
{
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
o_width
=
tmat
.
dims
()[
1
];
struct
MatrixBitCodeFunctorSum
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
Tensor
*
sum_
;
T
scale_sum_
;
MatrixBitCodeFunctorSum
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
sum
,
T
scale_sum
)
:
tmat_
(
tmat
),
sum_
(
sum
),
scale_sum_
(
scale_sum
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
o_width
=
tmat_
.
dims
()[
1
];
auto
*
tmat_data
=
tmat_
.
data
<
T
>
();
auto
*
sum_data
=
sum_
->
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
T
sm
=
static_cast
<
T
>
(
0.0
);
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
if
(
code
->
calc_bit
(
j
))
{
// calc_bit starts from right most bit, while data in tmat[i] is in the
if
(
code
.
calc_bit
(
j
))
{
// calc_bit starts from right most bit, while data in tmat[i] is in
// the
// reverse order.
sm
+=
tmat
.
data
<
T
>
()
[
i
*
o_width
+
j
];
sm
+=
tmat_data
[
i
*
o_width
+
j
];
}
}
sum
->
data
<
T
>
()[
i
]
=
scale_sum
*
sm
;
sum_data
[
i
]
=
scale_sum_
*
sm
;
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sum
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
sum
,
T
scale_sum
)
{
MatrixBitCodeFunctorSum
<
T
>
func
(
tmat
,
sum
,
scale_sum
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Mul
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
weight
,
const
framework
::
Tensor
&
input
)
{
struct
MatrixBitCodeFunctorMul
:
public
boost
::
static_visitor
<
void
>
{
framework
::
Tensor
*
tmat_
;
const
framework
::
Tensor
&
weight_
;
const
framework
::
Tensor
&
input_
;
MatrixBitCodeFunctorMul
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
weight
,
const
framework
::
Tensor
&
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
platform
::
CPUDeviceContext
());
size_t
num_samples
=
tmat
->
dims
()[
0
];
size_t
tmat_width
=
tmat
->
dims
()[
1
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
weight_width
=
weight
.
dims
()[
1
];
auto
tmat_value
=
tmat
->
data
<
T
>
();
auto
weight_value
=
weight
.
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
size_t
num_samples
=
tmat_
->
dims
()[
0
];
size_t
tmat_width
=
tmat_
->
dims
()[
1
];
size_t
input_width
=
input_
.
dims
()[
1
];
size_t
weight_width
=
weight_
.
dims
()[
1
];
auto
tmat_value
=
tmat_
->
data
<
T
>
();
auto
weight_value
=
weight_
.
data
<
T
>
();
auto
input_value
=
input_
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
const
T
*
input_row
=
input_value
+
input_width
*
i
;
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
const
T
*
input_row
=
input_value
+
input_width
*
i
;
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
const
T
*
weight_row
=
weight_value
+
weight_width
*
index
;
T
sum
=
static_cast
<
T
>
(
0.0
);
sum
=
blas
.
DOT
(
input_width
,
weight_row
,
input_row
);
size_t
index
=
code
.
calc_index
(
j
);
const
T
*
weight_row
=
weight_value
+
weight_width
*
index
;
T
sum
=
blas
.
DOT
(
input_width
,
weight_row
,
input_row
);
tmat_value
[
i
*
tmat_width
+
j
]
+=
sum
;
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Mul
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
weight
,
const
framework
::
Tensor
&
input
)
{
MatrixBitCodeFunctorMul
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
,
size_t
N
>
class
ReservedVector
:
public
std
::
vector
<
T
>
{
public:
ReservedVector
()
{
this
->
reserve
(
N
);
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
)
{
struct
MatrixBitCodeFunctorMulGradWeight
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
Tensor
*
weight_
;
const
framework
::
Tensor
&
input_
;
MatrixBitCodeFunctorMulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
platform
::
CPUDeviceContext
());
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
weight_width
=
weight
->
dims
()[
1
];
auto
tmat_value
=
tmat
.
data
<
T
>
();
auto
weight_value
=
weight
->
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
std
::
unordered_map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
const
T
*>>>
ops
;
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
input_width
=
input_
.
dims
()[
1
];
size_t
tmat_width
=
tmat_
.
dims
()[
1
];
size_t
weight_width
=
weight_
->
dims
()[
1
];
auto
tmat_value
=
tmat_
.
data
<
T
>
();
auto
weight_value
=
weight_
->
data
<
T
>
();
auto
input_value
=
input_
.
data
<
T
>
();
std
::
map
<
int
,
ReservedVector
<
std
::
pair
<
T
,
const
T
*>
,
8u
>>
ops
;
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
ops
[
code
->
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
ops
[
code
.
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
}
}
for
(
auto
&
op
:
ops
)
{
auto
&
op_in_row
=
op
.
second
;
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
T
*
weight_row
=
weight_value
+
op
.
first
*
weight_width
;
for
(
auto
&
op
:
ops
)
{
auto
&
op_in_row
=
op
.
second
;
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
T
*
weight_row
=
weight_value
+
op
.
first
*
weight_width
;
blas
.
AXPY
(
input_width
,
scale
,
input_row
,
weight_row
);
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
)
{
MatrixBitCodeFunctorMulGradWeight
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
)
{
struct
MatrixBitCodeFunctorMulGradWeightSR
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
framework
::
SelectedRows
*
weight_
;
const
framework
::
Tensor
&
input_
;
MatrixBitCodeFunctorMulGradWeightSR
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
platform
::
CPUDeviceContext
());
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
weight_width
=
weight
->
value
().
dims
()[
1
];
auto
tmat_value
=
tmat
.
data
<
T
>
();
auto
weight_value
=
weight
->
mutable_value
()
->
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
input_width
=
input_
.
dims
()[
1
];
size_t
tmat_width
=
tmat_
.
dims
()[
1
];
size_t
weight_width
=
weight_
->
value
().
dims
()[
1
];
auto
tmat_value
=
tmat_
.
data
<
T
>
();
auto
weight_value
=
weight_
->
mutable_value
()
->
data
<
T
>
();
auto
input_value
=
input_
.
data
<
T
>
();
std
::
unordered_map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
const
T
*>>>
ops
;
ops
.
reserve
(
weight
->
rows
().
size
());
std
::
unordered_map
<
int
,
std
::
vector
<
std
::
pair
<
T
,
const
T
*>>>
ops
;
ops
.
reserve
(
weight_
->
rows
().
size
());
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
const
T
*
input_value_row
=
input_value
+
input_width
*
i
;
const
T
*
tmat_row
=
tmat_value
+
i
*
tmat_width
;
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
ops
[
code
->
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
ops
[
code
.
calc_index
(
j
)].
emplace_back
(
tmat_row
[
j
],
input_value_row
);
}
}
for
(
auto
&
row
:
weight
->
rows
())
{
auto
&
op_in_row
=
ops
[
row
];
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
for
(
auto
&
row
:
weight_
->
rows
())
{
auto
&
op_in_row
=
ops
[
row
];
for
(
auto
&
pair
:
op_in_row
)
{
auto
&
scale
=
pair
.
first
;
auto
*
input_row
=
pair
.
second
;
blas
.
AXPY
(
input_width
,
scale
,
input_row
,
weight_value
);
}
weight_value
+=
weight_width
;
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
)
{
MatrixBitCodeFunctorMulGradWeightSR
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradError
(
const
framework
::
Tensor
&
tmat
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
input
)
{
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
input_width
=
input
->
dims
()[
1
];
size_t
weight_width
=
weight
.
dims
()[
1
];
auto
tmat_value
=
tmat
.
data
<
T
>
();
auto
weight_value
=
weight
.
data
<
T
>
();
auto
input_value
=
input
->
data
<
T
>
();
struct
MatrixBitCodeFunctorMulGradError
:
public
boost
::
static_visitor
<
void
>
{
const
framework
::
Tensor
&
tmat_
;
const
framework
::
Tensor
&
weight_
;
framework
::
Tensor
*
input_
;
MatrixBitCodeFunctorMulGradError
(
const
framework
::
Tensor
&
tmat
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
input
)
:
tmat_
(
tmat
),
weight_
(
weight
),
input_
(
input
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
num_samples
=
tmat_
.
dims
()[
0
];
size_t
tmat_width
=
tmat_
.
dims
()[
1
];
size_t
input_width
=
input_
->
dims
()[
1
];
size_t
weight_width
=
weight_
.
dims
()[
1
];
auto
tmat_value
=
tmat_
.
data
<
T
>
();
auto
weight_value
=
weight_
.
data
<
T
>
();
auto
input_value
=
input_
->
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
size_t
index
=
code
.
calc_index
(
j
);
for
(
size_t
k
=
0
;
k
<
input_width
;
++
k
)
{
input_value
[
input_width
*
i
+
k
]
+=
...
...
@@ -211,21 +350,44 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
}
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradError
(
const
framework
::
Tensor
&
tmat
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
input
)
{
MatrixBitCodeFunctorMulGradError
<
T
>
func
(
tmat
,
weight
,
input
);
code_table_
.
apply_visitor
(
func
);
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sub
(
framework
::
Tensor
*
tmat
)
{
size_t
num_samples
=
tmat
->
dims
()[
0
];
size_t
o_width
=
tmat
->
dims
()[
1
];
struct
MatrixBitCodeFunctorSub
:
public
boost
::
static_visitor
<
void
>
{
framework
::
Tensor
*
tmat_
;
explicit
MatrixBitCodeFunctorSub
(
framework
::
Tensor
*
tmat
)
:
tmat_
(
tmat
)
{}
template
<
typename
CodeTable
>
void
operator
()(
const
CodeTable
&
code_table
)
{
size_t
num_samples
=
tmat_
->
dims
()[
0
];
size_t
o_width
=
tmat_
->
dims
()[
1
];
auto
*
tmat_data
=
tmat_
->
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
auto
code
=
code_table
.
get_code
(
i
);
int
code_length
=
code
.
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
if
(
code
->
calc_bit
(
j
))
{
tmat
->
data
<
T
>
()[
i
*
o_width
+
j
]
-=
1
;
if
(
code
.
calc_bit
(
j
))
{
tmat_data
[
i
*
o_width
+
j
]
-=
1
;
}
}
}
}
};
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sub
(
framework
::
Tensor
*
tmat
)
{
MatrixBitCodeFunctorSub
<
T
>
func
(
tmat
);
code_table_
.
apply_visitor
(
func
);
}
template
class
MatrixBitCodeFunctor
<
float
>;
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
5c7a8aee
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <unordered_map>
#include <utility>
#include <vector>
...
...
@@ -22,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/variant.h"
#if defined(_WIN32)
#include <intrin.h>
...
...
@@ -98,24 +100,7 @@ inline int clz(const T& value) {
inline
size_t
FindLastSet
(
size_t
x
)
{
return
sizeof
(
size_t
)
*
8
-
clz
(
x
);
}
#endif // !_WIN32
// set a code interface to create multiple code
class
Code
{
public:
virtual
~
Code
()
{}
virtual
size_t
calc_index
(
int
bit
)
const
=
0
;
virtual
bool
calc_bit
(
int
bit
)
const
=
0
;
virtual
int
get_length
()
const
=
0
;
};
// set a CodeTable interface to create multiple code table
class
CodeTable
{
public:
virtual
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
=
0
;
virtual
size_t
size
()
const
=
0
;
virtual
int
get_max_code_length
()
const
=
0
;
virtual
~
CodeTable
()
{}
};
class
SimpleCode
:
public
Code
{
class
SimpleCode
{
public:
SimpleCode
(
size_t
code
,
size_t
num_classes
,
const
int64_t
*
ids
)
:
c_
(
static_cast
<
size_t
>
(
ids
[
code
])
+
num_classes
)
{}
...
...
@@ -137,16 +122,16 @@ class SimpleCode : public Code {
};
template
<
typename
T
>
class
CustomCode
:
public
Code
{
class
CustomCode
{
public:
CustomCode
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
,
int
index
)
:
ids_
(
ids
),
index_
(
index
)
{
ptable_
=
ptable
.
Slice
(
index
,
index
+
1
)
;
pcode_
=
pcode
.
Slice
(
index
,
index
+
1
)
;
const
int64_t
*
ids
,
int
index
)
{
seq_len_
=
ptable
.
dims
()[
1
];
ptable_
data_
=
ptable
.
data
<
T
>
()
+
seq_len_
*
index
;
pcode_
data_
=
pcode
.
data
<
T
>
()
+
seq_len_
*
index
;
}
/**
* Here the id of root shoud be 1 rather than 0, thus the encoding of class c
* Here the id of root shou
l
d be 1 rather than 0, thus the encoding of class c
* is `c + num_classes` and all siblings can get the same weight indice using
* prefixes.
* Weight index is the prefixes of encoding, thus leave out the right most
...
...
@@ -154,36 +139,37 @@ class CustomCode : public Code {
* Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit.
*/
size_t
calc_index
(
int
bit
)
const
{
return
ptable_
.
data
<
T
>
()[
bit
];
}
bool
calc_bit
(
int
bit
)
const
{
return
pcode_
.
data
<
T
>
()[
bit
];
}
int
get_length
()
const
{
int
length
=
0
;
size_t
calc_index
(
int
bit
)
const
{
return
ptable_data_
[
bit
];
}
bool
calc_bit
(
int
bit
)
const
{
return
pcode_data_
[
bit
];
}
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ptable_
.
dims
()[
1
]);
i
++
)
{
if
(
ptable_
.
data
<
T
>
()[
i
]
>=
0
)
{
length
++
;
}
else
{
return
length
;
}
// NOTE: this function is not thread-safe.
int
get_length
()
const
{
if
(
length_
<
0
)
{
auto
len
=
seq_len_
;
length_
=
static_cast
<
int
>
(
std
::
find_if
(
ptable_data_
,
ptable_data_
+
len
,
[](
const
T
&
val
)
{
return
val
<
0
;
})
-
ptable_data_
);
}
return
length
;
return
length
_
;
}
private:
framework
::
Tensor
ptable
_
;
framework
::
Tensor
pcode
_
;
const
int64_t
*
ids
_
;
const
int
index_
;
int64_t
seq_len
_
;
const
T
*
ptable_data
_
;
const
T
*
pcode_data
_
;
mutable
int
length_
{
-
1
}
;
};
class
SimpleCodeTable
:
public
CodeTable
{
class
SimpleCodeTable
{
public:
SimpleCodeTable
(
size_t
num_classes
,
const
int64_t
*
ids
)
:
num_classes_
(
num_classes
),
ids_
(
ids
)
{}
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
{
std
::
unique_ptr
<
Code
>
coder
(
new
SimpleCode
(
code
,
num_classes_
,
ids_
));
return
coder
;
SimpleCode
get_code
(
int64_t
code
)
const
{
return
SimpleCode
(
code
,
num_classes_
,
ids_
)
;
}
size_t
size
()
const
{
return
num_classes_
;
}
int
get_max_code_length
()
const
{
return
FindLastSet
(
num_classes_
-
1
);
}
...
...
@@ -193,15 +179,14 @@ class SimpleCodeTable : public CodeTable {
};
template
<
typename
T
>
class
CustomCodeTable
:
public
CodeTable
{
class
CustomCodeTable
{
public:
CustomCodeTable
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
ptable_
(
ptable
),
pcode_
(
pcode
),
ids_
(
ids
)
{}
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
{
std
::
unique_ptr
<
Code
>
coder
(
new
CustomCode
<
T
>
(
ptable_
,
pcode_
,
ids_
,
code
));
return
coder
;
CustomCode
<
T
>
get_code
(
int64_t
code
)
const
{
return
CustomCode
<
T
>
(
ptable_
,
pcode_
,
ids_
,
code
);
}
size_t
size
()
const
{
return
static_cast
<
size_t
>
(
ptable_
.
dims
()[
1
]);
}
...
...
@@ -215,19 +200,21 @@ class CustomCodeTable : public CodeTable {
const
int64_t
*
ids_
;
};
using
CodeTable
=
boost
::
variant
<
SimpleCodeTable
,
CustomCodeTable
<
int64_t
>>
;
template
<
typename
T
>
class
MatrixBitCodeFunctor
{
public:
MatrixBitCodeFunctor
(
size_t
num_classes
,
const
int64_t
*
ids
)
:
num_classes_
(
num_classes
),
ids_
(
ids
),
code_table_
(
new
SimpleCodeTable
(
num_classes
,
ids
))
{}
code_table_
(
SimpleCodeTable
(
num_classes
,
ids
))
{}
MatrixBitCodeFunctor
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
num_classes_
(
static_cast
<
size_t
>
(
ptable
.
dims
()[
1
])),
ids_
(
ids
),
code_table_
(
new
CustomCodeTable
<
int64_t
>
(
ptable
,
pcode
,
ids
))
{}
code_table_
(
CustomCodeTable
<
int64_t
>
(
ptable
,
pcode
,
ids
))
{}
/* For j < code_length
tmat(i, j) += vec(0, index(i, j))
*/
...
...
@@ -277,7 +264,7 @@ class MatrixBitCodeFunctor {
size_t
num_classes_
;
const
int64_t
*
ids_
;
std
::
unique_ptr
<
CodeTable
>
code_table_
;
CodeTable
code_table_
;
};
}
// namespace math
}
// namespace operators
...
...
paddle/fluid/operators/optimizers/adam_op.cc
浏览文件 @
5c7a8aee
...
...
@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-8) "
"Constant for numerical stability"
)
.
SetDefault
(
1.0e-8
f
);
AddAttr
<
bool
>
(
"lazy_mode"
,
"(bool, default false) "
"only update the parameter that has gradient in sparse update"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Adam Optimizer.
...
...
paddle/fluid/operators/optimizers/adam_op.h
浏览文件 @
5c7a8aee
...
...
@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
const
int64_t
*
rows_
;
int64_t
row_numel_
;
int64_t
row_count_
;
bool
lazy_mode_
;
SparseAdamFunctor
(
T
beta1
,
T
beta2
,
T
epsilon
,
const
T
*
beta1_pow
,
const
T
*
beta2_pow
,
const
T
*
mom1
,
T
*
mom1_out
,
const
T
*
mom2
,
T
*
mom2_out
,
const
T
*
lr
,
const
T
*
grad
,
const
T
*
param
,
T
*
param_out
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_count
)
int64_t
row_numel
,
int64_t
row_count
,
bool
lazy_mode
)
:
beta1_
(
beta1
),
beta2_
(
beta2
),
epsilon_
(
epsilon
),
...
...
@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
param_out_
(
param_out
),
rows_
(
rows
),
row_numel_
(
row_numel
),
row_count_
(
row_count
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
auto
row_idx
=
math
::
BinarySearch
<
int64_t
>
(
rows_
,
row_count_
,
i
/
row_numel_
);
T
g
=
row_idx
>=
0
?
grad_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
row_count_
(
row_count
),
lazy_mode_
(
lazy_mode
)
{}
inline
HOSTDEVICE
void
adam_update
(
size_t
i
,
T
g
)
const
{
// The following code is the same as dense
T
mom1
=
moment1_
[
i
];
T
mom2
=
moment2_
[
i
];
...
...
@@ -225,6 +223,17 @@ struct SparseAdamFunctor {
moment2_out_
[
i
]
=
mom2
;
param_out_
[
i
]
=
p
;
}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
auto
row_idx
=
math
::
BinarySearch
<
int64_t
>
(
rows_
,
row_count_
,
i
/
row_numel_
);
if
(
lazy_mode_
&&
row_idx
<
0
)
{
return
;
}
else
{
T
g
=
row_idx
>=
0
?
grad_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
adam_update
(
i
,
g
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
...
...
@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
using
paddle
::
framework
::
LoDTensor
;
using
paddle
::
operators
::
detail
::
Ref
;
bool
lazy_mode
=
ctx
.
Attr
<
bool
>
(
"lazy_mode"
);
T
beta1
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta1"
));
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
...
...
@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
lr
.
template
data
<
T
>(),
grad_data
,
param
.
template
data
<
T
>(),
param_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
rows
,
row_numel
,
grad_merge
.
rows
().
size
());
grad_merge
.
rows
().
size
(),
lazy_mode
);
VLOG
(
3
)
<<
"lazy_mode :"
<<
lazy_mode
;
if
(
lazy_mode
&&
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
size_t
row_count
=
grad_merge
.
rows
().
size
();
std
::
vector
<
int64_t
>
cpu_rows
(
grad_merge
.
rows
());
for
(
size_t
row_index
=
0
;
row_index
<
row_count
;
++
row_index
)
{
for
(
size_t
offset
=
0
;
offset
<
row_numel
;
++
offset
)
{
size_t
i
=
cpu_rows
[
row_index
]
*
row_numel
+
offset
;
functor
.
adam_update
(
i
,
grad_data
[
row_index
*
row_numel
+
offset
]);
}
}
}
else
{
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
param
.
numel
());
for_range
(
functor
);
}
}
else
{
PADDLE_THROW
(
"Variable type not supported by adam_op"
);
}
...
...
paddle/fluid/operators/tensorrt/CMakeLists.txt
浏览文件 @
5c7a8aee
op_library
(
tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter
)
file
(
APPEND
${
pybind_file
}
"USE_
CUDA_ONLY
_OP(tensorrt_engine);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_
NO_KERNEL
_OP(tensorrt_engine);
\n
"
)
nv_test
(
test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op
analysis
)
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
浏览文件 @
5c7a8aee
...
...
@@ -21,8 +21,6 @@
namespace
paddle
{
DEFINE_int32
(
tensorrt_engine_batch_size
,
1
,
"the batch_size of TensorRT"
);
namespace
operators
{
class
TensorRTEngineOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
@@ -31,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Xs"
,
"A list of inputs."
).
AsDuplicable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"engine_uniq_key"
,
"unique key for the TRT engine."
);
AddAttr
<
int
>
(
"max_batch_size"
,
"the maximum batch size."
);
AddAttr
<
int
>
(
"workspace_size"
,
"the workspace size."
);
AddComment
(
"TensorRT engine operator."
);
...
...
@@ -50,6 +47,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
tensorrt_engine
,
ops
::
TensorRTEngineOp
,
ops
::
TensorRTEngineOpMaker
,
ops
::
TensorRTEngineOpMaker
);
ops
::
TensorRTEngineOpMaker
);
#endif // PADDLE_WITH_CUDA
paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
已删除
100644 → 0
浏览文件 @
b3cf476d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
tensorrt_engine
,
ops
::
TensorRTEngineKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
TensorRTEngineKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
TensorRTEngineKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
TensorRTEngineKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
5c7a8aee
...
...
@@ -27,8 +27,6 @@
namespace
paddle
{
DECLARE_int32
(
tensorrt_engine_batch_size
);
namespace
operators
{
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
...
...
@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
return
TRT_DT
::
kINT32
;
}
nvinfer1
::
Dims
Vec2TRT_Dims
(
const
std
::
vector
<
int64_t
>
&
shape
)
{
nvinfer1
::
Dims
Vec2TRT_Dims
(
const
std
::
vector
<
int64_t
>
&
shape
)
{
PADDLE_ENFORCE_GT
(
shape
.
size
(),
1UL
,
"TensorRT' tensor input requires at least 2 dimensions"
);
PADDLE_ENFORCE_LE
(
shape
.
size
(),
4UL
,
...
...
@@ -63,128 +61,119 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
}
// namespace // NOLINT
using
inference
::
Singleton
;
using
inference
::
tensorrt
::
TRT_EngineManager
;
using
inference
::
tensorrt
::
TensorRTEngine
;
class
TensorRTEngineOp
:
public
framework
::
OperatorBase
{
private:
std
::
vector
<
std
::
string
>
input_names_
;
std
::
unordered_set
<
std
::
string
>
param_names_
;
mutable
std
::
unique_ptr
<
TensorRTEngine
>
trt_engine_
;
int
max_batch_size_
;
int
workspace_size_
;
class
TensorRTEngineOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
TensorRTEngineOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
framework
::
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{
input_names_
=
Inputs
(
"Xs"
);
max_batch_size_
=
Attr
<
int
>
(
"max_batch_size"
);
workspace_size_
=
Attr
<
int
>
(
"workspace_size"
);
auto
params
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
for
(
const
auto
&
param
:
params
)
{
param_names_
.
insert
(
param
);
}
}
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input0
=
ctx
.
Inputs
(
"Xs"
).
front
();
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
ctx
.
scope
().
FindVar
(
input0
)
->
GetMutable
<
framework
::
LoDTensor
>
()
->
type
(),
ctx
.
GetPlace
());
return
kt
;
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
RunTrt
(
scope
,
dev_place
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
TensorRTEngineKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
engine_name
=
context
.
Attr
<
std
::
string
>
(
"engine_uniq_key"
);
int
max_batch_size
=
context
.
Attr
<
int
>
(
"max_batch_size"
);
if
(
!
Singleton
<
TRT_EngineManager
>::
Global
().
HasEngine
(
engine_name
))
{
Prepare
(
context
);
void
RunTrt
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
{
int
runtime_batch
=
1
;
if
(
trt_engine_
.
get
()
==
nullptr
)
{
trt_engine_
.
reset
(
new
TensorRTEngine
(
max_batch_size_
,
workspace_size_
,
nullptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
));
Prepare
(
scope
,
dev_place
,
trt_engine_
.
get
()
);
}
auto
*
engine
=
Singleton
<
TRT_EngineManager
>::
Global
().
Get
(
engine_name
);
auto
input_names
=
context
.
op
().
Inputs
(
"Xs"
);
PADDLE_ENFORCE
(
!
input_names
.
empty
(),
"should pass more than one inputs"
);
PADDLE_ENFORCE_LE
(
FLAGS_tensorrt_engine_batch_size
,
max_batch_size
);
auto
*
engine
=
trt_engine_
.
get
();
PADDLE_ENFORCE
(
!
input_names_
.
empty
(),
"should pass more than one inputs"
);
std
::
vector
<
std
::
string
>
output_maps
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
auto
params
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
std
::
unordered_set
<
std
::
string
>
parameters
;
for
(
const
auto
&
param
:
params
)
{
parameters
.
insert
(
param
);
}
// Convert input tensor from fluid to engine.
for
(
const
auto
&
x
:
context
.
Inputs
(
"Xs"
))
{
if
(
param
eters
.
count
(
x
))
continue
;
for
(
const
auto
&
x
:
Inputs
(
"Xs"
))
{
if
(
param
_names_
.
count
(
x
))
continue
;
// convert input and copy to TRT engine's buffer
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
context
.
scope
(),
x
);
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
x
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
runtime_batch
=
t_shape
[
0
];
if
(
platform
::
is_cpu_place
(
t
.
place
()))
{
engine
->
SetInputFromCPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
engine
->
SetInputFromCPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
}
else
{
engine
->
SetInputFromGPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
engine
->
SetInputFromGPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
}
}
PADDLE_ENFORCE_LE
(
runtime_batch
,
max_batch_size_
);
// Execute the engine.
PADDLE_ENFORCE_GT
(
FLAGS_tensorrt_engine_batch_size
,
0
);
engine
->
Execute
(
FLAGS_tensorrt_engine_batch_size
);
engine
->
Execute
(
runtime_batch
);
// Convert output tensor from engine to fluid
int
output_index
=
0
;
VLOG
(
4
)
<<
"TensorRT Engine Op Outputs:"
;
for
(
const
auto
&
y
:
context
.
Outputs
(
"Ys"
))
{
for
(
const
auto
&
y
:
Outputs
(
"Ys"
))
{
VLOG
(
4
)
<<
y
;
// convert output and copy to fluid.
nvinfer1
::
ITensor
*
trt_t
=
engine
->
GetITensor
(
output_maps
[
output_index
]);
nvinfer1
::
ITensor
*
trt_t
=
engine
->
GetITensor
(
output_maps
[
output_index
]);
auto
dims
=
trt_t
->
getDimensions
();
// Use the output ITensor's dims to reshape the Fluid Tensor.
// The ITensor doesn't contain the batch size dim.
std
::
vector
<
int
>
ddim
;
ddim
.
push_back
(
FLAGS_tensorrt_engine_batch_size
);
ddim
.
push_back
(
runtime_batch
);
for
(
int
i
=
0
;
i
<
dims
.
nbDims
;
i
++
)
{
ddim
.
push_back
(
dims
.
d
[
i
]);
}
auto
*
fluid_v
=
context
.
scope
()
.
FindVar
(
y
);
auto
*
fluid_v
=
scope
.
FindVar
(
y
);
PADDLE_ENFORCE_NOT_NULL
(
fluid_v
,
"no output variable called %s"
,
y
);
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
fluid_t
->
Resize
(
framework
::
make_ddim
(
ddim
));
// TODO(Superjomn) find some way to determine which device to output the
// tensor.
// if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size.
auto
size
=
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
FLAGS_tensorrt_engine_batch_size
;
auto
size
=
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
runtime_batch
;
engine
->
GetOutputInGPU
(
output_maps
[
output_index
],
fluid_t
->
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
boost
::
get
<
platform
::
CUDAPlace
>
(
context
.
GetPlace
()
).
device
)),
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_place
).
device
)),
size
*
sizeof
(
float
));
output_index
+=
1
;
}
cudaStreamSynchronize
(
*
engine
->
stream
());
}
protected:
void
Prepare
(
const
framework
::
ExecutionContext
&
context
)
const
{
void
Prepare
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
,
TensorRTEngine
*
engine
)
const
{
VLOG
(
4
)
<<
"Prepare engine"
;
// Get the ProgramDesc and pass to convert.
framework
::
proto
::
BlockDesc
block_desc
;
block_desc
.
ParseFromString
(
context
.
Attr
<
std
::
string
>
(
"subgraph"
));
int
max_batch_size
=
context
.
Attr
<
int
>
(
"max_batch_size"
);
int
workspace_size
=
context
.
Attr
<
int
>
(
"workspace_size"
);
auto
params
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
std
::
unordered_set
<
std
::
string
>
parameters
;
for
(
const
auto
&
param
:
params
)
{
parameters
.
insert
(
param
);
}
block_desc
.
ParseFromString
(
Attr
<
std
::
string
>
(
"subgraph"
));
std
::
vector
<
std
::
string
>
output_maps
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
// TODO(Superjomn) replace this with a different stream
auto
*
engine
=
Singleton
<
TRT_EngineManager
>::
Global
().
Create
(
max_batch_size
,
workspace_size
,
nullptr
/*engine hold its own stream*/
,
context
.
Attr
<
std
::
string
>
(
"engine_uniq_key"
),
boost
::
get
<
platform
::
CUDAPlace
>
(
context
.
GetPlace
()).
device
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"output_name_mapping"
);
engine
->
InitNetwork
();
...
...
@@ -192,39 +181,33 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
VLOG
(
4
)
<<
"parsed var size "
<<
block
.
AllVars
().
size
();
// Add inputs
VLOG
(
4
)
<<
"declare inputs"
;
for
(
auto
&
input
:
context
.
Inputs
(
"Xs"
))
{
if
(
param
eters
.
count
(
input
))
continue
;
for
(
auto
&
input
:
Inputs
(
"Xs"
))
{
if
(
param
_names_
.
count
(
input
))
continue
;
VLOG
(
4
)
<<
"declare input "
<<
input
;
auto
*
var
=
block
.
FindVar
(
input
);
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
scope
,
input
);
auto
t_shape
=
framework
::
vectorize
(
t
.
dims
());
auto
*
var
=
block
.
FindVar
(
input
);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE
(
var
,
"no variable called %s"
,
input
);
PADDLE_ENFORCE_EQ
(
var
->
GetType
(),
FluidDT
::
VarType_Type_LOD_TENSOR
,
"TensorRT engine only takes LoDTensor as input"
);
auto
shape
=
var
->
GetShape
();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if
(
shape
[
0
]
==
-
1
)
{
shape
[
0
]
=
FLAGS_tensorrt_engine_batch_size
;
}
engine
->
DeclareInput
(
input
,
FluidDataType2TRT
(
var
->
Proto
()
->
type
().
lod_tensor
().
tensor
().
data_type
()),
Vec2TRT_Dims
(
shape
));
Vec2TRT_Dims
(
t_
shape
));
}
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
()
.
ConvertBlock
(
block_desc
,
param
eters
,
context
.
scope
()
,
engine
);
.
ConvertBlock
(
block_desc
,
param
_names_
,
scope
,
engine
);
// Add outputs
for
(
auto
&
output
:
output_maps
)
{
if
(
!
engine
->
HasDeclared
(
output
))
{
for
(
auto
&
output
:
output_maps
)
{
engine
->
DeclareOutput
(
output
);
}
}
engine
->
FreezeNetwork
();
}
};
...
...
paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
浏览文件 @
5c7a8aee
...
...
@@ -24,8 +24,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_CUDA_ONLY_OP
(
tensorrt_engine
);
USE_NO_KERNEL_OP
(
tensorrt_engine
);
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/transpose_mkldnn_op.cc
0 → 100644
浏览文件 @
5c7a8aee
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
framework
::
DataLayout
;
template
<
typename
T
>
class
TransposeMKLDNNOpKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
PADDLE_ENFORCE
(
is_test
==
true
,
"ConvTransposeMKLDNN works only for inference!. Set is_test = True"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
std
::
vector
<
int
>
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axis"
);
int
ndims
=
axis
.
size
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
if
(
ndims
==
1
)
{
output
->
ShareDataWith
(
*
input
);
return
;
}
std
::
vector
<
int
>
nchw_axis
(
ndims
,
0
);
for
(
size_t
i
=
0
;
i
<
nchw_axis
.
size
();
++
i
)
{
nchw_axis
[
i
]
=
i
;
}
std
::
vector
<
int
>
nchw_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
src_md
=
input
->
format
()
!=
mkldnn
::
memory
::
format
::
nchw
?
platform
::
MKLDNNMemDesc
(
nchw_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
input
->
format
())
:
Axis2MemoryDesc
(
nchw_tz
,
nchw_axis
);
this
->
TransposeKernel
(
ctx
.
GetPlace
(),
Axis2MemoryDesc
(
nchw_tz
,
axis
),
src_md
,
output
,
input_data
,
nchw_tz
,
mkldnn_engine
);
}
protected:
mkldnn
::
memory
::
desc
Axis2MemoryDesc
(
std
::
vector
<
int
>&
nchw_tz
,
std
::
vector
<
int
>&
axis
)
const
{
mkldnn_memory_desc_t
mem_fmt
;
mem_fmt
.
primitive_kind
=
mkldnn_memory
;
mem_fmt
.
ndims
=
axis
.
size
();
for
(
unsigned
int
i
=
0
;
i
<
nchw_tz
.
size
();
++
i
)
{
mem_fmt
.
dims
[
i
]
=
nchw_tz
[
i
];
// logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt
.
data_type
=
mkldnn_f32
;
mem_fmt
.
format
=
mkldnn_blocked
;
unsigned
int
total_stride
=
1
;
for
(
int
i
=
nchw_tz
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
mem_fmt
.
layout_desc
.
blocking
.
padding_dims
[
i
]
=
nchw_tz
[
i
];
// logical dimensions (nchw format, regardless physical
// layout)
mem_fmt
.
layout_desc
.
blocking
.
block_dims
[
i
]
=
1
;
mem_fmt
.
layout_desc
.
blocking
.
offset_padding_to_data
[
i
]
=
0
;
// no offset
mem_fmt
.
layout_desc
.
blocking
.
strides
[
0
][
axis
[
i
]]
=
total_stride
;
mem_fmt
.
layout_desc
.
blocking
.
strides
[
1
][
axis
[
i
]]
=
1
;
total_stride
*=
nchw_tz
[
axis
[
i
]];
}
mem_fmt
.
layout_desc
.
blocking
.
offset_padding
=
0
;
// no initial offset
return
mem_fmt
;
}
void
TransposeKernel
(
platform
::
Place
place
,
mkldnn
::
memory
::
desc
md_o
,
mkldnn
::
memory
::
desc
md_i
,
Tensor
*
output
,
const
T
*
data_i
,
std
::
vector
<
int
>&
nchw_dims
,
const
mkldnn
::
engine
&
eng
)
const
{
// Make Memory primitive descriptors
auto
mpd_o
=
mkldnn
::
memory
::
primitive_desc
(
md_o
,
eng
);
auto
mpd_i
=
mkldnn
::
memory
::
primitive_desc
(
md_i
,
eng
);
auto
data_o
=
output
->
mutable_data
<
T
>
(
place
,
paddle
::
memory
::
Allocator
::
kDefault
,
mpd_o
.
get_size
());
auto
src
=
mkldnn
::
memory
(
mpd_i
,
(
T
*
)(
data_i
));
auto
dst
=
mkldnn
::
memory
(
mpd_o
,
data_o
);
auto
r
=
mkldnn
::
reorder
(
src
,
dst
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
({
r
}).
wait
();
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
transpose2
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
TransposeMKLDNNOpKernel
<
float
>
);
REGISTER_OP_KERNEL
(
transpose
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
TransposeMKLDNNOpKernel
<
float
>
);
paddle/fluid/operators/transpose_op.cc
浏览文件 @
5c7a8aee
...
...
@@ -16,6 +16,10 @@ limitations under the License. */
#include <string>
#include <vector>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel {
}
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_MKLDNN
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
);
}
};
class
TransposeOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddAttr
<
bool
>
(
"is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
AddInput
(
"X"
,
"(Tensor) The input tensor, tensors with rank up to 6 are supported."
);
...
...
@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<int>) A list of values, and the size of the list should be "
"the same with the input tensor rank. This operator permutes the input "
"tensor's axes according to the values given."
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"data_format"
,
"(string, default NCHW) Only used in "
"An optional string from:
\"
NHWC
\"
,
\"
NCHW
\"
. "
"Defaults to
\"
NHWC
\"
. Specify the data format of the output data, "
"the input will be transformed automatically. "
)
.
SetDefault
(
"AnyLayout"
);
AddComment
(
R"DOC(
Transpose Operator.
...
...
@@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp {
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
)
->
type
(),
ctx
.
device_context
());
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
#ifdef PADDLE_WITH_MKLDNN
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
layout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
);
}
};
...
...
paddle/fluid/platform/dynload/CMakeLists.txt
浏览文件 @
5c7a8aee
...
...
@@ -16,9 +16,7 @@ if (CUPTI_FOUND)
list
(
APPEND CUDA_SRCS cupti.cc
)
endif
(
CUPTI_FOUND
)
nv_library
(
dynload_cuda SRCS
${
CUDA_SRCS
}
DEPS dynamic_loader
)
if
(
NOT WIN32
)
cc_library
(
dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc
)
endif
(
NOT WIN32
)
if
(
WITH_MKLML
)
cc_library
(
dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml
)
endif
()
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
5c7a8aee
...
...
@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) ->
decltype(__name(args...)) {
\
auto operator()(Args... args) ->
DECLARE_TYPE(__name, args...) {
\
using cudnn_func = decltype(&::__name); \
std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
...
...
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
5c7a8aee
...
...
@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() {
void
*
GetWarpCTCDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_warpctc_dir
,
"libwarpctc.dylib"
);
#elif defined(_WIN32)
return
GetDsoHandleFromSearchPath
(
FLAGS_warpctc_dir
,
"warpctc.dll"
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_warpctc_dir
,
"libwarpctc.so"
);
#endif
...
...
paddle/fluid/platform/dynload/dynamic_loader.h
浏览文件 @
5c7a8aee
...
...
@@ -18,6 +18,12 @@ namespace paddle {
namespace
platform
{
namespace
dynload
{
#ifndef _WIN32
#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
#else
#define DECLARE_TYPE(__name, ...) decltype(auto)
#endif
void
*
GetCublasDsoHandle
();
void
*
GetCUDNNDsoHandle
();
void
*
GetCUPTIDsoHandle
();
...
...
paddle/fluid/platform/dynload/mklml.h
浏览文件 @
5c7a8aee
...
...
@@ -34,7 +34,7 @@ extern void* mklml_dso_handle;
#define DYNAMIC_LOAD_MKLML_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) ->
decltype(__name(args...)) {
\
auto operator()(Args... args) ->
DECLARE_TYPE(__name, args...) {
\
using mklmlFunc = decltype(&::__name); \
std::call_once(mklml_dso_flag, []() { \
mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
...
...
@@ -82,6 +82,8 @@ extern void* mklml_dso_handle;
__macro(vdSqr); \
__macro(vsPowx); \
__macro(vdPowx); \
__macro(vsInv); \
__macro(vdInv); \
__macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_MKLML_WRAP
);
...
...
paddle/fluid/platform/dynload/tensorrt.h
浏览文件 @
5c7a8aee
...
...
@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle;
#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) ->
decltype(__name(args...)) {
\
auto operator()(Args... args) ->
DECLARE_TYPE(__name, args...) {
\
using tensorrt_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = \
...
...
paddle/fluid/platform/dynload/warpctc.h
浏览文件 @
5c7a8aee
...
...
@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle;
#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) ->
decltype(__name(args...)) {
\
auto operator()(Args... args) ->
DECLARE_TYPE(__name, args...) {
\
using warpctcFunc = decltype(&::__name); \
std::call_once(warpctc_dso_flag, []() { \
warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
...
...
paddle/fluid/platform/port.h
浏览文件 @
5c7a8aee
...
...
@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) {
static
void
*
dlopen
(
const
char
*
filename
,
int
flag
)
{
std
::
string
file_name
(
filename
);
file_name
.
replace
(
0
,
file_name
.
size
()
-
1
,
'/'
,
'\\'
);
HMODULE
hModule
=
LoadLibrary
(
file_name
.
c_str
());
if
(
!
hModule
)
{
throw
std
::
runtime_error
(
file_name
+
" not found."
);
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
5c7a8aee
...
...
@@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default False)DOC"
)
.
def_property
(
"memory_optimize"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
memory_optimize_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
memory_optimize_
=
b
;
})
.
def_property
(
"memory_early_delete"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
memory_early_delete_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
memory_early_delete_
=
b
;
})
.
def
(
"_finalize_strategy_and_create_passes"
,
[](
BuildStrategy
&
self
)
->
std
::
shared_ptr
<
ir
::
PassBuilder
>
{
return
self
.
CreatePassesFromStrategy
(
true
);
...
...
python/paddle/fluid/__init__.py
浏览文件 @
5c7a8aee
...
...
@@ -102,6 +102,13 @@ def __bootstrap__():
import
sys
import
os
import
platform
if
os
.
name
==
'nt'
:
third_lib_path
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))
+
os
.
sep
+
'..'
+
os
.
sep
+
'libs'
os
.
environ
[
'path'
]
+=
';'
+
third_lib_path
sys
.
path
.
append
(
third_lib_path
)
from
.
import
core
in_test
=
'unittest'
in
sys
.
modules
...
...
@@ -128,13 +135,12 @@ def __bootstrap__():
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'fast_eager_deletion_mode'
,
'allocator_strategy'
,
'reader_queue_speed_test_mode'
,
'print_sub_graph_dir'
,
'pe_profile_fname'
'print_sub_graph_dir'
,
'pe_profile_fname'
,
'warpctc_dir'
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
if
os
.
name
!=
'nt'
:
read_env_flags
.
append
(
'warpctc_dir'
)
read_env_flags
.
append
(
'cpu_deterministic'
)
if
core
.
is_compiled_with_dist
():
...
...
@@ -150,7 +156,7 @@ def __bootstrap__():
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'selected_gpus'
'cudnn_exhaustive_search'
,
'
memory_optimize_debug'
,
'
selected_gpus'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
...
...
python/paddle/fluid/backward.py
浏览文件 @
5c7a8aee
...
...
@@ -249,69 +249,6 @@ def serialize_op_decs(op_desc):
return
proto
.
__str__
()
def
_callback_lookup_
(
op
):
"""
Only used in _append_backward_ops_
Build and returns a callback function for certain op. For example
parallel_do: AllReduce
:param op:
:return: callback function
"""
if
op
.
type
==
'parallel_do'
and
op
.
attr
(
'use_nccl'
):
all_vars
=
op
.
block
.
vars
param_names
=
set
(
op
.
input
(
'parameters'
))
param_names
=
[
name
for
name
in
param_names
if
all_vars
[
name
].
stop_gradient
is
False
]
param_grad_names
=
[
n
+
"@GRAD"
for
n
in
param_names
]
class
ParallelDoCallBack
(
object
):
def
__init__
(
self
,
param_grad_names
,
parallel_scopes_name
):
self
.
has_inserted_nccl_init
=
False
self
.
param_grad_names
=
param_grad_names
self
.
parallel_scopes_name
=
parallel_scopes_name
def
__call__
(
self
,
block
,
context
):
if
not
self
.
has_inserted_nccl_init
:
op_desc
=
_create_op_desc_
(
"ncclInit"
,
{
"parallel_scopes"
:
self
.
parallel_scopes_name
},
{
"Communicator"
:
[
'nccl_com__do_not_change_'
]},
{})
block
.
program
.
global_block
().
desc
.
append_op
().
copy_from
(
op_desc
)
self
.
has_inserted_nccl_init
=
True
current_op_desc
=
context
[
"__current_op_desc__"
]
for
o_param
in
current_op_desc
.
output_names
():
for
o_argu
in
current_op_desc
.
output
(
o_param
):
if
o_argu
in
self
.
param_grad_names
:
allreduce_out_name
=
o_argu
+
"__nccl_all_reduce__"
op_desc
=
_create_op_desc_
(
"ncclReduce"
,
{
"X"
:
[
o_argu
],
"Communicator"
:
[
'nccl_com__do_not_change_'
]
},
{
"Out"
:
[
allreduce_out_name
]},
{
"reduction"
:
"ncclSum"
,
"root"
:
0
},
)
block
.
desc
.
append_op
().
copy_from
(
op_desc
)
op_desc
=
_create_op_desc_
(
"assign"
,
{
"X"
:
[
allreduce_out_name
]},
{
"Out"
:
[
o_argu
]},
{})
block
.
desc
.
append_op
().
copy_from
(
op_desc
)
return
ParallelDoCallBack
(
param_grad_names
,
op
.
output
(
"parallel_scopes"
))
else
:
return
None
def
_append_backward_ops_
(
block
,
ops
,
target_block
,
...
...
@@ -349,15 +286,6 @@ def _append_backward_ops_(block,
sub_block
=
program
.
block
(
op
.
_block_attr_id
(
"sub_block"
))
grad_sub_block
=
program
.
_create_block
()
grad_sub_block
.
_set_forward_block_idx
(
sub_block
.
idx
)
cb
=
_callback_lookup_
(
op
)
if
cb
is
not
None
:
if
callbacks
is
None
:
new_callbacks
=
[
cb
]
else
:
new_callbacks
=
callbacks
+
[
_callback_lookup_
(
op
)]
_append_backward_ops_
(
sub_block
,
sub_block
.
ops
,
grad_sub_block
,
no_grad_dict
,
grad_to_var
,
new_callbacks
)
else
:
_append_backward_ops_
(
sub_block
,
sub_block
.
ops
,
grad_sub_block
,
no_grad_dict
,
grad_to_var
,
callbacks
)
...
...
@@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
# infer_shape and infer_type
op_desc
.
infer_var_type
(
block
.
desc
)
op_desc
.
infer_shape
(
block
.
desc
)
# ncclInit dones't need to set data_type
if
op_desc
.
type
()
==
'ncclInit'
:
continue
for
arg
in
op_desc
.
output_arg_names
():
if
arg
in
new_vars
:
_infer_var_data_type_
(
arg
,
block
)
...
...
python/paddle/fluid/framework.py
浏览文件 @
5c7a8aee
...
...
@@ -16,6 +16,7 @@ from __future__ import print_function
import
collections
import
contextlib
import
os
import
re
import
six
import
sys
...
...
@@ -27,6 +28,13 @@ from .proto import framework_pb2
try
:
from
.
import
core
except
ImportError
as
e
:
if
os
.
name
==
'nt'
:
raise
ImportError
(
"""NOTE: You may need to run
\"
set PATH=c:\python27\lib:%PATH%
\"
if you encounters
\"
mkldnn.dll not found
\"
errors. If you have python
installed in other directory, replace
\"
c:\python27\lib" with your own
directory. The original error is:
\n
"""
+
cpt
.
get_exception_message
(
e
))
else
:
raise
ImportError
(
"""NOTE: You may need to run
\"
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
\"
if you encounters
\"
libmkldnn.so not found
\"
errors. If you have python
...
...
@@ -563,8 +571,8 @@ class Operator(object):
OP_WITHOUT_KERNEL_SET
=
{
'feed'
,
'fetch'
,
'save'
,
'load'
,
'recurrent'
,
'go'
,
'rnn_memory_helper_grad'
,
'conditional_block'
,
'while'
,
'send'
,
'recv'
,
'listen_and_serv'
,
'
parallel_do'
,
'save_combine'
,
'load_combine
'
,
'
ncclInit'
,
'select'
,
'
checkpoint_notify'
,
'gen_nccl_id'
'listen_and_serv'
,
'
save_combine'
,
'load_combine'
,
'ncclInit'
,
'select
'
,
'checkpoint_notify'
,
'gen_nccl_id'
}
def
__init__
(
self
,
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
5c7a8aee
...
...
@@ -226,156 +226,6 @@ class BlockGuard(object):
return
True
class
ParallelDo
(
object
):
"""
ParallelDo is used to represent multi-thread data parallel processing.
Its vanilla implementation can be shown as the following (:math:`|` means
single thread and :math:`||||` means multiple threads)
.. code-block:: text
In the forward pass
| Split input onto different devices
| Copy parameter onto different devices
|||| Compute forward pass in parallel
| Merge output from different devices
In the backward pass
| Split output@grad onto different devices
|||| Compute backward pass in parallel
| accumulate param@grad from different devices to the first device
| Merge input@grad from different devices
| Copy param@grad to the place of parallel_do_op
Examples:
.. code-block:: python
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
# ParallelDo version & Single-thread version
if thread_num > 1:
places = fluid.layers.get_places(thread_num)
pd = fluid.layers.control_flow.ParallelDo(places)
with pd.do():
images = pd.read_input(images)
label = pd.read_input(label)
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
pd.write_output(avg_cost)
avg_cost = pd()
avg_cost = fluid.layers.mean(avg_cost)
else:
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
.. warning::
It will be soon deprecated, please use ParallelExecutor instead.
"""
def
__init__
(
self
,
places
,
use_nccl
=
False
,
name
=
None
):
warnings
.
warn
(
"API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead."
,
Warning
)
self
.
helper
=
LayerHelper
(
"parallel_do"
,
name
=
name
)
self
.
inputs
=
[]
self
.
places
=
places
self
.
outputs
=
[]
self
.
status
=
StaticRNN
.
BEFORE_RNN_BLOCK
self
.
use_nccl
=
use_nccl
def
do
(
self
):
return
BlockGuardWithCompletion
(
self
)
def
parent_block
(
self
):
prog
=
self
.
helper
.
main_program
parent_idx
=
prog
.
current_block
().
parent_idx
assert
parent_idx
>=
0
parent_block
=
prog
.
block
(
parent_idx
)
return
parent_block
def
__call__
(
self
,
*
args
,
**
kwargs
):
if
self
.
status
!=
StaticRNN
.
AFTER_RNN_BLOCK
:
raise
ValueError
(
"RNN output can only be retrieved after rnn block"
)
if
len
(
self
.
outputs
)
==
0
:
raise
ValueError
(
"RNN has no output"
)
elif
len
(
self
.
outputs
)
==
1
:
return
self
.
outputs
[
0
]
else
:
return
self
.
outputs
def
read_input
(
self
,
var
):
self
.
inputs
.
append
(
var
)
return
var
def
write_output
(
self
,
var
):
self
.
outputs
.
append
(
var
)
def
get_parameters
(
self
):
main_program
=
self
.
helper
.
main_program
current_block
=
main_program
.
current_block
()
parent_block
=
self
.
parent_block
()
local_inputs
=
set
()
params
=
list
()
for
var
in
self
.
inputs
:
local_inputs
.
add
(
var
.
name
)
for
op
in
current_block
.
ops
:
for
iname
in
op
.
input_names
:
for
in_var_name
in
op
.
input
(
iname
):
if
in_var_name
not
in
local_inputs
:
params
.
append
(
in_var_name
)
for
oname
in
op
.
output_names
:
for
out_var_name
in
op
.
output
(
oname
):
local_inputs
.
add
(
out_var_name
)
params
=
list
(
set
(
params
))
return
[
parent_block
.
var
(
name
)
for
name
in
params
]
def
_complete_op
(
self
):
main_program
=
self
.
helper
.
main_program
current_block
=
main_program
.
current_block
()
parent_block
=
self
.
parent_block
()
step_scope
=
parent_block
.
create_var
(
type
=
core
.
VarDesc
.
VarType
.
STEP_SCOPES
)
self
.
outputs
=
[
parent_block
.
create_var
(
name
=
o
.
name
,
shape
=
o
.
shape
,
dtype
=
o
.
dtype
,
lod_level
=
o
.
lod_level
,
persistable
=
o
.
persistable
,
stop_gradient
=
o
.
stop_gradient
)
for
o
in
self
.
outputs
]
inputs
=
[
parent_block
.
var
(
i
.
name
)
for
i
in
self
.
inputs
]
outputs
=
[
parent_block
.
var
(
o
.
name
)
for
o
in
self
.
outputs
]
parent_block
.
append_op
(
type
=
'parallel_do'
,
inputs
=
{
'inputs'
:
inputs
,
'parameters'
:
self
.
get_parameters
(),
'places'
:
self
.
places
},
outputs
=
{
'outputs'
:
outputs
,
'parallel_scopes'
:
[
step_scope
]},
attrs
=
{
'sub_block'
:
current_block
,
'use_nccl'
:
self
.
use_nccl
})
class
BlockGuardWithCompletion
(
BlockGuard
):
"""
BlockGuardWithCompletion class.
...
...
@@ -384,9 +234,8 @@ class BlockGuardWithCompletion(BlockGuard):
"""
def
__init__
(
self
,
rnn
):
if
not
(
isinstance
(
rnn
,
StaticRNN
)
or
isinstance
(
rnn
,
ParallelDo
)):
raise
TypeError
(
"BlockGuardWithCompletion takes a StaticRNN or ParallelDo"
)
if
not
isinstance
(
rnn
,
StaticRNN
):
raise
TypeError
(
"BlockGuardWithCompletion takes a StaticRNN"
)
super
(
BlockGuardWithCompletion
,
self
).
__init__
(
rnn
.
helper
.
main_program
)
self
.
rnn
=
rnn
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
5c7a8aee
...
...
@@ -177,6 +177,7 @@ __all__ = [
'lstm'
,
'psroi_pool'
,
'teacher_student_sigmoid_loss'
,
'huber_loss'
,
]
kIgnoreIndex
=
-
100
...
...
@@ -9419,3 +9420,51 @@ def psroi_pool(input,
'pooled_width'
:
pooled_width
})
return
out
def
huber_loss
(
input
,
label
,
delta
):
"""
Huber loss is a loss function used in robust.
Huber loss can evaluate the fitness of input to label.
Different from MSE loss, Huber loss is more robust for outliers.
When the difference between input and label is large than delta
.. math::
huber\_loss = delta * (label - input) - 0.5 * delta * delta
When the difference between input and label is less than delta
.. math::
huber\_loss = 0.5 * (label - input) * (label - input)
Args:
input (Variable): This input is a probability computed by the previous operator.
The first dimension is batch size, and the last dimension is 1.
label (Variable): The groud truth whose first dimension is batch size
and last dimension is 1.
delta (float): The parameter of huber loss, which controls
the range of outliers
Returns:
huber\_loss (Variable): The huber loss with shape [batch_size, 1].
Examples:
.. code-block:: python
predictions = fluid.layers.softmax(x)
loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0)
"""
helper
=
LayerHelper
(
'huber_loss'
,
**
locals
())
residual
=
helper
.
create_variable_for_type_inference
(
dtype
=
helper
.
input_dtype
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
helper
.
input_dtype
())
helper
.
append_op
(
type
=
'huber_loss'
,
inputs
=
{
'X'
:
input
,
'Y'
:
label
},
outputs
=
{
'Out'
:
out
,
'Residual'
:
residual
},
attrs
=
{
'delta'
:
delta
})
return
out
python/paddle/fluid/optimizer.py
浏览文件 @
5c7a8aee
...
...
@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
the accumulators are updated at every step. Every element of the two moving-average is updated
in both dense mode and sparse mode. If the size of parameter is very large, then the update
may be very slow. The lazy mode only update the element that has gradient is the current
mini-batch, so it will be much more faster. But this mode has different semantics with the
original Adam algorithm and may lead to different result.
Examples:
.. code-block:: python
...
...
@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer):
beta2
=
0.999
,
epsilon
=
1e-8
,
regularization
=
None
,
name
=
None
):
name
=
None
,
lazy_mode
=
False
):
assert
learning_rate
is
not
None
assert
beta1
is
not
None
assert
beta2
is
not
None
...
...
@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer):
self
.
_beta1
=
beta1
self
.
_beta2
=
beta2
self
.
_epsilon
=
epsilon
self
.
_lazy_mode
=
lazy_mode
def
_create_accumulators
(
self
,
block
,
parameters
):
assert
isinstance
(
block
,
framework
.
Block
)
...
...
@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer):
attrs
=
{
"beta1"
:
self
.
_beta1
,
"beta2"
:
self
.
_beta2
,
"epsilon"
:
self
.
_epsilon
"epsilon"
:
self
.
_epsilon
,
"lazy_mode"
:
self
.
_lazy_mode
})
return
adam_op
...
...
python/paddle/fluid/tests/book/notest_understand_sentiment.py
浏览文件 @
5c7a8aee
...
...
@@ -15,7 +15,6 @@
from
__future__
import
print_function
from
paddle.fluid.layers.device
import
get_places
from
paddle.fluid.layers.control_flow
import
ParallelDo
import
unittest
import
paddle.fluid
as
fluid
import
paddle
...
...
@@ -147,22 +146,7 @@ def train(word_dict,
cost
,
acc_out
,
prediction
=
net_method
(
data
,
label
,
input_dim
=
dict_dim
,
class_dim
=
class_dim
)
else
:
places
=
get_places
()
pd
=
ParallelDo
(
places
)
with
pd
.
do
():
cost
,
acc
,
_
=
net_method
(
pd
.
read_input
(
data
),
pd
.
read_input
(
label
),
input_dim
=
dict_dim
,
class_dim
=
class_dim
)
pd
.
write_output
(
cost
)
pd
.
write_output
(
acc
)
cost
,
acc
=
pd
()
cost
=
fluid
.
layers
.
mean
(
cost
)
acc_out
=
fluid
.
layers
.
mean
(
acc
)
prediction
=
None
assert
save_dirname
is
None
raise
NotImplementedError
()
adagrad
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
0.002
)
adagrad
.
minimize
(
cost
)
...
...
python/paddle/fluid/tests/book/test_recognize_digits.py
浏览文件 @
5c7a8aee
...
...
@@ -25,7 +25,6 @@ import numpy
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.layers.device
import
get_places
from
paddle.fluid.layers.control_flow
import
ParallelDo
BATCH_SIZE
=
64
...
...
@@ -82,19 +81,7 @@ def train(nn_type,
net_conf
=
conv_net
if
parallel
:
places
=
get_places
()
pd
=
ParallelDo
(
places
)
with
pd
.
do
():
img_
=
pd
.
read_input
(
img
)
label_
=
pd
.
read_input
(
label
)
prediction
,
avg_loss
,
acc
=
net_conf
(
img_
,
label_
)
for
o
in
[
avg_loss
,
acc
]:
pd
.
write_output
(
o
)
avg_loss
,
acc
=
pd
()
# get mean loss and acc through every devices.
avg_loss
=
fluid
.
layers
.
mean
(
avg_loss
)
acc
=
fluid
.
layers
.
mean
(
acc
)
raise
NotImplementedError
()
else
:
prediction
,
avg_loss
,
acc
=
net_conf
(
img
,
label
)
...
...
@@ -273,7 +260,7 @@ def inject_all_tests():
for
use_cuda
in
(
False
,
True
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
continue
for
parallel
in
(
False
,
True
):
for
parallel
in
(
False
,
):
for
nn_type
in
(
'mlp'
,
'conv'
):
inject_test_method
(
use_cuda
,
parallel
,
nn_type
,
True
)
...
...
python/paddle/fluid/tests/book/test_word2vec.py
浏览文件 @
5c7a8aee
...
...
@@ -17,7 +17,6 @@ from __future__ import print_function
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.layers.device
import
get_places
from
paddle.fluid.layers.control_flow
import
ParallelDo
import
unittest
import
os
import
numpy
as
np
...
...
@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
avg_cost
,
predict_word
=
__network__
(
[
first_word
,
second_word
,
third_word
,
forth_word
,
next_word
])
else
:
places
=
get_places
()
pd
=
ParallelDo
(
places
)
with
pd
.
do
():
avg_cost
,
predict_word
=
__network__
(
list
(
map
(
pd
.
read_input
,
[
first_word
,
second_word
,
third_word
,
forth_word
,
next_word
])))
pd
.
write_output
(
avg_cost
)
avg_cost
=
fluid
.
layers
.
mean
(
pd
())
raise
NotImplementedError
()
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.001
)
sgd_optimizer
.
minimize
(
avg_cost
)
...
...
@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
for
use_cuda
in
(
False
,
True
):
for
is_sparse
in
(
False
,
True
):
for
is_parallel
in
(
False
,
True
):
for
is_parallel
in
(
False
,
):
inject_test_method
(
use_cuda
,
is_sparse
,
is_parallel
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
已删除
100644 → 0
浏览文件 @
b3cf476d
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
math
import
sys
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.layers.device
import
get_places
from
paddle.fluid.layers.control_flow
import
ParallelDo
# need to fix random seed and training data to compare the loss
# value accurately calculated by the default and the memory optimization
# version.
fluid
.
default_startup_program
().
random_seed
=
111
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
13
],
dtype
=
'float32'
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
device_type
=
'CPU'
use_nccl
=
False
place
=
fluid
.
CPUPlace
()
if
fluid
.
core
.
is_compiled_with_cuda
():
device_type
=
'CUDA'
use_nccl
=
False
place
=
fluid
.
CUDAPlace
(
0
)
places
=
get_places
(
device_count
=
0
,
device_type
=
device_type
)
pd
=
ParallelDo
(
places
,
use_nccl
=
use_nccl
)
with
pd
.
do
():
x_
=
pd
.
read_input
(
x
)
y_
=
pd
.
read_input
(
y
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x_
,
size
=
1
,
act
=
None
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y_
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
pd
.
write_output
(
avg_cost
)
cost
=
pd
()
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
sgd_optimizer
.
minimize
(
avg_cost
)
fluid
.
memory_optimize
(
fluid
.
default_main_program
(),
print_log
=
True
)
# fluid.release_memory(fluid.default_main_program())
BATCH_SIZE
=
200
# fix the order of training data
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
uci_housing
.
train
(),
batch_size
=
BATCH_SIZE
,
drop_last
=
False
)
# train_reader = paddle.batch(
# paddle.reader.shuffle(
# paddle.dataset.uci_housing.train(), buf_size=500),
# batch_size=BATCH_SIZE)
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
x
,
y
])
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
PASS_NUM
=
100
for
pass_id
in
range
(
PASS_NUM
):
for
data
in
train_reader
():
avg_loss_value
,
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_cost
])
if
avg_loss_value
[
0
]
<
10.0
:
exit
(
0
)
# if avg cost less than 10.0, we think our code is good.
print
(
avg_loss_value
[
0
])
if
math
.
isnan
(
float
(
avg_loss_value
)):
sys
.
exit
(
"got NaN loss, training failed."
)
exit
(
1
)
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
5c7a8aee
...
...
@@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase):
seed
=
None
,
use_parallel_executor
=
True
,
use_reduce
=
False
,
use_ir_memory_optimize
=
False
,
fuse_elewise_add_act_ops
=
False
,
optimizer
=
fluid
.
optimizer
.
Adam
,
use_fast_executor
=
False
,
...
...
@@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
\
if
use_reduce
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
fuse_elewise_add_act_ops
=
fuse_elewise_add_act_ops
build_strategy
.
memory_optimize
=
use_ir_memory_optimize
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_cuda
and
core
.
is_compiled_with_cuda
():
build_strategy
.
remove_unnecessary_lock
=
True
...
...
python/paddle/fluid/tests/unittests/test_adam_op.py
浏览文件 @
5c7a8aee
...
...
@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
return
param_out
,
moment1_out
,
moment2_out
def
adam_step_sparse
(
inputs
,
attributes
,
height
,
rows
,
row_numel
,
np_grad
):
def
adam_step_sparse
(
inputs
,
attributes
,
height
,
rows
,
row_numel
,
np_grad
,
lazy_mode
):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
...
...
@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
moment2_out
=
np
.
zeros
(
shape
=
[
height
,
row_numel
])
param_out
=
np
.
zeros
(
shape
=
[
height
,
row_numel
])
for
idx
,
row_id
in
enumerate
(
rows
):
def
update_row
(
row_id
,
update_value
):
moment1_out
[
row_id
]
=
beta1
*
moment1
[
row_id
]
+
(
1
-
beta1
)
*
np_grad
[
idx
]
)
*
update_value
moment2_out
[
row_id
]
=
beta2
*
moment2
[
row_id
]
+
(
1
-
beta2
)
*
np
.
square
(
np_grad
[
idx
]
)
1
-
beta2
)
*
np
.
square
(
update_value
)
lr_t
=
lr
*
np
.
sqrt
(
1
-
beta2_pow
)
/
(
1
-
beta1_pow
)
param_out
[
row_id
]
=
param
[
row_id
]
-
lr_t
*
(
moment1_out
[
row_id
]
/
(
np
.
sqrt
(
moment2_out
[
row_id
])
+
epsilon
))
if
lazy_mode
:
for
idx
,
row_id
in
enumerate
(
rows
):
update_row
(
row_id
,
np_grad
[
idx
])
else
:
for
row_id
in
range
(
param_out
.
shape
[
0
]):
update_value
=
np
.
zeros
(
np_grad
[
0
].
shape
).
astype
(
"float32"
)
if
row_id
in
rows
:
update_value
=
np_grad
[
rows
.
index
(
row_id
)]
update_row
(
row_id
,
update_value
)
return
param_out
,
moment1_out
,
moment2_out
class
TestSparseAdamOp
(
unittest
.
TestCase
):
def
setup
(
self
,
scope
,
place
):
def
setup
(
self
,
scope
,
place
,
lazy_mode
):
beta1
=
0.78
beta2
=
0.836
epsilon
=
1e-4
...
...
@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase):
'Beta2Pow'
:
np
.
array
([
beta2
**
10
]).
astype
(
"float32"
),
"LearningRate"
:
np
.
full
((
1
),
2.0
).
astype
(
"float32"
)
}
self
.
init_output
=
np
.
full
((
height
,
row_numel
),
0.0
).
astype
(
"float32"
)
self
.
attrs
=
{
'epsilon'
:
epsilon
,
'beta1'
:
beta1
,
'beta2'
:
beta2
}
grad_selected_rows
=
scope
.
var
(
'Grad'
).
get_selected_rows
()
...
...
@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase):
self
.
sparse_inputs
=
[
"Grad"
]
param_out
,
mom1
,
mom2
=
adam_step_sparse
(
self
.
dense_inputs
,
self
.
attrs
,
height
,
rows
,
row_numel
,
np_array
)
param_out
,
mom1
,
mom2
=
adam_step_sparse
(
self
.
dense_inputs
,
self
.
attrs
,
height
,
rows
,
row_numel
,
np_array
,
lazy_mode
)
self
.
outputs
=
{
"ParamOut"
:
param_out
,
"Moment1Out"
:
mom1
,
"Moment2Out"
:
mom2
}
def
check_with_place
(
self
,
place
):
def
check_with_place
(
self
,
place
,
lazy_mode
):
scope
=
core
.
Scope
()
self
.
setup
(
scope
,
place
)
self
.
setup
(
scope
,
place
,
lazy_mode
)
op_args
=
dict
()
op_args
[
'lazy_mode'
]
=
lazy_mode
for
key
,
np_array
in
self
.
dense_inputs
.
items
():
var
=
scope
.
var
(
key
).
get_tensor
()
var
.
set
(
np_array
,
place
)
...
...
@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase):
op_args
[
s
]
=
s
for
s
in
self
.
outputs
:
var
=
scope
.
var
(
s
).
get_tensor
()
var
.
set
(
self
.
outputs
[
s
]
,
place
)
var
.
set
(
self
.
init_output
,
place
)
op_args
[
s
]
=
s
for
k
in
self
.
attrs
:
op_args
[
k
]
=
self
.
attrs
[
k
]
...
...
@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase):
actual
=
np
.
array
(
out_var
)
actual
=
actual
.
reshape
([
actual
.
size
])
np_array
=
np_array
.
reshape
([
np_array
.
size
])
for
idx
,
row_id
in
enumerate
(
self
.
rows
):
j
=
0
while
j
<
self
.
row_numel
:
pos
=
row_id
*
self
.
row_numel
+
j
self
.
assertLess
((
actual
[
pos
]
-
np_array
[
pos
])
/
actual
[
pos
],
0.00001
)
j
+=
1
def
test_sparse_sgd
(
self
):
for
i
in
range
(
np_array
.
size
):
self
.
assertLess
((
actual
[
i
]
-
np_array
[
i
]),
0.00001
)
def
test_sparse_adam
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
for
lazy_mode
in
(
True
,
False
):
self
.
check_with_place
(
place
,
lazy_mode
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
浏览文件 @
5c7a8aee
...
...
@@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
cost
=
network
(
data
,
label
,
len
(
word_dict
))
cost
.
persistable
=
True
optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
0.2
)
optimizer
.
minimize
(
cost
)
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
0 → 100644
浏览文件 @
5c7a8aee
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
parallel_executor_test_base
import
TestParallelExecutorBase
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
import
paddle
import
paddle.dataset.mnist
as
mnist
import
unittest
import
os
MNIST_RECORDIO_FILE
=
"./mnist_test_pe.recordio"
def
_feed_data_helper
(
use_feed
):
if
use_feed
:
img
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
784
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
else
:
reader
=
fluid
.
layers
.
open_files
(
filenames
=
[
MNIST_RECORDIO_FILE
],
shapes
=
[[
-
1
,
784
],
[
-
1
,
1
]],
lod_levels
=
[
0
,
0
],
dtypes
=
[
'float32'
,
'int64'
])
reader
=
fluid
.
layers
.
io
.
double_buffer
(
reader
)
img
,
label
=
fluid
.
layers
.
read_file
(
reader
)
return
img
,
label
def
simple_fc_net
(
use_feed
):
x
,
y
=
_feed_data_helper
(
use_feed
)
hidden_layer
=
4
for
_
in
range
(
hidden_layer
):
x
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
20
,
act
=
'relu'
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
10
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
return
avg_cost
def
fc_with_inplace_net
(
use_feed
):
x
,
y
=
_feed_data_helper
(
use_feed
)
fc
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
20
,
act
=
'relu'
)
fc
=
fluid
.
layers
.
fc
(
input
=
fc
,
size
=
10
,
act
=
'relu'
)
reshape
=
fluid
.
layers
.
reshape
(
x
=
fc
,
shape
=
[
-
1
,
2
,
5
])
reshape
=
fluid
.
layers
.
reshape
(
x
=
reshape
,
shape
=
[
-
1
,
5
,
2
])
y_predict
=
fluid
.
layers
.
fc
(
input
=
reshape
,
size
=
10
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
return
avg_cost
class
TestMNIST
(
TestParallelExecutorBase
):
@
classmethod
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
# Convert mnist to recordio file
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
reader
=
paddle
.
batch
(
mnist
.
train
(),
batch_size
=
4
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
# order is image and label
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
[
784
]),
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
),
],
place
=
fluid
.
CPUPlace
())
fluid
.
recordio_writer
.
convert_reader_to_recordio_file
(
MNIST_RECORDIO_FILE
,
reader
,
feeder
)
def
_dummy_data
(
self
):
np
.
random
.
seed
(
5
)
img
=
np
.
random
.
random
(
size
=
[
32
,
784
]).
astype
(
np
.
float32
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
return
img
,
label
def
_compare_ir_and_python_memory_optimize
(
self
,
model
,
use_cuda
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_dummy_data
()
first_loss0
,
last_loss0
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
memory_opt
=
False
,
use_ir_memory_optimize
=
False
)
first_loss1
,
last_loss1
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
memory_opt
=
False
,
use_ir_memory_optimize
=
True
)
for
loss
in
zip
(
first_loss0
,
first_loss1
):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
for
loss
in
zip
(
last_loss0
,
last_loss1
):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_fc_net
(
self
):
self
.
_compare_ir_and_python_memory_optimize
(
simple_fc_net
,
False
)
self
.
_compare_ir_and_python_memory_optimize
(
simple_fc_net
,
True
)
def
test_fc_with_reshape_net
(
self
):
self
.
_compare_ir_and_python_memory_optimize
(
fc_with_inplace_net
,
False
)
self
.
_compare_ir_and_python_memory_optimize
(
fc_with_inplace_net
,
True
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_op.py
已删除
100644 → 0
浏览文件 @
b3cf476d
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
paddle.fluid
as
fluid
from
paddle.fluid.layers.device
import
get_places
from
paddle.fluid.layers.control_flow
import
ParallelDo
import
paddle.fluid.profiler
as
profiler
import
numpy
import
six
class
BaseParallelForTest
(
unittest
.
TestCase
):
def
run_test
(
self
,
callback
,
feed
,
fetch
):
"""
Run the unittest for parallel.for
Args:
callback(callable): A callable function returns a generator. There
are two yields in the generator function. The first yield
returns the data layers, and the second yield returns the loss.
The modified data variables will be sent back during the first
yield.
feed(dict): The executor feeding dictionary.
fetch(list|basestr): The fetch name lists.
Returns:
None
Raises:
AssertionError when the computation of cpu, parallel.for in cpu,
gpu, parallel.for in gpu are different.
"""
cpu
=
fluid
.
CPUPlace
()
result_cpu
=
self
.
_run_test_impl_
(
callback
=
callback
,
feed
=
feed
,
fetch
=
fetch
,
place
=
cpu
,
use_parallel
=
False
)
result_cpu_parallel
=
self
.
_run_test_impl_
(
callback
=
callback
,
feed
=
feed
,
fetch
=
fetch
,
place
=
cpu
,
use_parallel
=
True
)
if
fluid
.
core
.
is_compiled_with_cuda
():
gpu
=
fluid
.
CUDAPlace
(
0
)
result_gpu
=
self
.
_run_test_impl_
(
callback
=
callback
,
feed
=
feed
,
fetch
=
fetch
,
place
=
gpu
,
use_parallel
=
False
,
use_gpu
=
True
)
result_gpu_parallel
=
self
.
_run_test_impl_
(
callback
=
callback
,
feed
=
feed
,
fetch
=
fetch
,
place
=
gpu
,
use_parallel
=
True
,
use_gpu
=
True
)
result_gpu_nccl
=
self
.
_run_test_impl_
(
callback
=
callback
,
feed
=
feed
,
fetch
=
fetch
,
place
=
gpu
,
use_parallel
=
True
,
use_nccl
=
True
,
use_gpu
=
True
)
self
.
_assert_same_
(
fetch
,
result_cpu
,
result_cpu_parallel
,
result_gpu
,
result_gpu_parallel
,
result_gpu_nccl
)
else
:
self
.
_assert_same_
(
fetch
,
result_cpu
,
result_cpu_parallel
)
def
_run_test_impl_
(
self
,
callback
,
feed
,
fetch
,
place
,
use_parallel
=
False
,
use_nccl
=
False
,
use_gpu
=
False
):
"""
Run a single test, returns the fetch values
Args:
place(Place): the computation place.
use_parallel(bool): Whether use parallel.for or not.
Returns:
Fetched numpy arrays.
"""
if
isinstance
(
fetch
,
six
.
string_types
):
fetch
=
[
fetch
]
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
# Fix seed
main
.
random_seed
=
10
startup
.
random_seed
=
10
with
fluid
.
program_guard
(
main
,
startup
):
generator
=
callback
()
# Automatically insert parallel do if use_parallel = True
if
use_parallel
:
thread_num
=
fluid
.
core
.
get_cuda_device_count
(
)
if
use_gpu
else
8
places
=
get_places
(
thread_num
)
pd
=
ParallelDo
(
places
,
use_nccl
=
use_nccl
)
data
=
next
(
generator
)
if
isinstance
(
data
,
fluid
.
framework
.
Variable
):
data
=
[
data
]
with
pd
.
do
():
ins
=
list
(
map
(
pd
.
read_input
,
data
))
if
len
(
ins
)
==
1
:
ins
=
ins
[
0
]
loss
=
generator
.
send
(
ins
)
# patch input
pd
.
write_output
(
loss
)
loss
=
pd
()
else
:
data
=
next
(
generator
)
loss
=
generator
.
send
(
data
)
self
.
assertIsNotNone
(
loss
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
fluid
.
backward
.
append_backward
(
loss
=
avg_loss
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
if
use_gpu
:
profile_type
=
'GPU'
else
:
profile_type
=
'CPU'
with
profiler
.
profiler
(
profile_type
,
'total'
,
'/tmp/profiler'
):
return
exe
.
run
(
main
,
feed
=
feed
,
fetch_list
=
fetch
)
def
_assert_same_
(
self
,
fetch
,
*
args
):
"""
Assert the return values of `run_test` are same.
Args:
fetch: Fetch list. Used for print error message
*args: The fetch result lists of each situations.
Returns:
None
Raises:
AssertionError
"""
def
_impl_
(
a
,
b
,
fetch_id
,
item_id
):
item_str
=
[
'CPU'
,
'ParallelCPU'
,
'GPU'
,
'ParallelGPU'
,
'ParallelGPUNCCL'
]
flag
=
numpy
.
allclose
(
a
,
b
,
rtol
=
0.1
,
atol
=
1e-3
)
self
.
assertTrue
(
flag
,
"The {0} are different in {1}, {2} vs {3}"
.
format
(
fetch
[
fetch_id
],
item_str
[
item_id
],
a
,
b
))
for
i
,
items
in
enumerate
(
zip
(
*
args
)):
self
.
assertGreater
(
len
(
items
),
0
)
for
j
in
range
(
1
,
len
(
items
)):
_impl_
(
items
[
0
],
items
[
j
],
fetch_id
=
i
,
item_id
=
j
)
class
ParallelOpTest
(
BaseParallelForTest
):
@
staticmethod
def
__network__
():
x
=
fluid
.
layers
.
data
(
shape
=
[
784
],
dtype
=
'float32'
,
name
=
'img'
)
x
=
yield
x
hidden
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
200
,
param_attr
=
'fc1.w'
)
hidden
=
fluid
.
layers
.
batch_norm
(
input
=
hidden
)
loss
=
fluid
.
layers
.
mean
(
hidden
)
yield
loss
def
test_simple_fc
(
self
):
self
.
run_test
(
callback
=
self
.
__network__
,
feed
=
{
'img'
:
numpy
.
random
.
random
(
size
=
(
51
,
784
)).
astype
(
'float32'
)
},
fetch
=
[
'fc1.w@GRAD'
])
def
test_fc_with_tiny_data
(
self
):
self
.
run_test
(
callback
=
self
.
__network__
,
feed
=
{
'img'
:
numpy
.
random
.
random
(
size
=
(
1
,
784
)).
astype
(
'float32'
)},
fetch
=
[
'fc1.w@GRAD'
])
class
ParallelOpTestMultipleInput
(
BaseParallelForTest
):
@
staticmethod
def
__network__
():
x
=
fluid
.
layers
.
data
(
shape
=
[
784
],
dtype
=
'float32'
,
name
=
'img1'
,
stop_gradient
=
False
)
y
=
fluid
.
layers
.
data
(
shape
=
[
784
],
dtype
=
'float32'
,
name
=
'img2'
,
stop_gradient
=
False
)
yield
[
x
,
y
]
x
=
x
+
y
hidden1
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
200
,
param_attr
=
'fc1.w'
)
hidden2
=
fluid
.
layers
.
fc
(
input
=
hidden1
,
size
=
200
,
param_attr
=
'fc2.w'
)
hidden3
=
fluid
.
layers
.
fc
(
input
=
hidden2
,
size
=
200
,
param_attr
=
'fc3.w'
)
loss
=
fluid
.
layers
.
mean
(
hidden3
)
yield
loss
def
test_simple_fc
(
self
):
self
.
run_test
(
callback
=
self
.
__network__
,
feed
=
{
'img1'
:
numpy
.
random
.
random
(
size
=
(
51
,
784
)).
astype
(
'float32'
),
'img2'
:
numpy
.
random
.
random
(
size
=
(
51
,
784
)).
astype
(
'float32'
)
},
fetch
=
[
'fc1.w@GRAD'
,
'fc2.w@GRAD'
,
'fc3.w@GRAD'
])
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
0 → 100644
浏览文件 @
5c7a8aee
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
test_transpose_op
import
TestTransposeOp
class
TestTransposeMKLDNN
(
TestTransposeOp
):
def
init_op_type
(
self
):
self
.
op_type
=
"transpose2"
self
.
use_mkldnn
=
True
self
.
is_test
=
True
return
def
test_check_grad
(
self
):
return
def
test_check_grad_no_input
(
self
):
return
def
test_check_grad_no_filter
(
self
):
return
class
TestCase0MKLDNN
(
TestTransposeMKLDNN
):
def
initTestCase
(
self
):
self
.
shape
=
(
3
,
)
self
.
axis
=
(
0
,
)
class
TestCase1a
(
TestTransposeMKLDNN
):
def
initTestCase
(
self
):
self
.
shape
=
(
3
,
4
,
5
)
self
.
axis
=
(
0
,
2
,
1
)
class
TestCase1b
(
TestTransposeMKLDNN
):
def
initTestCase
(
self
):
self
.
shape
=
(
3
,
4
,
5
)
self
.
axis
=
(
2
,
1
,
0
)
class
TestCase2
(
TestTransposeMKLDNN
):
def
initTestCase
(
self
):
self
.
shape
=
(
2
,
3
,
4
,
5
)
self
.
axis
=
(
0
,
2
,
3
,
1
)
class
TestCase3
(
TestTransposeMKLDNN
):
def
initTestCase
(
self
):
self
.
shape
=
(
2
,
3
,
4
,
5
,
6
)
self
.
axis
=
(
4
,
2
,
3
,
1
,
0
)
class
TestCase4
(
TestTransposeMKLDNN
):
def
initTestCase
(
self
):
self
.
shape
=
(
2
,
3
,
4
,
5
,
6
,
1
)
self
.
axis
=
(
4
,
2
,
3
,
1
,
0
,
5
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_transpose_op.py
浏览文件 @
5c7a8aee
...
...
@@ -21,15 +21,24 @@ from op_test import OpTest
class
TestTransposeOp
(
OpTest
):
def
setUp
(
self
):
self
.
init_op_type
()
self
.
initTestCase
()
self
.
op_type
=
"transpose2"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
self
.
shape
).
astype
(
"float32"
)}
self
.
attrs
=
{
'axis'
:
list
(
self
.
axis
)}
self
.
attrs
=
{
'axis'
:
list
(
self
.
axis
),
'use_mkldnn'
:
self
.
use_mkldnn
,
'is_test'
:
self
.
is_test
,
}
self
.
outputs
=
{
'XShape'
:
np
.
random
.
random
(
self
.
shape
).
astype
(
"float32"
),
'Out'
:
self
.
inputs
[
'X'
].
transpose
(
self
.
axis
)
}
def
init_op_type
(
self
):
self
.
op_type
=
"transpose2"
self
.
use_mkldnn
=
False
self
.
is_test
=
False
def
test_check_output
(
self
):
self
.
check_output
(
no_check_set
=
[
'XShape'
])
...
...
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
浏览文件 @
5c7a8aee
...
...
@@ -35,14 +35,14 @@ dtype_to_size = {
}
SUB_BLOCK_OPS
=
[
"while"
,
"while_grad"
,
"parallel_do"
,
"parallel_do_grad"
,
"conditional_block"
,
"conditional_block_grad"
"while"
,
"while_grad"
,
"conditional_block"
,
"conditional_block_grad"
]
SUB_BLOCK_PAIR
=
[(
"while"
,
"while_grad"
),
(
"parallel_do"
,
"parallel_do_grad"
),
SUB_BLOCK_PAIR
=
[(
"while"
,
"while_grad"
),
(
"conditional_block"
,
"conditional_block_grad"
)]
PRINT_LOG
=
False
FLAGS_memory_optimize
=
""
class
OrderedSet
(
MutableSet
):
...
...
@@ -121,6 +121,7 @@ class ControlFlowGraph(object):
self
.
_defs
=
defaultdict
(
OrderedSet
)
self
.
_live_in
=
defaultdict
(
OrderedSet
)
self
.
_live_out
=
defaultdict
(
OrderedSet
)
self
.
_skip_opt
=
skip_opt
self
.
pool
=
[]
...
...
@@ -144,7 +145,6 @@ class ControlFlowGraph(object):
for
i
in
range
(
self
.
op_size
):
self
.
_uses
[
i
].
update
(
self
.
_ops
[
i
].
input_arg_names
())
self
.
_defs
[
i
].
update
(
self
.
_ops
[
i
].
output_arg_names
())
self
.
_live_in
[
i
]
=
self
.
_uses
[
i
]
def
_update_graph
(
self
,
old_name
,
new_name
,
begin_idx
=
0
):
for
i
in
range
(
begin_idx
,
self
.
op_size
):
...
...
@@ -177,20 +177,52 @@ class ControlFlowGraph(object):
worklist
.
append
(
d
)
def
_fill_pool
(
self
,
i
,
is_forward
):
def
comparator
(
x
,
cache
):
x_shape
=
x
[
1
]
cache_shape
=
cache
[
1
]
x_size
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
x_shape
))
cache_size
=
abs
(
reduce
(
lambda
x
,
y
:
x
*
y
,
cache_shape
))
if
(
x_shape
[
0
]
==
-
1
and
cache_shape
[
0
]
==
-
1
)
or
\
(
x_shape
[
0
]
!=
-
1
and
cache_shape
[
0
]
!=
-
1
)
:
return
x_size
<=
cache_size
else
:
return
False
def
find_var_in_block
(
x
):
known_vars
=
set
()
for
op
in
self
.
_ops
:
known_vars
.
update
(
op
.
output_arg_names
())
return
x
in
known_vars
block_desc
=
self
.
_ops
[
i
].
block
()
in_diff
,
_
=
self
.
_get_diff
(
self
.
_live_in
[
i
],
self
.
_live_out
[
i
])
# NOTE: must sort the in_diff set for cases that get different cache var.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this.
can_optimize
=
[
x
for
x
in
in_diff
x
for
x
in
sorted
(
in_diff
)
if
self
.
_check_var_validity
(
block_desc
,
x
,
is_forward
)
]
if
can_optimize
:
for
var_name
in
can_optimize
:
cache
=
(
var_name
,
self
.
_find_var
(
block_desc
,
var_name
,
is_forward
).
shape
())
if
cache
not
in
self
.
pool
:
self
.
pool
.
append
(
cache
)
if
cache
not
in
self
.
pool
and
find_var_in_block
(
var_name
):
i
=
0
while
i
<
len
(
self
.
pool
):
mycache
=
self
.
pool
[
i
]
mysize
=
mycache
[
1
][
0
]
cache_size
=
cache
[
1
][
0
]
if
(
mysize
==
-
1
and
cache_size
==
-
1
)
or
\
(
mysize
!=
-
1
and
cache_size
!=
-
1
):
if
comparator
(
mycache
,
cache
):
i
+=
1
else
:
break
elif
mysize
==
-
1
and
cache_size
!=
-
1
:
i
+=
1
elif
mysize
!=
-
1
and
cache_size
==
-
1
:
break
self
.
pool
.
insert
(
i
,
cache
)
def
_get_diff
(
self
,
a
,
b
):
u
=
a
&
b
...
...
@@ -229,7 +261,7 @@ class ControlFlowGraph(object):
def
_update_skip_opt_set
(
self
):
for
i
in
range
(
self
.
op_size
):
op
=
self
.
_ops
[
i
]
if
op
.
type
()
==
"fill_constant"
and
op
.
attr
(
"force_cpu"
)
==
True
:
if
op
.
has_attr
(
"force_cpu"
)
and
op
.
attr
(
"force_cpu"
)
==
True
:
self
.
_skip_opt
.
update
(
op
.
output_arg_names
())
def
release_memory
(
self
,
skip_opt_set
=
None
):
...
...
@@ -281,6 +313,7 @@ class ControlFlowGraph(object):
# update skip set to meet users' demand
if
skip_opt_set
:
self
.
_skip_opt
.
update
(
skip_opt_set
)
counter
=
0
for
i
in
range
(
self
.
op_size
):
op
=
self
.
_ops
[
i
]
if
op
.
type
()
in
SUB_BLOCK_OPS
:
...
...
@@ -301,6 +334,9 @@ class ControlFlowGraph(object):
# If x is both in uses and defs, it can not be optimized!
if
x
in
self
.
_uses
[
i
]:
continue
if
x
==
FLAGS_memory_optimize
:
print
(
"start match var "
,
x
,
" of op "
,
op
.
type
())
print
(
self
.
pool
)
for
index
,
cache_pair
in
enumerate
(
self
.
pool
):
cache_var
=
cache_pair
[
0
]
cache_shape
=
cache_pair
[
1
]
...
...
@@ -323,15 +359,13 @@ class ControlFlowGraph(object):
if
not
compare_shape
(
x_shape
,
cache_shape
,
level
):
continue
# TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
if
x_dtype
!=
cache_dtype
:
continue
if
PRINT_LOG
:
print
((
"Hit Cache !!!! cache pool index "
"is %d, var name is %s, "
"cached var name is %s, "
"var shape is %s "
)
%
(
index
,
x
,
cache_var
,
str
(
cache_shape
)))
print
(
(
"!!! %d, %s => %s, cache idx %d, pool size %d"
%
(
counter
,
x
+
str
(
x_shape
),
cache_var
+
str
(
cache_shape
),
index
,
len
(
self
.
pool
))))
counter
+=
1
self
.
pool
.
pop
(
index
)
# Rename the var to the cache var already with
# memory allocated in order to reuse the memory.
...
...
@@ -484,7 +518,10 @@ def memory_optimize(input_program,
if
level
!=
0
and
level
!=
1
:
raise
ValueError
(
"only support opt_level 0 or 1."
)
if
skip_opt_set
is
not
None
and
not
isinstance
(
skip_opt_set
,
set
):
if
skip_opt_set
is
not
None
:
if
isinstance
(
skip_opt_set
,
set
)
or
isinstance
(
skip_opt_set
,
list
):
skip_opt_set
=
set
(
skip_opt_set
)
else
:
raise
ValueError
(
"only support skip_opt_set as set."
)
global
PRINT_LOG
PRINT_LOG
=
print_log
...
...
python/setup.py.in
浏览文件 @
5c7a8aee
...
...
@@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
# put all thirdparty libraries in paddle.libs
libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
if os.name != 'nt':
package_data['paddle.libs']= []
package_data['paddle.libs']=['libwarpctc' + ext_name]
shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
package_data['paddle.libs']= []
package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录