Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
64ad051b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
64ad051b
编写于
12月 03, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
merge develop
test=develop
上级
c47c451a
0e3048db
变更
118
隐藏空白更改
内联
并排
Showing
118 changed file
with
6828 addition
and
969 deletion
+6828
-969
cmake/external/ngraph.cmake
cmake/external/ngraph.cmake
+6
-12
cmake/inference_lib.cmake
cmake/inference_lib.cmake
+9
-0
paddle/fluid/API.spec
paddle/fluid/API.spec
+10
-2
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+13
-7
paddle/fluid/framework/async_executor.cc
paddle/fluid/framework/async_executor.cc
+138
-0
paddle/fluid/framework/async_executor.h
paddle/fluid/framework/async_executor.h
+58
-0
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+386
-0
paddle/fluid/framework/data_feed.h
paddle/fluid/framework/data_feed.h
+269
-0
paddle/fluid/framework/data_feed.proto
paddle/fluid/framework/data_feed.proto
+30
-0
paddle/fluid/framework/data_feed_factory.cc
paddle/fluid/framework/data_feed_factory.cc
+64
-0
paddle/fluid/framework/data_feed_factory.h
paddle/fluid/framework/data_feed_factory.h
+29
-0
paddle/fluid/framework/data_feed_test.cc
paddle/fluid/framework/data_feed_test.cc
+337
-0
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+1
-1
paddle/fluid/framework/details/op_registry.h
paddle/fluid/framework/details/op_registry.h
+17
-3
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
...id/framework/details/scope_buffered_ssa_graph_executor.cc
+1
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+1
-30
paddle/fluid/framework/executor.h
paddle/fluid/framework/executor.h
+0
-1
paddle/fluid/framework/executor_thread_worker.cc
paddle/fluid/framework/executor_thread_worker.cc
+223
-0
paddle/fluid/framework/executor_thread_worker.h
paddle/fluid/framework/executor_thread_worker.h
+88
-0
paddle/fluid/framework/ir/is_test_pass.cc
paddle/fluid/framework/ir/is_test_pass.cc
+1
-1
paddle/fluid/framework/ir/is_test_pass_tester.cc
paddle/fluid/framework/ir/is_test_pass_tester.cc
+2
-2
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+1
-1
paddle/fluid/framework/ir/node.cc
paddle/fluid/framework/ir/node.cc
+25
-1
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+12
-0
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+1
-32
paddle/fluid/framework/ngraph_bridge.cc
paddle/fluid/framework/ngraph_bridge.cc
+85
-3
paddle/fluid/framework/ngraph_bridge.h
paddle/fluid/framework/ngraph_bridge.h
+6
-8
paddle/fluid/framework/ngraph_operator.cc
paddle/fluid/framework/ngraph_operator.cc
+339
-8
paddle/fluid/framework/ngraph_operator.h
paddle/fluid/framework/ngraph_operator.h
+1
-6
paddle/fluid/framework/op_info.h
paddle/fluid/framework/op_info.h
+7
-0
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+6
-0
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+5
-0
paddle/fluid/framework/type_defs.h
paddle/fluid/framework/type_defs.h
+2
-0
paddle/fluid/framework/variable_helper.cc
paddle/fluid/framework/variable_helper.cc
+60
-0
paddle/fluid/framework/variable_helper.h
paddle/fluid/framework/variable_helper.h
+22
-0
paddle/fluid/inference/analysis/analysis_pass.h
paddle/fluid/inference/analysis/analysis_pass.h
+0
-2
paddle/fluid/inference/analysis/passes/CMakeLists.txt
paddle/fluid/inference/analysis/passes/CMakeLists.txt
+2
-1
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
...uid/inference/analysis/passes/ir_analysis_compose_pass.cc
+1
-0
paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
...le/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+1
-6
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
...ence/analysis/passes/ir_params_sync_among_devices_pass.cc
+74
-0
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
...rence/analysis/passes/ir_params_sync_among_devices_pass.h
+39
-0
paddle/fluid/inference/analysis/passes/passes.cc
paddle/fluid/inference/analysis/passes/passes.cc
+4
-0
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+8
-4
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+3
-3
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+11
-1
paddle/fluid/inference/api/details/reset_tensor_array.cc
paddle/fluid/inference/api/details/reset_tensor_array.cc
+23
-0
paddle/fluid/inference/api/details/reset_tensor_array.h
paddle/fluid/inference/api/details/reset_tensor_array.h
+17
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+1
-5
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+8
-1
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+44
-29
paddle/fluid/inference/utils/benchmark.cc
paddle/fluid/inference/utils/benchmark.cc
+1
-1
paddle/fluid/inference/utils/benchmark.h
paddle/fluid/inference/utils/benchmark.h
+5
-3
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+7
-1
paddle/fluid/operators/batch_norm_mkldnn_op.cc
paddle/fluid/operators/batch_norm_mkldnn_op.cc
+11
-8
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+126
-50
paddle/fluid/operators/batch_norm_op.cu
paddle/fluid/operators/batch_norm_op.cu
+172
-62
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+9
-261
paddle/fluid/operators/conv_transpose_mkldnn_op.cc
paddle/fluid/operators/conv_transpose_mkldnn_op.cc
+299
-0
paddle/fluid/operators/conv_transpose_op.cc
paddle/fluid/operators/conv_transpose_op.cc
+28
-7
paddle/fluid/operators/cudnn_lstm_op.cc
paddle/fluid/operators/cudnn_lstm_op.cc
+218
-0
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+485
-0
paddle/fluid/operators/detection/box_coder_op.h
paddle/fluid/operators/detection/box_coder_op.h
+6
-0
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+18
-17
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+5
-2
paddle/fluid/operators/distributed/grpc_client.h
paddle/fluid/operators/distributed/grpc_client.h
+1
-0
paddle/fluid/operators/distributed/grpc_serde.cc
paddle/fluid/operators/distributed/grpc_serde.cc
+5
-1
paddle/fluid/operators/distributed/grpc_serde.h
paddle/fluid/operators/distributed/grpc_serde.h
+2
-1
paddle/fluid/operators/distributed/grpc_serde_test.cc
paddle/fluid/operators/distributed/grpc_serde_test.cc
+2
-1
paddle/fluid/operators/distributed/grpc_server.cc
paddle/fluid/operators/distributed/grpc_server.cc
+2
-1
paddle/fluid/operators/distributed/grpc_variable_response.cc
paddle/fluid/operators/distributed/grpc_variable_response.cc
+14
-0
paddle/fluid/operators/distributed/parameter_prefetch.cc
paddle/fluid/operators/distributed/parameter_prefetch.cc
+255
-0
paddle/fluid/operators/distributed/parameter_prefetch.h
paddle/fluid/operators/distributed/parameter_prefetch.h
+34
-0
paddle/fluid/operators/distributed/request_handler.h
paddle/fluid/operators/distributed/request_handler.h
+2
-1
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+23
-10
paddle/fluid/operators/distributed/request_handler_impl.h
paddle/fluid/operators/distributed/request_handler_impl.h
+32
-8
paddle/fluid/operators/distributed/rpc_client.h
paddle/fluid/operators/distributed/rpc_client.h
+1
-1
paddle/fluid/operators/distributed/send_recv.proto.in
paddle/fluid/operators/distributed/send_recv.proto.in
+1
-0
paddle/fluid/operators/distributed/variable_response.h
paddle/fluid/operators/distributed/variable_response.h
+1
-0
paddle/fluid/operators/lookup_table_op.cc
paddle/fluid/operators/lookup_table_op.cc
+19
-0
paddle/fluid/operators/lookup_table_op.cu
paddle/fluid/operators/lookup_table_op.cu
+47
-26
paddle/fluid/operators/lookup_table_op.h
paddle/fluid/operators/lookup_table_op.h
+63
-37
paddle/fluid/operators/metrics/auc_op.h
paddle/fluid/operators/metrics/auc_op.h
+7
-2
paddle/fluid/operators/pad2d_op.cc
paddle/fluid/operators/pad2d_op.cc
+75
-13
paddle/fluid/operators/pad2d_op.cu
paddle/fluid/operators/pad2d_op.cu
+36
-5
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
...e/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+6
-0
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
...le/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+65
-26
paddle/fluid/operators/softmax_mkldnn_op.cc
paddle/fluid/operators/softmax_mkldnn_op.cc
+1
-1
paddle/fluid/platform/assert.h
paddle/fluid/platform/assert.h
+10
-0
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+17
-1
paddle/fluid/platform/mkldnn_helper.h
paddle/fluid/platform/mkldnn_helper.h
+0
-164
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+458
-0
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+2
-2
paddle/fluid/pybind/async_executor_py.cc
paddle/fluid/pybind/async_executor_py.cc
+53
-0
paddle/fluid/pybind/async_executor_py.h
paddle/fluid/pybind/async_executor_py.h
+28
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-0
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+20
-2
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+9
-1
python/paddle/fluid/async_executor.py
python/paddle/fluid/async_executor.py
+151
-0
python/paddle/fluid/data_feed_desc.py
python/paddle/fluid/data_feed_desc.py
+152
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+1
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+283
-21
python/paddle/fluid/tests/demo/async_executor.py
python/paddle/fluid/tests/demo/async_executor.py
+100
-0
python/paddle/fluid/tests/unittests/dist_ctr.py
python/paddle/fluid/tests/unittests/dist_ctr.py
+2
-0
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+20
-2
python/paddle/fluid/tests/unittests/test_async_executor.py
python/paddle/fluid/tests/unittests/test_async_executor.py
+142
-0
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+110
-23
python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
.../fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+77
-0
python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
.../paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+6
-1
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+41
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+19
-1
python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
...ddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+203
-0
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+192
-0
python/paddle/fluid/tests/unittests/test_pad2d_op.py
python/paddle/fluid/tests/unittests/test_pad2d_op.py
+26
-1
python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
...ts/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+34
-0
python/paddle/fluid/tests/unittests/testsuite.py
python/paddle/fluid/tests/unittests/testsuite.py
+6
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+73
-21
python/paddle/reader/tests/decorator_test.py
python/paddle/reader/tests/decorator_test.py
+2
-2
python/setup.py.in
python/setup.py.in
+14
-9
未找到文件。
cmake/external/ngraph.cmake
浏览文件 @
64ad051b
...
@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
...
@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
return
()
return
()
ENDIF
()
ENDIF
()
INCLUDE
(
GNUInstallDirs
)
INCLUDE
(
ExternalProject
)
INCLUDE
(
ExternalProject
)
SET
(
NGRAPH_PROJECT
"extern_ngraph"
)
SET
(
NGRAPH_PROJECT
"extern_ngraph"
)
...
@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
...
@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET
(
NGRAPH_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/ngraph
)
SET
(
NGRAPH_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/ngraph
)
SET
(
NGRAPH_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/ngraph
)
SET
(
NGRAPH_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/ngraph
)
SET
(
NGRAPH_INC_DIR
${
NGRAPH_INSTALL_DIR
}
/include
)
SET
(
NGRAPH_INC_DIR
${
NGRAPH_INSTALL_DIR
}
/include
)
SET
(
NGRAPH_LIB_DIR
${
NGRAPH_INSTALL_DIR
}
/
${
CMAKE_INSTALL_LIBDIR
}
)
SET
(
NGRAPH_SHARED_LIB_NAME libngraph.so.
${
NGRAPH_VERSION
}
)
SET
(
NGRAPH_SHARED_LIB_NAME libngraph.so.
${
NGRAPH_VERSION
}
)
SET
(
NGRAPH_CPU_LIB_NAME libcpu_backend.so
)
SET
(
NGRAPH_CPU_LIB_NAME libcpu_backend.so
)
SET
(
NGRAPH_TBB_LIB_NAME libtbb.so.2
)
SET
(
NGRAPH_TBB_LIB_NAME libtbb.so.2
)
SET
(
NGRAPH_GIT_REPO
"https://github.com/NervanaSystems/ngraph.git"
)
SET
(
NGRAPH_GIT_REPO
"https://github.com/NervanaSystems/ngraph.git"
)
SET
(
NGRAPH_SHARED_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_SHARED_LIB_NAME
}
)
SET
(
NGRAPH_CPU_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_CPU_LIB_NAME
}
)
SET
(
NGRAPH_TBB_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_TBB_LIB_NAME
}
)
ExternalProject_Add
(
ExternalProject_Add
(
${
NGRAPH_PROJECT
}
${
NGRAPH_PROJECT
}
...
@@ -63,18 +69,6 @@ ExternalProject_Add(
...
@@ -63,18 +69,6 @@ ExternalProject_Add(
CMAKE_ARGS -DMKLDNN_LIB_DIR=
${
MKLDNN_INSTALL_DIR
}
/lib
CMAKE_ARGS -DMKLDNN_LIB_DIR=
${
MKLDNN_INSTALL_DIR
}
/lib
)
)
if
(
UNIX AND NOT APPLE
)
include
(
GNUInstallDirs
)
SET
(
NGRAPH_LIB_DIR
${
NGRAPH_INSTALL_DIR
}
/
${
CMAKE_INSTALL_LIBDIR
}
)
else
()
SET
(
NGRAPH_LIB_DIR
${
NGRAPH_INSTALL_DIR
}
/lib
)
endif
()
MESSAGE
(
STATUS
"nGraph lib will be installed at:
${
NGRAPH_LIB_DIR
}
"
)
SET
(
NGRAPH_SHARED_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_SHARED_LIB_NAME
}
)
SET
(
NGRAPH_CPU_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_CPU_LIB_NAME
}
)
SET
(
NGRAPH_TBB_LIB
${
NGRAPH_LIB_DIR
}
/
${
NGRAPH_TBB_LIB_NAME
}
)
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step
(
ExternalProject_Add_Step
(
${
NGRAPH_PROJECT
}
${
NGRAPH_PROJECT
}
...
...
cmake/inference_lib.cmake
浏览文件 @
64ad051b
...
@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
...
@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
)
)
endif
()
endif
()
if
(
WITH_NGRAPH
)
set
(
dst_dir
"
${
FLUID_INSTALL_DIR
}
/third_party/install/ngraph"
)
copy
(
ngraph_lib
SRCS
${
NGRAPH_INC_DIR
}
${
NGRAPH_LIB_DIR
}
DSTS
${
dst_dir
}
${
dst_dir
}
DEPS ngraph
)
endif
()
if
(
NOT WIN32
)
if
(
NOT WIN32
)
if
(
NOT MOBILE_INFERENCE AND NOT RPI
)
if
(
NOT MOBILE_INFERENCE AND NOT RPI
)
set
(
dst_dir
"
${
FLUID_INSTALL_DIR
}
/third_party/install/snappy"
)
set
(
dst_dir
"
${
FLUID_INSTALL_DIR
}
/third_party/install/snappy"
)
...
...
paddle/fluid/API.spec
浏览文件 @
64ad051b
...
@@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c
...
@@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
...
@@ -69,7 +76,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
...
@@ -69,7 +76,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'
], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, Non
e, False, False))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'
, 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, Fals
e, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
...
@@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
...
@@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', '
name'], varargs=None, keywords=None, defaults=(None,
))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', '
ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None
))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
...
@@ -187,6 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
...
@@ -187,6 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
64ad051b
...
@@ -34,6 +34,7 @@ add_subdirectory(ir)
...
@@ -34,6 +34,7 @@ add_subdirectory(ir)
add_subdirectory
(
details
)
add_subdirectory
(
details
)
# ddim lib
# ddim lib
proto_library
(
framework_proto SRCS framework.proto
)
proto_library
(
framework_proto SRCS framework.proto
)
proto_library
(
async_executor_proto SRCS data_feed.proto
)
cc_library
(
ddim SRCS ddim.cc DEPS eigen3 boost
)
cc_library
(
ddim SRCS ddim.cc DEPS eigen3 boost
)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
...
@@ -126,8 +127,9 @@ cc_library(version SRCS version.cc)
...
@@ -126,8 +127,9 @@ cc_library(version SRCS version.cc)
cc_test
(
version_test SRCS version_test.cc DEPS version
)
cc_test
(
version_test SRCS version_test.cc DEPS version
)
cc_library
(
proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version
)
cc_library
(
proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version
)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto
)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
cc_library
(
ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph
)
cc_library
(
ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
cc_library
(
ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler
)
shape_inference data_transform lod_tensor profiler
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
...
@@ -135,7 +137,7 @@ endif(NOT WIN32)
...
@@ -135,7 +137,7 @@ endif(NOT WIN32)
cc_library
(
op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc
)
cc_library
(
op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
nv_test
(
op_registry_test SRCS op_registry_test.cc DEPS op_registry
)
py_proto_compile
(
framework_py_proto SRCS framework.proto
)
py_proto_compile
(
framework_py_proto SRCS framework.proto
data_feed.proto
)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target
(
framework_py_proto_init ALL COMMAND
${
CMAKE_COMMAND
}
-E touch __init__.py
)
add_custom_target
(
framework_py_proto_init ALL COMMAND
${
CMAKE_COMMAND
}
-E touch __init__.py
)
add_dependencies
(
framework_py_proto framework_py_proto_init
)
add_dependencies
(
framework_py_proto framework_py_proto_init
)
...
@@ -157,18 +159,19 @@ endif(NOT WIN32)
...
@@ -157,18 +159,19 @@ endif(NOT WIN32)
cc_library
(
lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor
)
cc_library
(
lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor
)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
cc_library
(
variable_helper SRCS variable_helper.cc DEPS lod_tensor
)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
variable_helper
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass
variable_helper
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
else
()
if
(
NOT WIN32
)
if
(
NOT WIN32
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator
variable_helper
)
else
(
NOT WIN32
)
else
(
NOT WIN32
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass
variable_helper
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
endif
()
...
@@ -176,8 +179,11 @@ endif()
...
@@ -176,8 +179,11 @@ endif()
cc_library
(
parallel_executor SRCS parallel_executor.cc DEPS
cc_library
(
parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph build_strategy
graph build_strategy
fast_threaded_ssa_graph_executor
)
fast_threaded_ssa_graph_executor variable_helper
)
cc_library
(
async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper
)
cc_test
(
data_feed_test SRCS data_feed_test.cc DEPS async_executor
)
cc_library
(
prune SRCS prune.cc DEPS framework_proto
)
cc_library
(
prune SRCS prune.cc DEPS framework_proto
)
cc_test
(
prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context
)
cc_test
(
prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context
)
cc_test
(
var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_test
(
var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
...
...
paddle/fluid/framework/async_executor.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/async_executor.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace
paddle
{
namespace
framework
{
AsyncExecutor
::
AsyncExecutor
(
Scope
*
scope
,
const
platform
::
Place
&
place
)
:
root_scope_
(
scope
),
place_
(
place
)
{}
void
AsyncExecutor
::
CreateThreads
(
ExecutorThreadWorker
*
worker
,
const
ProgramDesc
&
main_program
,
const
std
::
shared_ptr
<
DataFeed
>&
reader
,
const
std
::
vector
<
std
::
string
>&
fetch_var_names
,
Scope
*
root_scope
,
const
int
thread_index
,
const
bool
debug
)
{
worker
->
SetThreadId
(
thread_index
);
worker
->
SetDebug
(
debug
);
worker
->
SetRootScope
(
root_scope
);
worker
->
CreateThreadResource
(
main_program
,
place_
);
worker
->
SetDataFeed
(
reader
);
worker
->
SetFetchVarNames
(
fetch_var_names
);
worker
->
BindingDataFeedMemory
();
}
void
PrepareReaders
(
std
::
vector
<
std
::
shared_ptr
<
DataFeed
>>&
readers
,
// NOLINT
const
int
thread_num
,
const
DataFeedDesc
&
data_feed_desc
,
const
std
::
vector
<
std
::
string
>&
filelist
)
{
readers
.
resize
(
thread_num
);
for
(
size_t
i
=
0
;
i
<
readers
.
size
();
++
i
)
{
readers
[
i
]
=
DataFeedFactory
::
CreateDataFeed
(
data_feed_desc
.
name
());
readers
[
i
]
->
Init
(
data_feed_desc
);
// set batch_size and queue_size here
}
readers
[
0
]
->
SetFileList
(
filelist
);
}
void
AsyncExecutor
::
RunFromFile
(
const
ProgramDesc
&
main_program
,
const
std
::
string
&
data_feed_desc_str
,
const
std
::
vector
<
std
::
string
>&
filelist
,
const
int
thread_num
,
const
std
::
vector
<
std
::
string
>&
fetch_var_names
,
const
bool
debug
)
{
std
::
vector
<
std
::
thread
>
threads
;
auto
&
block
=
main_program
.
Block
(
0
);
for
(
auto
var_name
:
fetch_var_names
)
{
auto
var_desc
=
block
.
FindVar
(
var_name
);
auto
shapes
=
var_desc
->
GetShape
();
PADDLE_ENFORCE
(
shapes
[
shapes
.
size
()
-
1
]
==
1
,
"var %s: Fetched var has wrong shape, "
"only variables with the last dimension size 1 supported"
,
var_name
);
}
DataFeedDesc
data_feed_desc
;
google
::
protobuf
::
TextFormat
::
ParseFromString
(
data_feed_desc_str
,
&
data_feed_desc
);
int
actual_thread_num
=
thread_num
;
int
file_cnt
=
filelist
.
size
();
PADDLE_ENFORCE
(
file_cnt
>
0
,
"File list cannot be empty"
);
if
(
actual_thread_num
>
file_cnt
)
{
VLOG
(
1
)
<<
"Thread num = "
<<
thread_num
<<
", file num = "
<<
file_cnt
<<
". Changing thread_num = "
<<
file_cnt
;
actual_thread_num
=
file_cnt
;
}
/*
readerDesc: protobuf description for reader initlization
argument: class_name, batch_size, use_slot, queue_size, buffer_size,
padding_index
reader:
1) each thread has a reader, reader will read input data and
put it into input queue
2) each reader has a Next() iterface, that can fetch an instance
from the input queue
*/
// todo: should be factory method for creating datafeed
std
::
vector
<
std
::
shared_ptr
<
DataFeed
>>
readers
;
PrepareReaders
(
readers
,
actual_thread_num
,
data_feed_desc
,
filelist
);
std
::
vector
<
std
::
shared_ptr
<
ExecutorThreadWorker
>>
workers
;
workers
.
resize
(
actual_thread_num
);
for
(
auto
&
worker
:
workers
)
{
worker
.
reset
(
new
ExecutorThreadWorker
);
}
// prepare thread resource here
for
(
int
thidx
=
0
;
thidx
<
actual_thread_num
;
++
thidx
)
{
CreateThreads
(
workers
[
thidx
].
get
(),
main_program
,
readers
[
thidx
],
fetch_var_names
,
root_scope_
,
thidx
,
debug
);
}
// start executing ops in multiple threads
for
(
int
thidx
=
0
;
thidx
<
actual_thread_num
;
++
thidx
)
{
threads
.
push_back
(
std
::
thread
(
&
ExecutorThreadWorker
::
TrainFiles
,
workers
[
thidx
].
get
()));
}
for
(
auto
&
th
:
threads
)
{
th
.
join
();
}
root_scope_
->
DropKids
();
return
;
}
}
// einit_modelnd namespace framework
}
// end namespace paddle
paddle/fluid/framework/async_executor.h
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <string>
#include <thread> // NOLINT
#include <typeinfo>
#include <vector>
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
class
AsyncExecutor
{
public:
AsyncExecutor
(
Scope
*
scope
,
const
platform
::
Place
&
place
);
virtual
~
AsyncExecutor
()
{}
void
RunFromFile
(
const
ProgramDesc
&
main_program
,
const
std
::
string
&
data_feed_desc_str
,
const
std
::
vector
<
std
::
string
>&
filelist
,
const
int
thread_num
,
const
std
::
vector
<
std
::
string
>&
fetch_names
,
const
bool
debug
=
false
);
private:
void
CreateThreads
(
ExecutorThreadWorker
*
worker
,
const
ProgramDesc
&
main_program
,
const
std
::
shared_ptr
<
DataFeed
>&
reader
,
const
std
::
vector
<
std
::
string
>&
fetch_var_names
,
Scope
*
root_scope
,
const
int
thread_index
,
const
bool
debug
);
public:
Scope
*
root_scope_
;
platform
::
Place
place_
;
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/data_feed.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
namespace
paddle
{
namespace
framework
{
std
::
vector
<
std
::
string
>
DataFeed
::
filelist_
;
size_t
DataFeed
::
file_idx_
;
std
::
mutex
DataFeed
::
mutex_for_pick_file_
;
bool
DataFeed
::
finish_set_filelist_
;
void
DataFeed
::
AddFeedVar
(
Variable
*
var
,
const
std
::
string
&
name
)
{
CheckInit
();
for
(
size_t
i
=
0
;
i
<
use_slots_
.
size
();
++
i
)
{
if
(
name
==
use_slots_
[
i
])
{
if
(
use_slots_is_dense_
[
i
])
{
feed_vec_
[
i
]
=
MixTensor
(
var
->
GetMutable
<
Tensor
>
());
}
else
{
feed_vec_
[
i
]
=
MixTensor
(
var
->
GetMutable
<
LoDTensor
>
());
}
}
}
}
bool
DataFeed
::
SetFileList
(
const
std
::
vector
<
std
::
string
>&
files
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_for_pick_file_
);
CheckInit
();
if
(
finish_set_filelist_
)
{
VLOG
(
3
)
<<
"info: you have set the filelist."
;
return
false
;
}
PADDLE_ENFORCE
(
files
.
size
(),
"You have set an empty filelist."
);
filelist_
.
assign
(
files
.
begin
(),
files
.
end
());
file_idx_
=
0
;
finish_set_filelist_
=
true
;
return
true
;
}
void
DataFeed
::
SetBatchSize
(
int
batch_size
)
{
PADDLE_ENFORCE
(
batch_size
>
0
,
"Illegal batch size: %d."
,
batch_size
);
default_batch_size_
=
batch_size
;
}
bool
DataFeed
::
PickOneFile
(
std
::
string
*
filename
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_for_pick_file_
);
if
(
file_idx_
==
filelist_
.
size
())
{
return
false
;
}
*
filename
=
filelist_
[
file_idx_
++
];
return
true
;
}
void
DataFeed
::
CheckInit
()
{
PADDLE_ENFORCE
(
finish_init_
,
"Initialization did not succeed."
);
}
void
DataFeed
::
CheckSetFileList
()
{
PADDLE_ENFORCE
(
finish_set_filelist_
,
"Set filelist did not succeed."
);
}
void
DataFeed
::
CheckStart
()
{
PADDLE_ENFORCE
(
finish_start_
,
"Datafeed has not started running yet."
);
}
template
<
typename
T
>
void
PrivateQueueDataFeed
<
T
>::
SetQueueSize
(
int
queue_size
)
{
PADDLE_ENFORCE
(
queue_size
>
0
,
"Illegal queue size: %d."
,
queue_size
);
queue_size_
=
queue_size
;
queue_
=
std
::
unique_ptr
<
paddle
::
operators
::
reader
::
BlockingQueue
<
T
>>
(
new
paddle
::
operators
::
reader
::
BlockingQueue
<
T
>
(
queue_size_
));
}
template
<
typename
T
>
bool
PrivateQueueDataFeed
<
T
>::
Start
()
{
CheckSetFileList
();
read_thread_
=
std
::
thread
(
&
PrivateQueueDataFeed
::
ReadThread
,
this
);
read_thread_
.
detach
();
finish_start_
=
true
;
return
true
;
}
template
<
typename
T
>
void
PrivateQueueDataFeed
<
T
>::
ReadThread
()
{
std
::
string
filename
;
while
(
PickOneFile
(
&
filename
))
{
file_
.
open
(
filename
.
c_str
());
// is_text_feed
PADDLE_ENFORCE
(
file_
.
good
(),
"Open file<%s> fail."
,
filename
.
c_str
());
T
instance
;
while
(
ParseOneInstance
(
&
instance
))
{
queue_
->
Send
(
instance
);
}
file_
.
close
();
}
queue_
->
Close
();
}
template
<
typename
T
>
int
PrivateQueueDataFeed
<
T
>::
Next
()
{
CheckStart
();
int
index
=
0
;
T
instance
;
T
ins_vec
;
while
(
index
<
default_batch_size_
)
{
if
(
!
queue_
->
Receive
(
&
instance
))
{
break
;
}
AddInstanceToInsVec
(
&
ins_vec
,
instance
,
index
++
);
}
batch_size_
=
index
;
if
(
batch_size_
!=
0
)
{
PutToFeedVec
(
ins_vec
);
}
return
batch_size_
;
}
#ifdef _WIN32
template
class
PrivateQueueDataFeed
<
std
::
vector
<
MultiSlotType
>
>
;
#endif
void
MultiSlotDataFeed
::
Init
(
const
paddle
::
framework
::
DataFeedDesc
&
data_feed_desc
)
{
finish_init_
=
false
;
finish_set_filelist_
=
false
;
finish_start_
=
false
;
PADDLE_ENFORCE
(
data_feed_desc
.
has_multi_slot_desc
(),
"Multi_slot_desc has not been set."
);
paddle
::
framework
::
MultiSlotDesc
multi_slot_desc
=
data_feed_desc
.
multi_slot_desc
();
SetBatchSize
(
data_feed_desc
.
batch_size
());
SetQueueSize
(
data_feed_desc
.
batch_size
());
size_t
all_slot_num
=
multi_slot_desc
.
slots_size
();
all_slots_
.
resize
(
all_slot_num
);
all_slots_type_
.
resize
(
all_slot_num
);
use_slots_index_
.
resize
(
all_slot_num
);
use_slots_
.
clear
();
use_slots_is_dense_
.
clear
();
for
(
size_t
i
=
0
;
i
<
all_slot_num
;
++
i
)
{
const
auto
&
slot
=
multi_slot_desc
.
slots
(
i
);
all_slots_
[
i
]
=
slot
.
name
();
all_slots_type_
[
i
]
=
slot
.
type
();
use_slots_index_
[
i
]
=
slot
.
is_used
()
?
use_slots_
.
size
()
:
-
1
;
if
(
slot
.
is_used
())
{
use_slots_
.
push_back
(
all_slots_
[
i
]);
use_slots_is_dense_
.
push_back
(
slot
.
is_dense
());
}
}
feed_vec_
.
resize
(
use_slots_
.
size
());
finish_init_
=
true
;
}
bool
MultiSlotDataFeed
::
CheckFile
(
const
char
*
filename
)
{
CheckInit
();
// get info of slots
std
::
ifstream
fin
(
filename
);
if
(
!
fin
.
good
())
{
VLOG
(
1
)
<<
"error: open file<"
<<
filename
<<
"> fail"
;
return
false
;
}
std
::
string
line
;
int
instance_cout
=
0
;
std
::
string
all_slots_alias
=
""
;
for
(
const
auto
&
alias
:
all_slots_
)
{
all_slots_alias
+=
alias
+
" "
;
}
std
::
string
use_slots_alias
=
""
;
for
(
const
auto
&
alias
:
use_slots_
)
{
use_slots_alias
+=
alias
+
" "
;
}
VLOG
(
3
)
<<
"total slots num: "
<<
all_slots_
.
size
();
VLOG
(
3
)
<<
"total slots alias: "
<<
all_slots_alias
;
VLOG
(
3
)
<<
"used slots num: "
<<
use_slots_
.
size
();
VLOG
(
3
)
<<
"used slots alias: "
<<
use_slots_alias
;
while
(
getline
(
fin
,
line
))
{
++
instance_cout
;
const
char
*
str
=
line
.
c_str
();
char
*
endptr
=
const_cast
<
char
*>
(
str
);
int
len
=
line
.
length
();
for
(
size_t
i
=
0
;
i
<
all_slots_
.
size
();
++
i
)
{
int
num
=
strtol
(
endptr
,
&
endptr
,
10
);
if
(
num
<
0
)
{
VLOG
(
0
)
<<
"error: the number of ids is a negative number: "
<<
num
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
else
if
(
num
==
0
)
{
VLOG
(
0
)
<<
"error: the number of ids can not be zero, you need "
"padding it in data generator; or if there is something wrong"
" with the data, please check if the data contains unresolvable "
"characters."
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
else
if
(
errno
==
ERANGE
||
num
>
INT_MAX
)
{
VLOG
(
0
)
<<
"error: the number of ids greater than INT_MAX"
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
if
(
all_slots_type_
[
i
]
==
"float"
)
{
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
strtof
(
endptr
,
&
endptr
);
if
(
errno
==
ERANGE
)
{
VLOG
(
0
)
<<
"error: the value is out of the range of "
"representable values for float"
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
if
(
i
+
1
!=
num
&&
endptr
-
str
==
len
)
{
VLOG
(
0
)
<<
"error: there is a wrong with the number of ids."
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
}
}
else
if
(
all_slots_type_
[
i
]
==
"uint64"
)
{
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
strtoull
(
endptr
,
&
endptr
,
10
);
if
(
errno
==
ERANGE
)
{
VLOG
(
0
)
<<
"error: the value is out of the range of "
"representable values for uint64_t"
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
if
(
i
+
1
!=
num
&&
endptr
-
str
==
len
)
{
VLOG
(
0
)
<<
"error: there is a wrong with the number of ids."
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
}
}
else
{
VLOG
(
0
)
<<
"error: this type<"
<<
all_slots_type_
[
i
]
<<
"> is not supported"
;
return
false
;
}
}
// It may be added '\t' character to the end of the output of reduce
// task when processes data by Hadoop(when the output of the reduce
// task of Hadoop has only one field, it will add a '\t' at the end
// of the line by default, and you can use this option to avoid it:
// `-D mapred.textoutputformat.ignoreseparator=true`), which does
// not affect the correctness of the data. Therefore, it should be
// judged that the data is not normal when the end of each line of
// data contains characters which are not spaces.
while
(
endptr
-
str
!=
len
)
{
if
(
!
isspace
(
*
(
endptr
++
)))
{
VLOG
(
0
)
<<
"error: there is some extra characters at the end of the line."
;
VLOG
(
0
)
<<
"please check line<"
<<
instance_cout
<<
"> in file<"
<<
filename
<<
">"
;
return
false
;
}
}
}
VLOG
(
3
)
<<
"instances cout: "
<<
instance_cout
;
VLOG
(
3
)
<<
"The file format is correct"
;
return
true
;
}
bool
MultiSlotDataFeed
::
ParseOneInstance
(
std
::
vector
<
MultiSlotType
>*
instance
)
{
std
::
string
line
;
if
(
getline
(
file_
,
line
))
{
int
use_slots_num
=
use_slots_
.
size
();
instance
->
resize
(
use_slots_num
);
// parse line
const
char
*
str
=
line
.
c_str
();
char
*
endptr
=
const_cast
<
char
*>
(
str
);
int
pos
=
0
;
for
(
size_t
i
=
0
;
i
<
use_slots_index_
.
size
();
++
i
)
{
int
idx
=
use_slots_index_
[
i
];
int
num
=
strtol
(
&
str
[
pos
],
&
endptr
,
10
);
PADDLE_ENFORCE
(
num
,
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.
\n
please check this error line: %s"
,
str
);
if
(
idx
!=
-
1
)
{
(
*
instance
)[
idx
].
Init
(
all_slots_type_
[
i
]);
if
((
*
instance
)[
idx
].
GetType
()[
0
]
==
'f'
)
{
// float
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
float
feasign
=
strtof
(
endptr
,
&
endptr
);
(
*
instance
)[
idx
].
AddValue
(
feasign
);
}
}
else
if
((
*
instance
)[
idx
].
GetType
()[
0
]
==
'u'
)
{
// uint64
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
uint64_t
feasign
=
(
uint64_t
)
strtoull
(
endptr
,
&
endptr
,
10
);
(
*
instance
)[
idx
].
AddValue
(
feasign
);
}
}
pos
=
endptr
-
str
;
}
else
{
for
(
int
j
=
0
;
j
<=
num
;
++
j
)
{
pos
=
line
.
find_first_of
(
' '
,
pos
+
1
);
}
}
}
}
else
{
return
false
;
}
return
true
;
}
void
MultiSlotDataFeed
::
AddInstanceToInsVec
(
std
::
vector
<
MultiSlotType
>*
ins_vec
,
const
std
::
vector
<
MultiSlotType
>&
instance
,
int
index
)
{
if
(
index
==
0
)
{
ins_vec
->
resize
(
instance
.
size
());
for
(
size_t
i
=
0
;
i
<
instance
.
size
();
++
i
)
{
(
*
ins_vec
)[
i
].
Init
(
instance
[
i
].
GetType
());
(
*
ins_vec
)[
i
].
InitOffset
();
}
}
for
(
size_t
i
=
0
;
i
<
instance
.
size
();
++
i
)
{
(
*
ins_vec
)[
i
].
AddIns
(
instance
[
i
]);
}
}
void
MultiSlotDataFeed
::
PutToFeedVec
(
const
std
::
vector
<
MultiSlotType
>&
ins_vec
)
{
for
(
size_t
i
=
0
;
i
<
use_slots_
.
size
();
++
i
)
{
const
auto
&
type
=
ins_vec
[
i
].
GetType
();
const
auto
&
offset
=
ins_vec
[
i
].
GetOffset
();
int
total_instance
=
static_cast
<
int
>
(
offset
.
back
());
if
(
type
[
0
]
==
'f'
)
{
// float
const
auto
&
feasign
=
ins_vec
[
i
].
GetFloatData
();
if
(
feed_vec_
[
i
].
IsDense
())
{
int
size_in_each_batch
=
total_instance
/
batch_size_
;
float
*
tensor_ptr
=
feed_vec_
[
i
].
GetTensor
()
->
mutable_data
<
float
>
(
{
batch_size_
,
size_in_each_batch
},
platform
::
CPUPlace
());
memcpy
(
tensor_ptr
,
&
feasign
[
0
],
total_instance
*
sizeof
(
float
));
}
else
{
float
*
tensor_ptr
=
feed_vec_
[
i
].
GetLoDTensor
()
->
mutable_data
<
float
>
(
{
total_instance
,
1
},
platform
::
CPUPlace
());
memcpy
(
tensor_ptr
,
&
feasign
[
0
],
total_instance
*
sizeof
(
float
));
LoD
data_lod
{
offset
};
feed_vec_
[
i
].
GetLoDTensor
()
->
set_lod
(
data_lod
);
}
}
else
if
(
type
[
0
]
==
'u'
)
{
// uint64
// no uint64_t type in paddlepaddle
const
auto
&
feasign
=
ins_vec
[
i
].
GetUint64Data
();
if
(
feed_vec_
[
i
].
IsDense
())
{
int
size_in_each_batch
=
total_instance
/
batch_size_
;
int64_t
*
tensor_ptr
=
feed_vec_
[
i
].
GetTensor
()
->
mutable_data
<
int64_t
>
(
{
batch_size_
,
size_in_each_batch
},
platform
::
CPUPlace
());
memcpy
(
tensor_ptr
,
&
feasign
[
0
],
total_instance
*
sizeof
(
int64_t
));
}
else
{
int64_t
*
tensor_ptr
=
feed_vec_
[
i
].
GetLoDTensor
()
->
mutable_data
<
int64_t
>
(
{
total_instance
,
1
},
platform
::
CPUPlace
());
memcpy
(
tensor_ptr
,
&
feasign
[
0
],
total_instance
*
sizeof
(
int64_t
));
LoD
data_lod
{
offset
};
feed_vec_
[
i
].
GetLoDTensor
()
->
set_lod
(
data_lod
);
}
}
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/data_feed.h
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
namespace
paddle
{
namespace
framework
{
// Pack Tensor type and LoDTensor type into MixTensor type, in order
// to record either Tensor or LoDTensor information at the same time.
class
MixTensor
{
public:
MixTensor
()
{}
explicit
MixTensor
(
LoDTensor
*
lodtensor
)
{
is_dense_
=
false
;
lodtensor_
=
lodtensor
;
}
explicit
MixTensor
(
Tensor
*
tensor
)
{
is_dense_
=
true
;
tensor_
=
tensor
;
}
bool
IsDense
()
{
return
is_dense_
;
}
LoDTensor
*
GetLoDTensor
()
{
PADDLE_ENFORCE
(
!
is_dense_
,
"Let a dense var return a LoDTensor ptr."
);
return
lodtensor_
;
}
Tensor
*
GetTensor
()
{
PADDLE_ENFORCE
(
is_dense_
,
"Let a sparse var return a Tensor ptr."
);
return
tensor_
;
}
private:
bool
is_dense_
;
LoDTensor
*
lodtensor_
;
Tensor
*
tensor_
;
};
// DataFeed is the base virtual class for all ohther DataFeeds.
// It is used to read files and parse the data for subsequent trainer.
// Example:
// DataFeed* reader =
// paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
// reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
// reader->SetFileList(filelist);
// const std::vector<std::string> & use_slot_alias =
// reader->GetUseSlotAlias();
// for (auto name: use_slot_alias){ // for binding memory
// reader->AddFeedVar(scope->Var(name), name);
// }
// reader->Start();
// while (reader->Next()) {
// // trainer do something
// }
class
DataFeed
{
public:
DataFeed
()
{}
virtual
~
DataFeed
()
{}
virtual
void
Init
(
const
paddle
::
framework
::
DataFeedDesc
&
data_feed_desc
)
=
0
;
virtual
bool
CheckFile
(
const
char
*
filename
)
{
PADDLE_THROW
(
"This function(CheckFile) is not implemented."
);
}
// Set filelist for DataFeed.
// Pay attention that it must init all readers before call this function.
// Otherwise, Init() function will init finish_set_filelist_ flag.
virtual
bool
SetFileList
(
const
std
::
vector
<
std
::
string
>&
files
);
virtual
bool
Start
()
=
0
;
// The trainer calls the Next() function, and the DataFeed will load a new
// batch to the feed_vec. The return value of this function is the batch
// size of the current batch.
virtual
int
Next
()
=
0
;
// Get all slots' alias which defined in protofile
virtual
const
std
::
vector
<
std
::
string
>&
GetAllSlotAlias
()
{
return
all_slots_
;
}
// Get used slots' alias which defined in protofile
virtual
const
std
::
vector
<
std
::
string
>&
GetUseSlotAlias
()
{
return
use_slots_
;
}
// This function is used for binding feed_vec memory
virtual
void
AddFeedVar
(
Variable
*
var
,
const
std
::
string
&
name
);
protected:
// The following three functions are used to check if it is executed in this
// order:
// Init() -> SetFileList() -> Start() -> Next()
virtual
void
CheckInit
();
virtual
void
CheckSetFileList
();
virtual
void
CheckStart
();
virtual
void
SetBatchSize
(
int
batch
);
// batch size will be set in Init() function
// This function is used to pick one file from the global filelist(thread
// safe).
virtual
bool
PickOneFile
(
std
::
string
*
filename
);
static
std
::
vector
<
std
::
string
>
filelist_
;
static
size_t
file_idx_
;
static
std
::
mutex
mutex_for_pick_file_
;
// the alias of used slots, and its order is determined by
// data_feed_desc(proto object)
std
::
vector
<
std
::
string
>
use_slots_
;
std
::
vector
<
bool
>
use_slots_is_dense_
;
// the alias of all slots, and its order is determined by data_feed_desc(proto
// object)
std
::
vector
<
std
::
string
>
all_slots_
;
std
::
vector
<
std
::
string
>
all_slots_type_
;
std
::
vector
<
int
>
use_slots_index_
;
// -1: not used; >=0: the index of use_slots_
// The data read by DataFeed will be stored here
std
::
vector
<
MixTensor
>
feed_vec_
;
// the batch size defined by user
int
default_batch_size_
;
// current batch size
int
batch_size_
;
bool
finish_init_
;
static
bool
finish_set_filelist_
;
bool
finish_start_
;
};
// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
// It use a read-thread to read file and parse data to a private-queue
// (thread level), and get data from this queue when trainer call Next().
template
<
typename
T
>
class
PrivateQueueDataFeed
:
public
DataFeed
{
public:
PrivateQueueDataFeed
()
{}
virtual
~
PrivateQueueDataFeed
()
{}
virtual
void
Init
(
const
paddle
::
framework
::
DataFeedDesc
&
data_feed_desc
)
=
0
;
virtual
bool
Start
();
virtual
int
Next
();
protected:
// The thread implementation function for reading file and parse.
virtual
void
ReadThread
();
// This function is used to set private-queue size, and the most
// efficient when the queue size is close to the batch size.
virtual
void
SetQueueSize
(
int
queue_size
);
// The reading and parsing method called in the ReadThread.
virtual
bool
ParseOneInstance
(
T
*
instance
)
=
0
;
// This function is used to put instance to vec_ins
virtual
void
AddInstanceToInsVec
(
T
*
vec_ins
,
const
T
&
instance
,
int
index
)
=
0
;
// This function is used to put ins_vec to feed_vec
virtual
void
PutToFeedVec
(
const
T
&
ins_vec
)
=
0
;
// The thread for read files
std
::
thread
read_thread_
;
// using ifstream one line and one line parse is faster
// than using fread one buffer and one buffer parse.
// for a 601M real data:
// ifstream one line and one line parse: 6034 ms
// fread one buffer and one buffer parse: 7097 ms
std
::
ifstream
file_
;
size_t
queue_size_
;
// The queue for store parsed data
std
::
unique_ptr
<
paddle
::
operators
::
reader
::
BlockingQueue
<
T
>>
queue_
;
};
// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
class
MultiSlotType
{
public:
MultiSlotType
()
{}
~
MultiSlotType
()
{}
void
Init
(
const
std
::
string
&
type
)
{
CheckType
(
type
);
if
(
type_
[
0
]
==
'f'
)
{
float_feasign_
.
clear
();
}
else
if
(
type_
[
0
]
==
'u'
)
{
uint64_feasign_
.
clear
();
}
type_
=
type
;
}
void
InitOffset
()
{
offset_
.
resize
(
1
);
// LoDTensor' lod is counted from 0, the size of lod
// is one size larger than the size of data.
offset_
[
0
]
=
0
;
}
const
std
::
vector
<
size_t
>&
GetOffset
()
const
{
return
offset_
;
}
void
AddValue
(
const
float
v
)
{
CheckFloat
();
float_feasign_
.
push_back
(
v
);
}
void
AddValue
(
const
uint64_t
v
)
{
CheckUint64
();
uint64_feasign_
.
push_back
(
v
);
}
void
AddIns
(
const
MultiSlotType
&
ins
)
{
if
(
ins
.
GetType
()[
0
]
==
'f'
)
{
// float
CheckFloat
();
auto
&
vec
=
ins
.
GetFloatData
();
offset_
.
push_back
(
offset_
.
back
()
+
vec
.
size
());
float_feasign_
.
insert
(
float_feasign_
.
end
(),
vec
.
begin
(),
vec
.
end
());
}
else
if
(
ins
.
GetType
()[
0
]
==
'u'
)
{
// uint64
CheckUint64
();
auto
&
vec
=
ins
.
GetUint64Data
();
offset_
.
push_back
(
offset_
.
back
()
+
vec
.
size
());
uint64_feasign_
.
insert
(
uint64_feasign_
.
end
(),
vec
.
begin
(),
vec
.
end
());
}
}
const
std
::
vector
<
float
>&
GetFloatData
()
const
{
return
float_feasign_
;
}
const
std
::
vector
<
uint64_t
>&
GetUint64Data
()
const
{
return
uint64_feasign_
;
}
const
std
::
string
&
GetType
()
const
{
return
type_
;
}
private:
void
CheckType
(
const
std
::
string
&
type
)
const
{
PADDLE_ENFORCE
((
type
==
"uint64"
)
||
(
type
==
"float"
),
"There is no this type<%s>."
,
type
);
}
void
CheckFloat
()
const
{
PADDLE_ENFORCE
(
type_
[
0
]
==
'f'
,
"Add %s value to float slot."
,
type_
);
}
void
CheckUint64
()
const
{
PADDLE_ENFORCE
(
type_
[
0
]
==
'u'
,
"Add %s value to uint64 slot."
,
type_
);
}
std
::
vector
<
float
>
float_feasign_
;
std
::
vector
<
uint64_t
>
uint64_feasign_
;
std
::
string
type_
;
std
::
vector
<
size_t
>
offset_
;
};
// This DataFeed is used to feed multi-slot type data.
// The format of multi-slot type data:
// [n feasign_0 feasign_1 ... feasign_n]*
class
MultiSlotDataFeed
:
public
PrivateQueueDataFeed
<
std
::
vector
<
MultiSlotType
>>
{
public:
MultiSlotDataFeed
()
{}
virtual
~
MultiSlotDataFeed
()
{}
virtual
void
Init
(
const
paddle
::
framework
::
DataFeedDesc
&
data_feed_desc
);
virtual
bool
CheckFile
(
const
char
*
filename
);
protected:
virtual
void
AddInstanceToInsVec
(
std
::
vector
<
MultiSlotType
>*
vec_ins
,
const
std
::
vector
<
MultiSlotType
>&
instance
,
int
index
);
virtual
bool
ParseOneInstance
(
std
::
vector
<
MultiSlotType
>*
instance
);
virtual
void
PutToFeedVec
(
const
std
::
vector
<
MultiSlotType
>&
ins_vec
);
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/data_feed.proto
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax
=
"proto2"
;
package
paddle
.
framework
;
message
Slot
{
required
string
name
=
1
;
required
string
type
=
2
;
optional
bool
is_dense
=
3
[
default
=
false
];
optional
bool
is_used
=
4
[
default
=
false
];
}
message
MultiSlotDesc
{
repeated
Slot
slots
=
1
;
}
message
DataFeedDesc
{
optional
string
name
=
1
;
optional
int32
batch_size
=
2
[
default
=
32
];
optional
MultiSlotDesc
multi_slot_desc
=
3
;
}
paddle/fluid/framework/data_feed_factory.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_feed_factory.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_feed.h"
namespace
paddle
{
namespace
framework
{
typedef
std
::
shared_ptr
<
DataFeed
>
(
*
Createdata_feedFunction
)();
typedef
std
::
unordered_map
<
std
::
string
,
Createdata_feedFunction
>
data_feedMap
;
data_feedMap
g_data_feed_map
;
#define REGISTER_DATAFEED_CLASS(data_feed_class) \
namespace { \
std::shared_ptr<DataFeed> Creator_##data_feed_class() { \
return std::shared_ptr<DataFeed>(new data_feed_class); \
} \
class __Registerer_##data_feed_class { \
public: \
__Registerer_##data_feed_class() { \
g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
} \
}; \
__Registerer_##data_feed_class g_registerer_##data_feed_class; \
} // namespace
std
::
string
DataFeedFactory
::
DataFeedTypeList
()
{
std
::
string
data_feed_types
;
for
(
auto
iter
=
g_data_feed_map
.
begin
();
iter
!=
g_data_feed_map
.
end
();
++
iter
)
{
if
(
iter
!=
g_data_feed_map
.
begin
())
{
data_feed_types
+=
", "
;
}
data_feed_types
+=
iter
->
first
;
}
return
data_feed_types
;
}
std
::
shared_ptr
<
DataFeed
>
DataFeedFactory
::
CreateDataFeed
(
std
::
string
data_feed_class
)
{
if
(
g_data_feed_map
.
count
(
data_feed_class
)
<
1
)
{
exit
(
-
1
);
}
return
g_data_feed_map
[
data_feed_class
]();
}
REGISTER_DATAFEED_CLASS
(
MultiSlotDataFeed
);
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/data_feed_factory.h
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include "paddle/fluid/framework/data_feed.h"
namespace
paddle
{
namespace
framework
{
class
DataFeedFactory
{
public:
static
std
::
string
DataFeedTypeList
();
static
std
::
shared_ptr
<
DataFeed
>
CreateDataFeed
(
std
::
string
data_feed_class
);
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/data_feed_test.cc
0 → 100644
浏览文件 @
64ad051b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_feed.h"
#include <fcntl.h>
#include <chrono> // NOLINT
#include <fstream>
#include <iostream>
#include <map>
#include <mutex> // NOLINT
#include <set>
#include <thread> // NOLINT
#include <utility>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
paddle
::
framework
::
DataFeedDesc
load_datafeed_param_from_file
(
const
char
*
filename
)
{
paddle
::
framework
::
DataFeedDesc
data_feed_desc
;
int
file_descriptor
=
open
(
filename
,
O_RDONLY
);
PADDLE_ENFORCE
(
file_descriptor
!=
-
1
,
"Can not open %s."
,
filename
);
google
::
protobuf
::
io
::
FileInputStream
fileInput
(
file_descriptor
);
google
::
protobuf
::
TextFormat
::
Parse
(
&
fileInput
,
&
data_feed_desc
);
close
(
file_descriptor
);
return
data_feed_desc
;
}
const
std
::
vector
<
std
::
string
>
load_filelist_from_file
(
const
char
*
filename
)
{
std
::
vector
<
std
::
string
>
filelist
;
std
::
ifstream
fin
(
filename
);
PADDLE_ENFORCE
(
fin
.
good
(),
"Can not open %s."
,
filename
);
std
::
string
line
;
while
(
getline
(
fin
,
line
))
{
filelist
.
push_back
(
line
);
}
fin
.
close
();
return
filelist
;
}
void
GenerateFileForTest
(
const
char
*
protofile
,
const
char
*
filelist
)
{
std
::
ofstream
w_protofile
(
protofile
);
w_protofile
<<
"name:
\"
MultiSlotDataFeed
\"\n
"
"batch_size: 2
\n
"
"multi_slot_desc {
\n
"
" slots {
\n
"
" name:
\"
uint64_sparse_slot
\"\n
"
" type:
\"
uint64
\"\n
"
" is_dense: false
\n
"
" is_used: true
\n
"
" }
\n
"
" slots {
\n
"
" name:
\"
float_sparse_slot
\"\n
"
" type:
\"
float
\"\n
"
" is_dense: false
\n
"
" is_used: true
\n
"
" }
\n
"
" slots {
\n
"
" name:
\"
uint64_dense_slot
\"\n
"
" type:
\"
uint64
\"\n
"
" is_dense: true
\n
"
" is_used: true
\n
"
" }
\n
"
" slots {
\n
"
" name:
\"
float_dense_slot
\"\n
"
" type:
\"
float
\"\n
"
" is_dense: true
\n
"
" is_used: true
\n
"
" }
\n
"
" slots {
\n
"
" name:
\"
not_used_slot
\"\n
"
" type:
\"
uint64
\"\n
"
" is_dense: false
\n
"
" is_used: false
\n
"
" }
\n
"
"}"
;
w_protofile
.
close
();
std
::
ofstream
w_filelist
(
filelist
);
int
total_file
=
4
;
for
(
int
i
=
0
;
i
<
total_file
;
++
i
)
{
std
::
string
filename
=
"TestMultiSlotDataFeed.data."
+
std
::
to_string
(
i
);
w_filelist
<<
filename
;
if
(
i
+
1
!=
total_file
)
{
w_filelist
<<
std
::
endl
;
}
std
::
ofstream
w_datafile
(
filename
.
c_str
());
w_datafile
<<
"3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996
\n
"
"2 1300 2983353 1 985.211 1 8 1 0.618 1 12
\n
"
"1 19260827 2 3.14 2.718 1 27 1 2.236 1 28
\n
"
;
w_datafile
.
close
();
}
w_filelist
.
close
();
}
class
MultiTypeSet
{
public:
MultiTypeSet
()
{
uint64_set_
.
clear
();
float_set_
.
clear
();
}
~
MultiTypeSet
()
{}
void
AddValue
(
uint64_t
v
)
{
uint64_set_
.
insert
(
v
);
}
void
AddValue
(
float
v
)
{
float_set_
.
insert
(
v
);
}
const
std
::
set
<
uint64_t
>&
GetUint64Set
()
const
{
return
uint64_set_
;
}
const
std
::
set
<
float
>&
GetFloatSet
()
const
{
return
float_set_
;
}
private:
std
::
set
<
uint64_t
>
uint64_set_
;
std
::
set
<
float
>
float_set_
;
};
void
GetElemSetFromReader
(
std
::
vector
<
MultiTypeSet
>*
reader_elem_set
,
const
paddle
::
framework
::
DataFeedDesc
&
data_feed_desc
,
const
std
::
vector
<
std
::
string
>&
filelist
,
const
int
thread_num
)
{
int
used_slot_num
=
0
;
for
(
auto
i
=
0
;
i
<
data_feed_desc
.
multi_slot_desc
().
slots_size
();
++
i
)
{
if
(
data_feed_desc
.
multi_slot_desc
().
slots
(
i
).
is_used
())
{
++
used_slot_num
;
}
}
reader_elem_set
->
resize
(
used_slot_num
);
std
::
vector
<
std
::
thread
>
threads
;
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DataFeed
>>
readers
;
readers
.
resize
(
thread_num
);
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
readers
[
i
]
=
paddle
::
framework
::
DataFeedFactory
::
CreateDataFeed
(
data_feed_desc
.
name
());
readers
[
i
]
->
Init
(
data_feed_desc
);
}
readers
[
0
]
->
SetFileList
(
filelist
);
std
::
mutex
mu
;
for
(
int
idx
=
0
;
idx
<
thread_num
;
++
idx
)
{
threads
.
emplace_back
(
std
::
thread
([
&
,
idx
]
{
std
::
unique_ptr
<
paddle
::
framework
::
Scope
>
scope
(
new
paddle
::
framework
::
Scope
());
const
auto
&
multi_slot_desc
=
data_feed_desc
.
multi_slot_desc
();
std
::
map
<
std
::
string
,
const
paddle
::
framework
::
LoDTensor
*>
lodtensor_targets
;
std
::
map
<
std
::
string
,
const
paddle
::
framework
::
Tensor
*>
tensor_targets
;
for
(
int
i
=
0
;
i
<
multi_slot_desc
.
slots_size
();
++
i
)
{
const
auto
&
slot
=
multi_slot_desc
.
slots
(
i
);
if
(
slot
.
is_used
())
{
const
auto
&
name
=
slot
.
name
();
readers
[
idx
]
->
AddFeedVar
(
scope
->
Var
(
name
),
name
);
if
(
slot
.
is_dense
())
{
tensor_targets
[
name
]
=
&
scope
->
FindVar
(
name
)
->
Get
<
paddle
::
framework
::
Tensor
>
();
}
else
{
lodtensor_targets
[
name
]
=
&
scope
->
FindVar
(
name
)
->
Get
<
paddle
::
framework
::
LoDTensor
>
();
}
}
}
readers
[
idx
]
->
Start
();
while
(
readers
[
idx
]
->
Next
())
{
int
index
=
0
;
for
(
int
k
=
0
;
k
<
multi_slot_desc
.
slots_size
();
++
k
)
{
const
auto
&
slot
=
multi_slot_desc
.
slots
(
k
);
if
(
!
slot
.
is_used
())
{
continue
;
}
if
(
slot
.
is_dense
())
{
// dense branch
const
paddle
::
framework
::
Tensor
*
tens
=
tensor_targets
[
slot
.
name
()];
if
(
slot
.
type
()
==
"uint64"
)
{
const
int64_t
*
data
=
tens
->
data
<
int64_t
>
();
int
batch_size
=
tens
->
dims
()[
0
];
int
dim
=
tens
->
dims
()[
1
];
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
dim
;
++
j
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu
);
(
*
reader_elem_set
)[
index
].
AddValue
(
(
uint64_t
)
data
[
i
*
dim
+
j
]);
}
}
}
else
if
(
slot
.
type
()
==
"float"
)
{
const
float
*
data
=
tens
->
data
<
float
>
();
int
batch_size
=
tens
->
dims
()[
0
];
int
dim
=
tens
->
dims
()[
1
];
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
dim
;
++
j
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu
);
(
*
reader_elem_set
)[
index
].
AddValue
(
data
[
i
*
dim
+
j
]);
}
}
}
else
{
PADDLE_THROW
(
"Error type in proto file."
);
}
}
else
{
// sparse branch
const
paddle
::
framework
::
LoDTensor
*
tens
=
lodtensor_targets
[
slot
.
name
()];
if
(
slot
.
type
()
==
"uint64"
)
{
const
int64_t
*
data
=
tens
->
data
<
int64_t
>
();
for
(
size_t
i
=
0
;
i
<
tens
->
NumElements
();
++
i
)
{
std
::
pair
<
size_t
,
size_t
>
element
=
tens
->
lod_element
(
0
,
i
);
for
(
size_t
j
=
element
.
first
;
j
<
element
.
second
;
++
j
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu
);
(
*
reader_elem_set
)[
index
].
AddValue
((
uint64_t
)
data
[
j
]);
}
}
}
else
if
(
slot
.
type
()
==
"float"
)
{
const
float
*
data
=
tens
->
data
<
float
>
();
for
(
size_t
i
=
0
;
i
<
tens
->
NumElements
();
++
i
)
{
std
::
pair
<
size_t
,
size_t
>
element
=
tens
->
lod_element
(
0
,
i
);
for
(
size_t
j
=
element
.
first
;
j
<
element
.
second
;
++
j
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mu
);
(
*
reader_elem_set
)[
index
].
AddValue
(
data
[
j
]);
}
}
}
else
{
PADDLE_THROW
(
"Error type in proto file."
);
}
}
// end sparse branch
++
index
;
}
// end slots loop
}
// end while Next()
}));
// end anonymous function
}
for
(
auto
&
th
:
threads
)
{
th
.
join
();
}
}
void
CheckIsUnorderedSame
(
const
std
::
vector
<
MultiTypeSet
>&
s1
,
const
std
::
vector
<
MultiTypeSet
>&
s2
)
{
EXPECT_EQ
(
s1
.
size
(),
s2
.
size
());
for
(
size_t
i
=
0
;
i
<
s1
.
size
();
++
i
)
{
// check for uint64
const
std
::
set
<
uint64_t
>&
uint64_s1
=
s1
[
i
].
GetUint64Set
();
const
std
::
set
<
uint64_t
>&
uint64_s2
=
s2
[
i
].
GetUint64Set
();
EXPECT_EQ
(
uint64_s1
.
size
(),
uint64_s2
.
size
());
auto
uint64_it1
=
uint64_s1
.
begin
();
auto
uint64_it2
=
uint64_s2
.
begin
();
while
(
uint64_it1
!=
uint64_s1
.
end
())
{
EXPECT_EQ
(
*
uint64_it1
,
*
uint64_it2
);
++
uint64_it1
;
++
uint64_it2
;
}
// check for float
const
std
::
set
<
float
>&
float_s1
=
s1
[
i
].
GetFloatSet
();
const
std
::
set
<
float
>&
float_s2
=
s2
[
i
].
GetFloatSet
();
EXPECT_EQ
(
float_s1
.
size
(),
float_s2
.
size
());
auto
float_it1
=
float_s1
.
begin
();
auto
float_it2
=
float_s2
.
begin
();
while
(
float_it1
!=
float_s1
.
end
())
{
EXPECT_EQ
(
*
float_it1
,
*
float_it2
);
++
float_it1
;
++
float_it2
;
}
}
}
void
GetElemSetFromFile
(
std
::
vector
<
MultiTypeSet
>*
file_elem_set
,
const
paddle
::
framework
::
DataFeedDesc
&
data_feed_desc
,
const
std
::
vector
<
std
::
string
>&
filelist
)
{
int
used_slot_num
=
0
;
for
(
auto
i
=
0
;
i
<
data_feed_desc
.
multi_slot_desc
().
slots_size
();
++
i
)
{
if
(
data_feed_desc
.
multi_slot_desc
().
slots
(
i
).
is_used
())
{
++
used_slot_num
;
}
}
file_elem_set
->
resize
(
used_slot_num
);
for
(
const
auto
&
file
:
filelist
)
{
std
::
ifstream
fin
(
file
.
c_str
());
PADDLE_ENFORCE
(
fin
.
good
(),
"Can not open %s."
,
file
.
c_str
());
while
(
1
)
{
bool
end_flag
=
false
;
int
index
=
0
;
for
(
auto
i
=
0
;
i
<
data_feed_desc
.
multi_slot_desc
().
slots_size
();
++
i
)
{
int
num
;
if
(
fin
>>
num
)
{
auto
slot
=
data_feed_desc
.
multi_slot_desc
().
slots
(
i
);
auto
type
=
slot
.
type
();
if
(
type
==
"uint64"
)
{
while
(
num
--
)
{
uint64_t
feasign
;
fin
>>
feasign
;
if
(
slot
.
is_used
())
{
(
*
file_elem_set
)[
index
].
AddValue
(
feasign
);
}
}
}
else
if
(
type
==
"float"
)
{
while
(
num
--
)
{
float
feasign
;
fin
>>
feasign
;
if
(
slot
.
is_used
())
{
(
*
file_elem_set
)[
index
].
AddValue
(
feasign
);
}
}
}
else
{
PADDLE_THROW
(
"Error type in proto file."
);
}
if
(
slot
.
is_used
())
{
++
index
;
}
}
else
{
end_flag
=
true
;
break
;
}
}
if
(
end_flag
)
{
break
;
}
}
fin
.
close
();
}
}
TEST
(
DataFeed
,
MultiSlotUnitTest
)
{
const
char
*
protofile
=
"data_feed_desc.prototxt"
;
const
char
*
filelist_name
=
"filelist.txt"
;
GenerateFileForTest
(
protofile
,
filelist_name
);
const
std
::
vector
<
std
::
string
>
filelist
=
load_filelist_from_file
(
filelist_name
);
paddle
::
framework
::
DataFeedDesc
data_feed_desc
=
load_datafeed_param_from_file
(
protofile
);
std
::
vector
<
MultiTypeSet
>
reader_elem_set
;
std
::
vector
<
MultiTypeSet
>
file_elem_set
;
GetElemSetFromReader
(
&
reader_elem_set
,
data_feed_desc
,
filelist
,
4
);
GetElemSetFromFile
(
&
file_elem_set
,
data_feed_desc
,
filelist
);
CheckIsUnorderedSame
(
reader_elem_set
,
file_elem_set
);
}
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
64ad051b
...
@@ -862,7 +862,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
...
@@ -862,7 +862,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if
(
node
->
Op
()
->
Type
()
==
"fetch_barrier"
)
{
if
(
node
->
Op
()
->
Type
()
==
"fetch_barrier"
)
{
outvar_dev_id
=
outvar_dev_id
=
GetVarDeviceID
(
*
result
,
output
->
Name
(),
*
sharded_var_device
);
GetVarDeviceID
(
*
result
,
output
->
Name
(),
*
sharded_var_device
);
PADDLE_ENFORCE_NE
(
outvar_dev_id
,
-
1
);
PADDLE_ENFORCE_NE
(
outvar_dev_id
,
-
1
,
"output name %s"
,
output
->
Name
()
);
}
}
p
=
places_
[
outvar_dev_id
];
p
=
places_
[
outvar_dev_id
];
ir
::
Node
*
new_node
=
nullptr
;
ir
::
Node
*
new_node
=
nullptr
;
...
...
paddle/fluid/framework/details/op_registry.h
浏览文件 @
64ad051b
...
@@ -32,7 +32,9 @@ enum OpInfoFillType {
...
@@ -32,7 +32,9 @@ enum OpInfoFillType {
kOpProtoAndCheckerMaker
=
1
,
kOpProtoAndCheckerMaker
=
1
,
kGradOpDescMaker
=
2
,
kGradOpDescMaker
=
2
,
kVarTypeInference
=
3
,
kVarTypeInference
=
3
,
kShapeInference
=
4
kShapeInference
=
4
,
kEstimateFlops
=
5
,
kUnknown
=
-
1
};
};
template
<
typename
T
>
template
<
typename
T
>
...
@@ -48,8 +50,10 @@ struct OpInfoFillTypeID {
...
@@ -48,8 +50,10 @@ struct OpInfoFillTypeID {
?
kVarTypeInference
?
kVarTypeInference
:
(
std
::
is_base_of
<
InferShapeBase
,
T
>::
value
:
(
std
::
is_base_of
<
InferShapeBase
,
T
>::
value
?
kShapeInference
?
kShapeInference
:
static_cast
<
OpInfoFillType
>
(
:
(
std
::
is_base_of
<
EstimateFlopsBase
,
-
1
)))));
T
>::
value
?
kEstimateFlops
:
kUnknown
)))));
}
}
};
};
...
@@ -139,6 +143,16 @@ struct OpInfoFiller<T, kShapeInference> {
...
@@ -139,6 +143,16 @@ struct OpInfoFiller<T, kShapeInference> {
}
}
};
};
template
<
typename
T
>
struct
OpInfoFiller
<
T
,
kEstimateFlops
>
{
void
operator
()(
const
char
*
op_tpe
,
OpInfo
*
info
)
const
{
info
->
estimate_flops_
=
[](
InferShapeContext
*
ctx
)
{
T
estimate_flops
;
return
estimate_flops
(
ctx
);
};
}
};
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
浏览文件 @
64ad051b
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
#include <stdexcept>
#include <stdexcept>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/
executo
r.h"
#include "paddle/fluid/framework/
variable_helpe
r.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
64ad051b
...
@@ -21,6 +21,7 @@ limitations under the License. */
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -156,36 +157,6 @@ void Executor::Close() {
...
@@ -156,36 +157,6 @@ void Executor::Close() {
#endif
#endif
}
}
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
)
{
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR
)
{
var
->
GetMutable
<
LoDTensor
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
SELECTED_ROWS
)
{
var
->
GetMutable
<
SelectedRows
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FEED_MINIBATCH
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
var
->
GetMutable
<
LoDRankTable
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
var
->
GetMutable
<
LoDTensorArray
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
PLACE_LIST
)
{
var
->
GetMutable
<
platform
::
PlaceList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
READER
)
{
var
->
GetMutable
<
ReaderHolder
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
RAW
)
{
// GetMutable will be called in operator
}
else
{
PADDLE_THROW
(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]"
,
var_type
);
}
}
void
Executor
::
CreateVariables
(
const
ProgramDesc
&
pdesc
,
Scope
*
scope
,
void
Executor
::
CreateVariables
(
const
ProgramDesc
&
pdesc
,
Scope
*
scope
,
int
block_id
)
{
int
block_id
)
{
auto
&
global_block
=
pdesc
.
Block
(
block_id
);
auto
&
global_block
=
pdesc
.
Block
(
block_id
);
...
...
paddle/fluid/framework/executor.h
浏览文件 @
64ad051b
...
@@ -26,7 +26,6 @@ limitations under the License. */
...
@@ -26,7 +26,6 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
extern
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
);
struct
ExecutorPrepareContext
{
struct
ExecutorPrepareContext
{
ExecutorPrepareContext
(
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
,
ExecutorPrepareContext
(
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
,
...
...
paddle/fluid/framework/executor_thread_worker.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace
paddle
{
namespace
framework
{
void
ExecutorThreadWorker
::
CreateThreadOperators
(
const
ProgramDesc
&
program
)
{
auto
&
block
=
program
.
Block
(
0
);
op_names_
.
clear
();
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
std
::
unique_ptr
<
OperatorBase
>
local_op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
op_names_
.
push_back
(
op_desc
->
Type
());
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
ops_
.
push_back
(
local_op_ptr
);
continue
;
}
}
void
ExecutorThreadWorker
::
CreateThreadResource
(
const
framework
::
ProgramDesc
&
program
,
const
paddle
::
platform
::
Place
&
place
)
{
CreateThreadScope
(
program
);
CreateThreadOperators
(
program
);
SetMainProgram
(
program
);
SetPlace
(
place
);
}
void
ExecutorThreadWorker
::
CreateThreadScope
(
const
ProgramDesc
&
program
)
{
auto
&
block
=
program
.
Block
(
0
);
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
"root_scope should be set before creating thread scope"
);
thread_scope_
=
&
root_scope_
->
NewScope
();
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
*
ptr
=
root_scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
}
else
{
auto
*
ptr
=
thread_scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
}
void
ExecutorThreadWorker
::
SetDataFeed
(
const
std
::
shared_ptr
<
DataFeed
>&
datafeed
)
{
thread_reader_
=
datafeed
;
}
void
ExecutorThreadWorker
::
BindingDataFeedMemory
()
{
const
std
::
vector
<
std
::
string
>&
input_feed
=
thread_reader_
->
GetUseSlotAlias
();
for
(
auto
name
:
input_feed
)
{
thread_reader_
->
AddFeedVar
(
thread_scope_
->
Var
(
name
),
name
);
}
}
void
ExecutorThreadWorker
::
SetFetchVarNames
(
const
std
::
vector
<
std
::
string
>&
fetch_var_names
)
{
fetch_var_names_
.
clear
();
fetch_var_names_
.
insert
(
fetch_var_names_
.
end
(),
fetch_var_names
.
begin
(),
fetch_var_names
.
end
());
}
void
ExecutorThreadWorker
::
SetDevice
()
{
#if defined _WIN32 || defined __APPLE__
return
;
#else
static
unsigned
concurrency_cap
=
std
::
thread
::
hardware_concurrency
();
int
thread_id
=
this
->
thread_id_
;
if
(
thread_id
<
concurrency_cap
)
{
unsigned
proc
=
thread_id
;
cpu_set_t
mask
;
CPU_ZERO
(
&
mask
);
CPU_SET
(
proc
,
&
mask
);
if
(
-
1
==
sched_setaffinity
(
0
,
sizeof
(
mask
),
&
mask
))
{
VLOG
(
1
)
<<
"WARNING: Failed to set thread affinity for thread "
<<
thread_id
;
}
else
{
CPU_ZERO
(
&
mask
);
if
((
0
!=
sched_getaffinity
(
0
,
sizeof
(
mask
),
&
mask
))
||
(
CPU_ISSET
(
proc
,
&
mask
)
==
0
))
{
VLOG
(
3
)
<<
"WARNING: Failed to set thread affinity for thread "
<<
thread_id
;
}
}
}
else
{
VLOG
(
1
)
<<
"WARNING: Failed to set thread affinity for thread "
<<
thread_id
;
}
#endif
}
template
<
typename
T
>
void
print_lod_tensor
(
std
::
string
var_name
,
const
LoDTensor
&
lod_tensor
)
{
auto
inspect
=
lod_tensor
.
data
<
T
>
();
auto
element_num
=
lod_tensor
.
numel
();
std
::
ostringstream
sstream
;
sstream
<<
var_name
<<
" (element num "
<<
element_num
<<
"): ["
;
sstream
<<
inspect
[
0
];
for
(
int
j
=
1
;
j
<
element_num
;
++
j
)
{
sstream
<<
" "
<<
inspect
[
j
];
}
sstream
<<
"]"
;
std
::
cout
<<
sstream
.
str
()
<<
std
::
endl
;
}
void
print_fetch_var
(
Scope
*
scope
,
std
::
string
var_name
)
{
const
LoDTensor
&
tensor
=
scope
->
FindVar
(
var_name
)
->
Get
<
LoDTensor
>
();
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
print_lod_tensor
<
platform
::
float16
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
float
)))
{
print_lod_tensor
<
float
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
double
)))
{
print_lod_tensor
<
double
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
int
)))
{
print_lod_tensor
<
int
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
int64_t
)))
{
print_lod_tensor
<
int64_t
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
bool
)))
{
print_lod_tensor
<
bool
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
uint8_t
)))
{
print_lod_tensor
<
uint8_t
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
int16_t
)))
{
print_lod_tensor
<
int16_t
>
(
var_name
,
tensor
);
}
else
if
(
std
::
type_index
(
tensor
.
type
())
==
std
::
type_index
(
typeid
(
int8_t
)))
{
print_lod_tensor
<
int8_t
>
(
var_name
,
tensor
);
}
else
{
VLOG
(
1
)
<<
"print_fetch_var: unrecognized data type:"
<<
tensor
.
type
().
name
();
}
return
;
}
void
ExecutorThreadWorker
::
TrainFiles
()
{
// todo: configurable
SetDevice
();
int
fetch_var_num
=
fetch_var_names_
.
size
();
fetch_values_
.
clear
();
fetch_values_
.
resize
(
fetch_var_num
);
thread_reader_
->
Start
();
int
cur_batch
;
int
batch_cnt
=
0
;
while
((
cur_batch
=
thread_reader_
->
Next
())
>
0
)
{
// executor run here
for
(
auto
&
op
:
ops_
)
{
op
->
Run
(
*
thread_scope_
,
place_
);
}
++
batch_cnt
;
thread_scope_
->
DropKids
();
if
(
debug_
==
false
||
thread_id_
!=
0
)
{
continue
;
}
for
(
int
i
=
0
;
i
<
fetch_var_num
;
++
i
)
{
print_fetch_var
(
thread_scope_
,
fetch_var_names_
[
i
]);
}
// end for (int i = 0...)
}
// end while ()
}
void
ExecutorThreadWorker
::
SetThreadId
(
int
tid
)
{
thread_id_
=
tid
;
}
void
ExecutorThreadWorker
::
SetPlace
(
const
platform
::
Place
&
place
)
{
place_
=
place
;
}
void
ExecutorThreadWorker
::
SetMainProgram
(
const
ProgramDesc
&
main_program_desc
)
{
main_program_
.
reset
(
new
ProgramDesc
(
main_program_desc
));
}
void
ExecutorThreadWorker
::
SetRootScope
(
Scope
*
g_scope
)
{
root_scope_
=
g_scope
;
}
}
// einit_modelnd namespace framework
}
// end namespace paddle
paddle/fluid/framework/executor_thread_worker.h
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
void
CreateTensor
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
);
class
ExecutorThreadWorker
{
public:
ExecutorThreadWorker
()
:
thread_id_
(
-
1
),
root_scope_
(
NULL
),
thread_scope_
(
NULL
),
debug_
(
false
)
{}
~
ExecutorThreadWorker
()
{}
void
CreateThreadResource
(
const
framework
::
ProgramDesc
&
program
,
const
paddle
::
platform
::
Place
&
place
);
void
SetThreadId
(
int
tid
);
void
SetDebug
(
const
bool
debug
)
{
debug_
=
debug
;
}
void
SetRootScope
(
Scope
*
g_scope
);
// set cpu device in this function
// cpu binding is used by default
void
SetDevice
();
// since we read data into memory that can not be accessed by program
// we need to bind memory of data with corresponding variables in program
// this function should be called after data feed is set
void
BindingDataFeedMemory
();
// set data feed declared in executor
void
SetDataFeed
(
const
std
::
shared_ptr
<
DataFeed
>&
datafeed
);
// A multi-thread training function
void
TrainFiles
();
// set fetch variable names from python interface assigned by users
void
SetFetchVarNames
(
const
std
::
vector
<
std
::
string
>&
fetch_var_names
);
private:
void
CreateThreadScope
(
const
framework
::
ProgramDesc
&
program
);
void
CreateThreadOperators
(
const
framework
::
ProgramDesc
&
program
);
void
SetMainProgram
(
const
ProgramDesc
&
main_program_desc
);
void
SetPlace
(
const
paddle
::
platform
::
Place
&
place
);
protected:
// thread index
std
::
shared_ptr
<
DataFeed
>
thread_reader_
;
// shared queue, thread buffer
int
thread_id_
;
// operator name
std
::
vector
<
std
::
string
>
op_names_
;
// thread level, local operators for forward and backward
std
::
vector
<
OperatorBase
*>
ops_
;
// main program for training
std
::
unique_ptr
<
framework
::
ProgramDesc
>
main_program_
;
// execution place
platform
::
Place
place_
;
// root scope for model parameters
Scope
*
root_scope_
;
// a thread scope, father scope is global score which is shared
Scope
*
thread_scope_
;
private:
std
::
vector
<
std
::
string
>
fetch_var_names_
;
std
::
vector
<
std
::
vector
<
float
>>
fetch_values_
;
bool
debug_
;
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/is_test_pass.cc
浏览文件 @
64ad051b
...
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
...
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
IsOp
())
{
if
(
n
->
IsOp
())
{
auto
*
op
=
n
->
Op
();
auto
*
op
=
n
->
Op
();
if
(
op
->
HasAttr
(
"is_test"
))
{
if
(
n
->
Runtime
HasAttr
(
"is_test"
))
{
op
->
SetAttr
(
"is_test"
,
true
);
op
->
SetAttr
(
"is_test"
,
true
);
}
else
if
(
std
::
find
(
begin
(
op_list
),
end
(
op_list
),
op
->
Type
())
!=
}
else
if
(
std
::
find
(
begin
(
op_list
),
end
(
op_list
),
op
->
Type
())
!=
end
(
op_list
))
{
end
(
op_list
))
{
...
...
paddle/fluid/framework/ir/is_test_pass_tester.cc
浏览文件 @
64ad051b
...
@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
...
@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
auto
*
op
=
node
->
Op
();
auto
*
op
=
node
->
Op
();
auto
op_name
=
boost
::
get
<
std
::
string
>
(
op
->
GetAttr
(
"name"
));
auto
op_name
=
boost
::
get
<
std
::
string
>
(
op
->
GetAttr
(
"name"
));
if
(
op_name
==
"conv3"
)
{
if
(
op_name
==
"conv3"
)
{
ASSERT_FALSE
(
op
->
HasAttr
(
"is_test"
));
ASSERT_FALSE
(
node
->
Runtime
HasAttr
(
"is_test"
));
}
else
{
}
else
{
ASSERT_TRUE
(
op
->
HasAttr
(
"is_test"
));
ASSERT_TRUE
(
node
->
Runtime
HasAttr
(
"is_test"
));
EXPECT_TRUE
(
boost
::
get
<
bool
>
(
op
->
GetAttr
(
"is_test"
)));
EXPECT_TRUE
(
boost
::
get
<
bool
>
(
op
->
GetAttr
(
"is_test"
)));
}
}
}
}
...
...
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
浏览文件 @
64ad051b
...
@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
...
@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
VLOG
(
3
)
<<
"Aplies MKL-DNN placement strategy."
;
VLOG
(
3
)
<<
"Aplies MKL-DNN placement strategy."
;
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
IsOp
()
&&
n
->
Op
()
->
HasAttr
(
"use_mkldnn"
))
{
if
(
n
->
IsOp
()
&&
n
->
Runtime
HasAttr
(
"use_mkldnn"
))
{
n
->
Op
()
->
SetAttr
(
"use_mkldnn"
,
true
);
n
->
Op
()
->
SetAttr
(
"use_mkldnn"
,
true
);
}
}
}
}
...
...
paddle/fluid/framework/ir/node.cc
浏览文件 @
64ad051b
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
...
@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
const
char
Node
::
kControlDepVarName
[]
=
"__control_var"
;
const
char
Node
::
kControlDepVarName
[]
=
"__control_var"
;
#endif
#endif
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
const
std
::
string
&
name
,
std
::
unique_ptr
<
Node
>
CreateNodeForTest
(
const
std
::
string
&
name
,
Node
::
Type
type
)
{
Node
::
Type
type
)
{
return
std
::
unique_ptr
<
Node
>
(
new
Node
(
name
,
type
));
return
std
::
unique_ptr
<
Node
>
(
new
Node
(
name
,
type
));
}
}
bool
Node
::
RuntimeHasAttr
(
const
std
::
string
&
name
)
const
{
if
(
Op
()
->
HasAttr
(
name
))
{
return
true
;
}
else
{
auto
&
op_info
=
OpInfoMap
::
Instance
();
auto
op_type
=
Op
()
->
Type
();
if
(
op_info
.
Has
(
op_type
))
{
auto
op_info_ptr
=
op_info
.
Get
(
op_type
);
if
(
op_info_ptr
.
HasOpProtoAndChecker
())
{
const
proto
::
OpProto
&
proto
=
op_info_ptr
.
Proto
();
for
(
int
i
=
0
;
i
!=
proto
.
attrs_size
();
++
i
)
{
const
proto
::
OpProto
::
Attr
&
attr
=
proto
.
attrs
(
i
);
if
(
attr
.
name
()
==
name
)
{
return
true
;
}
}
}
}
}
return
false
;
}
}
// namespace ir
}
// namespace ir
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/ir/node.h
浏览文件 @
64ad051b
...
@@ -108,6 +108,18 @@ class Node {
...
@@ -108,6 +108,18 @@ class Node {
Name
().
find
(
ir
::
Node
::
kControlDepVarName
)
!=
std
::
string
::
npos
;
Name
().
find
(
ir
::
Node
::
kControlDepVarName
)
!=
std
::
string
::
npos
;
}
}
// RuntimeHasAttr is different with HasAttr now.
// 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
// thus, if stored program_desc_ are old which don't have an attr, a new
// library which adds the attr already will fail on this function.
// Details:
// https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
// 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
// problem.
// TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
// RuntimeHasAttr.
bool
RuntimeHasAttr
(
const
std
::
string
&
name
)
const
;
std
::
vector
<
Node
*>
inputs
;
std
::
vector
<
Node
*>
inputs
;
std
::
vector
<
Node
*>
outputs
;
std
::
vector
<
Node
*>
outputs
;
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
64ad051b
...
@@ -21,42 +21,11 @@
...
@@ -21,42 +21,11 @@
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
// These code can be shared with Executor.
static
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
)
{
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR
)
{
var
->
GetMutable
<
LoDTensor
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
SELECTED_ROWS
)
{
var
->
GetMutable
<
SelectedRows
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FEED_MINIBATCH
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
var
->
GetMutable
<
LoDRankTable
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
var
->
GetMutable
<
LoDTensorArray
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
PLACE_LIST
)
{
var
->
GetMutable
<
platform
::
PlaceList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
READER
)
{
var
->
GetMutable
<
ReaderHolder
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
RAW
)
{
// GetMutable will be called in operator
}
else
{
PADDLE_THROW
(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]"
,
var_type
);
}
}
void
NaiveExecutor
::
Prepare
(
Scope
*
scope
,
const
ProgramDesc
&
program_desc
,
void
NaiveExecutor
::
Prepare
(
Scope
*
scope
,
const
ProgramDesc
&
program_desc
,
int
block_id
,
bool
with_feed_fetch_ops
)
{
int
block_id
,
bool
with_feed_fetch_ops
)
{
if
(
!
scope
)
{
if
(
!
scope
)
{
...
...
paddle/fluid/framework/ngraph_bridge.cc
浏览文件 @
64ad051b
...
@@ -15,23 +15,105 @@ limitations under the License. */
...
@@ -15,23 +15,105 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <algorithm>
#include <functional>
#include <functional>
#include <vector>
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
#include "ngraph/ngraph.hpp"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
static
std
::
shared_ptr
<
ngraph
::
Node
>
GetNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
,
const
std
::
string
prm
,
const
VariableNameMap
&
var_map
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
&
var_names
=
var_map
.
at
(
prm
);
PADDLE_ENFORCE_EQ
(
var_names
.
size
(),
1
,
"op %s prm %s expects one associated var"
,
op
->
Type
(),
prm
);
if
(
ngb_node_map
->
find
(
var_names
[
0
])
!=
ngb_node_map
->
end
())
{
return
(
*
ngb_node_map
)[
var_names
[
0
]];
}
else
{
return
nullptr
;
}
}
static
std
::
shared_ptr
<
ngraph
::
Node
>
GetInputNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
,
const
std
::
string
prm
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
return
GetNode
(
op
,
prm
,
op
->
Inputs
(),
ngb_node_map
);
}
static
std
::
shared_ptr
<
ngraph
::
Node
>
GetOutputNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
,
const
std
::
string
prm
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
return
GetNode
(
op
,
prm
,
op
->
Outputs
(),
ngb_node_map
);
}
static
void
SetOutputNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
,
const
std
::
string
prm
,
std
::
shared_ptr
<
ngraph
::
Node
>
node
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
&
var_names
=
op
->
Outputs
().
at
(
prm
);
if
(
var_names
.
size
()
==
1
)
{
(
*
ngb_node_map
)[
var_names
[
0
]]
=
node
;
}
else
if
(
var_names
.
size
()
==
0
)
{
(
*
ngb_node_map
)[
""
]
=
node
;
}
else
{
PADDLE_THROW
(
"prm %s has more than 1 var_names."
,
prm
);
}
}
static
bool
HasOutput
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
,
const
std
::
string
prm
)
{
auto
&
outputs
=
op
->
Outputs
();
if
(
outputs
.
find
(
prm
)
==
outputs
.
end
())
return
false
;
return
outputs
.
at
(
prm
).
size
()
>
0
;
}
template
<
typename
T
>
static
void
BuildBinaryNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
x
=
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
y
=
GetInputNode
(
op
,
"Y"
,
ngb_node_map
);
auto
out
=
std
::
make_shared
<
T
>
(
x
,
y
);
SetOutputNode
(
op
,
"Out"
,
out
,
ngb_node_map
);
}
template
<
typename
T
>
static
void
BuildUnaryNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
)
{
auto
input
=
GetInputNode
(
op
,
"X"
,
ngb_node_map
);
auto
out
=
std
::
make_shared
<
T
>
(
input
);
SetOutputNode
(
op
,
"Out"
,
out
,
ngb_node_map
);
}
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
OperatorBase
>&
,
std
::
function
<
void
(
const
std
::
shared_ptr
<
OperatorBase
>&
,
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
)
>>
NgraphBridge
::
NG_NODE_MAP
=
{};
NgraphBridge
::
NG_NODE_MAP
=
{{
"relu"
,
BuildUnaryNode
<
ngraph
::
op
::
Relu
>
},
{
"tanh"
,
BuildUnaryNode
<
ngraph
::
op
::
Tanh
>
}};
void
NgraphBridge
::
build_graph
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
)
{
void
NgraphBridge
::
BuildNgNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
)
{
auto
&
op_type
=
op
->
Type
();
auto
&
op_type
=
op
->
Type
();
NG_NODE_MAP
[
op_type
](
op
,
ngb_node_map
);
NG_NODE_MAP
[
op_type
](
op
,
ngb_node_map
_
);
}
}
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/ngraph_bridge.h
浏览文件 @
64ad051b
...
@@ -20,16 +20,14 @@ limitations under the License. */
...
@@ -20,16 +20,14 @@ limitations under the License. */
#include <map>
#include <map>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "ngraph/node.hpp"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
class
OperatorBase
;
class
NgraphBridge
{
class
NgraphBridge
{
public:
public:
static
std
::
map
<
static
std
::
map
<
...
@@ -43,14 +41,14 @@ class NgraphBridge {
...
@@ -43,14 +41,14 @@ class NgraphBridge {
std
::
shared_ptr
<
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_node_map
)
var_node_map
)
:
ngb_node_map
(
var_node_map
)
{}
:
ngb_node_map
_
(
var_node_map
)
{}
void
build_graph
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
);
void
BuildNgNode
(
const
std
::
shared_ptr
<
OperatorBase
>&
op
);
private:
private:
std
::
shared_ptr
<
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
ngb_node_map
;
ngb_node_map
_
;
};
};
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/ngraph_operator.cc
浏览文件 @
64ad051b
...
@@ -19,14 +19,29 @@ limitations under the License. */
...
@@ -19,14 +19,29 @@ limitations under the License. */
#include <map>
#include <map>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/
shape_inference
.h"
#include "paddle/fluid/framework/
tensor
.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/var_type.h"
#include "ngraph/ngraph.hpp"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
static
ngraph
::
Shape
Ddim2Shape
(
const
DDim
&
dims
)
{
ngraph
::
Shape
sp
;
for
(
int
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
int
k
=
dims
[
i
];
k
=
k
==
0
?
1
:
k
;
sp
.
push_back
(
k
);
}
return
sp
;
}
static
std
::
map
<
proto
::
VarType
::
Type
,
ngraph
::
element
::
Type
>
pd2ng_type_map
=
{
static
std
::
map
<
proto
::
VarType
::
Type
,
ngraph
::
element
::
Type
>
pd2ng_type_map
=
{
{
proto
::
VarType
::
FP32
,
ngraph
::
element
::
f32
},
{
proto
::
VarType
::
FP32
,
ngraph
::
element
::
f32
},
{
proto
::
VarType
::
FP64
,
ngraph
::
element
::
f64
},
{
proto
::
VarType
::
FP64
,
ngraph
::
element
::
f64
},
...
@@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */
...
@@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */
PARTIAL_TEST
/* Support partial list of ops for test */
PARTIAL_TEST
/* Support partial list of ops for test */
}
op_state
;
}
op_state
;
// perform graph build through bridge and execute computation
class
NgraphOperator
{
class
NgraphOperator
{
public:
public:
explicit
NgraphOperator
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
,
explicit
NgraphOperator
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
,
...
@@ -59,13 +75,23 @@ class NgraphOperator {
...
@@ -59,13 +75,23 @@ class NgraphOperator {
persistables_
(
persist
),
persistables_
(
persist
),
fetches_
(
fetches
),
fetches_
(
fetches
),
post_op_inputs_
(
post_op_inputs
),
post_op_inputs_
(
post_op_inputs
),
ng_op_state_
(
ng_op_state
)
{}
ng_op_state_
(
ng_op_state
)
{
var_in_node_map_
=
std
::
make_shared
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
();
var_node_map_
=
std
::
make_shared
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
();
BuildNgIO
();
GetNgFunction
();
}
void
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
;
void
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
;
private:
private:
static
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
static
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
func_cache
;
func_cache
_
;
const
Scope
&
scope_
;
const
Scope
&
scope_
;
const
platform
::
Place
&
place_
;
const
platform
::
Place
&
place_
;
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
>>
fused_ops_
;
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
>>
fused_ops_
;
...
@@ -74,6 +100,35 @@ class NgraphOperator {
...
@@ -74,6 +100,35 @@ class NgraphOperator {
std
::
unordered_set
<
std
::
string
>
fetches_
;
std
::
unordered_set
<
std
::
string
>
fetches_
;
std
::
unordered_set
<
std
::
string
>
post_op_inputs_
;
std
::
unordered_set
<
std
::
string
>
post_op_inputs_
;
op_state
ng_op_state_
;
op_state
ng_op_state_
;
// ngraph backend eg. CPU
static
std
::
shared_ptr
<
ngraph
::
runtime
::
Backend
>
backend_
;
// ngraph function to call and execute
std
::
shared_ptr
<
ngraph
::
Function
>
ngraph_function_
;
// var_name of inputs
std
::
vector
<
std
::
string
>
var_in_
;
// var_name of outputs from fetch in order
std
::
vector
<
std
::
string
>
var_out_
;
// map input vars to nodes
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_in_node_map_
;
// map each var name with a ngraph node
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_node_map_
;
// cache key to check if function is cached
std
::
shared_ptr
<
std
::
string
>
GetCacheKey
();
// get ngraph input and define ngraph input parameters
void
GetNgInputShape
(
std
::
shared_ptr
<
OperatorBase
>
op
);
// Call ngraph bridge to map ops
void
BuildNgNodes
();
// get the ngraph input and output var list
void
BuildNgIO
();
// build ngraph function call
void
BuildNgFunction
();
// Check cache for ngraph function or otherwise build the function
void
GetNgFunction
();
};
};
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
>>
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
>>
...
@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals(
...
@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals(
}
}
size_t
size
=
ops
->
size
();
size_t
size
=
ops
->
size
();
size_t
left
=
0
;
size_t
left
=
0
;
while
(
left
<
size
&&
ops
.
at
(
left
)
->
Type
()
!=
kFeedOpType
)
{
while
(
left
<
size
&&
ops
->
at
(
left
)
->
Type
()
!=
kFeedOpType
)
{
++
left
;
++
left
;
}
}
if
(
left
==
size
)
{
if
(
left
==
size
)
{
...
@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals(
...
@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals(
size_t
start
=
pivot
,
end
=
start
;
size_t
start
=
pivot
,
end
=
start
;
while
(
pivot
<
right
&&
while
(
pivot
<
right
&&
(
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
find
(
(
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
find
(
ops
.
at
(
pivot
)
->
Type
())
!=
ops
->
at
(
pivot
)
->
Type
())
!=
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
end
()))
{
paddle
::
framework
::
NgraphBridge
::
NG_NODE_MAP
.
end
()))
{
++
pivot
;
++
pivot
;
++
end
;
++
end
;
...
@@ -136,7 +191,9 @@ FusedOperator::FusedOperator(
...
@@ -136,7 +191,9 @@ FusedOperator::FusedOperator(
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
end
,
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
end
,
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
)
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
),
pdesc
(
prog
),
block
(
block_id
)
{
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
),
pdesc_
(
prog
),
block_
(
block_id
)
{
for
(
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
it
=
start
;
for
(
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>::
iterator
it
=
start
;
it
!=
end
;
++
it
)
{
it
!=
end
;
++
it
)
{
fused_ops_
.
push_back
(
std
::
move
(
*
it
));
fused_ops_
.
push_back
(
std
::
move
(
*
it
));
...
@@ -152,7 +209,7 @@ FusedOperator::FusedOperator(
...
@@ -152,7 +209,7 @@ FusedOperator::FusedOperator(
}
}
if
((
*
(
start
-
1
))
->
Type
()
==
kFeedOpType
&&
(
*
end
)
->
Type
()
==
kFetchOpType
)
{
if
((
*
(
start
-
1
))
->
Type
()
==
kFeedOpType
&&
(
*
end
)
->
Type
()
==
kFetchOpType
)
{
is_
complete
=
true
;
is_
full_
=
true
;
}
}
Process
();
Process
();
...
@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope,
...
@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope,
}
}
}
}
if
(
is_full
)
{
if
(
is_full
_
)
{
ng_op_state
=
ng_op_state
==
PARTIAL_TEST
?
FULL_TEST
:
FULL_TRAIN
;
ng_op_state
=
ng_op_state
==
PARTIAL_TEST
?
FULL_TEST
:
FULL_TRAIN
;
}
}
...
@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope,
...
@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope,
ngraph_op
.
Run
(
scope
,
place
);
ngraph_op
.
Run
(
scope
,
place
);
}
}
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
NgraphOperator
::
func_cache_
=
{};
std
::
shared_ptr
<
ngraph
::
runtime
::
Backend
>
NgraphOperator
::
backend_
=
ngraph
::
runtime
::
Backend
::
create
(
"CPU"
);
void
NgraphOperator
::
GetNgInputShape
(
std
::
shared_ptr
<
OperatorBase
>
op
)
{
op
->
RuntimeInferShape
(
scope_
,
place_
);
for
(
auto
&
var_name_item
:
op
->
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
auto
*
var
=
scope_
.
FindVar
(
var_name
);
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
sp
=
Ddim2Shape
(
tensor_pd
->
dims
());
if
(
std
::
find
(
var_in_
.
begin
(),
var_in_
.
end
(),
var_name
)
!=
var_in_
.
end
())
{
if
(
var_node_map_
->
find
(
var_name
)
==
var_node_map_
->
end
())
{
auto
ng_type
=
var_type_map_
.
at
(
var_name
);
auto
prm
=
std
::
make_shared
<
ngraph
::
op
::
Parameter
>
(
ng_type
,
sp
,
true
);
(
*
var_node_map_
)[
var_name
]
=
prm
;
(
*
var_in_node_map_
)[
var_name
]
=
prm
;
}
}
}
}
}
}
void
NgraphOperator
::
BuildNgNodes
()
{
for
(
auto
&
var_name
:
var_out_
)
{
if
(
var_node_map_
->
find
(
var_name
)
==
var_node_map_
->
end
())
{
auto
*
var
=
scope_
.
FindVar
(
var_name
);
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
&
ddim
=
tensor_pd
->
dims
();
auto
ng_shape
=
Ddim2Shape
(
ddim
);
auto
ng_type
=
var_type_map_
.
at
(
var_name
);
auto
prm
=
std
::
make_shared
<
ngraph
::
op
::
Parameter
>
(
ng_type
,
ng_shape
,
true
);
(
*
var_node_map_
)[
var_name
]
=
prm
;
}
}
}
paddle
::
framework
::
NgraphBridge
ngb
(
var_node_map_
);
for
(
auto
&
op
:
fused_ops_
)
{
ngb
.
BuildNgNode
(
op
);
}
}
void
NgraphOperator
::
BuildNgIO
()
{
std
::
unordered_set
<
std
::
string
>
inputs
;
std
::
unordered_set
<
std
::
string
>
outputs
;
for
(
auto
&
op
:
fused_ops_
)
{
for
(
auto
&
var_name_item
:
op
->
Inputs
())
{
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
inputs
.
insert
(
var_name
);
const
bool
is_output
=
outputs
.
find
(
var_name
)
!=
outputs
.
end
();
if
(
!
is_output
&&
std
::
find
(
var_in_
.
begin
(),
var_in_
.
end
(),
var_name
)
==
var_in_
.
end
())
{
// fill var_in here to keep lhs and rhs order
var_in_
.
push_back
(
var_name
);
}
}
}
if
(
op
->
Type
()
!=
"fill_constant"
)
{
GetNgInputShape
(
op
);
}
for
(
auto
&
var_name_item
:
op
->
Outputs
())
{
PADDLE_ENFORCE_LE
(
var_name_item
.
second
.
size
(),
1
,
"op %s has more than 1 output - Not handling yet"
,
op
->
Type
());
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
outputs
.
insert
(
var_name
);
}
}
}
// var_out.clear();
for
(
auto
&
op
:
fused_ops_
)
{
for
(
auto
&
var_name_item
:
op
->
Outputs
())
{
PADDLE_ENFORCE_LE
(
var_name_item
.
second
.
size
(),
1
,
"op %s has more than 1 output - Not handling yet"
,
op
->
Type
());
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
switch
(
ng_op_state_
)
{
case
PARTIAL_TEST
:
if
(
post_op_inputs_
.
find
(
var_name
)
!=
post_op_inputs_
.
end
()
||
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
())
{
var_out_
.
push_back
(
var_name
);
}
break
;
case
FULL_TEST
:
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
())
{
var_out_
.
push_back
(
var_name
);
}
break
;
case
PARTIAL_TRAIN
:
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
()
||
post_op_inputs_
.
find
(
var_name
)
!=
post_op_inputs_
.
end
()
||
persistables_
.
find
(
var_name
)
!=
persistables_
.
end
())
{
var_out_
.
push_back
(
var_name
);
}
break
;
case
FULL_TRAIN
:
if
(
fetches_
.
find
(
var_name
)
!=
fetches_
.
end
()
||
persistables_
.
find
(
var_name
)
!=
persistables_
.
end
())
{
var_out_
.
push_back
(
var_name
);
}
break
;
default:
var_out_
.
push_back
(
var_name
);
}
}
}
}
}
void
NgraphOperator
::
BuildNgFunction
()
{
BuildNgNodes
();
ngraph_function_
=
nullptr
;
ngraph
::
NodeVector
func_outputs
;
ngraph
::
op
::
ParameterVector
func_inputs
;
for
(
auto
&
vo
:
var_out_
)
{
func_outputs
.
push_back
(
var_node_map_
->
at
(
vo
));
}
for
(
auto
&
vi
:
var_in_
)
{
std
::
shared_ptr
<
ngraph
::
op
::
Parameter
>
prm
=
std
::
dynamic_pointer_cast
<
ngraph
::
op
::
Parameter
>
(
var_in_node_map_
->
at
(
vi
));
func_inputs
.
push_back
(
prm
);
}
ngraph_function_
=
std
::
make_shared
<
ngraph
::
Function
>
(
func_outputs
,
func_inputs
);
}
std
::
shared_ptr
<
std
::
string
>
NgraphOperator
::
GetCacheKey
()
{
auto
cache_key
=
std
::
make_shared
<
std
::
string
>
(
""
);
*
cache_key
+=
std
::
to_string
(
fused_ops_
.
size
());
for
(
auto
&
op
:
fused_ops_
)
{
*
cache_key
+=
op
->
Type
();
}
for
(
auto
&
var_name
:
var_in_
)
{
auto
shape
=
var_node_map_
->
at
(
var_name
)
->
get_shape
();
*
cache_key
+=
var_name
;
*
cache_key
+=
var_type_map_
.
at
(
var_name
).
c_type_string
();
for
(
size_t
i
=
0
;
i
<
shape
.
size
();
++
i
)
{
*
cache_key
+=
std
::
to_string
(
shape
.
at
(
i
));
}
}
for
(
auto
&
var_name
:
var_out_
)
{
auto
*
var
=
scope_
.
FindVar
(
var_name
);
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
auto
&
ddim
=
tensor_pd
->
dims
();
for
(
int
i
=
0
;
i
<
ddim
.
size
();
++
i
)
{
*
cache_key
+=
std
::
to_string
(
ddim
[
i
]);
}
}
}
return
cache_key
;
}
void
NgraphOperator
::
GetNgFunction
()
{
bool
cache_on
=
true
;
if
(
cache_on
)
{
std
::
string
cache_key_val
=
*
GetCacheKey
();
if
(
func_cache_
.
find
(
cache_key_val
)
!=
func_cache_
.
end
())
{
ngraph_function_
=
func_cache_
.
at
(
cache_key_val
);
}
else
{
BuildNgFunction
();
func_cache_
[
cache_key_val
]
=
ngraph_function_
;
}
}
else
{
BuildNgFunction
();
}
}
void
NgraphOperator
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
std
::
vector
<
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>>
t_in
;
std
::
vector
<
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>>
t_out
;
for
(
size_t
i
=
0
;
i
<
var_in_
.
size
();
++
i
)
{
auto
vi
=
var_in_
.
at
(
i
);
auto
sp
=
var_node_map_
->
at
(
vi
)
->
get_shape
();
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>
ti
;
auto
*
var
=
scope
.
FindVar
(
vi
);
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetLoDTensorOrSelectedRowsValueFromVar
(
*
var
);
PADDLE_ENFORCE
(
sp
==
Ddim2Shape
(
tensor_pd
->
dims
()),
"Ensure ngraph tensor layout align with paddle tensor"
);
if
(
tensor_pd
->
type
().
hash_code
()
==
typeid
(
float
).
hash_code
())
{
// NOLINT
const
float
*
arr
=
tensor_pd
->
data
<
float
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
f32
,
sp
,
const_cast
<
float
*>
(
arr
));
}
else
if
(
tensor_pd
->
type
().
hash_code
()
==
typeid
(
int
).
hash_code
())
{
// NOLINT
const
int
*
arr
=
tensor_pd
->
data
<
int
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
i32
,
sp
,
const_cast
<
int
*>
(
arr
));
}
else
if
(
tensor_pd
->
type
().
hash_code
()
==
typeid
(
int64_t
).
hash_code
())
{
const
int64_t
*
arr
=
tensor_pd
->
data
<
int64_t
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
i64
,
sp
,
const_cast
<
int64_t
*>
(
arr
));
}
else
if
(
tensor_pd
->
type
().
hash_code
()
==
typeid
(
double
).
hash_code
())
{
// NOLINT
const
double
*
arr
=
tensor_pd
->
data
<
double
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
f64
,
sp
,
const_cast
<
double
*>
(
arr
));
}
else
if
(
tensor_pd
->
type
().
hash_code
()
==
typeid
(
bool
).
hash_code
())
{
// NOLINT
const
bool
*
arr
=
tensor_pd
->
data
<
bool
>
();
ti
=
backend_
->
create_tensor
(
ngraph
::
element
::
boolean
,
sp
,
const_cast
<
bool
*>
(
arr
));
}
else
{
PADDLE_THROW
(
"Data type not handling for var %s"
,
vi
);
}
}
else
{
PADDLE_THROW
(
"Cannot find var or tensor with var name %s"
,
vi
);
}
bool
is_test
=
(
ng_op_state_
==
PARTIAL_TEST
||
ng_op_state_
==
FULL_TEST
)
?
true
:
false
;
bool
is_persistable
=
(
persistables_
.
find
(
vi
)
!=
persistables_
.
end
())
?
true
:
false
;
if
(
is_test
&&
is_persistable
)
{
ti
->
set_stale
(
false
);
}
t_in
.
push_back
(
ti
);
}
for
(
size_t
i
=
0
;
i
<
var_out_
.
size
();
++
i
)
{
auto
var_name
=
var_out_
[
i
];
auto
*
var
=
scope
.
FindVar
(
var_name
);
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>
to
;
if
(
var
&&
var
->
IsType
<
LoDTensor
>
())
{
auto
*
tensor_pd
=
GetMutableLoDTensorOrSelectedRowsValueFromVar
(
var
);
auto
dd
=
tensor_pd
->
dims
();
ngraph
::
Shape
sp
=
Ddim2Shape
(
dd
);
auto
ng_type
=
var_type_map_
.
at
(
var_name
);
if
(
ng_type
==
ngraph
::
element
::
f32
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
float
>
(
place
);
to
=
backend_
->
create_tensor
(
ngraph
::
element
::
f32
,
sp
,
pd_arr
);
}
else
if
(
ng_type
==
ngraph
::
element
::
i64
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
int64_t
>
(
place
);
to
=
backend_
->
create_tensor
(
ngraph
::
element
::
i64
,
sp
,
pd_arr
);
}
else
if
(
ng_type
==
ngraph
::
element
::
f64
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
double
>
(
place
);
to
=
backend_
->
create_tensor
(
ngraph
::
element
::
f64
,
sp
,
pd_arr
);
}
else
if
(
ng_type
==
ngraph
::
element
::
boolean
)
{
auto
pd_arr
=
tensor_pd
->
mutable_data
<
bool
>
(
place
);
to
=
backend_
->
create_tensor
(
ngraph
::
element
::
boolean
,
sp
,
pd_arr
);
}
else
{
PADDLE_THROW
(
"Data type not handled in for var %s"
,
var_name
);
}
t_out
.
push_back
(
to
);
}
else
{
PADDLE_THROW
(
"Cannot find var or tensor with var name %s"
,
var_name
);
}
}
backend_
->
call
(
ngraph_function_
,
t_out
,
t_in
);
}
// NgraphOperator::RunImpl
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
#endif
#endif
paddle/fluid/framework/ngraph_operator.h
浏览文件 @
64ad051b
...
@@ -17,24 +17,19 @@ limitations under the License. */
...
@@ -17,24 +17,19 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <algorithm>
#include <atomic>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/variant.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/
ngraph
.hpp"
#include "ngraph/
type/element_type
.hpp"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
...
paddle/fluid/framework/op_info.h
浏览文件 @
64ad051b
...
@@ -31,6 +31,12 @@ class InferShapeBase {
...
@@ -31,6 +31,12 @@ class InferShapeBase {
virtual
void
operator
()(
InferShapeContext
*
)
const
=
0
;
virtual
void
operator
()(
InferShapeContext
*
)
const
=
0
;
};
};
class
EstimateFlopsBase
{
public:
virtual
~
EstimateFlopsBase
()
=
default
;
virtual
size_t
operator
()(
InferShapeContext
*
)
const
=
0
;
};
struct
OpInfo
{
struct
OpInfo
{
OpCreator
creator_
;
OpCreator
creator_
;
GradOpMakerFN
grad_op_maker_
;
GradOpMakerFN
grad_op_maker_
;
...
@@ -38,6 +44,7 @@ struct OpInfo {
...
@@ -38,6 +44,7 @@ struct OpInfo {
OpAttrChecker
*
checker_
{
nullptr
};
OpAttrChecker
*
checker_
{
nullptr
};
InferVarTypeFN
infer_var_type_
;
InferVarTypeFN
infer_var_type_
;
InferShapeFN
infer_shape_
;
InferShapeFN
infer_shape_
;
EstimateFlopsFN
estimate_flops_
;
bool
HasOpProtoAndChecker
()
const
{
bool
HasOpProtoAndChecker
()
const
{
return
proto_
!=
nullptr
&&
checker_
!=
nullptr
;
return
proto_
!=
nullptr
&&
checker_
!=
nullptr
;
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
64ad051b
...
@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
...
@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
"Tensor %s contains NAN"
,
name
);
"Tensor %s contains NAN"
,
name
);
}
}
void
OperatorWithKernel
::
RuntimeInferShape
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
scope
);
this
->
InferShape
(
&
infer_shape_ctx
);
}
void
OperatorWithKernel
::
RunImpl
(
const
Scope
&
scope
,
void
OperatorWithKernel
::
RunImpl
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
const
platform
::
Place
&
place
)
const
{
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
scope
);
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
scope
);
...
...
paddle/fluid/framework/operator.h
浏览文件 @
64ad051b
...
@@ -128,6 +128,8 @@ class OperatorBase {
...
@@ -128,6 +128,8 @@ class OperatorBase {
virtual
std
::
vector
<
std
::
string
>
OutputVars
(
bool
has_intermediate
)
const
;
virtual
std
::
vector
<
std
::
string
>
OutputVars
(
bool
has_intermediate
)
const
;
void
SetIsCalledByExecutor
(
bool
x
)
{
run_by_executor_
=
x
;
}
void
SetIsCalledByExecutor
(
bool
x
)
{
run_by_executor_
=
x
;
}
virtual
void
RuntimeInferShape
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{}
protected:
protected:
std
::
string
type_
;
std
::
string
type_
;
...
@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
...
@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
OpInfoMap
::
Instance
().
Get
(
Type
()).
infer_shape_
(
ctx
);
OpInfoMap
::
Instance
().
Get
(
Type
()).
infer_shape_
(
ctx
);
}
}
void
RuntimeInferShape
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
;
protected:
protected:
virtual
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
;
virtual
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
;
virtual
OpKernelType
GetKernelTypeForVar
(
virtual
OpKernelType
GetKernelTypeForVar
(
...
...
paddle/fluid/framework/type_defs.h
浏览文件 @
64ad051b
...
@@ -54,5 +54,7 @@ using InferVarTypeFN =
...
@@ -54,5 +54,7 @@ using InferVarTypeFN =
using
InferShapeFN
=
std
::
function
<
void
(
InferShapeContext
*
)
>
;
using
InferShapeFN
=
std
::
function
<
void
(
InferShapeContext
*
)
>
;
using
EstimateFlopsFN
=
std
::
function
<
void
(
InferShapeContext
*
)
>
;
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/variable_helper.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h"
#include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
framework
{
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
)
{
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR
)
{
var
->
GetMutable
<
LoDTensor
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
SELECTED_ROWS
)
{
var
->
GetMutable
<
SelectedRows
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FEED_MINIBATCH
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*>>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
var
->
GetMutable
<
LoDRankTable
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
var
->
GetMutable
<
LoDTensorArray
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
PLACE_LIST
)
{
var
->
GetMutable
<
platform
::
PlaceList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
READER
)
{
var
->
GetMutable
<
ReaderHolder
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
RAW
)
{
// GetMutable will be called in operator
}
else
{
PADDLE_THROW
(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]"
,
var_type
);
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/variable_helper.h
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/variable.h"
namespace
paddle
{
namespace
framework
{
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
);
}
}
paddle/fluid/inference/analysis/analysis_pass.h
浏览文件 @
64ad051b
...
@@ -46,8 +46,6 @@ class AnalysisPass {
...
@@ -46,8 +46,6 @@ class AnalysisPass {
protected:
protected:
// User should implement these.
// User should implement these.
virtual
void
RunImpl
(
Argument
*
argument
)
=
0
;
virtual
void
RunImpl
(
Argument
*
argument
)
=
0
;
Argument
*
argument_
{
nullptr
};
};
};
}
// namespace analysis
}
// namespace analysis
...
...
paddle/fluid/inference/analysis/passes/CMakeLists.txt
浏览文件 @
64ad051b
cc_library
(
ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass
)
cc_library
(
ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager
)
cc_library
(
analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass
)
set
(
analysis_deps
${
analysis_deps
}
set
(
analysis_deps
${
analysis_deps
}
ir_graph_build_pass
ir_graph_build_pass
...
...
paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
浏览文件 @
64ad051b
...
@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
...
@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
void
IrAnalysisComposePass
::
ApplyIrPasses
(
Argument
*
argument
)
{
std
::
vector
<
std
::
string
>
passes
({
std
::
vector
<
std
::
string
>
passes
({
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_graph_build_pass"
,
"ir_analysis_pass"
,
"ir_params_sync_among_devices_pass"
,
});
});
for
(
const
auto
&
pass
:
passes
)
{
for
(
const
auto
&
pass
:
passes
)
{
VLOG
(
2
)
<<
"Run pass "
<<
pass
;
VLOG
(
2
)
<<
"Run pass "
<<
pass
;
...
...
paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
浏览文件 @
64ad051b
...
@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
...
@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
// so that the parameters will on the same device, or they will keep copying
// so that the parameters will on the same device, or they will keep copying
// between difference devices.
// between difference devices.
platform
::
Place
place
;
platform
::
Place
place
;
if
(
argument
->
use_gpu
())
{
place
=
platform
::
CPUPlace
();
PADDLE_ENFORCE
(
argument
->
gpu_device_id_valid
());
place
=
platform
::
CUDAPlace
(
argument
->
gpu_device_id
());
}
else
{
place
=
platform
::
CPUPlace
();
}
if
(
argument
->
model_dir_valid
())
{
if
(
argument
->
model_dir_valid
())
{
auto
program
=
auto
program
=
...
...
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
0 → 100644
浏览文件 @
64ad051b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
IrParamsSyncAmongDevicesPass
::
RunImpl
(
Argument
*
argument
)
{
PADDLE_ENFORCE
(
argument
->
scope_valid
());
PADDLE_ENFORCE
(
argument
->
use_gpu_valid
());
platform
::
Place
place
;
// The parameters are on the cpu, therefore, synchronization is not necessary.
if
(
!
argument
->
use_gpu
())
return
;
LOG
(
INFO
)
<<
"Sync params from CPU to GPU"
;
PADDLE_ENFORCE
(
argument
->
gpu_device_id_valid
());
place
=
platform
::
CUDAPlace
(
argument
->
gpu_device_id
());
auto
*
scope
=
argument
->
scope_ptr
();
std
::
vector
<
std
::
string
>
all_vars
=
scope
->
LocalVarNames
();
// We get all the vars from local_scope instead of the ProgramDesc.
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
for
(
auto
&
var_name
:
all_vars
)
{
auto
*
var
=
scope
->
FindLocalVar
(
var_name
);
PADDLE_ENFORCE
(
var
!=
nullptr
);
if
(
var
->
IsType
<
framework
::
LoDTensor
>
()
||
var
->
IsType
<
framework
::
Tensor
>
())
{
auto
*
t
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
platform
::
CPUPlace
cpu_place
;
framework
::
LoDTensor
temp_tensor
;
temp_tensor
.
Resize
(
t
->
dims
());
temp_tensor
.
mutable_data
<
float
>
(
cpu_place
);
// Copy the parameter data to a tmp tensor.
TensorCopySync
(
*
t
,
cpu_place
,
&
temp_tensor
);
// Reallocation the space on GPU
t
->
mutable_data
<
float
>
(
place
);
// Copy parameter data to newly allocated GPU space.
TensorCopySync
(
temp_tensor
,
place
,
t
);
}
}
}
std
::
string
IrParamsSyncAmongDevicesPass
::
repr
()
const
{
return
"ir-params-sync-among-devices-pass"
;
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
0 → 100644
浏览文件 @
64ad051b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
/*
* Sync parameter from CPU to GPU.
*/
class
IrParamsSyncAmongDevicesPass
:
public
AnalysisPass
{
public:
void
RunImpl
(
Argument
*
argument
)
override
;
std
::
string
repr
()
const
override
;
};
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/analysis/passes/passes.cc
浏览文件 @
64ad051b
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
...
@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrGraphBuildPass
));
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrGraphBuildPass
));
passes_
.
emplace
(
"ir_analysis_compose_pass"
,
passes_
.
emplace
(
"ir_analysis_compose_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrAnalysisComposePass
));
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrAnalysisComposePass
));
passes_
.
emplace
(
"ir_params_sync_among_devices_pass"
,
std
::
unique_ptr
<
AnalysisPass
>
(
new
IrParamsSyncAmongDevicesPass
));
}
}
}
// namespace analysis
}
// namespace analysis
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
64ad051b
...
@@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
...
@@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
}
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// Fix TensorArray reuse not cleaned bug.
// All the containers in the scope will be hold in inference, but the
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
// operators assume that the container will be reset after each batch.
tensor_array_batch_cleaner_
.
ResetTensorArray
();
// Here is a bugfix, collect all the container variables, and reset then to a
// bool; the next time, the operator will call MutableData and construct a new
// container again, so that the container will be empty for each batch.
tensor_array_batch_cleaner_
.
CollectNoTensorVars
(
sub_scope_
);
tensor_array_batch_cleaner_
.
ResetNoTensorVars
();
return
true
;
return
true
;
}
}
...
@@ -417,7 +421,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
...
@@ -417,7 +421,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
bool
AnalysisPredictor
::
ZeroCopyRun
()
{
bool
AnalysisPredictor
::
ZeroCopyRun
()
{
executor_
->
Run
();
executor_
->
Run
();
// Fix TensorArray reuse not cleaned bug.
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
s
cope_
.
get
()
);
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
s
ub_scope_
);
tensor_array_batch_cleaner_
.
ResetTensorArray
();
tensor_array_batch_cleaner_
.
ResetTensorArray
();
return
true
;
return
true
;
}
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
64ad051b
...
@@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
...
@@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
}
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// F
ix TensorArray reuse not cleaned bug
.
// F
or some other vector like containers not cleaned after each batch
.
tensor_array_batch_cleaner_
.
Collect
TensorArray
s
(
scope_
.
get
());
tensor_array_batch_cleaner_
.
Collect
NoTensorVar
s
(
scope_
.
get
());
tensor_array_batch_cleaner_
.
Reset
TensorArray
();
tensor_array_batch_cleaner_
.
Reset
NoTensorVars
();
return
true
;
return
true
;
}
}
...
...
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
64ad051b
...
@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
...
@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/xxhash/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/xxhash/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/paddle/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/paddle/lib"
)
if
(
NOT WIN32
)
set
(
NGRAPH_PATH
"
${
PADDLE_LIB
}
/third_party/install/ngraph"
)
if
(
EXISTS
${
NGRAPH_PATH
}
)
include
(
GNUInstallDirs
)
include_directories
(
"
${
NGRAPH_PATH
}
/include"
)
link_directories
(
"
${
NGRAPH_PATH
}
/
${
CMAKE_INSTALL_LIBDIR
}
"
)
set
(
NGRAPH_LIB
${
NGRAPH_PATH
}
/
${
CMAKE_INSTALL_LIBDIR
}
/libngraph
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
endif
()
endif
()
add_executable
(
${
DEMO_NAME
}
${
DEMO_NAME
}
.cc
)
add_executable
(
${
DEMO_NAME
}
${
DEMO_NAME
}
.cc
)
if
(
WITH_MKL
)
if
(
WITH_MKL
)
...
@@ -106,7 +116,7 @@ endif()
...
@@ -106,7 +116,7 @@ endif()
if
(
NOT WIN32
)
if
(
NOT WIN32
)
set
(
EXTERNAL_LIB
"-lrt -ldl -lpthread"
)
set
(
EXTERNAL_LIB
"-lrt -ldl -lpthread"
)
set
(
DEPS
${
DEPS
}
set
(
DEPS
${
DEPS
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
${
NGRAPH_LIB
}
glog gflags protobuf snappystream snappy z xxhash
glog gflags protobuf snappystream snappy z xxhash
${
EXTERNAL_LIB
}
)
${
EXTERNAL_LIB
}
)
else
()
else
()
...
...
paddle/fluid/inference/api/details/reset_tensor_array.cc
浏览文件 @
64ad051b
...
@@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() {
...
@@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() {
}
}
}
}
void
TensorArrayBatchCleaner
::
CollectNoTensorVars
(
framework
::
Scope
*
scope
)
{
if
(
no_tensor_flag_
)
{
for
(
auto
&
var_name
:
scope
->
LocalVarNames
())
{
auto
*
var
=
scope
->
FindVar
(
var_name
);
if
(
!
var
->
IsInitialized
())
continue
;
if
(
!
valid_types_
.
count
(
var
->
Type
()))
{
no_tensor_vars_
.
insert
(
var
);
}
}
for
(
auto
*
kid
:
scope
->
kids
())
{
CollectTensorArrays
(
kid
);
}
no_tensor_flag_
=
false
;
// Only collect one time.
}
}
void
TensorArrayBatchCleaner
::
ResetNoTensorVars
()
{
for
(
auto
*
var
:
no_tensor_vars_
)
{
var
->
Clear
();
}
}
}
// namespace details
}
// namespace details
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/details/reset_tensor_array.h
浏览文件 @
64ad051b
...
@@ -14,9 +14,11 @@
...
@@ -14,9 +14,11 @@
#pragma once
#pragma once
#include <unordered_set>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
namespace
paddle
{
namespace
paddle
{
namespace
details
{
namespace
details
{
...
@@ -24,13 +26,28 @@ namespace details {
...
@@ -24,13 +26,28 @@ namespace details {
// Clean the TensorArray each batch to make the behavior the same with the
// Clean the TensorArray each batch to make the behavior the same with the
// training phase.
// training phase.
struct
TensorArrayBatchCleaner
{
struct
TensorArrayBatchCleaner
{
TensorArrayBatchCleaner
()
{
valid_types_
.
insert
(
typeid
(
framework
::
Tensor
));
valid_types_
.
insert
(
typeid
(
framework
::
LoDTensor
));
}
// Collect the variables that are not Tensor or LoDTensor, and reset them to a
// bool(trick), because some of them are containers, and some operators just
// keep inserting new items without clearing the containers first; So the
// memory grow larger and larger in inference service deployed online.
void
CollectNoTensorVars
(
framework
::
Scope
*
scope
);
void
ResetNoTensorVars
();
// Fix the tensor array not clear in the inference scenarios.
// Fix the tensor array not clear in the inference scenarios.
void
CollectTensorArrays
(
framework
::
Scope
*
scope
);
void
CollectTensorArrays
(
framework
::
Scope
*
scope
);
void
ResetTensorArray
();
void
ResetTensorArray
();
private:
private:
bool
flag_
{
true
};
bool
flag_
{
true
};
bool
no_tensor_flag_
{
true
};
std
::
vector
<
framework
::
LoDTensorArray
*>
arrays_
;
std
::
vector
<
framework
::
LoDTensorArray
*>
arrays_
;
std
::
unordered_set
<
std
::
type_index
>
valid_types_
;
std
::
unordered_set
<
framework
::
Variable
*>
no_tensor_vars_
;
};
};
}
// namespace details
}
// namespace details
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
64ad051b
...
@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
...
@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
class
GpuPassStrategy
:
public
PassStrategy
{
class
GpuPassStrategy
:
public
PassStrategy
{
public:
public:
GpuPassStrategy
()
:
PassStrategy
({})
{
GpuPassStrategy
()
:
PassStrategy
({})
{
// TODO(NHZlX) Problem with Data synchronization between GPU and CPU
// When running in GPU mode, the parameters are all on GPU. But the
// opearations of "conv_bn_fuse_pass" are on CPU.
passes_
.
assign
({
passes_
.
assign
({
"infer_clean_graph_pass"
,
"infer_clean_graph_pass"
,
"conv_bn_fuse_pass"
,
// "infer_clean_graph_pass", "conv_bn_fuse_pass",
});
});
}
}
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
64ad051b
...
@@ -46,11 +46,18 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
...
@@ -46,11 +46,18 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data
(
${
RNN2_INSTALL_DIR
}
"rnn2_model.tar.gz"
"rnn2_data.txt.tar.gz"
)
download_model_and_data
(
${
RNN2_INSTALL_DIR
}
"rnn2_model.tar.gz"
"rnn2_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_rnn2
${
RNN2_INSTALL_DIR
}
analyzer_rnn2_tester.cc
)
inference_analysis_api_test
(
test_analyzer_rnn2
${
RNN2_INSTALL_DIR
}
analyzer_rnn2_tester.cc
)
# DAM
#
normal
DAM
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
set
(
DAM_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/dam"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
download_model_and_data
(
${
DAM_INSTALL_DIR
}
"DAM_model.tar.gz"
"DAM_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
)
inference_analysis_api_test
(
test_analyzer_dam
${
DAM_INSTALL_DIR
}
analyzer_dam_tester.cc
)
# small DAM
set
(
DAM_SMALL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/small_dam"
)
download_model_and_data
(
${
DAM_SMALL_INSTALL_DIR
}
"dam_small_model.tar.gz"
"dam_small_data.txt.tar.gz"
)
inference_analysis_test
(
test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
DAM_SMALL_INSTALL_DIR
}
/model --infer_data=
${
DAM_SMALL_INSTALL_DIR
}
/data.txt --max_turn_num=1
)
# chinese_ner
# chinese_ner
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
set
(
CHINESE_NER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/chinese_ner"
)
download_model_and_data
(
${
CHINESE_NER_INSTALL_DIR
}
"chinese_ner_model.tar.gz"
"chinese_ner-data.txt.tar.gz"
)
download_model_and_data
(
${
CHINESE_NER_INSTALL_DIR
}
"chinese_ner_model.tar.gz"
"chinese_ner-data.txt.tar.gz"
)
...
...
paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
浏览文件 @
64ad051b
...
@@ -14,38 +14,54 @@
...
@@ -14,38 +14,54 @@
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_int32
(
max_turn_num
,
9
,
"The max turn number: 1 for the small and 9 for the normal."
);
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
using
contrib
::
AnalysisConfig
;
#define MAX_TURN_NUM 9
#define MAX_TURN_LEN 50
constexpr
int32_t
kMaxTurnLen
=
50
;
static
std
::
vector
<
float
>
result_data
;
static
std
::
vector
<
float
>
result_data
;
struct
DataRecord
{
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
int64_t
>>
std
::
vector
<
std
::
vector
<
int64_t
>>
*
turns
;
turns
[
MAX_TURN_NUM
];
// turns data : MAX_TURN_NUM
std
::
vector
<
std
::
vector
<
float
>>
*
turns_mask
;
std
::
vector
<
std
::
vector
<
float
>>
std
::
vector
<
std
::
vector
<
int64_t
>>
response
;
// response data : 1
turns_mask
[
MAX_TURN_NUM
];
// turns mask data : MAX_TURN_NUM
std
::
vector
<
std
::
vector
<
int64_t
>>
response
;
// response data : 1
std
::
vector
<
std
::
vector
<
float
>>
response_mask
;
// response mask data : 1
std
::
vector
<
std
::
vector
<
float
>>
response_mask
;
// response mask data : 1
size_t
batch_iter
{
0
};
size_t
batch_iter
{
0
};
size_t
batch_size
{
1
};
size_t
batch_size
{
1
};
size_t
num_samples
;
// total number of samples
size_t
num_samples
;
// total number of samples
DataRecord
()
=
default
;
DataRecord
()
{
turns
=
new
std
::
vector
<
std
::
vector
<
int64_t
>>
[
FLAGS_max_turn_num
];
// turns data : FLAGS_max_turn_num
turns_mask
=
new
std
::
vector
<
std
::
vector
<
float
>>
[
FLAGS_max_turn_num
];
// turns mask data : FLAGS_max_turn_num
}
explicit
DataRecord
(
const
std
::
string
&
path
,
int
batch_size
=
1
)
explicit
DataRecord
(
const
std
::
string
&
path
,
int
batch_size
=
1
)
:
batch_size
(
batch_size
)
{
:
DataRecord
()
{
this
->
batch_size
=
batch_size
;
Load
(
path
);
Load
(
path
);
}
}
~
DataRecord
()
{
delete
[]
turns
;
delete
[]
turns_mask
;
}
DataRecord
NextBatch
()
{
DataRecord
NextBatch
()
{
DataRecord
data
;
DataRecord
data
;
size_t
batch_end
=
batch_iter
+
batch_size
;
size_t
batch_end
=
batch_iter
+
batch_size
;
// NOTE skip the final batch, if no enough data is provided.
// NOTE skip the final batch, if no enough data is provided.
if
(
batch_end
<=
response
.
size
())
{
if
(
batch_end
<=
response
.
size
())
{
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
data
.
turns
[
i
].
assign
(
turns
[
i
].
begin
()
+
batch_iter
,
data
.
turns
[
i
].
assign
(
turns
[
i
].
begin
()
+
batch_iter
,
turns
[
i
].
begin
()
+
batch_end
);
turns
[
i
].
begin
()
+
batch_end
);
}
}
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
data
.
turns_mask
[
i
].
assign
(
turns_mask
[
i
].
begin
()
+
batch_iter
,
data
.
turns_mask
[
i
].
assign
(
turns_mask
[
i
].
begin
()
+
batch_iter
,
turns_mask
[
i
].
begin
()
+
batch_end
);
turns_mask
[
i
].
begin
()
+
batch_end
);
}
}
...
@@ -60,6 +76,7 @@ struct DataRecord {
...
@@ -60,6 +76,7 @@ struct DataRecord {
batch_iter
+=
batch_size
;
batch_iter
+=
batch_size
;
return
data
;
return
data
;
}
}
void
Load
(
const
std
::
string
&
path
)
{
void
Load
(
const
std
::
string
&
path
)
{
std
::
ifstream
file
(
path
);
std
::
ifstream
file
(
path
);
std
::
string
line
;
std
::
string
line
;
...
@@ -69,30 +86,30 @@ struct DataRecord {
...
@@ -69,30 +86,30 @@ struct DataRecord {
num_lines
++
;
num_lines
++
;
std
::
vector
<
std
::
string
>
data
;
std
::
vector
<
std
::
string
>
data
;
split
(
line
,
','
,
&
data
);
split
(
line
,
','
,
&
data
);
CHECK_EQ
(
data
.
size
(),
(
size_t
)(
2
*
MAX_TURN_NUM
+
3
));
CHECK_EQ
(
data
.
size
(),
(
size_t
)(
2
*
FLAGS_max_turn_num
+
3
));
// load turn data
// load turn data
std
::
vector
<
int64_t
>
turns_tmp
[
MAX_TURN_NUM
];
std
::
vector
<
int64_t
>
turns_tmp
[
FLAGS_max_turn_num
];
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
split_to_int64
(
data
[
i
],
' '
,
&
turns_tmp
[
i
]);
split_to_int64
(
data
[
i
],
' '
,
&
turns_tmp
[
i
]);
turns
[
i
].
push_back
(
std
::
move
(
turns_tmp
[
i
]));
turns
[
i
].
push_back
(
std
::
move
(
turns_tmp
[
i
]));
}
}
// load turn_mask data
// load turn_mask data
std
::
vector
<
float
>
turns_mask_tmp
[
MAX_TURN_NUM
];
std
::
vector
<
float
>
turns_mask_tmp
[
FLAGS_max_turn_num
];
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
split_to_float
(
data
[
MAX_TURN_NUM
+
i
],
' '
,
&
turns_mask_tmp
[
i
]);
split_to_float
(
data
[
FLAGS_max_turn_num
+
i
],
' '
,
&
turns_mask_tmp
[
i
]);
turns_mask
[
i
].
push_back
(
std
::
move
(
turns_mask_tmp
[
i
]));
turns_mask
[
i
].
push_back
(
std
::
move
(
turns_mask_tmp
[
i
]));
}
}
// load response data
// load response data
std
::
vector
<
int64_t
>
response_tmp
;
std
::
vector
<
int64_t
>
response_tmp
;
split_to_int64
(
data
[
2
*
MAX_TURN_NUM
],
' '
,
&
response_tmp
);
split_to_int64
(
data
[
2
*
FLAGS_max_turn_num
],
' '
,
&
response_tmp
);
response
.
push_back
(
std
::
move
(
response_tmp
));
response
.
push_back
(
std
::
move
(
response_tmp
));
// load response_mask data
// load response_mask data
std
::
vector
<
float
>
response_mask_tmp
;
std
::
vector
<
float
>
response_mask_tmp
;
split_to_float
(
data
[
2
*
MAX_TURN_NUM
+
1
],
' '
,
&
response_mask_tmp
);
split_to_float
(
data
[
2
*
FLAGS_max_turn_num
+
1
],
' '
,
&
response_mask_tmp
);
response_mask
.
push_back
(
std
::
move
(
response_mask_tmp
));
response_mask
.
push_back
(
std
::
move
(
response_mask_tmp
));
// load result data
// load result data
float
result_tmp
;
float
result_tmp
;
result_tmp
=
std
::
stof
(
data
[
2
*
MAX_TURN_NUM
+
2
]);
result_tmp
=
std
::
stof
(
data
[
2
*
FLAGS_max_turn_num
+
2
]);
result_data
.
push_back
(
result_tmp
);
result_data
.
push_back
(
result_tmp
);
}
}
num_samples
=
num_lines
;
num_samples
=
num_lines
;
...
@@ -101,8 +118,8 @@ struct DataRecord {
...
@@ -101,8 +118,8 @@ struct DataRecord {
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
input_slots
,
DataRecord
*
data
,
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
input_slots
,
DataRecord
*
data
,
int
batch_size
)
{
int
batch_size
)
{
PaddleTensor
turns_tensor
[
MAX_TURN_NUM
];
PaddleTensor
turns_tensor
[
FLAGS_max_turn_num
];
PaddleTensor
turns_mask_tensor
[
MAX_TURN_NUM
];
PaddleTensor
turns_mask_tensor
[
FLAGS_max_turn_num
];
PaddleTensor
response_tensor
;
PaddleTensor
response_tensor
;
PaddleTensor
response_mask_tensor
;
PaddleTensor
response_mask_tensor
;
std
::
string
turn_pre
=
"turn_"
;
std
::
string
turn_pre
=
"turn_"
;
...
@@ -110,16 +127,16 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...
@@ -110,16 +127,16 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
auto
one_batch
=
data
->
NextBatch
();
auto
one_batch
=
data
->
NextBatch
();
int
size
=
one_batch
.
response
[
0
].
size
();
int
size
=
one_batch
.
response
[
0
].
size
();
CHECK_EQ
(
size
,
MAX_TURN_LEN
);
CHECK_EQ
(
size
,
kMaxTurnLen
);
// turn tensor assignment
// turn tensor assignment
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
turns_tensor
[
i
].
name
=
turn_pre
+
std
::
to_string
(
i
);
turns_tensor
[
i
].
name
=
turn_pre
+
std
::
to_string
(
i
);
turns_tensor
[
i
].
shape
.
assign
({
batch_size
,
size
,
1
});
turns_tensor
[
i
].
shape
.
assign
({
batch_size
,
size
,
1
});
turns_tensor
[
i
].
dtype
=
PaddleDType
::
INT64
;
turns_tensor
[
i
].
dtype
=
PaddleDType
::
INT64
;
TensorAssignData
<
int64_t
>
(
&
turns_tensor
[
i
],
one_batch
.
turns
[
i
]);
TensorAssignData
<
int64_t
>
(
&
turns_tensor
[
i
],
one_batch
.
turns
[
i
]);
}
}
// turn mask tensor assignment
// turn mask tensor assignment
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
turns_mask_tensor
[
i
].
name
=
turn_mask_pre
+
std
::
to_string
(
i
);
turns_mask_tensor
[
i
].
name
=
turn_mask_pre
+
std
::
to_string
(
i
);
turns_mask_tensor
[
i
].
shape
.
assign
({
batch_size
,
size
,
1
});
turns_mask_tensor
[
i
].
shape
.
assign
({
batch_size
,
size
,
1
});
turns_mask_tensor
[
i
].
dtype
=
PaddleDType
::
FLOAT32
;
turns_mask_tensor
[
i
].
dtype
=
PaddleDType
::
FLOAT32
;
...
@@ -137,10 +154,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...
@@ -137,10 +154,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
TensorAssignData
<
float
>
(
&
response_mask_tensor
,
one_batch
.
response_mask
);
TensorAssignData
<
float
>
(
&
response_mask_tensor
,
one_batch
.
response_mask
);
// Set inputs.
// Set inputs.
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
input_slots
->
push_back
(
std
::
move
(
turns_tensor
[
i
]));
input_slots
->
push_back
(
std
::
move
(
turns_tensor
[
i
]));
}
}
for
(
int
i
=
0
;
i
<
MAX_TURN_NUM
;
++
i
)
{
for
(
int
i
=
0
;
i
<
FLAGS_max_turn_num
;
++
i
)
{
input_slots
->
push_back
(
std
::
move
(
turns_mask_tensor
[
i
]));
input_slots
->
push_back
(
std
::
move
(
turns_mask_tensor
[
i
]));
}
}
input_slots
->
push_back
(
std
::
move
(
response_tensor
));
input_slots
->
push_back
(
std
::
move
(
response_tensor
));
...
@@ -202,8 +219,6 @@ TEST(Analyzer_dam, fuse_statis) {
...
@@ -202,8 +219,6 @@ TEST(Analyzer_dam, fuse_statis) {
auto
fuse_statis
=
GetFuseStatis
(
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
EXPECT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
317
);
EXPECT_EQ
(
num_ops
,
2020
);
}
}
// Compare result of NativeConfig and AnalysisConfig
// Compare result of NativeConfig and AnalysisConfig
...
...
paddle/fluid/inference/utils/benchmark.cc
浏览文件 @
64ad051b
...
@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
...
@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
ss
<<
batch_size_
<<
"
\t
"
;
ss
<<
batch_size_
<<
"
\t
"
;
ss
<<
num_threads_
<<
"
\t
"
;
ss
<<
num_threads_
<<
"
\t
"
;
ss
<<
latency_
<<
"
\t
"
;
ss
<<
latency_
<<
"
\t
"
;
ss
<<
1000
/
latency_
;
ss
<<
1000
.0
/
latency_
;
ss
<<
'\n'
;
ss
<<
'\n'
;
return
ss
.
str
();
return
ss
.
str
();
}
}
...
...
paddle/fluid/inference/utils/benchmark.h
浏览文件 @
64ad051b
...
@@ -11,9 +11,11 @@
...
@@ -11,9 +11,11 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#pragma once
#include <fstream>
#include <fstream>
#include <iostream>
#include <iostream>
#include <string>
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
...
@@ -31,8 +33,8 @@ struct Benchmark {
...
@@ -31,8 +33,8 @@ struct Benchmark {
bool
use_gpu
()
const
{
return
use_gpu_
;
}
bool
use_gpu
()
const
{
return
use_gpu_
;
}
void
SetUseGpu
()
{
use_gpu_
=
true
;
}
void
SetUseGpu
()
{
use_gpu_
=
true
;
}
in
t
latency
()
const
{
return
latency_
;
}
floa
t
latency
()
const
{
return
latency_
;
}
void
SetLatency
(
in
t
x
)
{
latency_
=
x
;
}
void
SetLatency
(
floa
t
x
)
{
latency_
=
x
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
...
@@ -43,7 +45,7 @@ struct Benchmark {
...
@@ -43,7 +45,7 @@ struct Benchmark {
private:
private:
bool
use_gpu_
{
false
};
bool
use_gpu_
{
false
};
int
batch_size_
{
0
};
int
batch_size_
{
0
};
in
t
latency_
;
floa
t
latency_
;
int
num_threads_
{
1
};
int
num_threads_
{
1
};
std
::
string
name_
;
std
::
string
name_
;
};
};
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
64ad051b
...
@@ -37,7 +37,13 @@ if (WITH_GPU)
...
@@ -37,7 +37,13 @@ if (WITH_GPU)
SET
(
OP_HEADER_DEPS
${
OP_HEADER_DEPS
}
cub
)
SET
(
OP_HEADER_DEPS
${
OP_HEADER_DEPS
}
cub
)
endif
()
endif
()
register_operators
(
EXCLUDES warpctc_op conv_fusion_op DEPS
${
OP_HEADER_DEPS
}
)
SET
(
OP_PREFETCH_DEPS
""
)
if
(
WITH_DISTRIBUTE
)
SET
(
OP_PREFETCH_DEPS
${
OP_PREFETCH_DEPS
}
parameter_prefetch
)
endif
()
register_operators
(
EXCLUDES warpctc_op conv_fusion_op DEPS
${
OP_HEADER_DEPS
}
${
OP_PREFETCH_DEPS
}
)
# warpctc_op needs cudnn 7 above
# warpctc_op needs cudnn 7 above
if
(
WITH_GPU AND NOT WIN32
)
if
(
WITH_GPU AND NOT WIN32
)
...
...
paddle/fluid/operators/batch_norm_mkldnn_op.cc
浏览文件 @
64ad051b
...
@@ -14,7 +14,7 @@ limitations under the License. */
...
@@ -14,7 +14,7 @@ limitations under the License. */
#include "mkldnn.hpp"
#include "mkldnn.hpp"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/mkldnn_
helper
.h"
#include "paddle/fluid/platform/mkldnn_
reuse
.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
const
bool
fuse_with_relu
=
ctx
.
Attr
<
bool
>
(
"fuse_with_relu"
);
const
bool
fuse_with_relu
=
ctx
.
Attr
<
bool
>
(
"fuse_with_relu"
);
bool
global_stats
=
is_test
||
use_global_stats
;
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
...
@@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
T
*
batch_mean_data
=
nullptr
;
T
*
batch_mean_data
=
nullptr
;
T
*
batch_variance_data
=
nullptr
;
T
*
batch_variance_data
=
nullptr
;
if
(
!
is_test
)
{
if
(
!
global_stats
)
{
batch_mean_data
=
batch_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_mean_data
=
batch_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_variance_data
=
batch_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_variance_data
=
batch_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
}
auto
propagation
=
is_test
==
true
?
mkldnn
::
prop_kind
::
forward_scoring
auto
propagation
=
global_stats
==
true
:
mkldnn
::
prop_kind
::
forward_training
;
?
mkldnn
::
prop_kind
::
forward_scoring
:
mkldnn
::
prop_kind
::
forward_training
;
auto
src_tz
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
src_tz
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
scale_tz
=
paddle
::
framework
::
vectorize2int
(
scale
->
dims
());
auto
scale_tz
=
paddle
::
framework
::
vectorize2int
(
scale
->
dims
());
...
@@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
shift
->
data
<
T
>
()
+
ic
,
&
scaleshift_data
);
shift
->
data
<
T
>
()
+
ic
,
&
scaleshift_data
);
unsigned
flags
=
mkldnn
::
use_scale_shift
;
unsigned
flags
=
mkldnn
::
use_scale_shift
;
if
(
is_test
)
flags
|=
mkldnn
::
use_global_stats
;
if
(
global_stats
)
flags
|=
mkldnn
::
use_global_stats
;
if
(
fuse_with_relu
)
flags
|=
mkldnn
::
fuse_bn_relu
;
if
(
fuse_with_relu
)
flags
|=
mkldnn
::
fuse_bn_relu
;
// create mkldnn memory from input x tensor
// create mkldnn memory from input x tensor
...
@@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// keys for backward pass
// keys for backward pass
const
std
::
string
key
=
BatchNormMKLDNNHandler
::
GetHash
(
const
std
::
string
key
=
BatchNormMKLDNNHandler
::
GetHash
(
src_tz
,
epsilon
,
flags
,
is_test
,
input_format
,
src_tz
,
epsilon
,
flags
,
global_stats
,
input_format
,
ctx
.
op
().
Output
(
"SavedMean"
));
ctx
.
op
().
Output
(
"SavedMean"
));
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
...
@@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
batch_norm_fwd_pd
->
dst_primitive_desc
().
desc
(),
y_data
);
batch_norm_fwd_pd
->
dst_primitive_desc
().
desc
(),
y_data
);
std
::
shared_ptr
<
batch_norm_fwd
>
batch_norm_p
;
std
::
shared_ptr
<
batch_norm_fwd
>
batch_norm_p
;
if
(
is_test
)
{
if
(
global_stats
)
{
// create mkldnn memory for stats (as input)
// create mkldnn memory for stats (as input)
std
::
shared_ptr
<
memory
>
mean_memory
=
std
::
shared_ptr
<
memory
>
mean_memory
=
handler
.
AcquireMeanMemoryFromPrimitive
(
to_void_cast
(
mean_data
));
handler
.
AcquireMeanMemoryFromPrimitive
(
to_void_cast
(
mean_data
));
...
@@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline
.
push_back
(
*
batch_norm_p
);
pipeline
.
push_back
(
*
batch_norm_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
if
(
!
is_test
)
{
if
(
!
global_stats
)
{
// mkldnn only compute stats for current batch
// mkldnn only compute stats for current batch
// so we need compute momentum stats via Eigen lib
// so we need compute momentum stats via Eigen lib
EigenVectorArrayMap
<
T
>
batch_mean_e
(
batch_mean_data
,
ic
);
EigenVectorArrayMap
<
T
>
batch_mean_e
(
batch_mean_data
,
ic
);
...
...
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
64ad051b
...
@@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
bool
>
(
"fuse_with_relu"
,
AddAttr
<
bool
>
(
"fuse_with_relu"
,
"(bool, default false) Only used in mkldnn kernel"
)
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_global_stats"
,
"(bool, default false) Whether to use global mean and "
"variance. In inference or test mode, set use_global_stats "
"to true or is_test true. the behavior is equivalent. "
"In train mode, when setting use_global_stats True, the "
"global mean and variance are also used during train time, "
"the BN acts as scaling and shiffting."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Batch Normalization.
Batch Normalization.
...
@@ -190,6 +198,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
...
@@ -190,6 +198,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
bool
global_stats
=
is_test
||
use_global_stats
;
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
data_layout
=
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
framework
::
StringToDataLayout
(
data_layout_str
);
...
@@ -217,7 +229,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
...
@@ -217,7 +229,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
saved_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
saved_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
!
is_test
)
{
if
(
!
global_stats
)
{
// saved_xx is use just in this batch of data
// saved_xx is use just in this batch of data
EigenVectorArrayMap
<
T
>
saved_mean_e
(
EigenVectorArrayMap
<
T
>
saved_mean_e
(
saved_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
);
saved_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
);
...
@@ -234,7 +246,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
...
@@ -234,7 +246,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
if
((
N
*
sample_size
)
==
1
)
{
if
((
N
*
sample_size
)
==
1
)
{
LOG
(
WARNING
)
<<
"Only 1 element in normalization dimension, "
LOG
(
WARNING
)
<<
"Only 1 element in normalization dimension, "
<<
"we skip the batch norm calculation, let y = x."
;
<<
"we skip the batch norm calculation, let y = x."
;
framework
::
TensorCopy
Sync
(
*
x
,
ctx
.
GetPlace
(),
y
);
framework
::
TensorCopy
(
*
x
,
ctx
.
GetPlace
(),
y
);
return
;
return
;
}
}
...
@@ -277,7 +289,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
...
@@ -277,7 +289,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
// use SavedMean and SavedVariance to do normalize
// use SavedMean and SavedVariance to do normalize
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
inv_std
(
C
);
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
inv_std
(
C
);
if
(
is_test
)
{
if
(
global_stats
)
{
ConstEigenVectorArrayMap
<
T
>
var_arr
(
ConstEigenVectorArrayMap
<
T
>
var_arr
(
ctx
.
Input
<
Tensor
>
(
"Variance"
)
->
data
<
T
>
(),
C
);
ctx
.
Input
<
Tensor
>
(
"Variance"
)
->
data
<
T
>
(),
C
);
inv_std
=
(
var_arr
+
epsilon
).
sqrt
().
inverse
();
inv_std
=
(
var_arr
+
epsilon
).
sqrt
().
inverse
();
...
@@ -289,8 +301,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
...
@@ -289,8 +301,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
inv_std
=
saved_inv_std
;
inv_std
=
saved_inv_std
;
}
}
ConstEigenVectorArrayMap
<
T
>
mean_arr
(
ConstEigenVectorArrayMap
<
T
>
mean_arr
(
is_test
?
ctx
.
Input
<
Tensor
>
(
"Mean"
)
->
data
<
T
>
()
global_stats
?
ctx
.
Input
<
Tensor
>
(
"Mean"
)
->
data
<
T
>
()
:
ctx
.
Output
<
Tensor
>
(
"SavedMean"
)
->
data
<
T
>
(),
:
ctx
.
Output
<
Tensor
>
(
"SavedMean"
)
->
data
<
T
>
(),
C
);
C
);
// ((x - est_mean) * (inv_var) * scale + bias
// ((x - est_mean) * (inv_var) * scale + bias
...
@@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
...
@@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
// check input
// check input
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
));
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
));
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scale"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scale"
),
"Input(scale) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"SavedMean"
),
""
);
"Input(Y@GRAD) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"SavedVariance"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"SavedMean"
),
"Input(SavedMean) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"SavedVariance"
),
"Input(SavedVariance) should not be null"
);
// check output
// check output
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Scale"
)),
""
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Scale"
)))
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)),
"Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
"null at same time"
);
}
const
bool
use_global_stats
=
ctx
->
Attrs
().
Get
<
bool
>
(
"use_global_stats"
);
if
(
use_global_stats
)
{
PADDLE_ENFORCE
(
!
ctx
->
Attrs
().
Get
<
bool
>
(
"use_mkldnn"
),
"Using global stats during training is not supported "
"in gradient op kernel of batch_norm_mkldnn_op now."
);
}
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
...
@@ -354,8 +378,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
...
@@ -354,8 +378,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
:
x_dims
[
x_dims
.
size
()
-
1
]);
:
x_dims
[
x_dims
.
size
()
-
1
]);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Scale"
),
{
C
});
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Scale"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
{
C
});
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Scale"
),
{
C
});
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
{
C
});
}
}
}
protected:
protected:
...
@@ -405,6 +431,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
...
@@ -405,6 +431,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
// SavedVariance have been reverted in forward operator
// SavedVariance have been reverted in forward operator
const
auto
*
saved_inv_variance
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
auto
*
saved_inv_variance
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
DataLayout
data_layout
=
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
framework
::
StringToDataLayout
(
data_layout_str
);
...
@@ -419,38 +447,60 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
...
@@ -419,38 +447,60 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
:
x_dims
[
x_dims
.
size
()
-
1
]);
:
x_dims
[
x_dims
.
size
()
-
1
]);
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
ConstEigenVectorArrayMap
<
T
>
scale_arr
(
scale
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
mean_arr
(
saved_mean
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
inv_var_arr
(
saved_inv_variance
->
data
<
T
>
(),
C
);
// init output
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
mean_data
=
saved_mean
->
data
<
T
>
();
const
T
*
inv_var_data
=
saved_inv_variance
->
data
<
T
>
();
Tensor
inv_var_tensor
;
if
(
use_global_stats
)
{
const
auto
*
running_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
running_variance
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
mean_data
=
running_mean
->
data
<
T
>
();
T
*
running_inv_var_data
=
inv_var_tensor
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
EigenVectorArrayMap
<
T
>
inv_var_tmp
(
running_inv_var_data
,
C
);
ConstEigenVectorArrayMap
<
T
>
var_arr
(
running_variance
->
data
<
T
>
(),
C
);
inv_var_tmp
=
(
var_arr
+
epsilon
).
sqrt
().
inverse
().
eval
();
inv_var_data
=
running_inv_var_data
;
}
ConstEigenVectorArrayMap
<
T
>
scale_arr
(
scale
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
mean_arr
(
mean_data
,
C
);
ConstEigenVectorArrayMap
<
T
>
inv_var_arr
(
inv_var_data
,
C
);
T
*
d_bias_data
=
nullptr
;
T
*
d_scale_data
=
nullptr
;
if
(
d_scale
&&
d_bias
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias_data
=
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_scale_data
=
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
// d_bias = np.sum(d_y, axis=0)
// d_bias = np.sum(d_y, axis=0)
// d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
// d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
// d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
// d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
// - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
// - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
EigenVectorArrayMap
<
T
>
d_bias_arr
(
d_bias_data
,
C
);
EigenVectorArrayMap
<
T
>
d_scale_arr
(
d_scale_data
,
C
);
EigenVectorArrayMap
<
T
>
d_bias_arr
(
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
if
(
d_scale
&&
d_bias
)
{
C
);
d_bias_arr
.
setZero
();
EigenVectorArrayMap
<
T
>
d_scale_arr
(
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
d_scale_arr
.
setZero
();
C
);
}
d_bias_arr
.
setZero
();
d_scale_arr
.
setZero
();
if
((
N
*
sample_size
)
==
1
)
{
if
((
N
*
sample_size
)
==
1
&&
!
use_global_stats
)
{
framework
::
TensorCopy
Sync
(
*
d_y
,
ctx
.
GetPlace
(),
d_x
);
framework
::
TensorCopy
(
*
d_y
,
ctx
.
GetPlace
(),
d_x
);
return
;
return
;
}
}
const
auto
scale_inv_var_nhw
=
scale_arr
*
inv_var_arr
/
(
N
*
sample_size
);
int
scale_coefff
=
use_global_stats
?
1
:
N
*
sample_size
;
const
auto
scale_inv_var_nhw
=
scale_arr
*
inv_var_arr
/
scale_coefff
;
switch
(
data_layout
)
{
switch
(
data_layout
)
{
case
DataLayout
::
kNCHW
:
{
case
DataLayout
::
kNCHW
:
{
...
@@ -460,19 +510,29 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
...
@@ -460,19 +510,29 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
sample_size
,
N
*
C
);
sample_size
,
N
*
C
);
d_x_arr
.
setZero
();
d_x_arr
.
setZero
();
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
if
(
d_scale
&&
d_bias
)
{
int
c
=
nc
%
C
;
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
d_bias_arr
(
c
)
+=
d_y_arr
.
col
(
nc
).
sum
();
int
c
=
nc
%
C
;
d_scale_arr
(
c
)
+=
d_bias_arr
(
c
)
+=
d_y_arr
.
col
(
nc
).
sum
();
((
x_arr
.
col
(
nc
)
-
mean_arr
(
c
))
*
inv_var_arr
(
c
)
*
d_y_arr
.
col
(
nc
))
d_scale_arr
(
c
)
+=
((
x_arr
.
col
(
nc
)
-
mean_arr
(
c
))
*
inv_var_arr
(
c
)
*
.
sum
();
d_y_arr
.
col
(
nc
))
.
sum
();
}
}
}
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
if
(
!
use_global_stats
)
{
int
c
=
nc
%
C
;
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
d_x_arr
.
col
(
nc
)
+=
int
c
=
nc
%
C
;
scale_inv_var_nhw
(
c
)
*
d_x_arr
.
col
(
nc
)
+=
(
d_y_arr
.
col
(
nc
)
*
N
*
sample_size
-
d_bias_arr
(
c
)
-
scale_inv_var_nhw
(
c
)
*
(
x_arr
.
col
(
nc
)
-
mean_arr
[
c
])
*
d_scale_arr
(
c
)
*
inv_var_arr
(
c
));
(
d_y_arr
.
col
(
nc
)
*
N
*
sample_size
-
d_bias_arr
(
c
)
-
(
x_arr
.
col
(
nc
)
-
mean_arr
[
c
])
*
d_scale_arr
(
c
)
*
inv_var_arr
(
c
));
}
}
else
{
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
int
c
=
nc
%
C
;
d_x_arr
.
col
(
nc
)
+=
scale_inv_var_nhw
(
c
)
*
d_y_arr
.
col
(
nc
);
}
}
}
break
;
break
;
}
}
...
@@ -488,15 +548,27 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
...
@@ -488,15 +548,27 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
const
auto
d_y_mul_x_minus_mean_row_sum
=
const
auto
d_y_mul_x_minus_mean_row_sum
=
(
d_y_arr
*
x_minus_mean
).
rowwise
().
sum
();
(
d_y_arr
*
x_minus_mean
).
rowwise
().
sum
();
const
auto
inv_var_sqr
=
inv_var_arr
*
inv_var_arr
;
const
auto
inv_var_sqr
=
inv_var_arr
*
inv_var_arr
;
for
(
int
nhw
=
0
;
nhw
<
N
*
sample_size
;
++
nhw
)
{
d_bias_arr
+=
d_y_arr
.
col
(
nhw
);
if
(
d_scale
&&
d_bias
)
{
d_scale_arr
+=
for
(
int
nhw
=
0
;
nhw
<
N
*
sample_size
;
++
nhw
)
{
(
x_arr
.
col
(
nhw
)
-
mean_arr
)
*
inv_var_arr
*
d_y_arr
.
col
(
nhw
);
d_bias_arr
+=
d_y_arr
.
col
(
nhw
);
d_x_arr
.
col
(
nhw
)
+=
d_scale_arr
+=
scale_inv_var_nhw
*
(
x_arr
.
col
(
nhw
)
-
mean_arr
)
*
inv_var_arr
*
d_y_arr
.
col
(
nhw
);
(
d_y_arr
.
col
(
nhw
)
*
N
*
sample_size
-
d_y_row_sum
-
}
x_minus_mean
.
col
(
nhw
)
*
inv_var_sqr
*
}
d_y_mul_x_minus_mean_row_sum
);
if
(
!
use_global_stats
)
{
for
(
int
nhw
=
0
;
nhw
<
N
*
sample_size
;
++
nhw
)
{
d_x_arr
.
col
(
nhw
)
+=
scale_inv_var_nhw
*
(
d_y_arr
.
col
(
nhw
)
*
N
*
sample_size
-
d_y_row_sum
-
x_minus_mean
.
col
(
nhw
)
*
inv_var_sqr
*
d_y_mul_x_minus_mean_row_sum
);
}
}
else
{
for
(
int
nhw
=
0
;
nhw
<
N
*
sample_size
;
++
nhw
)
{
d_x_arr
.
col
(
nhw
)
+=
scale_inv_var_nhw
*
d_y_arr
.
col
(
nhw
);
}
}
}
break
;
break
;
}
}
...
@@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
...
@@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
op
->
SetInput
(
"SavedMean"
,
Output
(
"SavedMean"
));
op
->
SetInput
(
"SavedMean"
,
Output
(
"SavedMean"
));
op
->
SetInput
(
"SavedVariance"
,
Output
(
"SavedVariance"
));
op
->
SetInput
(
"SavedVariance"
,
Output
(
"SavedVariance"
));
// used when setting use_global_stats True during training
op
->
SetInput
(
"Mean"
,
Output
(
"MeanOut"
));
op
->
SetInput
(
"Variance"
,
Output
(
"VarianceOut"
));
op
->
SetAttrMap
(
Attrs
());
op
->
SetAttrMap
(
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
...
...
paddle/fluid/operators/batch_norm_op.cu
.cc
→
paddle/fluid/operators/batch_norm_op.cu
浏览文件 @
64ad051b
...
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include
"paddle/fluid/operators/batch_norm_op.h"
#include
<algorithm>
#include <cfloat>
#include <cfloat>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/float16.h"
...
@@ -59,6 +63,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
...
@@ -59,6 +63,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
const
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
data_layout
=
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
framework
::
StringToDataLayout
(
data_layout_str
);
...
@@ -121,7 +126,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
...
@@ -121,7 +126,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
handle
=
dev_ctx
.
cudnn_handle
();
// Now, depending on whether we are running test or not, we have two paths.
// Now, depending on whether we are running test or not, we have two paths.
if
(
is_test
)
{
if
(
is_test
||
use_global_stats
)
{
// only when test we use input to do computation.
// only when test we use input to do computation.
const
auto
*
est_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
est_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
est_var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
est_var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
...
@@ -163,7 +168,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
...
@@ -163,7 +168,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
if
((
N
*
H
*
W
*
D
)
==
1
)
{
if
((
N
*
H
*
W
*
D
)
==
1
)
{
LOG
(
WARNING
)
<<
"Only 1 element in normalization dimension, "
LOG
(
WARNING
)
<<
"Only 1 element in normalization dimension, "
<<
"we skip the batch norm calculation, let y = x."
;
<<
"we skip the batch norm calculation, let y = x."
;
framework
::
TensorCopy
Sync
(
*
x
,
ctx
.
GetPlace
(),
y
);
framework
::
TensorCopy
(
*
x
,
ctx
.
GetPlace
(),
y
);
}
else
{
}
else
{
double
this_factor
=
1.
-
momentum
;
double
this_factor
=
1.
-
momentum
;
...
@@ -191,6 +196,58 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
...
@@ -191,6 +196,58 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
}
}
};
};
template
<
typename
T
,
framework
::
DataLayout
layout
>
static
__global__
void
KeBNBackwardData
(
const
T
*
dy
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
variance
,
const
double
epsilon
,
const
int
C
,
const
int
HxW
,
const
int
num
,
T
*
dx
)
{
int
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
HxW
%
C
:
i
%
C
;
BatchNormParamType
<
T
>
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
dx
[
i
]
=
static_cast
<
T
>
(
static_cast
<
BatchNormParamType
<
T
>>
(
dy
[
i
])
*
scale
[
c
]
*
inv_var
);
}
}
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
static
__global__
void
KeBNBackwardScaleBias
(
const
T
*
dy
,
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
mean
,
const
BatchNormParamType
<
T
>
*
variance
,
const
double
epsilon
,
const
int
N
,
const
int
C
,
const
int
HxW
,
BatchNormParamType
<
T
>
*
dscale
,
BatchNormParamType
<
T
>
*
dbias
)
{
const
int
outer_size
=
C
;
const
int
inner_size
=
N
*
HxW
;
typedef
cub
::
BlockReduce
<
BatchNormParamType
<
T
>
,
BlockDim
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
ds_storage
;
__shared__
typename
BlockReduce
::
TempStorage
db_storage
;
for
(
int
i
=
blockIdx
.
x
;
i
<
outer_size
;
i
+=
gridDim
.
x
)
{
BatchNormParamType
<
T
>
ds_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
BatchNormParamType
<
T
>
db_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
BatchNormParamType
<
T
>
inv_var_i
=
1.0
/
sqrt
(
variance
[
i
]
+
epsilon
);
BatchNormParamType
<
T
>
mean_i
=
mean
[
i
];
for
(
int
j
=
threadIdx
.
x
;
j
<
inner_size
;
j
+=
blockDim
.
x
)
{
const
int
index
=
layout
==
framework
::
DataLayout
::
kNCHW
?
(
j
/
HxW
*
C
+
i
)
*
HxW
+
j
%
HxW
:
j
*
outer_size
+
i
;
ds_sum
+=
static_cast
<
BatchNormParamType
<
T
>>
(
dy
[
index
])
*
(
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
index
])
-
mean_i
);
db_sum
+=
static_cast
<
BatchNormParamType
<
T
>>
(
dy
[
index
]);
}
ds_sum
=
BlockReduce
(
ds_storage
).
Reduce
(
ds_sum
,
cub
::
Sum
());
db_sum
=
BlockReduce
(
db_storage
).
Reduce
(
db_sum
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
{
dscale
[
i
]
=
ds_sum
*
inv_var_i
;
dbias
[
i
]
=
db_sum
;
}
__syncthreads
();
}
}
template
<
typename
T
>
template
<
typename
T
>
class
BatchNormGradKernel
<
platform
::
CUDADeviceContext
,
T
>
class
BatchNormGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
:
public
framework
::
OpKernel
<
T
>
{
...
@@ -200,6 +257,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
...
@@ -200,6 +257,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
"It must use CUDAPlace."
);
"It must use CUDAPlace."
);
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
const
DataLayout
data_layout
=
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
framework
::
StringToDataLayout
(
data_layout_str
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
...
@@ -219,42 +278,13 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
...
@@ -219,42 +278,13 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_scale
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
if
(
d_scale
&&
d_bias
)
{
d_bias
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
d_scale
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
if
((
N
*
H
*
W
*
D
)
==
1
)
{
framework
::
TensorCopySync
(
*
d_y
,
ctx
.
GetPlace
(),
d_x
);
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
BatchNormParamType
<
T
>>
functor
;
functor
(
dev_ctx
,
d_scale
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
d_bias
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
return
;
}
}
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
scale
->
dims
()[
0
],
C
);
PADDLE_ENFORCE_EQ
(
scale
->
dims
()[
0
],
C
);
// ------------------- cudnn descriptors ---------------------
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
bn_param_desc_
;
cudnnBatchNormMode_t
mode_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bn_param_desc_
));
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
#if CUDNN_VERSION_MIN(7, 0, 0)
mode_
=
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
;
#else
mode_
=
CUDNN_BATCHNORM_SPATIAL
;
#endif
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
std
::
vector
<
int
>
strides
;
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
...
@@ -264,34 +294,114 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
...
@@ -264,34 +294,114 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
dims
=
{
N
,
C
,
H
,
W
,
D
};
dims
=
{
N
,
C
,
H
,
W
,
D
};
strides
=
{
H
*
W
*
C
*
D
,
1
,
W
*
D
*
C
,
D
*
C
,
C
};
strides
=
{
H
*
W
*
C
*
D
,
1
,
W
*
D
*
C
,
D
*
C
,
C
};
}
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
));
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
void
*
saved_mean_data
=
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
void
*
saved_var_data
=
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
data_desc_
,
d_y
->
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
// clean when exit.
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
if
(
!
use_global_stats
)
{
CUDNN_ENFORCE
(
if
((
N
*
H
*
W
*
D
)
==
1
)
{
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bn_param_desc_
));
framework
::
TensorCopy
(
*
d_y
,
ctx
.
GetPlace
(),
d_x
);
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
BatchNormParamType
<
T
>>
functor
;
functor
(
dev_ctx
,
d_scale
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
d_bias
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
return
;
}
// ------------------- cudnn descriptors ---------------------
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
bn_param_desc_
;
cudnnBatchNormMode_t
mode_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
bn_param_desc_
));
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
#if CUDNN_VERSION_MIN(7, 0, 0)
mode_
=
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
;
#else
mode_
=
CUDNN_BATCHNORM_SPATIAL
;
#endif
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
bn_param_desc_
,
data_desc_
,
mode_
));
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
void
*
saved_mean_data
=
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
void
*
saved_var_data
=
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
data_desc_
,
d_y
->
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
// clean when exit.
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bn_param_desc_
));
}
else
{
const
auto
*
running_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
running_var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
running_mean_data
=
running_mean
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
auto
*
running_var_data
=
running_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
int
num
=
x
->
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
int
grid1
=
(
num
+
block
-
1
)
/
block
;
int
grid2
=
std
::
min
(
C
,
max_blocks
);
if
(
data_layout
==
framework
::
DataLayout
::
kNCHW
)
{
if
(
d_x
)
{
KeBNBackwardData
<
T
,
framework
::
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
->
data
<
T
>
(),
scale
->
data
<
BatchNormParamType
<
T
>>
(),
running_var_data
,
epsilon
,
C
,
H
*
W
,
num
,
d_x
->
data
<
T
>
());
}
if
(
d_scale
&&
d_bias
)
{
KeBNBackwardScaleBias
<
T
,
block
,
framework
::
DataLayout
::
kNCHW
><<<
grid2
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
->
data
<
T
>
(),
x
->
data
<
T
>
(),
running_mean_data
,
running_var_data
,
epsilon
,
C
,
H
*
W
,
num
,
d_scale
->
data
<
BatchNormParamType
<
T
>>
(),
d_bias
->
data
<
BatchNormParamType
<
T
>>
());
}
}
else
{
if
(
d_x
)
{
KeBNBackwardData
<
T
,
framework
::
DataLayout
::
kNHWC
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
->
data
<
T
>
(),
scale
->
data
<
BatchNormParamType
<
T
>>
(),
running_var_data
,
epsilon
,
C
,
H
*
W
,
num
,
d_x
->
data
<
T
>
());
}
if
(
d_scale
&&
d_bias
)
{
KeBNBackwardScaleBias
<
T
,
block
,
framework
::
DataLayout
::
kNCHW
><<<
grid2
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
->
data
<
T
>
(),
x
->
data
<
T
>
(),
running_mean_data
,
running_var_data
,
epsilon
,
C
,
H
*
W
,
num
,
d_scale
->
data
<
BatchNormParamType
<
T
>>
(),
d_bias
->
data
<
BatchNormParamType
<
T
>>
());
}
}
}
}
}
};
};
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
浏览文件 @
64ad051b
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_
helper
.h"
#include "paddle/fluid/platform/mkldnn_
reuse
.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -28,259 +28,6 @@ using mkldnn::stream;
...
@@ -28,259 +28,6 @@ using mkldnn::stream;
using
platform
::
to_void_cast
;
using
platform
::
to_void_cast
;
using
platform
::
GetMKLDNNFormat
;
using
platform
::
GetMKLDNNFormat
;
class
ConvMKLDNNHandler
:
public
platform
::
MKLDNNHandler
{
public:
ConvMKLDNNHandler
(
std
::
shared_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
conv_pd
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
)
{
conv_pd_
=
conv_pd
;
}
ConvMKLDNNHandler
(
std
::
shared_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
conv_pd
,
std
::
shared_ptr
<
mkldnn
::
convolution_backward_data
::
primitive_desc
>
conv_bwd_data_pd
,
std
::
shared_ptr
<
mkldnn
::
convolution_backward_weights
::
primitive_desc
>
conv_bwd_weights_pd
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
),
conv_pd_
(
conv_pd
),
conv_bwd_weights_pd_
(
conv_bwd_weights_pd
),
conv_bwd_data_pd_
(
conv_bwd_data_pd
)
{
// If we are in Grad operatgor then update a key with BWD suffix to
// distinguish from FWD memory primitives
key_
+=
"-BWD"
;
}
size_t
GetDstMemorySize
()
const
{
return
conv_pd_
->
dst_primitive_desc
().
get_size
();
}
mkldnn
::
memory
::
format
GetDstFormat
()
const
{
return
static_cast
<
mkldnn
::
memory
::
format
>
(
conv_pd_
->
dst_primitive_desc
().
desc
().
data
.
format
);
}
size_t
GetDiffWeightsMemorySize
()
const
{
return
conv_bwd_weights_pd_
->
diff_weights_primitive_desc
().
get_size
();
}
size_t
GetDiffSourceMemorySize
()
const
{
return
conv_bwd_data_pd_
->
diff_src_primitive_desc
().
get_size
();
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_bwd_weights_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@weights-src_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_weights_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
"@weights-diff_dst_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffWeightsMemoryFromWeightsPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
conv_bwd_weights_pd_
->
diff_weights_primitive_desc
(),
ptr
,
"@diff_weights_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_data_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
"@data-diff_dst_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
weights_pd
=
conv_bwd_data_pd_
->
weights_primitive_desc
();
auto
user_pd
=
user_weights_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_pd
,
user_weights_memory_p
,
"@data-weights_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireResidualDataMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_residual_data_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemoryFromResidualDataMemory
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>&
user_residual_memory_p
,
void
*
dst_ptr
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
return
this
->
AcquireMemory
(
user_residual_memory_p
,
this
->
AcquireDstMemoryFromPrimitive
(
dst_ptr
),
"@residual_data_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffSrcMemoryFromDataPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
conv_bwd_data_pd_
->
diff_src_primitive_desc
(),
ptr
,
"@diff_src_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemoryFromPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
conv_pd_
->
dst_primitive_desc
(),
ptr
,
"@dst_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@src_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
,
// NOLINT
bool
is_persistent
=
false
)
{
auto
user_weights_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
weights_pd
=
conv_pd_
->
weights_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
user_weights_memory_p
,
"@weights_mem_p"
,
pipeline
,
is_persistent
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireBiasMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_bias_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
user_bias_pd
=
user_bias_memory_p
->
get_primitive_desc
();
auto
bias_pd
=
conv_pd_
->
bias_primitive_desc
();
return
this
->
AcquireMemory
(
bias_pd
,
user_bias_pd
,
user_bias_memory_p
,
"@bias_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
convolution_forward
>
AcquireConvolution
(
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
weights_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_p"
;
auto
conv_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_forward
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
mkldnn
::
convolution_forward
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
()),
*
(
dst_memory_p
.
get
()));
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_p
;
}
std
::
shared_ptr
<
mkldnn
::
convolution_forward
>
AcquireConvolution
(
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
weights_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
bias_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_p"
;
auto
conv_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_forward
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
mkldnn
::
convolution_forward
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
()),
*
(
bias_memory_p
.
get
()),
*
(
dst_memory_p
.
get
()));
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_p
;
}
std
::
shared_ptr
<
mkldnn
::
convolution_backward_weights
>
AcquireConvolutionBackwardWeights
(
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_dst_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_weights_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_bwd_weights_p"
;
auto
conv_bwd_weights_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_backward_weights
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
(
(
conv_bwd_weights_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution bwd weights primitive in device context"
);
if
(
conv_bwd_weights_p
==
nullptr
)
{
// create backward conv primitive for weights
conv_bwd_weights_p
=
std
::
make_shared
<
mkldnn
::
convolution_backward_weights
>
(
*
conv_bwd_weights_pd_
,
*
src_memory_p
,
*
diff_dst_memory_p
,
*
diff_weights_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_bwd_weights_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_bwd_weights_p
;
}
std
::
shared_ptr
<
mkldnn
::
convolution_backward_data
>
AcquireConvolutionBackwardData
(
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_dst_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
weights_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_src_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_bwd_data_p"
;
auto
conv_bwd_data_p
=
std
::
static_pointer_cast
<
mkldnn
::
convolution_backward_data
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
(
(
conv_bwd_data_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution bwd data primitive in device context"
);
if
(
conv_bwd_data_p
==
nullptr
)
{
conv_bwd_data_p
=
std
::
make_shared
<
mkldnn
::
convolution_backward_data
>
(
*
conv_bwd_data_pd_
,
*
diff_dst_memory_p
,
*
weights_memory_p
,
*
diff_src_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_bwd_data_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_bwd_data_p
;
}
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static
std
::
string
GetHash
(
memory
::
dims
&
input_dims
,
// NOLINT
memory
::
dims
&
weights_dims
,
// NOLINT
std
::
vector
<
int
>&
strides
,
// NOLINT
std
::
vector
<
int
>&
paddings
,
// NOLINT
std
::
vector
<
int
>&
dilations
,
// NOLINT
int
groups
,
const
std
::
string
&
suffix
)
{
return
dims2str
(
input_dims
)
+
dims2str
(
weights_dims
)
+
dims2str
(
strides
)
+
dims2str
(
paddings
)
+
dims2str
(
dilations
)
+
std
::
to_string
(
groups
)
+
suffix
;
}
private:
std
::
shared_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
conv_pd_
;
std
::
shared_ptr
<
mkldnn
::
convolution_backward_weights
::
primitive_desc
>
conv_bwd_weights_pd_
;
std
::
shared_ptr
<
mkldnn
::
convolution_backward_data
::
primitive_desc
>
conv_bwd_data_pd_
;
};
template
<
typename
T
>
template
<
typename
T
>
class
ConvMKLDNNOpKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
class
ConvMKLDNNOpKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
public:
...
@@ -351,7 +98,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -351,7 +98,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
// Get unique name for storing MKLDNN primitives
// Get unique name for storing MKLDNN primitives
const
std
::
string
key
=
ConvMKLDNNHandler
::
GetHash
(
const
std
::
string
key
=
platform
::
ConvMKLDNNHandler
::
GetHash
(
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
groups
,
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
groups
,
ctx
.
op
().
Output
(
"Output"
));
ctx
.
op
().
Output
(
"Output"
));
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
...
@@ -400,7 +147,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -400,7 +147,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// Save conv_pd/src_memory/weights_memory for backward pass
// Save conv_pd/src_memory/weights_memory for backward pass
if
(
!
is_test
)
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd
);
if
(
!
is_test
)
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd
);
ConvMKLDNNHandler
handler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
platform
::
ConvMKLDNNHandler
handler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
// create mkldnn memory from input tensors (data/weights)
// create mkldnn memory from input tensors (data/weights)
auto
user_src_memory_p
=
auto
user_src_memory_p
=
...
@@ -616,9 +363,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -616,9 +363,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// Get an unique name from "argument" name of "Output" variable
// Get an unique name from "argument" name of "Output" variable
// as well as attributes of primitive to be created
// as well as attributes of primitive to be created
// This name will be used as key when saving info into device context
// This name will be used as key when saving info into device context
const
std
::
string
key
=
const
std
::
string
key
=
platform
::
ConvMKLDNNHandler
::
GetHash
(
ConvMKLDNNHandler
::
GetHash
(
src_tz
,
weights_tz
,
strides
,
padding
s
,
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
group
s
,
dilations
,
groups
,
ctx
.
op
().
Input
(
"Output"
));
ctx
.
op
().
Input
(
"Output"
));
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
std
::
vector
<
primitive
>
pipeline
;
std
::
vector
<
primitive
>
pipeline
;
...
@@ -673,8 +420,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
...
@@ -673,8 +420,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std
::
make_shared
<
mkldnn
::
convolution_backward_data
::
primitive_desc
>
(
std
::
make_shared
<
mkldnn
::
convolution_backward_data
::
primitive_desc
>
(
conv_bwd_data_desc
,
mkldnn_engine
,
*
conv_pd
);
conv_bwd_data_desc
,
mkldnn_engine
,
*
conv_pd
);
ConvMKLDNNHandler
handler
(
conv_pd
,
conv_bwd_data_pd
,
conv_bwd_weights_pd
,
platform
::
ConvMKLDNNHandler
handler
(
conv_pd
,
conv_bwd_data_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
conv_bwd_weights_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
// create mkldnn memory from input tensors (data/weights)
// create mkldnn memory from input tensors (data/weights)
auto
user_src_memory_p
=
auto
user_src_memory_p
=
...
...
paddle/fluid/operators/conv_transpose_mkldnn_op.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
framework
::
DataLayout
;
template
<
typename
T
>
class
ConvTransposeMKLDNNOpKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
PADDLE_ENFORCE
(
is_test
==
true
,
"ConvTransposeMKLDNN works only for inference!. Set is_test = True"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
auto
*
bias
=
ctx
.
HasInput
(
"Bias"
)
?
ctx
.
Input
<
Tensor
>
(
"Bias"
)
:
nullptr
;
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
&&
input
->
format
()
!=
mkldnn
::
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input tensor"
);
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
&&
filter
->
format
()
!=
mkldnn
::
memory
::
format
::
format_undef
,
"Wrong layout/format set for Filter tensor"
);
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
,
"Input must be with 4 dimensions, i.e. NCHW"
);
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
,
"Filter must be with 4 dimensions, i.e. OIHW"
);
if
(
bias
)
{
PADDLE_ENFORCE
(
bias
->
layout
()
==
DataLayout
::
kMKLDNN
&&
bias
->
format
()
!=
mkldnn
::
memory
::
format
::
format_undef
,
"Wrong layout/format set for Bias tensor"
);
PADDLE_ENFORCE
(
bias
->
dims
().
size
()
==
1
,
"Bias must only have 1 dimension, i.e. X"
);
}
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
// TODO(tpatejko): add support for dilation
PADDLE_ENFORCE
(
dilations
.
size
()
==
2
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
,
"dilation in convolution is not implemented yet"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
iohw_weights_tz
=
paddle
::
framework
::
vectorize2int
(
filter
->
dims
());
std
::
vector
<
int
>
weights_tz
=
iohw_weights_tz
;
// IOHW -> OIHW
weights_tz
[
0
]
=
iohw_weights_tz
[
1
];
weights_tz
[
1
]
=
iohw_weights_tz
[
0
];
// Custom Reorder from IOHW to OIHW
auto
iohw2oihw_reorder
=
[
&
iohw_weights_tz
](
const
T
*
filter_data
)
->
std
::
shared_ptr
<
T
>
{
int
o
=
iohw_weights_tz
[
1
];
int
c
=
iohw_weights_tz
[
0
];
int
h
=
iohw_weights_tz
[
2
];
int
w
=
iohw_weights_tz
[
3
];
std
::
shared_ptr
<
T
>
reordered_filter_data
(
new
T
[
o
*
c
*
h
*
w
](),
std
::
default_delete
<
T
[]
>
());
for
(
int
i
=
0
;
i
<
c
;
++
i
)
{
for
(
int
j
=
0
;
j
<
o
;
++
j
)
{
int
in_offset
=
j
*
h
*
w
+
i
*
o
*
h
*
w
;
int
out_offset
=
j
*
c
*
h
*
w
+
i
*
h
*
w
;
std
::
memcpy
(
&
(
reordered_filter_data
.
get
())[
out_offset
],
&
filter_data
[
in_offset
],
h
*
w
*
sizeof
(
T
));
}
}
return
reordered_filter_data
;
};
int
g
=
std
::
max
(
groups
,
1
);
if
(
g
>
1
)
{
int
o
=
weights_tz
[
0
];
int
i
=
weights_tz
[
1
];
int
h
=
weights_tz
[
2
];
int
w
=
weights_tz
[
3
];
weights_tz
.
resize
(
5
);
weights_tz
[
0
]
=
g
;
weights_tz
[
1
]
=
o
/
g
;
weights_tz
[
2
]
=
i
;
weights_tz
[
3
]
=
h
;
weights_tz
[
4
]
=
w
;
}
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
// Get unique name for storing MKLDNN primitives
const
std
::
string
key
=
platform
::
ConvTransposeMKLDNNHandler
::
GetHash
(
src_tz
,
weights_tz
,
strides
,
paddings
,
dilations
,
groups
,
ctx
.
op
().
Output
(
"Output"
));
const
std
::
string
key_conv_transpose_pd
=
key
+
"@conv_transpose_pd"
;
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
auto
user_src_md
=
platform
::
MKLDNNMemDesc
(
{
src_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
input
->
format
());
auto
user_weights_md
=
platform
::
MKLDNNMemDesc
({
weights_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
(
g
==
1
)
?
mkldnn
::
memory
::
format
::
oihw
:
mkldnn
::
memory
::
format
::
goihw
);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
chosen_memory_format
=
platform
::
data_format_to_memory_format
(
data_format
);
bool
fuse_relu
=
ctx
.
Attr
<
bool
>
(
"fuse_relu"
);
auto
src_md
=
platform
::
MKLDNNMemDesc
(
src_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
chosen_memory_format
);
auto
weights_md
=
platform
::
MKLDNNMemDesc
(
weights_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
chosen_memory_format
);
std
::
vector
<
int
>
bias_tz
;
// TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto
dst_md
=
platform
::
MKLDNNMemDesc
(
dst_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
chosen_memory_format
);
// create a deconv(conv transpose) primitive descriptor and save it for
// usage in backward
std
::
shared_ptr
<
mkldnn
::
deconvolution_forward
::
primitive_desc
>
conv_transpose_pd
;
auto
fwd_prop_kind
=
is_test
?
mkldnn
::
prop_kind
::
forward_inference
:
mkldnn
::
prop_kind
::
forward_training
;
if
(
bias
)
{
bias_tz
=
paddle
::
framework
::
vectorize2int
(
bias
->
dims
());
auto
bias_md
=
platform
::
MKLDNNMemDesc
(
bias_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
mkldnn
::
memory
::
format
::
x
);
conv_transpose_pd
=
ConvTransposeFwdPrimitiveDesc
(
src_md
,
weights_md
,
bias_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
fuse_relu
,
fwd_prop_kind
);
}
else
{
conv_transpose_pd
=
ConvTransposeFwdPrimitiveDesc
(
src_md
,
weights_md
,
dst_md
,
strides
,
paddings
,
mkldnn_engine
,
fuse_relu
,
fwd_prop_kind
);
}
// Save conv_pd/src_memory/weights_memory for backward pass
if
(
!
is_test
)
dev_ctx
.
SetBlob
(
key_conv_transpose_pd
,
conv_transpose_pd
);
platform
::
ConvTransposeMKLDNNHandler
handler
(
conv_transpose_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
// create mkldnn memory from input tensors (data/weights)
auto
user_src_memory_p
=
handler
.
AcquireSrcMemory
(
user_src_md
,
platform
::
to_void_cast
<
T
>
(
input_data
));
auto
user_weights_memory_p
=
handler
.
AcquireWeightsMemory
(
user_weights_md
,
platform
::
to_void_cast
<
T
>
(
filter_data
),
is_test
?
iohw2oihw_reorder
:
platform
::
user_function
());
// create reorder primitive if the input format is not the preferred one
auto
src_memory_p
=
handler
.
AcquireSrcMemoryFromPrimitive
(
user_src_memory_p
,
pipeline
);
auto
weights_memory_p
=
handler
.
AcquireWeightsMemoryFromPrimitive
(
user_weights_memory_p
,
pipeline
,
is_test
);
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
;
auto
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
(),
paddle
::
memory
::
Allocator
::
kDefault
,
handler
.
GetDstMemorySize
());
dst_memory_p
=
handler
.
AcquireDstMemoryFromPrimitive
(
platform
::
to_void_cast
<
T
>
(
output_data
));
// create convolution op primitive
std
::
shared_ptr
<
mkldnn
::
deconvolution_forward
>
conv_p
;
if
(
bias
)
{
const
T
*
bias_data
=
bias
->
data
<
T
>
();
auto
user_bias_md
=
platform
::
MKLDNNMemDesc
({
bias_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
mkldnn
::
memory
::
format
::
x
);
auto
user_bias_memory_p
=
handler
.
AcquireBiasMemory
(
user_bias_md
,
platform
::
to_void_cast
<
T
>
(
bias_data
));
auto
bias_memory_p
=
handler
.
AcquireBiasMemoryFromPrimitive
(
user_bias_memory_p
,
pipeline
);
conv_p
=
handler
.
AcquireConvolution
(
src_memory_p
,
weights_memory_p
,
bias_memory_p
,
dst_memory_p
);
}
else
{
conv_p
=
handler
.
AcquireConvolution
(
src_memory_p
,
weights_memory_p
,
dst_memory_p
);
}
// push primitive to stream and wait until it's executed
pipeline
.
push_back
(
*
conv_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_format
(
platform
::
GetMKLDNNFormat
(
*
dst_memory_p
));
}
private:
mkldnn
::
primitive_attr
CreatePostOps
(
bool
fuse_relu
)
const
{
mkldnn
::
primitive_attr
conv_attr
;
mkldnn
::
post_ops
post_operations
;
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if
(
fuse_relu
)
{
constexpr
float
scale
=
1.0
f
;
constexpr
float
negative_slope
=
0.0
f
;
constexpr
float
placeholder
=
0.0
f
;
post_operations
.
append_eltwise
(
scale
,
mkldnn
::
algorithm
::
eltwise_relu
,
negative_slope
,
placeholder
);
}
conv_attr
.
set_post_ops
(
post_operations
);
return
conv_attr
;
}
std
::
unique_ptr
<
mkldnn
::
deconvolution_forward
::
primitive_desc
>
ConvTransposeFwdPrimitiveDesc
(
const
mkldnn
::
memory
::
desc
&
src
,
const
mkldnn
::
memory
::
desc
&
weights
,
const
mkldnn
::
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
mkldnn
::
prop_kind
fwd_prop_kind
)
const
{
mkldnn
::
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
mkldnn
::
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
auto
deconv_desc
=
mkldnn
::
deconvolution_forward
::
desc
(
fwd_prop_kind
,
mkldnn
::
deconvolution_direct
,
src
,
weights
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
deconv_attr
=
CreatePostOps
(
fuse_relu
);
auto
p_conv_transpose_pd
=
new
mkldnn
::
deconvolution_forward
::
primitive_desc
(
deconv_desc
,
deconv_attr
,
engine
);
return
std
::
unique_ptr
<
mkldnn
::
deconvolution_forward
::
primitive_desc
>
(
p_conv_transpose_pd
);
}
std
::
unique_ptr
<
mkldnn
::
deconvolution_forward
::
primitive_desc
>
ConvTransposeFwdPrimitiveDesc
(
const
mkldnn
::
memory
::
desc
&
src
,
const
mkldnn
::
memory
::
desc
&
weights
,
const
mkldnn
::
memory
::
desc
&
bias
,
const
mkldnn
::
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
,
const
bool
fuse_relu
,
mkldnn
::
prop_kind
fwd_prop_kind
)
const
{
mkldnn
::
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
mkldnn
::
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
auto
deconv_desc
=
mkldnn
::
deconvolution_forward
::
desc
(
fwd_prop_kind
,
mkldnn
::
deconvolution_direct
,
src
,
weights
,
bias
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
mkldnn
::
primitive_attr
deconv_attr
=
CreatePostOps
(
fuse_relu
);
auto
p_conv_transpose_pd
=
new
mkldnn
::
deconvolution_forward
::
primitive_desc
(
deconv_desc
,
deconv_attr
,
engine
);
return
std
::
unique_ptr
<
mkldnn
::
deconvolution_forward
::
primitive_desc
>
(
p_conv_transpose_pd
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
conv2d_transpose
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ConvTransposeMKLDNNOpKernel
<
float
>
);
paddle/fluid/operators/conv_transpose_op.cc
浏览文件 @
64ad051b
...
@@ -16,6 +16,10 @@ limitations under the License. */
...
@@ -16,6 +16,10 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -78,29 +82,38 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -78,29 +82,38 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
ConvTransposeOp
::
GetExpectedKernelType
(
framework
::
OpKernelType
ConvTransposeOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
if
(
use_cudnn
)
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
}
}
#endif
#endif
framework
::
LibraryType
library_
;
#ifdef PADDLE_WITH_MKLDNN
if
(
use_cudnn
)
{
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
library_
=
framework
::
LibraryType
::
kCUDNN
;
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
}
else
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
l
ibrary_
=
framework
::
LibraryType
::
kPlain
;
l
ayout_
=
framework
::
DataLayout
::
kMKLDNN
;
}
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
layout_
,
library_
);
layout_
,
library_
);
}
}
void
Conv2DTransposeOpMaker
::
Make
()
{
void
Conv2DTransposeOpMaker
::
Make
()
{
AddAttr
<
bool
>
(
"is_test"
,
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
AddInput
(
AddInput
(
"Input"
,
"Input"
,
"(Tensor) The input tensor of convolution transpose operator. "
"(Tensor) The input tensor of convolution transpose operator. "
...
@@ -145,6 +158,11 @@ void Conv2DTransposeOpMaker::Make() {
...
@@ -145,6 +158,11 @@ void Conv2DTransposeOpMaker::Make() {
"use_cudnn"
,
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"fuse_relu"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"data_format"
,
"data_format"
,
"(string, default NCHW) Only used in "
"(string, default NCHW) Only used in "
...
@@ -238,6 +256,9 @@ void Conv3DTransposeOpMaker::Make() {
...
@@ -238,6 +256,9 @@ void Conv3DTransposeOpMaker::Make() {
"use_cudnn"
,
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"data_format"
,
"data_format"
,
"(string, default NCHW) Only used in "
"(string, default NCHW) Only used in "
...
...
paddle/fluid/operators/cudnn_lstm_op.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
class
CudnnLSTMOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
"Input(Input) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input(Weight) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitH"
),
"Input(init_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitC"
),
"Input(init_c) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Cache"
),
"Input(Cache) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"last_h"
),
"Output(last_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"last_c"
),
"Output(last_c) of LSTM should not be null."
);
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
3
,
"Input(X)'s rank must be 3."
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"Input"
));
ctx
->
SetOutputDim
(
"last_h"
,
ctx
->
GetInputDim
(
"InitH"
));
ctx
->
SetOutputDim
(
"last_c"
,
ctx
->
GetInputDim
(
"InitC"
));
}
};
class
CudnnLSTMOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Input"
,
"(Tensor) RNN input tensor, which support variable-time length input "
"sequence."
"The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
"seq_len is the total time step in this mini-batch (CAN be change in "
"different batch)"
"batch_size is the instance number of this batch"
"input_size is the hidden size of the input."
"input_hidden_size and the hidden_size in the next may not be same"
);
AddInput
(
"InitH"
,
"(Tensor) the initial hidden state of the LSTM"
"input. This is a tensor with shape (num_layers x batch_size x "
"hidden_size)"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)"
);
AddInput
(
"InitC"
,
"(Tensor) the initial cell state of the LSTm "
"input. This is a tensor with shape (num_layers x batch_size x "
"hidden_size)"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)"
);
AddInput
(
"W"
,
"(Tensor) the learnable hidden-hidden weights."
" The shape is (N), where N is total weight size of the LSTM. "
" cudnn concatenate all the weight to one Tensor"
);
AddInput
(
"Cache"
,
"The cache of dropout op, a RAW type variable including random "
"number generator states and some descriptors, which is used in "
"cudnn kernel."
)
.
AsDispensable
();
AddOutput
(
"Out"
,
"(Tensor) the hidden state of LSTM operator. "
"The shape is ( seq_len x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirec is True, the shape will be ( seq_len x "
"batch_size x hidden_size * 2) "
);
AddOutput
(
"last_h"
,
"(Tensor) the hidden state of the last step. "
"The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)"
);
AddOutput
(
"last_c"
,
"(Tensor) the cell state of the last step"
"The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirect is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size*2)"
);
AddAttr
<
int
>
(
"max_len"
,
"max length of the LSTM op"
"the first dim of the Input can NOT be greater than max_len"
)
.
SetDefault
(
20
);
AddAttr
<
float
>
(
"dropout_prob"
,
"dropout prob of the dropout op"
"the dropout ONLY work between lstm layers, not between time steps"
"There is no dropout work on the Out tensor"
)
.
SetDefault
(
0.0
);
AddAttr
<
bool
>
(
"is_bidirec"
,
"is_bidirec"
"if it is bidirection rnn"
"The will affect the shape of the Out, last_h, and last_c"
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"input_size"
,
"input size ot the Input Tensor"
).
SetDefault
(
10
);
AddAttr
<
int
>
(
"hidden_size"
,
"hidden size of the LSTM"
).
SetDefault
(
100
);
AddAttr
<
int
>
(
"num_layers"
,
"the total layer number of the LSTM"
)
.
SetDefault
(
1
);
AddAttr
<
bool
>
(
"is_test"
,
"True if in test phase."
).
SetDefault
(
false
);
AddAttr
<
int
>
(
"seed"
,
"seed to used if fix_seed is True"
).
SetDefault
(
-
1
);
AddComment
(
R"DOC(
CUDNN LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
)DOC"
);
}
};
class
CudnnLSTMGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Input"
),
"Input(Input) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input(W) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"last_h"
),
"Input(last_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"last_c"
),
"Input(last_c) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Cache"
),
"Input(last_c) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitH"
),
"Input(init_h) of LSTM should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"InitC"
),
"Input(init_c) of LSTM should not be null."
);
auto
SetOutGradDim
=
[
&
ctx
](
const
std
::
string
&
name
)
{
auto
g_name
=
framework
::
GradVarName
(
name
);
if
(
ctx
->
HasOutput
(
g_name
))
{
ctx
->
SetOutputDim
(
g_name
,
ctx
->
GetInputDim
(
name
));
}
};
SetOutGradDim
(
"Input"
);
SetOutGradDim
(
"W"
);
SetOutGradDim
(
"InitH"
);
SetOutGradDim
(
"InitC"
);
}
};
template
<
typename
T
>
class
NotImpleKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
"CPU is not support for this kernel now. Will be add in the future"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
cudnn_lstm
,
ops
::
CudnnLSTMOp
,
ops
::
CudnnLSTMOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
cudnn_lstm_grad
,
ops
::
CudnnLSTMGradOp
);
REGISTER_OP_CPU_KERNEL
(
cudnn_lstm
,
ops
::
NotImpleKernel
<
float
>
);
REGISTER_OP_CPU_KERNEL
(
cudnn_lstm_grad
,
ops
::
NotImpleKernel
<
float
>
);
paddle/fluid/operators/cudnn_lstm_op.cu.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
Tensor
=
framework
::
Tensor
;
struct
CudnnRNNCache
{
CudnnRNNCache
()
{
x_desc_
=
NULL
;
y_desc_
=
NULL
;
dx_desc_
=
NULL
;
dy_desc_
=
NULL
;
}
~
CudnnRNNCache
()
{
release
();
}
cudnnRNNDescriptor_t
rnn_desc_
;
cudnnTensorDescriptor_t
*
x_desc_
;
cudnnTensorDescriptor_t
*
y_desc_
;
cudnnTensorDescriptor_t
*
dx_desc_
;
cudnnTensorDescriptor_t
*
dy_desc_
;
cudnnTensorDescriptor_t
hx_desc_
;
cudnnTensorDescriptor_t
cx_desc_
;
cudnnTensorDescriptor_t
hy_desc_
;
cudnnTensorDescriptor_t
cy_desc_
;
cudnnTensorDescriptor_t
dhx_desc_
;
cudnnTensorDescriptor_t
dcx_desc_
;
cudnnTensorDescriptor_t
dhy_desc_
;
cudnnTensorDescriptor_t
dcy_desc_
;
cudnnTensorDescriptor_t
output_x_desc_
;
cudnnTensorDescriptor_t
output_y_desc_
;
cudnnDropoutDescriptor_t
dropout_desc_
;
size_t
weights_size_
;
cudnnFilterDescriptor_t
w_desc_
;
cudnnFilterDescriptor_t
dw_desc_
;
size_t
workspace_size_
;
size_t
reserve_size_
;
Tensor
reserve_data_
;
Tensor
workspace_data_
;
Tensor
dropout_state_
;
size_t
max_length_
;
float
dropout_prob_
;
bool
is_bidirec_
;
int
batch_size_
;
int
input_size_
;
int
hidden_size_
;
int
num_layers_
;
int
seed_
;
void
init
(
cudnnHandle_t
handle
,
const
framework
::
ExecutionContext
&
ctx
,
size_t
max_len
,
int
batch_size
,
int
input_size
,
int
hidden_size
,
int
num_layers
,
float
dropout_prob
,
bool
is_bidirec
,
int
seed
,
int
weight_numel
)
{
max_length_
=
max_len
;
batch_size_
=
batch_size
;
input_size_
=
input_size
;
hidden_size_
=
hidden_size
;
num_layers_
=
num_layers
;
dropout_prob_
=
dropout_prob
;
is_bidirec_
=
is_bidirec
;
seed_
=
seed
;
x_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
y_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
dx_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
dy_desc_
=
new
cudnnTensorDescriptor_t
[
max_length_
];
int
dim_a
[
3
];
int
stride_a
[
3
];
for
(
size_t
i
=
0
;
i
<
max_length_
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
x_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
y_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dx_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dy_desc_
[
i
]));
dim_a
[
0
]
=
batch_size_
;
dim_a
[
1
]
=
input_size_
;
dim_a
[
2
]
=
1
;
stride_a
[
0
]
=
dim_a
[
2
]
*
dim_a
[
1
];
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
x_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dx_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
dim_a
[
0
]
=
batch_size_
;
dim_a
[
1
]
=
is_bidirec_
?
hidden_size_
*
2
:
hidden_size_
;
dim_a
[
2
]
=
1
;
stride_a
[
0
]
=
dim_a
[
2
]
*
dim_a
[
1
];
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
y_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dy_desc_
[
i
],
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
}
dim_a
[
0
]
=
num_layers_
*
(
is_bidirec_
?
2
:
1
);
dim_a
[
1
]
=
batch_size_
;
dim_a
[
2
]
=
hidden_size_
;
stride_a
[
0
]
=
dim_a
[
2
]
*
dim_a
[
1
];
stride_a
[
1
]
=
dim_a
[
2
];
stride_a
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
hy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
cy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dhy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
dcy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
hx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
cx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
hy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
cy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dhx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dcx_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dhy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
dcy_desc_
,
CUDNN_DATA_FLOAT
,
3
,
dim_a
,
stride_a
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateDropoutDescriptor
(
&
dropout_desc_
));
size_t
state_size
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDropoutGetStatesSize
(
handle
,
&
state_size
);
dropout_state_
.
Resize
({
static_cast
<
int64_t
>
(
state_size
)}));
auto
*
dropout_state_data
=
dropout_state_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetDropoutDescriptor
(
dropout_desc_
,
handle
,
dropout_prob_
,
dropout_state_data
,
state_size
,
seed_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateRNNDescriptor
(
&
rnn_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetRNNDescriptor_v6
(
handle
,
rnn_desc_
,
hidden_size_
,
num_layers_
,
dropout_desc_
,
CUDNN_LINEAR_INPUT
,
is_bidirec_
?
CUDNN_BIDIRECTIONAL
:
CUDNN_UNIDIRECTIONAL
,
CUDNN_LSTM
,
CUDNN_RNN_ALGO_STANDARD
,
CUDNN_DATA_FLOAT
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
w_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnCreateFilterDescriptor
(
&
dw_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNParamsSize
(
handle
,
rnn_desc_
,
x_desc_
[
0
],
&
weights_size_
,
CUDNN_DATA_FLOAT
));
PADDLE_ENFORCE_EQ
(
weights_size_
,
sizeof
(
float
)
*
weight_numel
,
"cudnn lstm weight size should be SAME"
);
int
dim_w
[
3
];
dim_w
[
0
]
=
weights_size_
/
sizeof
(
float
);
dim_w
[
1
]
=
1
;
dim_w
[
2
]
=
1
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
w_desc_
,
CUDNN_DATA_FLOAT
,
CUDNN_TENSOR_NCHW
,
3
,
dim_w
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetFilterNdDescriptor
(
dw_desc_
,
CUDNN_DATA_FLOAT
,
CUDNN_TENSOR_NCHW
,
3
,
dim_w
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNWorkspaceSize
(
handle
,
rnn_desc_
,
max_length_
,
x_desc_
,
&
workspace_size_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetRNNTrainingReserveSize
(
handle
,
rnn_desc_
,
max_length_
,
x_desc_
,
&
reserve_size_
));
reserve_data_
.
Resize
({
static_cast
<
int64_t
>
(
reserve_size_
)});
reserve_data_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
workspace_data_
.
Resize
({
static_cast
<
int64_t
>
(
workspace_size_
)});
workspace_data_
.
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
}
void
release
()
{
for
(
size_t
i
=
0
;
i
<
max_length_
;
++
i
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
x_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
y_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dx_desc_
[
i
]));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dy_desc_
[
i
]));
}
delete
[]
x_desc_
;
delete
[]
y_desc_
;
delete
[]
dx_desc_
;
delete
[]
dy_desc_
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
hy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
cy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcx_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dhy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
dcy_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyDropoutDescriptor
(
dropout_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyRNNDescriptor
(
rnn_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
w_desc_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyFilterDescriptor
(
dw_desc_
));
}
};
template
<
typename
T
>
class
CudnnLSTMGPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
x
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
const
Tensor
*
init_h
=
ctx
.
Input
<
Tensor
>
(
"InitH"
);
const
Tensor
*
init_c
=
ctx
.
Input
<
Tensor
>
(
"InitC"
);
auto
w
=
ctx
.
Input
<
Tensor
>
(
"W"
);
Tensor
*
out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
Tensor
*
last_h
=
ctx
.
Output
<
Tensor
>
(
"last_h"
);
Tensor
*
last_c
=
ctx
.
Output
<
Tensor
>
(
"last_c"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
init_h_data
=
init_h
->
data
<
T
>
();
const
T
*
init_c_data
=
init_c
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
last_h_data
=
last_h
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
last_c_data
=
last_c
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
size_t
max_len
=
ctx
.
Attr
<
int
>
(
"max_len"
);
float
dropout_prob
=
ctx
.
Attr
<
float
>
(
"dropout_prob"
);
bool
is_bidirec
=
ctx
.
Attr
<
bool
>
(
"is_bidirec"
);
int
input_size
=
ctx
.
Attr
<
int
>
(
"input_size"
);
int
hidden_size
=
ctx
.
Attr
<
int
>
(
"hidden_size"
);
int
num_layers
=
ctx
.
Attr
<
int
>
(
"num_layers"
);
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
*
cache_var
=
ctx
.
InputVar
(
"Cache"
);
if
(
!
cache_var
)
{
// The RAW type cache variable wouldn't be created and broadcasted on
// multi-devices before the first running.
// use parent scope to make cache persistable
auto
*
scope
=
const_cast
<
framework
::
Scope
*>
(
ctx
.
scope
().
parent
());
auto
cache_var_name
=
ctx
.
Inputs
(
"Cache"
)[
0
];
cache_var
=
scope
->
Var
(
cache_var_name
);
}
CudnnRNNCache
*
cudnn_rnn_cache
=
nullptr
;
if
(
cache_var
->
IsInitialized
())
{
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
}
else
{
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
std
::
random_device
rnd
;
int
seed
=
ctx
.
Attr
<
int
>
(
"seed"
);
if
(
seed
==
-
1
)
{
seed
=
rnd
();
}
auto
input_w_numel
=
w
->
numel
();
auto
batch_size
=
x
->
dims
()[
1
];
cudnn_rnn_cache
->
init
(
handle
,
ctx
,
max_len
,
batch_size
,
input_size
,
hidden_size
,
num_layers
,
dropout_prob
,
is_bidirec
,
seed
,
input_w_numel
);
}
auto
run_seq_len
=
x
->
dims
()[
0
];
if
(
is_test
)
{
// for inference
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNForwardInference
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
w_desc_
,
w_data
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
hy_desc_
,
last_h_data
,
cudnn_rnn_cache
->
cy_desc_
,
last_c_data
,
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
));
}
else
{
// for train
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNForwardTraining
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
x_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
w_desc_
,
w_data
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
hy_desc_
,
last_h_data
,
cudnn_rnn_cache
->
cy_desc_
,
last_c_data
,
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
,
cudnn_rnn_cache
->
reserve_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
reserve_size_
));
}
}
};
template
<
typename
T
>
class
CudnnLSTMGPUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
*
weight
=
ctx
.
Input
<
Tensor
>
(
"W"
);
auto
*
init_h
=
ctx
.
Input
<
Tensor
>
(
"InitH"
);
auto
*
init_c
=
ctx
.
Input
<
Tensor
>
(
"InitC"
);
// auto * last_h = ctx.Input<Tensor>("last_h");
// auto * last_c = ctx.Input<Tensor>("last_c");
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
out_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
last_h_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"last_h"
));
auto
*
last_c_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"last_c"
));
// auto* init_h = ctx.Input<Tensor>("init_h");
// auto* init_c = ctx.Input<Tensor>("init_c");
auto
*
in_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
*
weight_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"W"
));
auto
*
init_h_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"InitH"
));
auto
*
init_c_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"InitC"
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
*
cache_var
=
ctx
.
InputVar
(
"Cache"
);
PADDLE_ENFORCE
(
cache_var
->
IsInitialized
());
CudnnRNNCache
*
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
auto
input_dims
=
input
->
dims
();
auto
weight_dims
=
weight
->
dims
();
auto
init_h_dims
=
init_h
->
dims
();
auto
init_c_dims
=
init_c
->
dims
();
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
weight_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
math
::
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
zero
;
zero
(
dev_ctx
,
in_grad
,
static_cast
<
T
>
(
0.0
));
zero
(
dev_ctx
,
weight_grad
,
static_cast
<
T
>
(
0.0
));
T
*
init_h_grad_data
=
NULL
;
if
(
init_h_grad
==
nullptr
)
{
Tensor
init_h_grad_temp
;
init_h_grad_temp
.
mutable_data
<
T
>
(
init_h_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
init_h_grad_temp
,
static_cast
<
T
>
(
0.0
));
init_h_grad_data
=
init_h_grad_temp
.
data
<
T
>
();
}
else
{
init_h_grad
->
mutable_data
<
T
>
(
init_h_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
init_h_grad
,
static_cast
<
T
>
(
0.0
));
init_h_grad_data
=
init_h_grad
->
data
<
T
>
();
}
T
*
init_c_grad_data
=
NULL
;
if
(
init_c_grad
==
nullptr
)
{
Tensor
init_c_grad_temp
;
init_c_grad_temp
.
mutable_data
<
T
>
(
init_c_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
init_c_grad_temp
,
static_cast
<
T
>
(
0.0
));
init_c_grad_data
=
init_c_grad_temp
.
data
<
T
>
();
}
else
{
init_c_grad
->
mutable_data
<
T
>
(
init_c_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
init_c_grad
,
static_cast
<
T
>
(
0.0
));
init_c_grad_data
=
init_c_grad
->
data
<
T
>
();
}
const
T
*
last_h_grad_data
=
NULL
;
if
(
last_h_grad
==
nullptr
)
{
Tensor
last_h_grad_temp
;
last_h_grad_temp
.
mutable_data
<
T
>
(
init_h_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
last_h_grad_temp
,
static_cast
<
T
>
(
0.0
));
last_h_grad_data
=
(
const
T
*
)
last_h_grad_temp
.
data
<
T
>
();
}
else
{
last_h_grad_data
=
last_h_grad
->
data
<
T
>
();
}
const
T
*
last_c_grad_data
=
NULL
;
if
(
last_c_grad
==
nullptr
)
{
Tensor
last_c_grad_temp
;
last_c_grad_temp
.
mutable_data
<
T
>
(
init_c_dims
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
last_c_grad_temp
,
static_cast
<
T
>
(
0.0
));
last_c_grad_data
=
(
const
T
*
)
last_c_grad_temp
.
data
<
T
>
();
}
else
{
last_c_grad_data
=
last_c_grad
->
data
<
T
>
();
}
const
T
*
out_grad_data
=
NULL
;
if
(
out_grad
==
nullptr
)
{
Tensor
out_grad_temp
;
out_grad_temp
.
mutable_data
<
T
>
(
out
->
dims
(),
ctx
.
GetPlace
());
zero
(
dev_ctx
,
&
out_grad_temp
,
static_cast
<
T
>
(
0.0
));
out_grad_data
=
(
const
T
*
)
out_grad_temp
.
data
<
T
>
();
}
else
{
out_grad_data
=
out_grad
->
data
<
T
>
();
}
// zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
// zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
auto
out_data
=
out
->
data
<
T
>
();
// auto out_grad_data = out_grad->data<T>();
auto
weight_data
=
weight
->
data
<
T
>
();
auto
init_h_data
=
init_h
->
data
<
T
>
();
auto
init_c_data
=
init_c
->
data
<
T
>
();
auto
in_grad_data
=
in_grad
->
data
<
T
>
();
auto
work_data
=
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
();
auto
reserve_data
=
cudnn_rnn_cache
->
reserve_data_
.
data
<
uint8_t
>
();
auto
run_seq_len
=
input_dims
[
0
];
PADDLE_ENFORCE_LE
((
size_t
)
run_seq_len
,
cudnn_rnn_cache
->
max_length_
,
"cudnn running seq_len CAN not greater max_lengh"
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNBackwardData
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
y_desc_
,
out_data
,
cudnn_rnn_cache
->
dy_desc_
,
out_grad_data
,
cudnn_rnn_cache
->
dhy_desc_
,
last_h_grad_data
,
cudnn_rnn_cache
->
dcy_desc_
,
last_c_grad_data
,
cudnn_rnn_cache
->
w_desc_
,
weight_data
,
cudnn_rnn_cache
->
hx_desc_
,
init_h_data
,
cudnn_rnn_cache
->
cx_desc_
,
init_c_data
,
cudnn_rnn_cache
->
dx_desc_
,
in_grad_data
,
cudnn_rnn_cache
->
dhx_desc_
,
init_h_grad_data
,
cudnn_rnn_cache
->
dcx_desc_
,
init_c_grad_data
,
work_data
,
cudnn_rnn_cache
->
workspace_size_
,
reserve_data
,
cudnn_rnn_cache
->
reserve_size_
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnRNNBackwardWeights
(
handle
,
cudnn_rnn_cache
->
rnn_desc_
,
run_seq_len
,
cudnn_rnn_cache
->
x_desc_
,
input
->
data
<
T
>
(),
cudnn_rnn_cache
->
hx_desc_
,
init_h
->
data
<
T
>
(),
cudnn_rnn_cache
->
y_desc_
,
out
->
data
<
T
>
(),
cudnn_rnn_cache
->
workspace_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
workspace_size_
,
cudnn_rnn_cache
->
dw_desc_
,
weight_grad
->
data
<
T
>
(),
cudnn_rnn_cache
->
reserve_data_
.
data
<
uint8_t
>
(),
cudnn_rnn_cache
->
reserve_size_
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
cudnn_lstm
,
ops
::
CudnnLSTMGPUKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
cudnn_lstm_grad
,
ops
::
CudnnLSTMGPUGradKernel
<
float
>
);
paddle/fluid/operators/detection/box_coder_op.h
浏览文件 @
64ad051b
...
@@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
...
@@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
const
T
*
prior_box_var_data
=
nullptr
;
const
T
*
prior_box_var_data
=
nullptr
;
if
(
prior_box_var
)
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
if
(
prior_box_var
)
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
T
prior_box_width
=
prior_box_data
[
j
*
len
+
2
]
-
T
prior_box_width
=
prior_box_data
[
j
*
len
+
2
]
-
...
@@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
...
@@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
const
T
*
prior_box_var_data
=
nullptr
;
const
T
*
prior_box_var_data
=
nullptr
;
if
(
prior_box_var
)
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
if
(
prior_box_var
)
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
size_t
offset
=
i
*
col
*
len
+
j
*
len
;
size_t
offset
=
i
*
col
*
len
+
j
*
len
;
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
64ad051b
...
@@ -9,36 +9,37 @@ else()
...
@@ -9,36 +9,37 @@ else()
endif
()
endif
()
configure_file
(
send_recv.proto.in
${
CMAKE_CURRENT_SOURCE_DIR
}
/send_recv.proto @ONLY
)
configure_file
(
send_recv.proto.in
${
CMAKE_CURRENT_SOURCE_DIR
}
/send_recv.proto @ONLY
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
if
(
WITH_GRPC
)
if
(
WITH_GRPC
)
grpc_library
(
sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
grpc_library
(
sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
PROTO send_recv.proto
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory
)
DEPS lod_tensor selected_rows memory
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_test
(
grpc_serde_test SRCS grpc_serde_test.cc
cc_test
(
grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL
)
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL
)
cc_test
(
rpc_server_test SRCS rpc_server_test.cc
cc_test
(
rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL
)
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL
)
cc_test
(
varhandle_test SRCS varhandle_test.cc DEPS profiler
)
cc_test
(
varhandle_test SRCS varhandle_test.cc DEPS profiler
)
return
()
cc_library
(
parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory
)
endif
()
else
()
set_source_files_properties
(
brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
brpc_library
(
sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory
)
brpc_library
(
sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
cc_library
(
parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory
)
brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory
)
set
(
brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy
)
set
(
brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy
)
cc_test
(
brpc_server_test SRCS rpc_server_test.cc
cc_test
(
brpc_server_test SRCS rpc_server_test.cc
DEPS
${
brpc_test_depends
}
SERIAL
)
DEPS
${
brpc_test_depends
}
SERIAL
)
cc_test
(
brpc_serde_test SRCS brpc_serde_test.cc
cc_test
(
brpc_serde_test SRCS brpc_serde_test.cc
DEPS
${
brpc_test_depends
}
SERIAL
)
DEPS
${
brpc_test_depends
}
SERIAL
)
endif
()
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
64ad051b
...
@@ -171,11 +171,13 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
...
@@ -171,11 +171,13 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
const
framework
::
Scope
&
scope
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
in_var_name
,
const
std
::
string
&
in_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
,
int64_t
time_out
)
{
int64_t
time_out
)
{
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
in_var_name_val
=
in_var_name
;
const
std
::
string
in_var_name_val
=
in_var_name
;
const
std
::
string
out_var_name_val
=
out_var_name
;
const
std
::
string
out_var_name_val
=
out_var_name
;
const
std
::
string
table_name_val
=
table_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
const
auto
ch
=
GetChannel
(
ep_val
);
GetProcessor
*
s
=
new
GetProcessor
(
ch
);
GetProcessor
*
s
=
new
GetProcessor
(
ch
);
...
@@ -186,11 +188,12 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
...
@@ -186,11 +188,12 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
s
->
Prepare
(
h
,
time_out
);
s
->
Prepare
(
h
,
time_out
);
framework
::
AsyncIO
([
in_var_name_val
,
out_var_name_val
,
ep_val
,
p_scope
,
p_ctx
,
framework
::
AsyncIO
([
in_var_name_val
,
out_var_name_val
,
ep_val
,
p_scope
,
p_ctx
,
s
,
method
,
h
,
this
]
{
s
,
method
,
h
,
t
able_name_val
,
t
his
]
{
auto
*
var
=
p_scope
->
FindVar
(
in_var_name_val
);
auto
*
var
=
p_scope
->
FindVar
(
in_var_name_val
);
::
grpc
::
ByteBuffer
req
;
::
grpc
::
ByteBuffer
req
;
SerializeToByteBuffer
(
in_var_name_val
,
var
,
*
p_ctx
,
&
req
,
out_var_name_val
);
SerializeToByteBuffer
(
in_var_name_val
,
var
,
*
p_ctx
,
&
req
,
out_var_name_val
,
0
,
table_name_val
);
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
...
...
paddle/fluid/operators/distributed/grpc_client.h
浏览文件 @
64ad051b
...
@@ -194,6 +194,7 @@ class GRPCClient : public RPCClient {
...
@@ -194,6 +194,7 @@ class GRPCClient : public RPCClient {
const
framework
::
Scope
&
scope
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
in_var_name
,
const
std
::
string
&
in_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
=
""
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
VarHandlePtr
AsyncSendBatchBarrier
(
VarHandlePtr
AsyncSendBatchBarrier
(
...
...
paddle/fluid/operators/distributed/grpc_serde.cc
浏览文件 @
64ad051b
...
@@ -42,7 +42,8 @@ static void SerializeDestroyCallback(void* payload) {
...
@@ -42,7 +42,8 @@ static void SerializeDestroyCallback(void* payload) {
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
::
grpc
::
ByteBuffer
*
msg
,
const
std
::
string
&
out_name
,
::
grpc
::
ByteBuffer
*
msg
,
const
std
::
string
&
out_name
,
const
int
trainer_id
)
{
const
int
trainer_id
,
const
std
::
string
&
table_name
)
{
platform
::
RecordRPCEvent
record_event
(
"serial"
,
&
ctx
);
platform
::
RecordRPCEvent
record_event
(
"serial"
,
&
ctx
);
VarMsg
request
;
VarMsg
request
;
TensorPayload
*
payload
=
nullptr
;
TensorPayload
*
payload
=
nullptr
;
...
@@ -63,6 +64,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -63,6 +64,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
if
(
!
out_name
.
empty
())
{
if
(
!
out_name
.
empty
())
{
request
.
set_out_varname
(
out_name
);
request
.
set_out_varname
(
out_name
);
}
}
if
(
!
table_name
.
empty
())
{
request
.
set_table_name
(
table_name
);
}
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
request
.
set_type
(
::
sendrecv
::
LOD_TENSOR
);
request
.
set_type
(
::
sendrecv
::
LOD_TENSOR
);
payload
=
new
TensorPayload
(
GetTensorPayload
(
var
,
ctx
,
&
request
));
payload
=
new
TensorPayload
(
GetTensorPayload
(
var
,
ctx
,
&
request
));
...
...
paddle/fluid/operators/distributed/grpc_serde.h
浏览文件 @
64ad051b
...
@@ -40,7 +40,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
...
@@ -40,7 +40,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
::
grpc
::
ByteBuffer
*
msg
,
::
grpc
::
ByteBuffer
*
msg
,
const
std
::
string
&
out_varname
=
std
::
string
(),
const
std
::
string
&
out_varname
=
std
::
string
(),
const
int
trainer_id
=
0
);
const
int
trainer_id
=
0
,
const
std
::
string
&
table_name
=
std
::
string
());
void
DeserializeFromByteBuffer
(
const
::
grpc
::
ByteBuffer
&
msg
,
void
DeserializeFromByteBuffer
(
const
::
grpc
::
ByteBuffer
&
msg
,
const
platform
::
DeviceContext
&
ctx
,
const
platform
::
DeviceContext
&
ctx
,
...
...
paddle/fluid/operators/distributed/grpc_serde_test.cc
浏览文件 @
64ad051b
...
@@ -130,7 +130,8 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
...
@@ -130,7 +130,8 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
math
::
set_constant
(
ctx
,
tensor
,
31.9
);
math
::
set_constant
(
ctx
,
tensor
,
31.9
);
::
grpc
::
ByteBuffer
msg
;
::
grpc
::
ByteBuffer
msg
;
operators
::
distributed
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
);
operators
::
distributed
::
SerializeToByteBuffer
(
"myvar"
,
&
var
,
ctx
,
&
msg
,
"outvar"
,
0
,
"table_name"
);
EXPECT_GT
(
msg
.
Length
(),
static_cast
<
size_t
>
(
0
));
EXPECT_GT
(
msg
.
Length
(),
static_cast
<
size_t
>
(
0
));
// deserialize
// deserialize
...
...
paddle/fluid/operators/distributed/grpc_server.cc
浏览文件 @
64ad051b
...
@@ -183,6 +183,7 @@ class RequestPrefetch final : public RequestBase {
...
@@ -183,6 +183,7 @@ class RequestPrefetch final : public RequestBase {
// prefetch process...
// prefetch process...
std
::
string
in_var_name
=
request_
->
Varname
();
std
::
string
in_var_name
=
request_
->
Varname
();
std
::
string
out_var_name
=
request_
->
OutVarname
();
std
::
string
out_var_name
=
request_
->
OutVarname
();
std
::
string
table_name
=
request_
->
TableName
();
int
trainer_id
=
request_
->
GetTrainerId
();
int
trainer_id
=
request_
->
GetTrainerId
();
VLOG
(
4
)
<<
"RequestPrefetch, in_var_name: "
<<
in_var_name
VLOG
(
4
)
<<
"RequestPrefetch, in_var_name: "
<<
in_var_name
<<
" out_var_name: "
<<
out_var_name
;
<<
" out_var_name: "
<<
out_var_name
;
...
@@ -193,7 +194,7 @@ class RequestPrefetch final : public RequestBase {
...
@@ -193,7 +194,7 @@ class RequestPrefetch final : public RequestBase {
framework
::
Variable
*
outvar
=
scope
->
Var
(
out_var_name
);
framework
::
Variable
*
outvar
=
scope
->
Var
(
out_var_name
);
request_handler_
->
Handle
(
in_var_name
,
scope
,
invar
,
&
outvar
,
trainer_id
,
request_handler_
->
Handle
(
in_var_name
,
scope
,
invar
,
&
outvar
,
trainer_id
,
out_var_name
);
out_var_name
,
table_name
);
SerializeToByteBuffer
(
out_var_name
,
outvar
,
*
request_handler_
->
dev_ctx
(),
SerializeToByteBuffer
(
out_var_name
,
outvar
,
*
request_handler_
->
dev_ctx
(),
&
reply_
);
&
reply_
);
...
...
paddle/fluid/operators/distributed/grpc_variable_response.cc
浏览文件 @
64ad051b
...
@@ -301,6 +301,20 @@ int GRPCVariableResponse::Parse(Source* source) {
...
@@ -301,6 +301,20 @@ int GRPCVariableResponse::Parse(Source* source) {
meta_
.
set_trainer_id
(
trainer_id
);
meta_
.
set_trainer_id
(
trainer_id
);
break
;
break
;
}
}
case
sendrecv
::
VariableMessage
::
kTableNameFieldNumber
:
{
uint32_t
length
;
if
((
wt
!=
WIRETYPE_LENGTH_DELIMITED
)
||
!
input
.
ReadVarint32
(
&
length
))
{
return
tag
;
}
std
::
string
temp
;
if
(
!
input
.
ReadString
(
&
temp
,
length
))
{
return
tag
;
}
meta_
.
set_table_name
(
temp
);
break
;
}
default:
{
default:
{
// Unknown tag, return unknown error.
// Unknown tag, return unknown error.
return
-
1
;
return
-
1
;
...
...
paddle/fluid/operators/distributed/parameter_prefetch.cc
0 → 100644
浏览文件 @
64ad051b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
using
SelectedRows
=
framework
::
SelectedRows
;
using
DDim
=
framework
::
DDim
;
static
size_t
GetSectionIndex
(
int64_t
id
,
const
std
::
vector
<
int64_t
>&
abs_sections
)
{
for
(
size_t
i
=
1
;
i
<
abs_sections
.
size
();
++
i
)
{
if
(
id
<
abs_sections
[
i
])
{
return
i
-
1
;
}
}
return
abs_sections
.
size
()
-
1
;
}
static
std
::
vector
<
int64_t
>
ToAbsoluteSection
(
const
std
::
vector
<
int
>&
height_sections
)
{
std
::
vector
<
int64_t
>
abs_sections
;
abs_sections
.
resize
(
height_sections
.
size
());
abs_sections
[
0
]
=
0
;
for
(
size_t
i
=
1
;
i
<
height_sections
.
size
();
++
i
)
{
abs_sections
[
i
]
=
height_sections
[
i
-
1
]
+
abs_sections
[
i
-
1
];
}
return
abs_sections
;
}
static
std
::
vector
<
std
::
vector
<
int64_t
>>
SplitIds
(
const
std
::
vector
<
int64_t
>&
ids_vector
,
const
std
::
vector
<
int
>&
height_section
,
framework
::
Scope
*
scope
)
{
std
::
set
<
int64_t
>
all_ids
;
for
(
auto
id
:
ids_vector
)
{
all_ids
.
insert
(
id
);
}
auto
abs_sections
=
ToAbsoluteSection
(
height_section
);
std
::
vector
<
std
::
vector
<
int64_t
>>
splited_ids
;
splited_ids
.
resize
(
height_section
.
size
()
+
1
);
for
(
auto
&
id
:
all_ids
)
{
auto
section_index
=
GetSectionIndex
(
id
,
abs_sections
);
splited_ids
[
section_index
].
push_back
(
id
-
abs_sections
[
section_index
]);
}
return
splited_ids
;
}
static
void
SplitIdsIntoMultipleVarsBySection
(
const
std
::
vector
<
std
::
string
>&
in_var_names
,
const
std
::
vector
<
int
>&
height_section
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
splited_ids
,
framework
::
Scope
*
scope
)
{
PADDLE_ENFORCE_EQ
(
in_var_names
.
size
(),
height_section
.
size
(),
""
);
auto
place
=
platform
::
CPUPlace
();
for
(
size_t
i
=
0
;
i
<
in_var_names
.
size
();
++
i
)
{
auto
*
id_tensor
=
scope
->
Var
(
in_var_names
[
i
])
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
&
ids
=
splited_ids
[
i
];
if
(
!
ids
.
empty
())
{
auto
*
id_tensor_data
=
id_tensor
->
mutable_data
<
int64_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
ids
.
size
()),
1
}),
place
);
memcpy
(
id_tensor_data
,
ids
.
data
(),
sizeof
(
int64_t
)
*
ids
.
size
());
}
}
}
static
void
MergeMultipleVarsIntoOneBySection
(
const
std
::
string
&
id_name
,
const
std
::
vector
<
int64_t
>&
ids_vector
,
const
std
::
string
&
out_name
,
const
std
::
vector
<
std
::
string
>&
out_var_names
,
const
std
::
vector
<
int
>&
height_section
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
splited_ids
,
const
framework
::
ExecutionContext
&
context
,
framework
::
Scope
*
scope
,
platform
::
DeviceContext
*
actual_ctx
)
{
PADDLE_ENFORCE_EQ
(
out_var_names
.
size
(),
height_section
.
size
(),
""
);
auto
cpu_place
=
platform
::
CPUPlace
();
auto
abs_sections
=
ToAbsoluteSection
(
height_section
);
std
::
unordered_map
<
int64_t
,
std
::
vector
<
size_t
>>
id_to_offset
;
for
(
size_t
i
=
0
;
i
<
ids_vector
.
size
();
++
i
)
{
id_to_offset
[
ids_vector
[
i
]].
push_back
(
i
);
}
auto
&
id_tensor
=
scope
->
FindVar
(
id_name
)
->
Get
<
framework
::
LoDTensor
>
();
auto
*
out_tensor
=
scope
->
FindVar
(
out_name
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
out_tensor_data
=
out_tensor
->
mutable_data
<
float
>
(
id_tensor
.
place
());
bool
is_on_cpu_place
=
true
;
if
(
!
platform
::
is_cpu_place
(
id_tensor
.
place
()))
{
is_on_cpu_place
=
false
;
}
for
(
size_t
section_idx
=
0
;
section_idx
<
out_var_names
.
size
();
++
section_idx
)
{
auto
&
ids_in_this_section
=
splited_ids
[
section_idx
];
if
(
!
ids_in_this_section
.
empty
())
{
auto
&
prefetch_out_var
=
scope
->
Var
(
out_var_names
[
section_idx
])
->
Get
<
framework
::
LoDTensor
>
();
const
auto
*
out_var_data
=
prefetch_out_var
.
data
<
float
>
();
auto
&
dims
=
prefetch_out_var
.
dims
();
PADDLE_ENFORCE_EQ
(
dims
.
size
(),
2
,
""
);
PADDLE_ENFORCE_EQ
(
ids_in_this_section
.
size
(),
dims
[
0
]);
auto
row_numel
=
dims
[
1
];
for
(
size_t
i
=
0
;
i
<
dims
[
0
];
++
i
)
{
auto
id
=
ids_in_this_section
[
i
];
auto
origin_id
=
id
+
abs_sections
[
section_idx
];
auto
&
offsets
=
id_to_offset
[
origin_id
];
for
(
auto
&
offset
:
offsets
)
{
// should support GPU tensor
if
(
is_on_cpu_place
)
{
memory
::
Copy
(
cpu_place
,
out_tensor_data
+
offset
*
row_numel
,
cpu_place
,
out_var_data
+
i
*
row_numel
,
sizeof
(
float
)
*
row_numel
);
}
else
{
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW
(
"paddle is not compiled with CUDA!"
);
#else
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
actual_ctx
)
->
stream
();
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
id_tensor
.
place
()),
out_tensor_data
+
offset
*
row_numel
,
cpu_place
,
out_var_data
+
i
*
row_numel
,
sizeof
(
float
)
*
row_numel
,
stream
);
#endif
}
}
}
}
else
{
VLOG
(
3
)
<<
"ids in this section is empty"
;
}
}
}
void
prefetch
(
const
std
::
string
&
id_name
,
const
std
::
string
&
out_name
,
const
std
::
vector
<
std
::
string
>&
table_names
,
const
std
::
vector
<
std
::
string
>&
epmap
,
const
std
::
vector
<
int
>&
height_sections
,
const
framework
::
ExecutionContext
&
context
)
{
auto
&
local_scope
=
context
.
scope
().
NewScope
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
cpu_ctx
=
*
pool
.
Get
(
platform
::
CPUPlace
());
auto
&
actual_ctx
=
*
pool
.
Get
(
context
.
GetPlace
());
distributed
::
RPCClient
*
rpc_client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
context
.
Attr
<
int
>
(
"trainer_id"
));
std
::
vector
<
std
::
string
>
in_var_names
;
std
::
vector
<
std
::
string
>
out_var_names
;
for
(
size_t
i
=
0
;
i
<
epmap
.
size
();
++
i
)
{
in_var_names
.
push_back
(
id_name
+
"@"
+
epmap
[
i
]);
out_var_names
.
push_back
(
out_name
+
"@"
+
epmap
[
i
]);
}
auto
&
id_tensor
=
local_scope
.
FindVar
(
id_name
)
->
Get
<
framework
::
LoDTensor
>
();
std
::
vector
<
int64_t
>
ids_vector
;
if
(
platform
::
is_cpu_place
(
id_tensor
.
place
()))
{
auto
*
id_data
=
id_tensor
.
data
<
int64_t
>
();
for
(
size_t
i
=
0
;
i
<
id_tensor
.
numel
();
++
i
)
{
ids_vector
.
push_back
(
id_data
[
i
]);
}
}
else
{
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW
(
"paddle is not compiled with CUDA!"
);
#else
auto
cpu_place
=
platform
::
CPUPlace
();
framework
::
Tensor
cpu_tensor
;
auto
*
cpu_tensor_data
=
cpu_tensor
.
mutable_data
<
int64_t
>
(
id_tensor
.
dims
(),
cpu_place
);
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
&
actual_ctx
)
->
stream
();
memory
::
Copy
(
cpu_place
,
cpu_tensor_data
,
boost
::
get
<
platform
::
CUDAPlace
>
(
id_tensor
.
place
()),
id_tensor
.
data
<
int64_t
>
(),
sizeof
(
int64_t
)
*
id_tensor
.
numel
(),
stream
);
for
(
size_t
i
=
0
;
i
<
cpu_tensor
.
numel
();
++
i
)
{
ids_vector
.
push_back
(
cpu_tensor_data
[
i
]);
}
#endif
}
auto
splited_ids
=
SplitIds
(
ids_vector
,
height_sections
,
&
local_scope
);
SplitIdsIntoMultipleVarsBySection
(
in_var_names
,
height_sections
,
splited_ids
,
&
local_scope
);
// create output var in local scope
for
(
auto
&
name
:
out_var_names
)
{
local_scope
.
Var
(
name
)
->
GetMutable
<
framework
::
LoDTensor
>
();
}
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
in_var_names
.
size
();
i
++
)
{
if
(
NeedSend
(
local_scope
,
in_var_names
[
i
]))
{
VLOG
(
3
)
<<
"sending "
<<
in_var_names
[
i
]
<<
" to "
<<
epmap
[
i
]
<<
" to get "
<<
out_var_names
[
i
]
<<
" back"
;
rets
.
push_back
(
rpc_client
->
AsyncPrefetchVar
(
epmap
[
i
],
cpu_ctx
,
local_scope
,
in_var_names
[
i
],
out_var_names
[
i
],
table_names
[
i
]));
}
else
{
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
out_var_names
[
i
];
}
}
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
PADDLE_ENFORCE
(
rets
[
i
]
->
Wait
(),
"internal error in RPCClient"
);
}
MergeMultipleVarsIntoOneBySection
(
id_name
,
ids_vector
,
out_name
,
out_var_names
,
height_sections
,
splited_ids
,
context
,
&
local_scope
,
&
actual_ctx
);
context
.
scope
().
DeleteScope
(
&
local_scope
);
}
};
// namespace distributed
};
// namespace operators
};
// namespace paddle
paddle/fluid/operators/distributed/parameter_prefetch.h
0 → 100644
浏览文件 @
64ad051b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
void
prefetch
(
const
std
::
string
&
id_name
,
const
std
::
string
&
out_name
,
const
std
::
vector
<
std
::
string
>&
table_names
,
const
std
::
vector
<
std
::
string
>&
epmap
,
const
std
::
vector
<
int
>&
height_sections
,
const
framework
::
ExecutionContext
&
context
);
};
// namespace distributed
};
// namespace operators
};
// namespace paddle
paddle/fluid/operators/distributed/request_handler.h
浏览文件 @
64ad051b
...
@@ -191,7 +191,8 @@ class RequestHandler {
...
@@ -191,7 +191,8 @@ class RequestHandler {
virtual
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
virtual
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
=
""
)
=
0
;
const
std
::
string
&
out_var_name
=
""
,
const
std
::
string
&
table_name
=
""
)
=
0
;
protected:
protected:
const
bool
sync_mode_
;
const
bool
sync_mode_
;
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
64ad051b
...
@@ -12,6 +12,7 @@
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include <iostream>
#include <iostream>
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -20,7 +21,7 @@
...
@@ -20,7 +21,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/
operators/distributed/request_handler_impl
.h"
#include "paddle/fluid/
framework/variable_helper
.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
...
@@ -37,7 +38,8 @@ bool RequestSendHandler::Handle(const std::string& varname,
...
@@ -37,7 +38,8 @@ bool RequestSendHandler::Handle(const std::string& varname,
framework
::
Variable
*
invar
,
framework
::
Variable
*
invar
,
framework
::
Variable
**
outvar
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
)
{
VLOG
(
4
)
<<
"RequestSendHandler:"
<<
varname
;
VLOG
(
4
)
<<
"RequestSendHandler:"
<<
varname
;
// Sync
// Sync
...
@@ -77,8 +79,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
...
@@ -77,8 +79,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
framework
::
Variable
*
invar
,
framework
::
Variable
*
invar
,
framework
::
Variable
**
outvar
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
)
{
VLOG
(
4
)
<<
"RequestGetHandler:"
<<
varname
;
VLOG
(
4
)
<<
"RequestGetHandler:"
<<
varname
;
if
(
sync_mode_
)
{
if
(
sync_mode_
)
{
if
(
varname
==
FETCH_BARRIER_MESSAGE
)
{
if
(
varname
==
FETCH_BARRIER_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv fetch barrier message"
;
VLOG
(
3
)
<<
"sync: recv fetch barrier message"
;
...
@@ -113,14 +117,22 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
...
@@ -113,14 +117,22 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
framework
::
Variable
*
invar
,
framework
::
Variable
*
invar
,
framework
::
Variable
**
outvar
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
)
{
VLOG
(
4
)
<<
"RequestPrefetchHandler "
<<
varname
;
VLOG
(
4
)
<<
"RequestPrefetchHandler "
<<
varname
;
auto
var_desc
=
program_
->
Block
(
0
).
FindVar
(
out_var_name
);
if
(
table_name
.
empty
())
{
InitializeVariable
(
*
outvar
,
var_desc
->
GetType
());
auto
var_desc
=
program_
->
Block
(
0
).
FindVar
(
out_var_name
);
executor_
->
RunPreparedContext
(
InitializeVariable
(
*
outvar
,
var_desc
->
GetType
());
(
*
prefetch_var_name_to_prepared_ctx_
)[
varname
].
get
(),
scope
);
executor_
->
RunPreparedContext
(
(
*
prefetch_var_name_to_prepared_ctx_
)[
varname
].
get
(),
scope
);
}
else
{
(
*
outvar
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
lookup_table_op
=
BuildLookupTableOp
(
table_name
,
varname
,
out_var_name
);
paddle
::
platform
::
CPUPlace
cpu_place
;
lookup_table_op
->
Run
(
*
scope
,
cpu_place
);
}
return
true
;
return
true
;
}
}
...
@@ -129,7 +141,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
...
@@ -129,7 +141,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
framework
::
Variable
*
invar
,
framework
::
Variable
*
invar
,
framework
::
Variable
**
outvar
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
)
{
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
checkpoint_notify_id
!=
-
1
,
checkpoint_notify_id
!=
-
1
,
"when checkpoint_notify_id = -1, there should be no RPC invoke."
);
"when checkpoint_notify_id = -1, there should be no RPC invoke."
);
...
...
paddle/fluid/operators/distributed/request_handler_impl.h
浏览文件 @
64ad051b
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
...
@@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler {
...
@@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler {
virtual
~
RequestSendHandler
()
{}
virtual
~
RequestSendHandler
()
{}
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
=
""
,
const
std
::
string
&
out_var
_name
=
""
)
override
;
const
std
::
string
&
table
_name
=
""
)
override
;
private:
private:
bool
enable_dc_asgd_
;
bool
enable_dc_asgd_
;
...
@@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler {
...
@@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler {
virtual
~
RequestGetHandler
()
{}
virtual
~
RequestGetHandler
()
{}
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
=
""
,
const
std
::
string
&
out_var
_name
=
""
)
override
;
const
std
::
string
&
table
_name
=
""
)
override
;
private:
private:
bool
enable_dc_asgd_
;
bool
enable_dc_asgd_
;
};
};
static
inline
void
BuildVar
(
const
std
::
string
&
param_name
,
std
::
initializer_list
<
const
char
*>
arguments
,
paddle
::
framework
::
proto
::
OpDesc
::
Var
*
var
)
{
var
->
set_parameter
(
param_name
);
for
(
auto
&
arg_name
:
arguments
)
{
*
var
->
mutable_arguments
()
->
Add
()
=
arg_name
;
}
}
class
RequestPrefetchHandler
final
:
public
RequestHandler
{
class
RequestPrefetchHandler
final
:
public
RequestHandler
{
public:
public:
explicit
RequestPrefetchHandler
(
bool
sync_mode
)
:
RequestHandler
(
sync_mode
)
{}
explicit
RequestPrefetchHandler
(
bool
sync_mode
)
:
RequestHandler
(
sync_mode
)
{}
virtual
~
RequestPrefetchHandler
()
{}
virtual
~
RequestPrefetchHandler
()
{}
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
=
""
,
const
std
::
string
&
out_var_name
=
""
)
override
;
const
std
::
string
&
table_name
=
""
)
override
;
private:
std
::
unique_ptr
<
paddle
::
framework
::
OperatorBase
>
BuildLookupTableOp
(
const
std
::
string
&
table_name
,
const
std
::
string
&
id_name
,
const
std
::
string
&
out_name
)
{
paddle
::
framework
::
proto
::
OpDesc
op_desc
;
op_desc
.
set_type
(
"lookup_table"
);
BuildVar
(
"W"
,
{
table_name
.
data
()},
op_desc
.
add_inputs
());
BuildVar
(
"Ids"
,
{
id_name
.
data
()},
op_desc
.
add_inputs
());
BuildVar
(
"Out"
,
{
out_name
.
data
()},
op_desc
.
add_outputs
());
auto
op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
op_desc
);
return
op
;
}
};
};
class
RequestCheckpointHandler
final
:
public
RequestHandler
{
class
RequestCheckpointHandler
final
:
public
RequestHandler
{
...
@@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler {
...
@@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler {
virtual
~
RequestCheckpointHandler
()
{}
virtual
~
RequestCheckpointHandler
()
{}
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
bool
Handle
(
const
std
::
string
&
varname
,
framework
::
Scope
*
scope
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
framework
::
Variable
*
var
,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
=
""
,
const
std
::
string
&
out_var
_name
=
""
)
override
;
const
std
::
string
&
table
_name
=
""
)
override
;
private:
private:
int
checkpoint_notify_id
;
int
checkpoint_notify_id
;
...
...
paddle/fluid/operators/distributed/rpc_client.h
浏览文件 @
64ad051b
...
@@ -48,7 +48,7 @@ class RPCClient {
...
@@ -48,7 +48,7 @@ class RPCClient {
virtual
VarHandlePtr
AsyncPrefetchVar
(
virtual
VarHandlePtr
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
in_var_name
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
in_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
=
""
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
virtual
VarHandlePtr
AsyncSendBatchBarrier
(
virtual
VarHandlePtr
AsyncSendBatchBarrier
(
...
...
paddle/fluid/operators/distributed/send_recv.proto.in
浏览文件 @
64ad051b
...
@@ -80,6 +80,7 @@ message VariableMessage {
...
@@ -80,6 +80,7 @@ message VariableMessage {
// when profile switches from 1 to 2.
// when profile switches from 1 to 2.
int64 profile = 11;
int64 profile = 11;
int64 trainer_id = 12;
int64 trainer_id = 12;
string table_name = 13;
}
}
message VoidMessage {}
message VoidMessage {}
paddle/fluid/operators/distributed/variable_response.h
浏览文件 @
64ad051b
...
@@ -85,6 +85,7 @@ class VariableResponse {
...
@@ -85,6 +85,7 @@ class VariableResponse {
inline
framework
::
Scope
*
GetMutableLocalScope
()
const
{
return
local_scope_
;
}
inline
framework
::
Scope
*
GetMutableLocalScope
()
const
{
return
local_scope_
;
}
inline
std
::
string
Varname
()
const
{
return
meta_
.
varname
();
}
inline
std
::
string
Varname
()
const
{
return
meta_
.
varname
();
}
inline
std
::
string
OutVarname
()
const
{
return
meta_
.
out_varname
();
}
inline
std
::
string
OutVarname
()
const
{
return
meta_
.
out_varname
();
}
inline
std
::
string
TableName
()
const
{
return
meta_
.
table_name
();
}
// should call parse first.
// should call parse first.
framework
::
Variable
*
GetVar
()
{
framework
::
Variable
*
GetVar
()
{
...
...
paddle/fluid/operators/lookup_table_op.cc
浏览文件 @
64ad051b
...
@@ -87,6 +87,25 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -87,6 +87,25 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) "
"(boolean, default false) "
"If the grad op reuse the input's variable."
)
"If the grad op reuse the input's variable."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
// for parameter prefetch
AddAttr
<
bool
>
(
"remote_prefetch"
,
""
).
SetDefault
(
false
);
AddAttr
<
int
>
(
"trainer_id"
,
"trainer id from 0 ~ worker_num."
).
SetDefault
(
0
);
AddAttr
<
std
::
vector
<
int
>>
(
"height_sections"
,
"Height for each output SelectedRows."
)
.
SetDefault
(
std
::
vector
<
int
>
({}));
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
,
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input variables for mapping"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"table_names"
,
"(string vector, the splited table names that will be fetched from "
"parameter server)"
"in the order of input variables for mapping"
)
.
SetDefault
({});
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Lookup Table Operator.
Lookup Table Operator.
...
...
paddle/fluid/operators/lookup_table_op.cu
浏览文件 @
64ad051b
...
@@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids,
...
@@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids,
while
(
idy
<
K
)
{
while
(
idy
<
K
)
{
int64_t
id
=
ids
[
idy
];
int64_t
id
=
ids
[
idy
];
PADDLE_ASSERT
(
id
>=
0
);
PADDLE_ASSERT
_MSG_CODE
(
id
>=
0
,
"received id:"
,
id
);
PADDLE_ASSERT
(
id
<
N
);
PADDLE_ASSERT
_MSG_CODE
(
id
<
N
,
"received id:"
,
id
);
T
*
out
=
output
+
idy
*
D
;
T
*
out
=
output
+
idy
*
D
;
const
T
*
tab
=
table
+
id
*
D
;
const
T
*
tab
=
table
+
id
*
D
;
for
(
int
i
=
idx
;
i
<
D
;
i
+=
BlockDimX
)
{
for
(
int
i
=
idx
;
i
<
D
;
i
+=
BlockDimX
)
{
...
@@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
...
@@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
int
idy
=
blockIdx
.
x
+
threadIdx
.
y
*
GridDimX
;
int
idy
=
blockIdx
.
x
+
threadIdx
.
y
*
GridDimX
;
while
(
idy
<
K
)
{
while
(
idy
<
K
)
{
int
id
=
ids
[
idy
];
int
64_t
id
=
ids
[
idy
];
PADDLE_ASSERT
(
id
>=
0
);
PADDLE_ASSERT
_MSG_CODE
(
id
>=
0
,
"received id:"
,
id
);
PADDLE_ASSERT
(
id
<
N
);
PADDLE_ASSERT
_MSG_CODE
(
id
<
N
,
"received id:"
,
id
);
const
T
*
out
=
output
+
idy
*
D
;
const
T
*
out
=
output
+
idy
*
D
;
T
*
tab
=
table
+
id
*
D
;
T
*
tab
=
table
+
id
*
D
;
for
(
int
i
=
idx
;
i
<
D
;
i
+=
BlockDimX
)
{
for
(
int
i
=
idx
;
i
<
D
;
i
+=
BlockDimX
)
{
...
@@ -78,27 +78,47 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
...
@@ -78,27 +78,47 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
auto
*
output_t
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
auto
*
output_t
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
int64_t
padding_idx
=
context
.
Attr
<
int64_t
>
(
"padding_idx"
);
int64_t
padding_idx
=
context
.
Attr
<
int64_t
>
(
"padding_idx"
);
size_t
N
=
table_t
->
dims
()[
0
];
auto
id_name
=
context
.
Inputs
(
"Ids"
).
front
();
size_t
D
=
table_t
->
dims
()[
1
];
auto
out_name
=
context
.
Outputs
(
"Out"
).
front
();
size_t
K
=
ids_t
->
numel
();
// for remote prefetch
auto
*
ids
=
ids_t
->
data
<
int64_t
>
();
auto
epmap
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
auto
*
table
=
table_t
->
data
<
T
>
();
auto
height_sections
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"height_sections"
);
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
table_names
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"table_names"
);
dim3
threads
(
128
,
8
);
if
(
!
epmap
.
empty
())
{
dim3
grids
(
8
,
1
);
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
if
(
padding_idx
==
-
1
)
// server
LookupTable
<
#ifdef PADDLE_WITH_DISTRIBUTE
T
,
128
,
8
,
8
,
operators
::
distributed
::
prefetch
(
id_name
,
out_name
,
table_names
,
epmap
,
false
><<<
grids
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
height_sections
,
context
);
output
,
table
,
ids
,
N
,
K
,
D
,
padding_idx
);
#else
else
PADDLE_THROW
(
LookupTable
<
"paddle is not compiled with distribute support, can not do "
T
,
128
,
8
,
8
,
"parameter prefetch!"
);
true
><<<
grids
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
#endif
output
,
table
,
ids
,
N
,
K
,
D
,
padding_idx
);
}
else
{
size_t
N
=
table_t
->
dims
()[
0
];
size_t
D
=
table_t
->
dims
()[
1
];
size_t
K
=
ids_t
->
numel
();
auto
*
ids
=
ids_t
->
data
<
int64_t
>
();
auto
*
table
=
table_t
->
data
<
T
>
();
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
dim3
threads
(
128
,
8
);
dim3
grids
(
8
,
1
);
if
(
padding_idx
==
-
1
)
LookupTable
<
T
,
128
,
8
,
8
,
false
><<<
grids
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
output
,
table
,
ids
,
N
,
K
,
D
,
padding_idx
);
else
LookupTable
<
T
,
128
,
8
,
8
,
true
><<<
grids
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
output
,
table
,
ids
,
N
,
K
,
D
,
padding_idx
);
}
}
}
};
};
...
@@ -109,6 +129,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
...
@@ -109,6 +129,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>();
context
.
template
device_context
<
platform
::
CUDADeviceContext
>();
bool
is_sparse
=
context
.
Attr
<
bool
>
(
"is_sparse"
);
bool
is_sparse
=
context
.
Attr
<
bool
>
(
"is_sparse"
);
// Since paddings are not trainable and fixed in forward, the gradient of
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
// paddings makes no sense and we don't deal with it in backward.
if
(
is_sparse
)
{
if
(
is_sparse
)
{
...
...
paddle/fluid/operators/lookup_table_op.h
浏览文件 @
64ad051b
...
@@ -23,6 +23,10 @@ limitations under the License. */
...
@@ -23,6 +23,10 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -41,44 +45,66 @@ class LookupTableKernel : public framework::OpKernel<T> {
...
@@ -41,44 +45,66 @@ class LookupTableKernel : public framework::OpKernel<T> {
auto
*
output_t
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
// float tensor
auto
*
output_t
=
context
.
Output
<
LoDTensor
>
(
"Out"
);
// float tensor
auto
*
table_var
=
context
.
InputVar
(
"W"
);
auto
*
table_var
=
context
.
InputVar
(
"W"
);
int64_t
padding_idx
=
context
.
Attr
<
int64_t
>
(
"padding_idx"
);
auto
id_name
=
context
.
Inputs
(
"Ids"
).
front
();
int64_t
*
ids
=
const_cast
<
int64_t
*>
(
ids_t
->
data
<
int64_t
>
());
auto
out_name
=
context
.
Outputs
(
"Out"
).
front
();
int64_t
ids_numel
=
ids_t
->
numel
();
// for remote prefetch
if
(
table_var
->
IsType
<
LoDTensor
>
())
{
auto
epmap
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
auto
*
table_t
=
context
.
Input
<
LoDTensor
>
(
"W"
);
auto
height_sections
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"height_sections"
);
int64_t
row_number
=
table_t
->
dims
()[
0
];
auto
table_names
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"table_names"
);
int64_t
row_width
=
table_t
->
dims
()[
1
];
if
(
!
epmap
.
empty
())
{
auto
*
table
=
table_t
->
data
<
T
>
();
// if epmap is not empty, then the parameter will be fetched from remote
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
// parameter
// server
for
(
int64_t
i
=
0
;
i
<
ids_numel
;
++
i
)
{
#ifdef PADDLE_WITH_DISTRIBUTE
if
(
padding_idx
!=
kNoPadding
&&
ids
[
i
]
==
padding_idx
)
{
operators
::
distributed
::
prefetch
(
id_name
,
out_name
,
table_names
,
epmap
,
memset
(
output
+
i
*
row_width
,
0
,
row_width
*
sizeof
(
T
));
height_sections
,
context
);
}
else
{
#else
PADDLE_ENFORCE_LT
(
ids
[
i
],
row_number
);
PADDLE_THROW
(
PADDLE_ENFORCE_GE
(
ids
[
i
],
0
,
"ids %d"
,
i
);
"paddle is not compiled with distribute support, can not do "
memcpy
(
output
+
i
*
row_width
,
table
+
ids
[
i
]
*
row_width
,
"parameter prefetch!"
);
row_width
*
sizeof
(
T
));
#endif
}
else
{
int64_t
padding_idx
=
context
.
Attr
<
int64_t
>
(
"padding_idx"
);
int64_t
*
ids
=
const_cast
<
int64_t
*>
(
ids_t
->
data
<
int64_t
>
());
int64_t
ids_numel
=
ids_t
->
numel
();
if
(
table_var
->
IsType
<
LoDTensor
>
())
{
auto
*
table_t
=
context
.
Input
<
LoDTensor
>
(
"W"
);
int64_t
row_number
=
table_t
->
dims
()[
0
];
int64_t
row_width
=
table_t
->
dims
()[
1
];
auto
*
table
=
table_t
->
data
<
T
>
();
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
for
(
int64_t
i
=
0
;
i
<
ids_numel
;
++
i
)
{
if
(
padding_idx
!=
kNoPadding
&&
ids
[
i
]
==
padding_idx
)
{
memset
(
output
+
i
*
row_width
,
0
,
row_width
*
sizeof
(
T
));
}
else
{
PADDLE_ENFORCE_LT
(
ids
[
i
],
row_number
);
PADDLE_ENFORCE_GE
(
ids
[
i
],
0
,
"ids %d"
,
i
);
memcpy
(
output
+
i
*
row_width
,
table
+
ids
[
i
]
*
row_width
,
row_width
*
sizeof
(
T
));
}
}
}
}
}
else
if
(
table_var
->
IsType
<
SelectedRows
>
())
{
}
else
if
(
table_var
->
IsType
<
SelectedRows
>
())
{
const
auto
&
table_t
=
table_var
->
Get
<
SelectedRows
>
();
const
auto
&
table_t
=
table_var
->
Get
<
SelectedRows
>
()
;
int64_t
row_width
=
table_t
.
value
().
dims
()[
1
]
;
int64_t
row_width
=
table_t
.
value
().
dims
()[
1
]
;
const
auto
*
table
=
table_t
.
value
().
data
<
T
>
()
;
const
auto
*
table
=
table_t
.
value
().
data
<
T
>
(
);
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
()
);
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int64_t
i
=
0
;
i
<
ids_numel
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
ids_numel
;
++
i
)
{
if
(
padding_idx
!=
kNoPadding
&&
ids
[
i
]
==
padding_idx
)
{
if
(
padding_idx
!=
kNoPadding
&&
ids
[
i
]
==
padding_idx
)
{
memset
(
output
+
i
*
row_width
,
0
,
row_width
*
sizeof
(
T
));
memset
(
output
+
i
*
row_width
,
0
,
row_width
*
sizeof
(
T
));
}
else
{
}
else
{
PADDLE_ENFORCE_GE
(
ids
[
i
],
0
);
PADDLE_ENFORCE_GE
(
ids
[
i
],
0
);
auto
id_index
=
table_t
.
Index
(
ids
[
i
]
);
auto
id_index
=
table_t
.
Index
(
ids
[
i
]
);
PADDLE_ENFORCE_GE
(
id_index
,
0
,
"the input key should be exists."
);
PADDLE_ENFORCE_GE
(
id_index
,
0
,
"the input key should be exists."
);
blas
.
VCOPY
(
row_width
,
table
+
id_index
*
row_width
,
blas
.
VCOPY
(
row_width
,
table
+
id_index
*
row_width
,
output
+
i
*
row_width
);
output
+
i
*
row_width
);
}
}
}
}
}
}
}
...
...
paddle/fluid/operators/metrics/auc_op.h
浏览文件 @
64ad051b
...
@@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel<T> {
...
@@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel<T> {
const
auto
*
label_data
=
label
->
data
<
int64_t
>
();
const
auto
*
label_data
=
label
->
data
<
int64_t
>
();
for
(
size_t
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
batch_size
;
i
++
)
{
uint32_t
binIdx
=
static_cast
<
uint32_t
>
(
auto
predict_data
=
inference_data
[
i
*
inference_width
+
1
];
inference_data
[
i
*
inference_width
+
1
]
*
num_thresholds
);
PADDLE_ENFORCE_LE
(
predict_data
,
1
,
"The predict data must less or equal 1."
);
PADDLE_ENFORCE_GE
(
predict_data
,
0
,
"The predict data must gather or equal 0."
);
uint32_t
binIdx
=
static_cast
<
uint32_t
>
(
predict_data
*
num_thresholds
);
if
(
label_data
[
i
])
{
if
(
label_data
[
i
])
{
(
*
stat_pos
)[
binIdx
]
+=
1.0
;
(
*
stat_pos
)[
binIdx
]
+=
1.0
;
}
else
{
}
else
{
...
...
paddle/fluid/operators/pad2d_op.cc
浏览文件 @
64ad051b
...
@@ -319,20 +319,46 @@ void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
...
@@ -319,20 +319,46 @@ void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
}
}
}
}
static
inline
void
GetPaddings
(
int
*
paddings
,
const
framework
::
ExecutionContext
&
context
)
{
auto
*
paddings_t
=
context
.
Input
<
Tensor
>
(
"Paddings"
);
if
(
paddings_t
)
{
auto
paddings_data
=
paddings_t
->
data
<
int
>
();
paddings
[
0
]
=
paddings_data
[
0
];
paddings
[
1
]
=
paddings_data
[
1
];
paddings
[
2
]
=
paddings_data
[
2
];
paddings
[
3
]
=
paddings_data
[
3
];
}
else
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
copy
(
pads
.
begin
(),
pads
.
end
(),
paddings
);
}
}
template
<
typename
T
>
template
<
typename
T
>
class
Pad2dCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Pad2dCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
int
pads
[
4
];
GetPaddings
(
pads
,
context
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
T
value
=
context
.
Attr
<
T
>
(
"pad_value"
);
T
value
=
context
.
Attr
<
T
>
(
"pad_value"
);
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
in_dims
=
x
->
dims
();
auto
in_dims
=
x
->
dims
();
auto
out_dims
=
out
->
dims
();
const
T
*
in_data
=
x
->
data
<
T
>
();
const
T
*
in_data
=
x
->
data
<
T
>
();
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
if
(
data_format
==
"NCHW"
)
{
out
->
Resize
({
in_dims
[
0
],
in_dims
[
1
],
in_dims
[
2
]
+
pads
[
0
]
+
pads
[
1
],
in_dims
[
3
]
+
pads
[
2
]
+
pads
[
3
]});
}
else
{
out
->
Resize
({
in_dims
[
0
],
in_dims
[
1
]
+
pads
[
0
]
+
pads
[
1
],
in_dims
[
2
]
+
pads
[
2
]
+
pads
[
3
],
in_dims
[
3
]});
}
auto
out_dims
=
out
->
dims
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
int
pad_top
=
pads
[
0
];
const
int
pad_top
=
pads
[
0
];
const
int
pad_left
=
pads
[
2
];
const
int
pad_left
=
pads
[
2
];
const
int
num
=
in_dims
[
0
];
const
int
num
=
in_dims
[
0
];
...
@@ -376,7 +402,8 @@ template <typename T>
...
@@ -376,7 +402,8 @@ template <typename T>
class
Pad2dGradCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Pad2dGradCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
int
pads
[
4
];
GetPaddings
(
pads
,
context
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
...
@@ -442,21 +469,35 @@ class Pad2dOp : public framework::OperatorWithKernel {
...
@@ -442,21 +469,35 @@ class Pad2dOp : public framework::OperatorWithKernel {
"Output(Out) of Pad2dOp should not be null."
);
"Output(Out) of Pad2dOp should not be null."
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
PADDLE_ENFORCE_EQ
(
x_dim
.
size
(),
4
,
PADDLE_ENFORCE_EQ
(
x_dim
.
size
(),
4
,
"Size of paddings should be equal to 4."
);
"The size of input(X)'s dimension should be equal to 4."
);
std
::
vector
<
int64_t
>
out_dims
(
x_dim
.
size
());
std
::
vector
<
int64_t
>
out_dims
(
x_dim
.
size
());
auto
data_format
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"data_format"
);
auto
data_format
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"data_format"
);
out_dims
[
0
]
=
x_dim
[
0
];
out_dims
[
0
]
=
x_dim
[
0
];
if
(
data_format
==
"NCHW"
)
{
if
(
ctx
->
HasInput
(
"Paddings"
))
{
auto
paddings_dim
=
ctx
->
GetInputDim
(
"Paddings"
);
PADDLE_ENFORCE_EQ
(
paddings_dim
.
size
(),
1
,
"Size of Input(Paddings)'s dimension should be equal to 1."
);
PADDLE_ENFORCE_EQ
(
paddings_dim
[
0
],
4
,
"Shape of Input(Paddings) should be equal to [4]."
);
out_dims
[
1
]
=
x_dim
[
1
];
out_dims
[
1
]
=
x_dim
[
1
];
out_dims
[
2
]
=
x_dim
[
2
]
+
paddings
[
0
]
+
paddings
[
1
];
// height
out_dims
[
2
]
=
x_dim
[
2
];
out_dims
[
3
]
=
x_dim
[
3
]
+
paddings
[
2
]
+
paddings
[
3
];
// width
}
else
{
// NHWC
out_dims
[
3
]
=
x_dim
[
3
];
out_dims
[
3
]
=
x_dim
[
3
];
out_dims
[
1
]
=
x_dim
[
1
]
+
paddings
[
0
]
+
paddings
[
1
];
}
else
{
out_dims
[
2
]
=
x_dim
[
2
]
+
paddings
[
2
]
+
paddings
[
3
];
auto
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
PADDLE_ENFORCE_EQ
(
paddings
.
size
(),
4
,
"Size of paddings should be equal to 4."
);
if
(
data_format
==
"NCHW"
)
{
out_dims
[
1
]
=
x_dim
[
1
];
out_dims
[
2
]
=
x_dim
[
2
]
+
paddings
[
0
]
+
paddings
[
1
];
// height
out_dims
[
3
]
=
x_dim
[
3
]
+
paddings
[
2
]
+
paddings
[
3
];
// width
}
else
{
// NHWC
out_dims
[
3
]
=
x_dim
[
3
];
out_dims
[
1
]
=
x_dim
[
1
]
+
paddings
[
0
]
+
paddings
[
1
];
out_dims
[
2
]
=
x_dim
[
2
]
+
paddings
[
2
]
+
paddings
[
3
];
}
}
}
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
...
@@ -466,6 +507,13 @@ class Pad2dOp : public framework::OperatorWithKernel {
...
@@ -466,6 +507,13 @@ class Pad2dOp : public framework::OperatorWithKernel {
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
}
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
());
}
};
};
class
Pad2dOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
Pad2dOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
@@ -477,6 +525,12 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -477,6 +525,12 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
AddOutput
(
"Out"
,
"The output of pad2d op. "
"The output of pad2d op. "
"A tensor with the same shape as X."
);
"A tensor with the same shape as X."
);
AddInput
(
"Paddings"
,
"A 1-D tensor to describe the padding rules."
"paddings=[0, 1, 2, 3] means "
"padding 0 row to top, 1 row to bottom, 2 columns to left "
"and 3 columns to right. Size of paddings must be 4."
)
.
AsDispensable
();
AddAttr
<
std
::
vector
<
int
>>
(
AddAttr
<
std
::
vector
<
int
>>
(
"paddings"
,
"paddings"
,
"(vector<int>) "
"(vector<int>) "
...
@@ -554,6 +608,13 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
...
@@ -554,6 +608,13 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
x_grad_name
,
x_dims
);
ctx
->
SetOutputDim
(
x_grad_name
,
x_dims
);
}
}
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
());
}
};
};
class
Pad2dOpGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
class
Pad2dOpGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
...
@@ -564,6 +625,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
...
@@ -564,6 +625,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
bind
=
new
framework
::
OpDesc
();
auto
*
bind
=
new
framework
::
OpDesc
();
bind
->
SetInput
(
"X"
,
Input
(
"X"
));
bind
->
SetInput
(
"X"
,
Input
(
"X"
));
bind
->
SetInput
(
"Paddings"
,
Input
(
"Paddings"
));
bind
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
OutputGrad
(
"Out"
));
bind
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
OutputGrad
(
"Out"
));
bind
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
bind
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
bind
->
SetAttrMap
(
Attrs
());
bind
->
SetAttrMap
(
Attrs
());
...
...
paddle/fluid/operators/pad2d_op.cu
浏览文件 @
64ad051b
...
@@ -287,20 +287,50 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
...
@@ -287,20 +287,50 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
}
}
}
}
static
inline
void
GetPaddings
(
int
*
paddings
,
const
framework
::
ExecutionContext
&
context
)
{
auto
*
paddings_t
=
context
.
Input
<
Tensor
>
(
"Paddings"
);
if
(
paddings_t
)
{
Tensor
pads
;
framework
::
TensorCopySync
(
*
paddings_t
,
platform
::
CPUPlace
(),
&
pads
);
auto
pads_data
=
pads
.
data
<
int
>
();
paddings
[
0
]
=
pads_data
[
0
];
paddings
[
1
]
=
pads_data
[
1
];
paddings
[
2
]
=
pads_data
[
2
];
paddings
[
3
]
=
pads_data
[
3
];
}
else
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
copy
(
pads
.
begin
(),
pads
.
end
(),
paddings
);
}
}
template
<
typename
T
>
template
<
typename
T
>
class
Pad2dCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Pad2dCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
int
pads
[
4
];
GetPaddings
(
pads
,
context
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
T
value
=
context
.
Attr
<
T
>
(
"pad_value"
);
T
value
=
context
.
Attr
<
T
>
(
"pad_value"
);
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
in_dims
=
x
->
dims
();
auto
in_dims
=
x
->
dims
();
auto
out_dims
=
out
->
dims
();
const
T
*
in_data
=
x
->
data
<
T
>
();
const
T
*
in_data
=
x
->
data
<
T
>
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
out_dims
=
out
->
dims
();
if
(
data_format
==
"NCHW"
)
{
out_dims
[
0
]
=
in_dims
[
0
];
out_dims
[
1
]
=
in_dims
[
1
];
out_dims
[
2
]
=
in_dims
[
2
]
+
pads
[
0
]
+
pads
[
1
];
out_dims
[
3
]
=
in_dims
[
3
]
+
pads
[
2
]
+
pads
[
3
];
}
else
{
out_dims
[
0
]
=
in_dims
[
0
];
out_dims
[
1
]
=
in_dims
[
1
]
+
pads
[
0
]
+
pads
[
1
];
out_dims
[
2
]
=
in_dims
[
2
]
+
pads
[
2
]
+
pads
[
3
];
out_dims
[
3
]
=
in_dims
[
3
];
}
T
*
out_data
=
out
->
mutable_data
<
T
>
(
out_dims
,
context
.
GetPlace
());
const
int
pad_top
=
pads
[
0
];
const
int
pad_top
=
pads
[
0
];
const
int
pad_left
=
pads
[
2
];
const
int
pad_left
=
pads
[
2
];
const
int
num
=
in_dims
[
0
];
const
int
num
=
in_dims
[
0
];
...
@@ -356,7 +386,8 @@ template <typename T>
...
@@ -356,7 +386,8 @@ template <typename T>
class
Pad2dGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Pad2dGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
pads
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
int
pads
[
4
];
GetPaddings
(
pads
,
context
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
data_format
=
context
.
Attr
<
std
::
string
>
(
"data_format"
);
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
...
...
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
浏览文件 @
64ad051b
...
@@ -18,6 +18,7 @@ namespace paddle {
...
@@ -18,6 +18,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
using
framework
::
Tensor
;
using
framework
::
Tensor
;
const
int
kIgnoreIndex
=
-
100
;
class
SigmoidCrossEntropyWithLogitsOp
:
public
framework
::
OperatorWithKernel
{
class
SigmoidCrossEntropyWithLogitsOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
...
@@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
...
@@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
AddOutput
(
"Out"
,
AddOutput
(
"Out"
,
"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
" of elementwise logistic losses."
);
" of elementwise logistic losses."
);
AddAttr
<
int
>
(
"ignore_index"
,
"(int, default kIgnoreIndex), Specifies a target value that "
"is ignored and"
"does not contribute to the input gradient."
)
.
SetDefault
(
kIgnoreIndex
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
SigmoidCrossEntropyWithLogits Operator.
SigmoidCrossEntropyWithLogits Operator.
...
...
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
浏览文件 @
64ad051b
...
@@ -15,33 +15,72 @@ limitations under the License. */
...
@@ -15,33 +15,72 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/legacy/utils/Logging.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
struct
SigmoidCrossEntropyWithLogitsForward
{
HOSTDEVICE
SigmoidCrossEntropyWithLogitsForward
(
const
int
&
ignore_index
)
:
ignore_index
(
ignore_index
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
,
const
T
&
label
)
const
{
if
(
static_cast
<
int
>
(
label
)
==
ignore_index
)
{
return
static_cast
<
T
>
(
0.
);
}
T
term1
=
(
x
>
0
)
?
x
:
0
;
T
term2
=
x
*
label
;
T
term3
=
std
::
log
(
static_cast
<
T
>
(
1
)
+
std
::
exp
(
-
(
std
::
abs
(
x
))));
return
term1
-
term2
+
term3
;
}
int
ignore_index
;
};
template
<
typename
T
>
struct
SigmoidCrossEntropyWithLogitsBackward
{
HOSTDEVICE
SigmoidCrossEntropyWithLogitsBackward
(
const
int
&
ignore_index
)
:
ignore_index
(
ignore_index
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
,
const
T
&
label
)
const
{
if
(
static_cast
<
int
>
(
label
)
==
ignore_index
)
{
return
static_cast
<
T
>
(
0.
);
}
T
simoid_x
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
std
::
exp
(
-
x
));
return
simoid_x
-
label
;
}
int
ignore_index
;
};
// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
SigmoidCrossEntropyWithLogitsKernel
:
public
framework
::
OpKernel
<
T
>
{
class
SigmoidCrossEntropyWithLogitsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
framework
::
Tensor
*
X
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
const
Tensor
*
X
=
context
.
Input
<
Tensor
>
(
"X"
);
const
framework
::
Tensor
*
Labels
=
context
.
Input
<
framework
::
Tensor
>
(
"Label"
);
const
Tensor
*
Labels
=
context
.
Input
<
Tensor
>
(
"Label"
);
framework
::
Tensor
*
Out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
Tensor
*
Out
=
context
.
Output
<
Tensor
>
(
"Out"
);
Out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
Out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
ignore_index
=
context
.
Attr
<
int
>
(
"ignore_index"
);
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
X
);
auto
x
=
EigenVector
<
T
>::
Flatten
(
*
X
);
auto
labels
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
Labels
);
auto
labels
=
EigenVector
<
T
>::
Flatten
(
*
Labels
);
auto
out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
Out
);
auto
out
=
EigenVector
<
T
>::
Flatten
(
*
Out
);
auto
&
place
=
*
context
.
device_context
<
DeviceContext
>
().
eigen_device
();
auto
&
place
=
*
context
.
device_context
<
DeviceContext
>
().
eigen_device
();
// term1 = max(x, 0)
out
.
device
(
place
)
=
x
.
binaryExpr
(
auto
term1
=
x
.
cwiseMax
(
static_cast
<
T
>
(
0
));
labels
,
SigmoidCrossEntropyWithLogitsForward
<
T
>
(
ignore_index
));
// term2 = x * labels
auto
term2
=
x
*
labels
;
// term3 = log(1 + exp(-abs(x)))
auto
term3
=
(
static_cast
<
T
>
(
1
)
+
(
-
(
x
.
abs
())).
exp
()).
log
();
out
.
device
(
place
)
=
term1
-
term2
+
term3
;
}
}
};
};
...
@@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
...
@@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
class
SigmoidCrossEntropyWithLogitsGradKernel
:
public
framework
::
OpKernel
<
T
>
{
class
SigmoidCrossEntropyWithLogitsGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
framework
::
Tensor
*
X
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
const
Tensor
*
X
=
context
.
Input
<
Tensor
>
(
"X"
);
const
framework
::
Tensor
*
Labels
=
context
.
Input
<
framework
::
Tensor
>
(
"Label"
);
const
Tensor
*
Labels
=
context
.
Input
<
Tensor
>
(
"Label"
);
const
framework
::
Tensor
*
dOut
=
const
Tensor
*
dOut
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
Tensor
*
dX
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
framework
::
Tensor
*
dX
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
dX
->
mutable_data
<
T
>
(
context
.
GetPlace
());
dX
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
X
);
auto
ignore_index
=
context
.
Attr
<
int
>
(
"ignore_index"
);
auto
labels
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
Labels
);
auto
x
=
EigenVector
<
T
>::
Flatten
(
*
X
);
auto
dout
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dOut
);
auto
labels
=
EigenVector
<
T
>::
Flatten
(
*
Labels
);
auto
dx
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dX
);
auto
dout
=
EigenVector
<
T
>::
Flatten
(
*
dOut
);
auto
dx
=
EigenVector
<
T
>::
Flatten
(
*
dX
);
auto
&
place
=
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
sigmoid_x
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
-
x
).
exp
());
auto
diff
=
x
.
binaryExpr
(
labels
,
SigmoidCrossEntropyWithLogitsBackward
<
T
>
(
dx
.
device
(
place
)
=
dout
*
(
sigmoid_x
-
labels
);
static_cast
<
int
>
(
ignore_index
)));
dx
.
device
(
place
)
=
dout
*
diff
;
}
}
};
};
...
...
paddle/fluid/operators/softmax_mkldnn_op.cc
浏览文件 @
64ad051b
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include <iostream>
#include <iostream>
#include "mkldnn.hpp"
#include "mkldnn.hpp"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/mkldnn_
helper
.h"
#include "paddle/fluid/platform/mkldnn_
reuse
.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/platform/assert.h
浏览文件 @
64ad051b
...
@@ -36,6 +36,15 @@ limitations under the License. */
...
@@ -36,6 +36,15 @@ limitations under the License. */
asm("trap;"); \
asm("trap;"); \
} \
} \
} while (0)
} while (0)
#define PADDLE_ASSERT_MSG_CODE(e, m, c) \
do { \
if (!(e)) { \
printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \
TOSTRING(e), m, c); \
asm("trap;"); \
} \
} while (0)
#else
#else
#include <assert.h>
#include <assert.h>
// For cuda, the assertions can affect performance and it is therefore
// For cuda, the assertions can affect performance and it is therefore
...
@@ -43,4 +52,5 @@ limitations under the License. */
...
@@ -43,4 +52,5 @@ limitations under the License. */
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
#define PADDLE_ASSERT(e) assert((e))
#define PADDLE_ASSERT(e) assert((e))
#define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
#define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1))
#endif
#endif
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
64ad051b
...
@@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
...
@@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString);
__macro(cudnnGetErrorString); \
__macro(cudnnCreateDropoutDescriptor); \
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnSetRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
__macro(cudnnRNNForwardTraining); \
__macro(cudnnRNNBackwardData); \
__macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \
__macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
CUDNN_DNN_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
...
...
paddle/fluid/platform/mkldnn_helper.h
浏览文件 @
64ad051b
...
@@ -107,170 +107,6 @@ inline mkldnn::memory::format GetMKLDNNFormat(
...
@@ -107,170 +107,6 @@ inline mkldnn::memory::format GetMKLDNNFormat(
memory
.
dst_primitive_desc
().
desc
().
data
.
format
);
memory
.
dst_primitive_desc
().
desc
().
data
.
format
);
}
}
class
MKLDNNHandler
{
public:
MKLDNNHandler
(
const
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
dev_ctx_
(
dev_ctx
),
engine_
(
engine
),
key_
(
base_key
),
is_reusing_
(
false
)
{}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_src_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_weights_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireBiasMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_bias_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_dst_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_diff_dst_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffSrcMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_diff_src_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemoryFromPrimitive
(
mkldnn
::
memory
::
primitive_desc
mdp
,
void
*
ptr
,
const
std
::
string
&
suffix
)
{
auto
local_key
=
key_
+
suffix
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mdp
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
,
const
std
::
string
&
suffix
)
{
/*Generate key*/
auto
local_key
=
key_
+
suffix
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
::
primitive_desc
{
md
,
engine_
},
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemory
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>&
user_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>&
target_memory_p
,
const
std
::
string
&
suffix
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
local_key
=
key_
+
suffix
;
auto
key_reorder_p
=
key_
+
suffix
+
"reorder_p"
;
auto
stored_reorder_p
=
std
::
static_pointer_cast
<
mkldnn
::
reorder
>
(
dev_ctx_
.
GetBlob
(
key_reorder_p
));
if
(
stored_reorder_p
)
{
pipeline
.
push_back
(
*
stored_reorder_p
);
}
else
{
auto
reorder_p
=
std
::
make_shared
<
mkldnn
::
reorder
>
(
*
user_memory_p
,
*
target_memory_p
);
dev_ctx_
.
SetBlob
(
key_reorder_p
,
reorder_p
);
pipeline
.
push_back
(
*
reorder_p
);
}
return
target_memory_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemory
(
mkldnn
::
memory
::
primitive_desc
&
mpd
,
// NOLINT
mkldnn
::
memory
::
primitive_desc
&
user_mpd
,
// NOLINT
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
const
std
::
string
&
suffix
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
,
// NOLINT
bool
is_persistent
=
false
)
{
// create reorder primitive if the input format is not the preferred one
auto
local_key
=
key_
+
suffix
;
auto
key_reorder_p
=
key_
+
suffix
+
"reorder_p"
;
auto
target_memory_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
target_memory_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find mem primitive in device context"
);
if
(
target_memory_p
==
nullptr
)
{
target_memory_p
=
user_memory_p
;
std
::
shared_ptr
<
mkldnn
::
primitive
>
reorder_p
;
if
(
mpd
!=
user_mpd
)
{
target_memory_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
);
auto
reorder_p
=
std
::
make_shared
<
mkldnn
::
reorder
>
(
*
user_memory_p
,
*
target_memory_p
);
dev_ctx_
.
SetBlob
(
key_reorder_p
,
reorder_p
);
pipeline
.
push_back
(
*
reorder_p
);
}
dev_ctx_
.
SetBlob
(
local_key
,
target_memory_p
);
}
else
if
(
!
is_persistent
)
{
// Make reorder if needed
auto
reorder_p
=
std
::
static_pointer_cast
<
mkldnn
::
reorder
>
(
dev_ctx_
.
GetBlob
(
key_reorder_p
));
if
(
reorder_p
!=
nullptr
)
{
pipeline
.
push_back
(
*
reorder_p
);
}
is_reusing_
=
true
;
}
return
target_memory_p
;
}
static
std
::
string
GetHash
(
mkldnn
::
memory
::
dims
&
operand_dims
,
// NOLINT
const
std
::
string
&
suffix
)
{
return
dims2str
(
operand_dims
)
+
suffix
;
}
protected:
static
std
::
string
dims2str
(
const
mkldnn
::
memory
::
dims
&
operand_dims
)
{
std
::
string
dstr
=
""
;
for
(
size_t
i
=
0
;
i
<
operand_dims
.
size
();
++
i
)
{
dstr
+=
std
::
to_string
(
operand_dims
[
i
])
+
"-"
;
}
return
dstr
;
}
protected:
const
MKLDNNDeviceContext
&
dev_ctx_
;
mkldnn
::
engine
engine_
;
std
::
string
key_
;
bool
is_reusing_
;
};
inline
mkldnn
::
memory
::
format
MKLDNNFormatForSize
(
inline
mkldnn
::
memory
::
format
MKLDNNFormatForSize
(
size_t
dims_size
,
mkldnn
::
memory
::
format
data_format
)
{
size_t
dims_size
,
mkldnn
::
memory
::
format
data_format
)
{
if
(
dims_size
==
1
)
{
if
(
dims_size
==
1
)
{
...
...
paddle/fluid/platform/mkldnn_reuse.h
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
platform
{
using
user_function
=
std
::
function
<
std
::
shared_ptr
<
float
>
(
const
float
*
)
>
;
class
MKLDNNHandler
{
public:
MKLDNNHandler
(
const
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
dev_ctx_
(
dev_ctx
),
engine_
(
engine
),
key_
(
base_key
),
is_reusing_
(
false
)
{}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_src_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
,
user_function
custom_func
=
{})
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_weights_mem_p"
,
custom_func
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireBiasMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_bias_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_dst_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_diff_dst_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffSrcMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_diff_src_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemoryFromPrimitive
(
mkldnn
::
memory
::
primitive_desc
mdp
,
void
*
ptr
,
const
std
::
string
&
suffix
)
{
auto
local_key
=
key_
+
suffix
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mdp
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
// This incarnation of AcquireMemory can call user function eg. custom reorder
// or preprocessing routine if needed
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
,
const
std
::
string
&
suffix
,
user_function
custom_func
=
{})
{
/*Generate key*/
auto
local_key
=
key_
+
suffix
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
// Call custom reorder/preprocessing func if available
if
(
custom_func
)
{
auto
reordered_data
=
custom_func
(
reinterpret_cast
<
const
float
*>
(
ptr
));
dev_ctx_
.
SetBlob
(
local_key
+
"-custom_reorder"
,
reordered_data
);
ptr
=
reinterpret_cast
<
void
*>
(
reordered_data
.
get
());
}
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
::
primitive_desc
{
md
,
engine_
},
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemory
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>&
user_memory_p
,
const
std
::
shared_ptr
<
mkldnn
::
memory
>&
target_memory_p
,
const
std
::
string
&
suffix
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
local_key
=
key_
+
suffix
;
auto
key_reorder_p
=
key_
+
suffix
+
"reorder_p"
;
auto
stored_reorder_p
=
std
::
static_pointer_cast
<
mkldnn
::
reorder
>
(
dev_ctx_
.
GetBlob
(
key_reorder_p
));
if
(
stored_reorder_p
)
{
pipeline
.
push_back
(
*
stored_reorder_p
);
}
else
{
auto
reorder_p
=
std
::
make_shared
<
mkldnn
::
reorder
>
(
*
user_memory_p
,
*
target_memory_p
);
dev_ctx_
.
SetBlob
(
key_reorder_p
,
reorder_p
);
pipeline
.
push_back
(
*
reorder_p
);
}
return
target_memory_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireMemory
(
mkldnn
::
memory
::
primitive_desc
&
mpd
,
// NOLINT
mkldnn
::
memory
::
primitive_desc
&
user_mpd
,
// NOLINT
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
const
std
::
string
&
suffix
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
,
// NOLINT
bool
is_persistent
=
false
)
{
// create reorder primitive if the input format is not the preferred one
auto
local_key
=
key_
+
suffix
;
auto
key_reorder_p
=
key_
+
suffix
+
"reorder_p"
;
auto
target_memory_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
target_memory_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find mem primitive in device context"
);
if
(
target_memory_p
==
nullptr
)
{
target_memory_p
=
user_memory_p
;
std
::
shared_ptr
<
mkldnn
::
primitive
>
reorder_p
;
if
(
mpd
!=
user_mpd
)
{
target_memory_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
);
auto
reorder_p
=
std
::
make_shared
<
mkldnn
::
reorder
>
(
*
user_memory_p
,
*
target_memory_p
);
dev_ctx_
.
SetBlob
(
key_reorder_p
,
reorder_p
);
pipeline
.
push_back
(
*
reorder_p
);
}
dev_ctx_
.
SetBlob
(
local_key
,
target_memory_p
);
}
else
if
(
!
is_persistent
)
{
// Make reorder if needed
auto
reorder_p
=
std
::
static_pointer_cast
<
mkldnn
::
reorder
>
(
dev_ctx_
.
GetBlob
(
key_reorder_p
));
if
(
reorder_p
!=
nullptr
)
{
pipeline
.
push_back
(
*
reorder_p
);
}
is_reusing_
=
true
;
}
return
target_memory_p
;
}
static
std
::
string
GetHash
(
mkldnn
::
memory
::
dims
&
operand_dims
,
// NOLINT
const
std
::
string
&
suffix
)
{
return
dims2str
(
operand_dims
)
+
suffix
;
}
protected:
static
std
::
string
dims2str
(
const
mkldnn
::
memory
::
dims
&
operand_dims
)
{
std
::
string
dstr
=
""
;
for
(
size_t
i
=
0
;
i
<
operand_dims
.
size
();
++
i
)
{
dstr
+=
std
::
to_string
(
operand_dims
[
i
])
+
"-"
;
}
return
dstr
;
}
protected:
const
MKLDNNDeviceContext
&
dev_ctx_
;
mkldnn
::
engine
engine_
;
std
::
string
key_
;
bool
is_reusing_
;
};
template
<
class
forward_t
,
class
backward_data_t
,
class
backward_weights_t
>
class
ConvMKLDNNTemplateHandler
:
public
MKLDNNHandler
{
public:
ConvMKLDNNTemplateHandler
(
std
::
shared_ptr
<
typename
forward_t
::
primitive_desc
>
conv_pd
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
)
{
conv_pd_
=
conv_pd
;
}
ConvMKLDNNTemplateHandler
(
std
::
shared_ptr
<
typename
forward_t
::
primitive_desc
>
conv_pd
,
std
::
shared_ptr
<
typename
backward_data_t
::
primitive_desc
>
conv_bwd_data_pd
,
std
::
shared_ptr
<
typename
backward_weights_t
::
primitive_desc
>
conv_bwd_weights_pd
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
),
conv_pd_
(
conv_pd
),
conv_bwd_weights_pd_
(
conv_bwd_weights_pd
),
conv_bwd_data_pd_
(
conv_bwd_data_pd
)
{
// If we are in Grad operatgor then update a key with BWD suffix to
// distinguish from FWD memory primitives
key_
+=
"-BWD"
;
}
size_t
GetDstMemorySize
()
const
{
return
conv_pd_
->
dst_primitive_desc
().
get_size
();
}
mkldnn
::
memory
::
format
GetDstFormat
()
const
{
return
static_cast
<
mkldnn
::
memory
::
format
>
(
conv_pd_
->
dst_primitive_desc
().
desc
().
data
.
format
);
}
size_t
GetDiffWeightsMemorySize
()
const
{
return
conv_bwd_weights_pd_
->
diff_weights_primitive_desc
().
get_size
();
}
size_t
GetDiffSourceMemorySize
()
const
{
return
conv_bwd_data_pd_
->
diff_src_primitive_desc
().
get_size
();
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_bwd_weights_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@weights-src_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromWeightsPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_weights_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
"@weights-diff_dst_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffWeightsMemoryFromWeightsPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
conv_bwd_weights_pd_
->
diff_weights_primitive_desc
(),
ptr
,
"@diff_weights_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffDstMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
diff_dst_pd
=
conv_bwd_data_pd_
->
diff_dst_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
diff_dst_pd
,
user_pd
,
user_memory_p
,
"@data-diff_dst_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromDataPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
weights_pd
=
conv_bwd_data_pd_
->
weights_primitive_desc
();
auto
user_pd
=
user_weights_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_pd
,
user_weights_memory_p
,
"@data-weights_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireResidualDataMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
)
{
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_residual_data_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemoryFromResidualDataMemory
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>&
user_residual_memory_p
,
void
*
dst_ptr
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
return
this
->
AcquireMemory
(
user_residual_memory_p
,
this
->
AcquireDstMemoryFromPrimitive
(
dst_ptr
),
"@residual_data_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDiffSrcMemoryFromDataPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
conv_bwd_data_pd_
->
diff_src_primitive_desc
(),
ptr
,
"@diff_src_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemoryFromPrimitive
(
void
*
ptr
)
{
return
this
->
AcquireMemoryFromPrimitive
(
conv_pd_
->
dst_primitive_desc
(),
ptr
,
"@dst_mem_p"
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
src_pd
=
conv_pd_
->
src_primitive_desc
();
auto
user_pd
=
user_memory_p
->
get_primitive_desc
();
return
this
->
AcquireMemory
(
src_pd
,
user_pd
,
user_memory_p
,
"@src_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_weights_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
,
// NOLINT
bool
is_persistent
=
false
)
{
auto
user_weights_pd
=
user_weights_memory_p
->
get_primitive_desc
();
auto
weights_pd
=
conv_pd_
->
weights_primitive_desc
();
return
this
->
AcquireMemory
(
weights_pd
,
user_weights_pd
,
user_weights_memory_p
,
"@weights_mem_p"
,
pipeline
,
is_persistent
);
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireBiasMemoryFromPrimitive
(
const
std
::
shared_ptr
<
mkldnn
::
memory
>
user_bias_memory_p
,
std
::
vector
<
mkldnn
::
primitive
>&
pipeline
)
{
// NOLINT
auto
user_bias_pd
=
user_bias_memory_p
->
get_primitive_desc
();
auto
bias_pd
=
conv_pd_
->
bias_primitive_desc
();
return
this
->
AcquireMemory
(
bias_pd
,
user_bias_pd
,
user_bias_memory_p
,
"@bias_mem_p"
,
pipeline
);
}
std
::
shared_ptr
<
forward_t
>
AcquireConvolution
(
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
weights_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_p"
;
auto
conv_p
=
std
::
static_pointer_cast
<
forward_t
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
()),
*
(
dst_memory_p
.
get
()));
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_p
;
}
std
::
shared_ptr
<
forward_t
>
AcquireConvolution
(
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
weights_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
bias_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
dst_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_p"
;
auto
conv_p
=
std
::
static_pointer_cast
<
forward_t
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
()),
*
(
bias_memory_p
.
get
()),
*
(
dst_memory_p
.
get
()));
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_p
;
}
std
::
shared_ptr
<
backward_weights_t
>
AcquireConvolutionBackwardWeights
(
std
::
shared_ptr
<
mkldnn
::
memory
>
src_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_dst_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_weights_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_bwd_weights_p"
;
auto
conv_bwd_weights_p
=
std
::
static_pointer_cast
<
backward_weights_t
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
(
(
conv_bwd_weights_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution bwd weights primitive in device context"
);
if
(
conv_bwd_weights_p
==
nullptr
)
{
// create backward conv primitive for weights
conv_bwd_weights_p
=
std
::
make_shared
<
backward_weights_t
>
(
*
conv_bwd_weights_pd_
,
*
src_memory_p
,
*
diff_dst_memory_p
,
*
diff_weights_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_bwd_weights_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_bwd_weights_p
;
}
std
::
shared_ptr
<
backward_data_t
>
AcquireConvolutionBackwardData
(
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_dst_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
weights_memory_p
,
std
::
shared_ptr
<
mkldnn
::
memory
>
diff_src_memory_p
)
{
auto
prim_key
=
key_
+
"@conv_bwd_data_p"
;
auto
conv_bwd_data_p
=
std
::
static_pointer_cast
<
backward_data_t
>
(
dev_ctx_
.
GetBlob
(
prim_key
));
PADDLE_ENFORCE
(
(
conv_bwd_data_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution bwd data primitive in device context"
);
if
(
conv_bwd_data_p
==
nullptr
)
{
conv_bwd_data_p
=
std
::
make_shared
<
backward_data_t
>
(
*
conv_bwd_data_pd_
,
*
diff_dst_memory_p
,
*
weights_memory_p
,
*
diff_src_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_bwd_data_p
);
}
else
{
is_reusing_
=
true
;
}
return
conv_bwd_data_p
;
}
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static
std
::
string
GetHash
(
mkldnn
::
memory
::
dims
&
input_dims
,
// NOLINT
mkldnn
::
memory
::
dims
&
weights_dims
,
// NOLINT
std
::
vector
<
int
>&
strides
,
// NOLINT
std
::
vector
<
int
>&
paddings
,
// NOLINT
std
::
vector
<
int
>&
dilations
,
// NOLINT
int
groups
,
const
std
::
string
&
suffix
)
{
return
dims2str
(
input_dims
)
+
dims2str
(
weights_dims
)
+
dims2str
(
strides
)
+
dims2str
(
paddings
)
+
dims2str
(
dilations
)
+
std
::
to_string
(
groups
)
+
suffix
;
}
private:
std
::
shared_ptr
<
typename
forward_t
::
primitive_desc
>
conv_pd_
;
std
::
shared_ptr
<
typename
backward_weights_t
::
primitive_desc
>
conv_bwd_weights_pd_
;
std
::
shared_ptr
<
typename
backward_data_t
::
primitive_desc
>
conv_bwd_data_pd_
;
};
using
ConvMKLDNNHandler
=
ConvMKLDNNTemplateHandler
<
mkldnn
::
convolution_forward
,
mkldnn
::
convolution_backward_data
,
mkldnn
::
convolution_backward_weights
>
;
using
ConvTransposeMKLDNNHandler
=
ConvMKLDNNTemplateHandler
<
mkldnn
::
deconvolution_forward
,
mkldnn
::
deconvolution_backward_data
,
mkldnn
::
deconvolution_backward_weights
>
;
}
// namespace platform
}
// namespace paddle
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
64ad051b
set
(
PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler
)
set
(
PYBIND_DEPS pybind python proto_desc memory executor
async_executor
prune feed_fetch_method pass_builder parallel_executor profiler
)
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
)
set
(
PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
async_executor_py.cc
)
if
(
WITH_PYTHON
)
if
(
WITH_PYTHON
)
if
(
WITH_AMD_GPU
)
if
(
WITH_AMD_GPU
)
hip_library
(
paddle_pybind SHARED
hip_library
(
paddle_pybind SHARED
...
...
paddle/fluid/pybind/async_executor_py.cc
0 → 100644
浏览文件 @
64ad051b
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
// To avoid conflicting definition in gcc-4.8.2 headers and pyconfig.h (2.7.3)
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#include <string>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/async_executor.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/variant.h"
#include "paddle/fluid/pybind/async_executor_py.h"
namespace
py
=
pybind11
;
namespace
pd
=
paddle
::
framework
;
namespace
paddle
{
namespace
pybind
{
using
set_name_func
=
void
(
pd
::
DataFeedDesc
::*
)(
const
std
::
string
&
);
void
BindAsyncExecutor
(
py
::
module
*
m
)
{
py
::
class_
<
framework
::
AsyncExecutor
>
(
*
m
,
"AsyncExecutor"
)
.
def
(
py
::
init
([](
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
)
{
return
std
::
unique_ptr
<
framework
::
AsyncExecutor
>
(
new
framework
::
AsyncExecutor
(
scope
,
place
));
}))
.
def
(
"run_from_files"
,
&
framework
::
AsyncExecutor
::
RunFromFile
);
}
// end BindAsyncExecutor
}
// end namespace pybind
}
// end namespace paddle
paddle/fluid/pybind/async_executor_py.h
0 → 100644
浏览文件 @
64ad051b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace
py
=
pybind11
;
namespace
paddle
{
namespace
pybind
{
void
BindAsyncExecutor
(
py
::
module
*
m
);
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
64ad051b
...
@@ -42,6 +42,7 @@ limitations under the License. */
...
@@ -42,6 +42,7 @@ limitations under the License. */
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/async_executor_py.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/protobuf.h"
#include "paddle/fluid/pybind/protobuf.h"
...
@@ -932,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -932,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle.
});
});
BindRecordIOWriter
(
&
m
);
BindRecordIOWriter
(
&
m
);
BindAsyncExecutor
(
&
m
);
}
}
}
// namespace pybind
}
// namespace pybind
}
// namespace paddle
}
// namespace paddle
paddle/scripts/paddle_build.sh
浏览文件 @
64ad051b
...
@@ -440,11 +440,29 @@ EOF
...
@@ -440,11 +440,29 @@ EOF
ctest
--output-on-failure
-j
$1
ctest
--output-on-failure
-j
$1
# make install should also be test when unittest
# make install should also be test when unittest
make
install
-j
8
make
install
-j
8
pip
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
if
[
"
$1
"
==
"cp27-cp27m"
]
;
then
pip
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
elif
[
"
$1
"
==
"cp35-cp35m"
]
;
then
pip3.5
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
pip3.6
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
pip3.7
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
fi
if
[[
${
WITH_FLUID_ONLY
:-
OFF
}
==
"OFF"
]]
;
then
if
[[
${
WITH_FLUID_ONLY
:-
OFF
}
==
"OFF"
]]
;
then
paddle version
paddle version
fi
fi
pip uninstall
-y
paddlepaddle
if
[
"
$1
"
==
"cp27-cp27m"
]
;
then
pip uninstall
-y
paddlepaddle
elif
[
"
$1
"
==
"cp35-cp35m"
]
;
then
pip3.5 uninstall
-y
paddlepaddle
elif
[
"
$1
"
==
"cp36-cp36m"
]
;
then
pip3.6 uninstall
-y
paddlepaddle
elif
[
"
$1
"
==
"cp37-cp37m"
]
;
then
pip3.7 uninstall
-y
paddlepaddle
fi
fi
fi
}
}
...
...
python/paddle/fluid/__init__.py
浏览文件 @
64ad051b
...
@@ -20,6 +20,13 @@ from .framework import *
...
@@ -20,6 +20,13 @@ from .framework import *
# import all class inside executor into fluid module
# import all class inside executor into fluid module
from
.
import
executor
from
.
import
executor
from
.executor
import
*
from
.executor
import
*
from
.
import
data_feed_desc
from
.data_feed_desc
import
*
from
.
import
async_executor
from
.async_executor
import
*
from
.
import
trainer
from
.
import
trainer
from
.
import
inferencer
from
.
import
inferencer
...
@@ -54,7 +61,8 @@ Tensor = LoDTensor
...
@@ -54,7 +61,8 @@ Tensor = LoDTensor
__all__
=
framework
.
__all__
+
executor
.
__all__
+
\
__all__
=
framework
.
__all__
+
executor
.
__all__
+
\
trainer
.
__all__
+
inferencer
.
__all__
+
transpiler
.
__all__
+
\
trainer
.
__all__
+
inferencer
.
__all__
+
transpiler
.
__all__
+
\
parallel_executor
.
__all__
+
lod_tensor
.
__all__
+
[
parallel_executor
.
__all__
+
lod_tensor
.
__all__
+
\
data_feed_desc
.
__all__
+
async_executor
.
__all__
+
[
'io'
,
'io'
,
'initializer'
,
'initializer'
,
'layers'
,
'layers'
,
...
...
python/paddle/fluid/async_executor.py
0 → 100644
浏览文件 @
64ad051b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
numpy
as
np
import
contextlib
import
six
from
.framework
import
Program
,
default_main_program
,
Variable
from
.
import
core
from
.executor
import
global_scope
,
Executor
from
paddle.fluid.proto
import
data_feed_pb2
from
google.protobuf
import
text_format
from
.
import
io
from
.data_feed_desc
import
DataFeedDesc
__all__
=
[
'AsyncExecutor'
]
class
AsyncExecutor
(
object
):
"""
An asynchronous Executor in Python. Through exploiting the power of
multi-core processor and data queueing, AsyncExecutor makes data reading
and cosuming decoupled, each run in multiple threads in parallel.
Instead of reading data in python side, AsyncExecutor accepts a training
file list, which will be retrieved in C++, then training inputs will be
read, parsed and fed to training network within C++ code.
AsyncExecutor is in active development and the API might change in the near
future.
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> startup_program = fluid.default_startup_program()
>>> main_program = fluid.default_main_program()
>>> filelist = ["train_data/part-%d" % i for i in range(100)]
>>> thread_num = len(filelist) / 4
>>>
>>> place = fluid.CPUPlace()
>>> async_executor = fluid.AsyncExecutor(place)
>>>
>>> async_executor.run_startup_program(startup_program)
>>>
>>> epoch = 10
>>> for i in range(epoch):
>>> async_executor.run(main_program,
>>> data_feed,
>>> filelist,
>>> thread_num,
>>> [acc],
>>> debug=False)
Args:
place(fluid.CPUPlace|None): indicate the executor run on which device.
Only CPUPlace supported
Note:
For debugging complicated network in parallel-GPUs, you can test it
on the executor. They has the exactly same arguments, and expected
the same results.
Note: Only running on CPUPlace supported.
"""
def
__init__
(
self
,
place
=
None
):
if
place
is
None
:
place
=
core
.
CPUPlace
()
if
not
isinstance
(
place
,
core
.
CPUPlace
):
raise
ValueError
(
"AsyncExecutor only supports CPU device"
)
p
=
core
.
Place
()
p
.
set_place
(
place
)
scope
=
global_scope
()
self
.
executor
=
core
.
AsyncExecutor
(
scope
,
p
)
def
run
(
self
,
program
,
data_feed
,
filelist
,
thread_num
,
fetch
,
debug
=
False
):
"""
Run program by this AsyncExecutor. Training dataset will be in filelist.
Users can also inspect certain variables by naming them in parameter
:code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however,
AsyncExecutor doesn't return fetched variables, instead, it will dump
the values of each fetched variable to stdandard output.
Running the dataset will be on multiple threads, within each a thread
local scope will be created, then all OPs also created in that scope.
Parameters are updated by all the OPs simultaneously.
Args:
program(Program): the program that need to run, if not provied,
then default_main_program will be used.
data_feed(DataFeedDesc): A DataFeedDesc object
filelist(str): a file containing the training dataset file list
thread_num(int): number of concurrent training threads. See
:code:`Note` for how to set this properly
fetch(str|list): the var name or a list of var names to inspect
debug(bool): When set to True, fetch vars will be printed to
standard output after each minibatch
Note:
the executor will run all operators in the program but not only
the operators dependent by the fetch_list.
Note:
Running AsyncExecutor will be on multiple threads, each bound to a
CPU core. To achieve best performance, it's suggested to set thread
num to be equal or slightly less than that of CPU cores.
"""
if
program
is
None
:
program
=
default_main_program
()
program_desc
=
program
.
desc
if
data_feed
is
None
:
raise
ValueError
(
'ValueError: data_feed should be provided'
)
if
filelist
is
None
:
raise
ValueError
(
'ValueError: filelist should be provided'
)
if
isinstance
(
filelist
,
str
):
filelist
=
[
filelist
]
if
not
isinstance
(
thread_num
,
int
):
raise
TypeError
(
'TypeError: thread_num should be a positive number'
)
if
fetch
is
not
None
:
if
isinstance
(
fetch
,
Variable
):
fetch
=
[
fetch
]
fetch_var_names
=
[
var
.
name
for
var
in
fetch
]
for
fetch_var
in
fetch
:
shape
=
fetch_var
.
shape
if
shape
[
len
(
shape
)
-
1
]
!=
1
:
raise
AssertionError
(
"%s: Fetch variable has wrong shape. Only varibles "
"with the last dimension size 1 supported."
%
(
fetch_var
.
name
))
self
.
executor
.
run_from_files
(
program_desc
,
data_feed
.
desc
(),
filelist
,
thread_num
,
fetch_var_names
,
debug
)
python/paddle/fluid/data_feed_desc.py
0 → 100644
浏览文件 @
64ad051b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.fluid.proto
import
data_feed_pb2
from
google.protobuf
import
text_format
__all__
=
[
'DataFeedDesc'
]
class
DataFeedDesc
(
object
):
"""
Datafeed descriptor, describing input training data format. This class is
currently only used for AsyncExecutor (See comments for class AsyncExecutor
for a brief introduction)
DataFeedDesc shall be initialized from a valid protobuf message from disk:
>>> data_feed = fluid.DataFeedDesc('data.proto')
See :code:`paddle/fluid/framework/data_feed.proto` for message definition.
A typical message might look like:
>>> name: "MultiSlotDataFeed"
>>> batch_size: 2
>>> multi_slot_desc {
>>> slots {
>>> name: "words"
>>> type: "uint64"
>>> is_dense: false
>>> is_used: true
>>> }
>>> slots {
>>> name: "label"
>>> type: "uint64"
>>> is_dense: false
>>> is_used: true
>>> }
>>> }
However, users usually shouldn't care about the message format; instead,
they are encouragd to use :code:`Data Generator` as a tool to generate a
valid data description, in the process of converting their raw log files to
training files acceptable to AsyncExecutor.
DataFeedDesc can also be changed during runtime. Once you got familiar with
what each field mean, you can modify it to better suit your need. E.g.:
>>> data_feed.set_batch_size(128)
>>> data_feed.set_dense_slots('wd') # The slot named 'wd' will be dense
>>> data_feed.set_use_slots('wd') # The slot named 'wd' will be used
Finally, the content can be dumped out for debugging purpose:
>>> print(data_feed.desc())
Args:
proto_file(string): Disk file containing a data feed description.
"""
def
__init__
(
self
,
proto_file
):
self
.
proto_desc
=
data_feed_pb2
.
DataFeedDesc
()
with
open
(
proto_file
,
'r'
)
as
f
:
text_format
.
Parse
(
f
.
read
(),
self
.
proto_desc
)
if
self
.
proto_desc
.
name
==
"MultiSlotDataFeed"
:
self
.
__name_to_index
=
{
slot
.
name
:
i
for
i
,
slot
in
enumerate
(
self
.
proto_desc
.
multi_slot_desc
.
slots
)
}
def
set_batch_size
(
self
,
batch_size
):
"""
Set batch size. Will be effective during training
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> data_feed.set_batch_size(128)
Args:
batch_size: batch size
"""
self
.
proto_desc
.
batch_size
=
batch_size
def
set_dense_slots
(
self
,
dense_slots_name
):
"""
Set if a specific slot will be dense. Will be effective during training.
features for a dense slot will be fed into a Tensor, while those for a
sparse slot will be fed into a LoDTensor
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> data_feed.set_dense_slots(['words'])
Args:
dense_slots_name: a list of slot names which will be set dense
Note:
Default is sparse for all slots
"""
if
self
.
proto_desc
.
name
!=
"MultiSlotDataFeed"
:
raise
ValueError
(
"Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto"
)
for
name
in
dense_slots_name
:
self
.
proto_desc
.
multi_slot_desc
.
slots
[
self
.
__name_to_index
[
name
]].
is_dense
=
True
def
set_use_slots
(
self
,
use_slots_name
):
"""
Set if a specific slot will be used for training. A dataset shall
contain a lot of features, through this function one can select which
ones will be used for a specific model.
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> data_feed.set_use_slots(['words'])
Args:
use_slots_name: a list of slot names which will be used in training
Note:
Default is not used for all slots
"""
if
self
.
proto_desc
.
name
!=
"MultiSlotDataFeed"
:
raise
ValueError
(
"Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto"
)
for
name
in
use_slots_name
:
self
.
proto_desc
.
multi_slot_desc
.
slots
[
self
.
__name_to_index
[
name
]].
is_used
=
True
def
desc
(
self
):
"""
Returns a protobuf message for this DataFeedDesc
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> print(data_feed.desc())
Returns:
A string message
"""
return
text_format
.
MessageToString
(
self
.
proto_desc
)
python/paddle/fluid/executor.py
浏览文件 @
64ad051b
...
@@ -278,6 +278,7 @@ class Executor(object):
...
@@ -278,6 +278,7 @@ class Executor(object):
p
=
core
.
Place
()
p
=
core
.
Place
()
p
.
set_place
(
place
)
p
.
set_place
(
place
)
self
.
executor
=
core
.
Executor
(
p
)
self
.
executor
=
core
.
Executor
(
p
)
self
.
program_caches
=
dict
()
self
.
program_caches
=
dict
()
self
.
_closed
=
False
self
.
_closed
=
False
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
64ad051b
...
@@ -169,8 +169,11 @@ __all__ = [
...
@@ -169,8 +169,11 @@ __all__ = [
'log_loss'
,
'log_loss'
,
'add_position_encoding'
,
'add_position_encoding'
,
'bilinear_tensor_product'
,
'bilinear_tensor_product'
,
'lstm'
,
]
]
kIgnoreIndex
=
-
100
def
fc
(
input
,
def
fc
(
input
,
size
,
size
,
...
@@ -326,6 +329,11 @@ def embedding(input,
...
@@ -326,6 +329,11 @@ def embedding(input,
"""
"""
helper
=
LayerHelper
(
'embedding'
,
**
locals
())
helper
=
LayerHelper
(
'embedding'
,
**
locals
())
remote_prefetch
=
False
if
os
.
environ
.
get
(
'PADDLE_ENABLE_REMOTE_PREFETCH'
):
remote_prefetch
=
True
if
remote_prefetch
:
assert
is_sparse
is
True
and
is_distributed
is
False
w
=
helper
.
create_parameter
(
w
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
size
,
dtype
=
dtype
,
is_bias
=
False
)
attr
=
helper
.
param_attr
,
shape
=
size
,
dtype
=
dtype
,
is_bias
=
False
)
tmp
=
helper
.
create_variable_for_type_inference
(
dtype
)
tmp
=
helper
.
create_variable_for_type_inference
(
dtype
)
...
@@ -339,6 +347,7 @@ def embedding(input,
...
@@ -339,6 +347,7 @@ def embedding(input,
attrs
=
{
attrs
=
{
'is_sparse'
:
is_sparse
,
'is_sparse'
:
is_sparse
,
'is_distributed'
:
is_distributed
,
'is_distributed'
:
is_distributed
,
'remote_prefetch'
:
remote_prefetch
,
'padding_idx'
:
padding_idx
'padding_idx'
:
padding_idx
})
})
return
tmp
return
tmp
...
@@ -466,6 +475,168 @@ def dynamic_lstm(input,
...
@@ -466,6 +475,168 @@ def dynamic_lstm(input,
return
hidden
,
cell
return
hidden
,
cell
def
lstm
(
input
,
init_h
,
init_c
,
max_len
,
hidden_size
,
num_layers
,
dropout_prob
=
0.0
,
is_bidirec
=
False
,
is_test
=
False
,
name
=
None
,
default_initializer
=
None
,
seed
=-
1
):
"""
If Device is GPU, This op will use cudnn LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t =
\\
sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
$$ f_t =
\\
sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t =
\\
sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$
\\
tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t
\\
odot c_{t-1} + i_t
\\
odot
\\
tilde{c_t} $$
$$ h_t = o_t
\\
odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $
\t
ilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
Args:
input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
init_h(Variable): The initial hidden state of the LSTM
This is a tensor with shape ( num_layers x batch_size x hidden_size)
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
init_c(Variable): The initial cell state of the LSTM.
This is a tensor with shape ( num_layers x batch_size x hidden_size )
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
hidden_size (int): hidden size of the LSTM
num_layers (int): total layers number of the LSTM
dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
There is NO dropout work on rnn output of the last RNN layers
is_bidirec (bool): If it is bidirectional
is_test (bool): If it is in test phrase
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
default_initializer(Initialize|None): Where use initializer to initialize the Weight
If set None, defaule initializer will be used
seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
Returns:
rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
last_h(Tensor): the hidden state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
last_c(Tensor): the cell state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
Examples:
.. code-block:: python
input = embedding
batch_size = 20
max_len = 100
dropout_prob = 0.2
input_size = 100
hidden_size = 150
num_layers = 1
init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c,
\
max_len, dropout_prob, input_size, hidden_size,
\
num_layers)
"""
helper
=
LayerHelper
(
'cudnn_lstm'
,
**
locals
())
dtype
=
input
.
dtype
input_shape
=
list
(
input
.
shape
)
input_size
=
input_shape
[
-
1
]
weight_size
=
0
for
i
in
range
(
num_layers
):
if
i
==
0
:
input_weight_size
=
(
input_size
*
hidden_size
)
*
4
else
:
if
is_bidirec
:
input_weight_size
=
(
hidden_size
*
2
*
hidden_size
)
*
4
else
:
input_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
hidden_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
if
is_bidirec
:
weight_size
+=
(
input_weight_size
+
hidden_weight_size
)
*
2
weight_size
+=
hidden_size
*
8
*
2
else
:
weight_size
+=
input_weight_size
+
hidden_weight_size
weight_size
+=
hidden_size
*
8
weight
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
[
weight_size
],
dtype
=
dtype
,
default_initializer
=
default_initializer
)
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
last_h
=
helper
.
create_variable_for_type_inference
(
dtype
)
last_c
=
helper
.
create_variable_for_type_inference
(
dtype
)
cache
=
helper
.
create_variable
(
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
,
stop_gradient
=
True
)
helper
.
append_op
(
type
=
'cudnn_lstm'
,
inputs
=
{
'Input'
:
input
,
'InitH'
:
init_h
,
'InitC'
:
init_c
,
'W'
:
weight
,
'Cache'
:
cache
,
},
outputs
=
{
'Out'
:
out
,
'last_h'
:
last_h
,
'last_c'
:
last_c
,
},
attrs
=
{
'max_len'
:
max_len
,
'is_bidirec'
:
is_bidirec
,
'input_size'
:
input_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
num_layers
,
'is_test'
:
is_test
,
'dropout_prob'
:
dropout_prob
,
'seed'
:
seed
,
})
return
out
,
last_h
,
last_c
def
dynamic_lstmp
(
input
,
def
dynamic_lstmp
(
input
,
size
,
size
,
proj_size
,
proj_size
,
...
@@ -1098,7 +1269,7 @@ def dropout(x,
...
@@ -1098,7 +1269,7 @@ def dropout(x,
return
out
return
out
def
cross_entropy
(
input
,
label
,
soft_label
=
False
,
ignore_index
=
-
100
):
def
cross_entropy
(
input
,
label
,
soft_label
=
False
,
ignore_index
=
kIgnoreIndex
):
"""
"""
**Cross Entropy Layer**
**Cross Entropy Layer**
...
@@ -1145,7 +1316,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
...
@@ -1145,7 +1316,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
labels. Default: `False`.
labels. Default: `False`.
ignore_index (int): Specifies a target value that is ignored and does
ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid
not contribute to the input gradient. Only valid
if soft_label is set to False. Default:
-100
if soft_label is set to False. Default:
kIgnoreIndex
Returns:
Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss.
A 2-D tensor with shape [N x 1], the cross entropy loss.
...
@@ -2300,7 +2471,8 @@ def batch_norm(input,
...
@@ -2300,7 +2471,8 @@ def batch_norm(input,
moving_mean_name
=
None
,
moving_mean_name
=
None
,
moving_variance_name
=
None
,
moving_variance_name
=
None
,
do_model_average_for_mean_and_var
=
False
,
do_model_average_for_mean_and_var
=
False
,
fuse_with_relu
=
False
):
fuse_with_relu
=
False
,
use_global_stats
=
False
):
"""
"""
**Batch Normalization Layer**
**Batch Normalization Layer**
...
@@ -2327,6 +2499,19 @@ def batch_norm(input,
...
@@ -2327,6 +2499,19 @@ def batch_norm(input,
\\
sigma_{
\\
beta}^{2} +
\\
epsilon}}
\\
qquad &//\ normalize
\\\\
\\
sigma_{
\\
beta}^{2} +
\\
epsilon}}
\\
qquad &//\ normalize
\\\\
y_i &
\\
gets
\\
gamma
\\
hat{x_i} +
\\
beta
\\
qquad &//\ scale\ and\ shift
y_i &
\\
gets
\\
gamma
\\
hat{x_i} +
\\
beta
\\
qquad &//\ scale\ and\ shift
When use_global_stats = True, the :math:`
\\
mu_{
\\
beta}`
and :math:`
\\
sigma_{
\\
beta}^{2}` are not the statistics of one mini-batch.
They are global (or running) statistics. (It usually got from the
pre-trained model.)
The training and testing (or inference) have the same behavior:
.. math::
\\
hat{x_i} &
\\
gets
\\
frac{x_i -
\\
mu_
\\
beta} {
\\
sqrt{
\\
\\
sigma_{
\\
beta}^{2} +
\\
epsilon}}
\\\\
y_i &
\\
gets
\\
gamma
\\
hat{x_i} +
\\
beta
Args:
Args:
input(variable): The input variable which is a LoDTensor.
input(variable): The input variable which is a LoDTensor.
act(string, Default None): Activation type, linear|relu|prelu|...
act(string, Default None): Activation type, linear|relu|prelu|...
...
@@ -2349,6 +2534,11 @@ def batch_norm(input,
...
@@ -2349,6 +2534,11 @@ def batch_norm(input,
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
fuse_with_relu (bool): if True, this OP performs relu after batch norm.
fuse_with_relu (bool): if True, this OP performs relu after batch norm.
use_global_stats(bool, Default False): Whether to use global mean and
variance. In inference or test mode, set use_global_stats to true
or is_test to true, and the behavior is equivalent.
In train mode, when setting use_global_stats True, the global mean
and variance are also used during train period.
Returns:
Returns:
Variable: A tensor variable which is the result after applying batch normalization on the input.
Variable: A tensor variable which is the result after applying batch normalization on the input.
...
@@ -2381,9 +2571,15 @@ def batch_norm(input,
...
@@ -2381,9 +2571,15 @@ def batch_norm(input,
shape
=
param_shape
,
shape
=
param_shape
,
dtype
=
dtype
,
dtype
=
dtype
,
default_initializer
=
Constant
(
1.0
))
default_initializer
=
Constant
(
1.0
))
# setting stop_gradient=True to reduce computation
if
use_global_stats
and
helper
.
param_attr
.
learning_rate
==
0.
:
scale
.
stop_gradient
=
True
bias
=
helper
.
create_parameter
(
bias
=
helper
.
create_parameter
(
attr
=
helper
.
bias_attr
,
shape
=
param_shape
,
dtype
=
dtype
,
is_bias
=
True
)
attr
=
helper
.
bias_attr
,
shape
=
param_shape
,
dtype
=
dtype
,
is_bias
=
True
)
# setting stop_gradient=True to reduce computation
if
use_global_stats
and
helper
.
bias_attr
.
learning_rate
==
0.
:
scale
.
stop_gradient
=
True
mean
=
helper
.
create_parameter
(
mean
=
helper
.
create_parameter
(
attr
=
ParamAttr
(
attr
=
ParamAttr
(
...
@@ -2439,7 +2635,8 @@ def batch_norm(input,
...
@@ -2439,7 +2635,8 @@ def batch_norm(input,
"epsilon"
:
epsilon
,
"epsilon"
:
epsilon
,
"is_test"
:
is_test
,
"is_test"
:
is_test
,
"use_mkldnn"
:
False
,
"use_mkldnn"
:
False
,
"fuse_with_relu"
:
fuse_with_relu
"fuse_with_relu"
:
fuse_with_relu
,
"use_global_stats"
:
use_global_stats
})
})
return
helper
.
append_activation
(
batch_norm_out
)
return
helper
.
append_activation
(
batch_norm_out
)
...
@@ -4218,8 +4415,15 @@ def ctc_greedy_decoder(input, blank, name=None):
...
@@ -4218,8 +4415,15 @@ def ctc_greedy_decoder(input, blank, name=None):
[0.5, 0.1, 0.3, 0.1]]
[0.5, 0.1, 0.3, 0.1]]
input.lod = [[4, 4]]
input.lod = [[4, 4]]
Computation:
Then:
step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
[[0], [2], [1], [0]]
step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence:
[[2], [1]]
Finally:
output.data = [[2],
output.data = [[2],
[1],
[1],
...
@@ -4227,6 +4431,7 @@ def ctc_greedy_decoder(input, blank, name=None):
...
@@ -4227,6 +4431,7 @@ def ctc_greedy_decoder(input, blank, name=None):
output.lod = [[2, 1]]
output.lod = [[2, 1]]
Args:
Args:
input(Variable): (LoDTensor<float>), the probabilities of
input(Variable): (LoDTensor<float>), the probabilities of
...
@@ -4241,8 +4446,10 @@ def ctc_greedy_decoder(input, blank, name=None):
...
@@ -4241,8 +4446,10 @@ def ctc_greedy_decoder(input, blank, name=None):
name (str): The name of this layer. It is optional.
name (str): The name of this layer. It is optional.
Returns:
Returns:
Variable: CTC greedy decode result. If all the sequences in result were
Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
'Lp' is the sum if all output sequences' length. If all the sequences
in result were empty, the result LoDTensor will be [-1] with
LoD [[]] and dims [1, 1].
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -4980,7 +5187,7 @@ def multiplex(inputs, index):
...
@@ -4980,7 +5187,7 @@ def multiplex(inputs, index):
def
softmax_with_cross_entropy
(
logits
,
def
softmax_with_cross_entropy
(
logits
,
label
,
label
,
soft_label
=
False
,
soft_label
=
False
,
ignore_index
=
-
100
,
ignore_index
=
kIgnoreIndex
,
numeric_stable_mode
=
False
,
numeric_stable_mode
=
False
,
return_softmax
=
False
):
return_softmax
=
False
):
"""
"""
...
@@ -5038,7 +5245,7 @@ def softmax_with_cross_entropy(logits,
...
@@ -5038,7 +5245,7 @@ def softmax_with_cross_entropy(logits,
labels as soft labels. By default, `soft_label` is set to False.
labels as soft labels. By default, `soft_label` is set to False.
ignore_index (int): Specifies a target value that is ignored and does
ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid
not contribute to the input gradient. Only valid
if soft_label is set to False. Default:
-100
if soft_label is set to False. Default:
kIgnoreIndex
numeric_stable_mode (bool): A flag to indicate whether to use a more
numeric_stable_mode (bool): A flag to indicate whether to use a more
numerically stable algorithm. Only valid
numerically stable algorithm. Only valid
when soft_label is False and GPU is used.
when soft_label is False and GPU is used.
...
@@ -6892,7 +7099,7 @@ def pad2d(input,
...
@@ -6892,7 +7099,7 @@ def pad2d(input,
Args:
Args:
input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
paddings (tuple|list): The padding size. If padding is a tuple, it must
paddings (tuple|list
|Variable
): The padding size. If padding is a tuple, it must
contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
Default: padding = [0, 0, 0, 0].
Default: padding = [0, 0, 0, 0].
mode (str): Three modes: constant(default), reflect, edge. Default: constant
mode (str): Three modes: constant(default), reflect, edge. Default: constant
...
@@ -6917,16 +7124,17 @@ def pad2d(input,
...
@@ -6917,16 +7124,17 @@ def pad2d(input,
helper
=
LayerHelper
(
'pad2d'
,
**
locals
())
helper
=
LayerHelper
(
'pad2d'
,
**
locals
())
dtype
=
helper
.
input_dtype
(
input_param_name
=
'input'
)
dtype
=
helper
.
input_dtype
(
input_param_name
=
'input'
)
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
inputs
=
{
'X'
:
input
}
attrs
=
{
'mode'
:
mode
,
'pad_value'
:
pad_value
,
'data_format'
:
data_format
}
if
isinstance
(
paddings
,
Variable
):
inputs
[
'Paddings'
]
=
paddings
attrs
[
'paddings'
]
=
[]
else
:
attrs
[
'paddings'
]
=
paddings
helper
.
append_op
(
helper
.
append_op
(
type
=
'pad2d'
,
type
=
'pad2d'
,
inputs
=
inputs
,
outputs
=
{
"Out"
:
out
},
attrs
=
attrs
)
inputs
=
{
'X'
:
input
},
outputs
=
{
"Out"
:
out
},
attrs
=
{
'paddings'
:
paddings
,
'mode'
:
mode
,
'pad_value'
:
pad_value
,
'data_frmat'
:
data_format
})
return
out
return
out
...
@@ -7574,6 +7782,11 @@ def uniform_random_batch_size_like(input,
...
@@ -7574,6 +7782,11 @@ def uniform_random_batch_size_like(input,
Returns:
Returns:
out (Variable): ${out_comment}
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.uniform_random_batch_size_like(input, [-1, 11])
"""
"""
helper
=
LayerHelper
(
'uniform_random_batch_size_like'
,
**
locals
())
helper
=
LayerHelper
(
'uniform_random_batch_size_like'
,
**
locals
())
...
@@ -7611,6 +7824,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
...
@@ -7611,6 +7824,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
Returns:
Returns:
out (Variable): ${out_comment}
out (Variable): ${out_comment}
Examples:
.. code-block:: python
out = layers.gaussian_random(shape=[20, 30])
"""
"""
helper
=
LayerHelper
(
'gaussian_random'
,
**
locals
())
helper
=
LayerHelper
(
'gaussian_random'
,
**
locals
())
...
@@ -7646,6 +7863,16 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
...
@@ -7646,6 +7863,16 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
Returns:
Returns:
out (Variable): ${out_comment}
out (Variable): ${out_comment}
Examples:
.. code-block:: python
x = layers.data(
name="X",
shape=[13, 11],
dtype='float32',
append_batch_size=False)
out = layers.sampling_id(x)
"""
"""
helper
=
LayerHelper
(
'sampling_id'
,
**
locals
())
helper
=
LayerHelper
(
'sampling_id'
,
**
locals
())
...
@@ -7685,6 +7912,14 @@ def gaussian_random_batch_size_like(input,
...
@@ -7685,6 +7912,14 @@ def gaussian_random_batch_size_like(input,
Returns:
Returns:
out (Variable): ${out_comment}
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.gaussian_random_batch_size_like(
input, shape=[-1, 11], mean=1.0, std=2.0)
"""
"""
helper
=
LayerHelper
(
'gaussian_random_batch_size_like'
,
**
locals
())
helper
=
LayerHelper
(
'gaussian_random_batch_size_like'
,
**
locals
())
...
@@ -7717,6 +7952,12 @@ def sum(x):
...
@@ -7717,6 +7952,12 @@ def sum(x):
Returns:
Returns:
out (Variable): ${out_comment}
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.sum(input)
"""
"""
helper
=
LayerHelper
(
'sum'
,
**
locals
())
helper
=
LayerHelper
(
'sum'
,
**
locals
())
...
@@ -7745,6 +7986,17 @@ def slice(input, axes, starts, ends):
...
@@ -7745,6 +7986,17 @@ def slice(input, axes, starts, ends):
Returns:
Returns:
out (Variable): ${out_comment}
out (Variable): ${out_comment}
Examples:
.. code-block:: python
starts = [1, 0, 2]
ends = [3, 3, 4]
axes = [0, 1, 2]
input = layers.data(
name="input", shape=[3, 4, 5, 6], dtype='float32')
out = layers.slice(input, axes=axes, starts=starts, ends=ends)
"""
"""
helper
=
LayerHelper
(
'slice'
,
**
locals
())
helper
=
LayerHelper
(
'slice'
,
**
locals
())
...
@@ -7772,6 +8024,12 @@ def shape(input):
...
@@ -7772,6 +8024,12 @@ def shape(input):
Returns:
Returns:
out (Variable): ${out_comment}
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32")
out = layers.shape(input)
"""
"""
helper
=
LayerHelper
(
'shape'
,
**
locals
())
helper
=
LayerHelper
(
'shape'
,
**
locals
())
...
@@ -8159,13 +8417,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
...
@@ -8159,13 +8417,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
@
templatedoc
()
@
templatedoc
()
def
sigmoid_cross_entropy_with_logits
(
x
,
label
,
name
=
None
):
def
sigmoid_cross_entropy_with_logits
(
x
,
label
,
ignore_index
=
kIgnoreIndex
,
name
=
None
):
"""
"""
${comment}
${comment}
Args:
Args:
x(${x_type}): ${x_comment}
x(${x_type}): ${x_comment}
label(${label_type}): ${label_comment}
label(${label_type}): ${label_comment}
ignore_index(&{ignore_index}): ${ignore_index_comment}
name(basestring|None): Name of the output.
name(basestring|None): Name of the output.
Returns:
Returns:
...
@@ -8184,7 +8446,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
...
@@ -8184,7 +8446,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
type
=
"sigmoid_cross_entropy_with_logits"
,
type
=
"sigmoid_cross_entropy_with_logits"
,
inputs
=
{
"X"
:
x
,
inputs
=
{
"X"
:
x
,
"Label"
:
label
},
"Label"
:
label
},
attrs
=
{},
attrs
=
{
"ignore_index"
:
ignore_index
},
outputs
=
{
"Out"
:
out
})
outputs
=
{
"Out"
:
out
})
return
out
return
out
...
...
python/paddle/fluid/tests/demo/async_executor.py
0 → 100644
浏览文件 @
64ad051b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
tarfile
import
paddle.fluid
as
fluid
import
paddle
from
paddle.fluid
import
core
URL
=
'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
MD5
=
'2a405a31508969b3ab823f42c0f522ca'
def
bow_net
(
data
,
label
,
dict_dim
=
89528
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
models/fluid/PaddleNLP/text_classification/nets.py
"""
# embedding
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
is_sparse
=
True
)
bow
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
bowh
=
fluid
.
layers
.
tanh
(
bow
)
# fc layer after conv
fc_1
=
fluid
.
layers
.
fc
(
input
=
bowh
,
size
=
hid_dim
,
act
=
"tanh"
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
)
# probability of each class
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
)
# cross entropy loss
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
# mean loss
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
def
train
():
# Download data
with
tarfile
.
open
(
paddle
.
dataset
.
common
.
download
(
URL
,
"imdb"
,
MD5
))
as
tarf
:
tarf
.
extractall
(
path
=
'./'
)
tarf
.
close
()
# Initialize dataset description
dataset
=
fluid
.
DataFeedDesc
(
'train_data/data.prototxt'
)
dataset
.
set_batch_size
(
128
)
# See API doc for how to change other fields
print
dataset
.
desc
()
# Debug purpose: see what we get
# define network
# input text data
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
# label data
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
avg_cost
,
acc
,
prediction
=
bow_net
(
data
,
label
)
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
0.002
)
opt_ops
,
weight_and_grad
=
sgd_optimizer
.
minimize
(
avg_cost
)
# Run startup program
startup_program
=
fluid
.
default_startup_program
()
place
=
fluid
.
CPUPlace
()
executor
=
fluid
.
Executor
(
place
)
executor
.
run
(
startup_program
)
async_executor
=
fluid
.
AsyncExecutor
(
place
)
main_program
=
fluid
.
default_main_program
()
epochs
=
10
filelist
=
[
"train_data/part-%d"
%
i
for
i
in
range
(
12
)]
for
i
in
range
(
epochs
):
thread_num
=
4
async_executor
.
run
(
main_program
,
# This can be changed during iteration
dataset
,
# This can be changed during iteration
filelist
,
# This can be changed during iteration
thread_num
,
# This can be changed during iteration
[
data
,
acc
],
# Multiple fetch targets can be specified
debug
=
False
)
fluid
.
io
.
save_inference_model
(
'imdb/epoch%d.model'
%
i
,
[
data
.
name
,
label
.
name
],
[
acc
],
executor
)
if
__name__
==
"__main__"
:
train
()
python/paddle/fluid/tests/unittests/dist_ctr.py
浏览文件 @
64ad051b
...
@@ -16,11 +16,13 @@ from __future__ import print_function
...
@@ -16,11 +16,13 @@ from __future__ import print_function
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
os
import
dist_ctr_reader
import
dist_ctr_reader
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
IS_SPARSE
=
True
IS_SPARSE
=
True
os
.
environ
[
'PADDLE_ENABLE_REMOTE_PREFETCH'
]
=
"1"
# Fix seed for test
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_startup_program
().
random_seed
=
1
...
...
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
64ad051b
...
@@ -216,6 +216,15 @@ class OpTest(unittest.TestCase):
...
@@ -216,6 +216,15 @@ class OpTest(unittest.TestCase):
self
.
dtype
)
self
.
dtype
)
outputs
=
append_input_output
(
block
,
op_proto
,
self
.
outputs
,
False
,
outputs
=
append_input_output
(
block
,
op_proto
,
self
.
outputs
,
False
,
self
.
dtype
)
self
.
dtype
)
if
hasattr
(
self
,
"cache_name_list"
):
for
name
in
self
.
cache_name_list
:
inputs
[
name
]
=
block
.
create_var
(
name
=
name
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
,
stop_gradient
=
True
)
op
=
block
.
append_op
(
op
=
block
.
append_op
(
type
=
self
.
op_type
,
type
=
self
.
op_type
,
inputs
=
inputs
,
inputs
=
inputs
,
...
@@ -428,8 +437,17 @@ class OpTest(unittest.TestCase):
...
@@ -428,8 +437,17 @@ class OpTest(unittest.TestCase):
op_inputs
=
self
.
inputs
if
hasattr
(
self
,
"inputs"
)
else
dict
()
op_inputs
=
self
.
inputs
if
hasattr
(
self
,
"inputs"
)
else
dict
()
op_outputs
=
self
.
outputs
if
hasattr
(
self
,
"outputs"
)
else
dict
()
op_outputs
=
self
.
outputs
if
hasattr
(
self
,
"outputs"
)
else
dict
()
op_attrs
=
self
.
attrs
if
hasattr
(
self
,
"attrs"
)
else
dict
()
op_attrs
=
self
.
attrs
if
hasattr
(
self
,
"attrs"
)
else
dict
()
self
.
op
=
create_op
(
self
.
scope
,
self
.
op_type
,
op_inputs
,
op_outputs
,
op_attrs
)
cache_list
=
None
if
hasattr
(
self
,
"cache_name_list"
):
cache_list
=
self
.
cache_name_list
self
.
op
=
create_op
(
self
.
scope
,
self
.
op_type
,
op_inputs
,
op_outputs
,
op_attrs
,
cache_list
=
cache_list
)
if
no_grad_set
is
None
:
if
no_grad_set
is
None
:
no_grad_set
=
set
()
no_grad_set
=
set
()
...
...
python/paddle/fluid/tests/unittests/test_async_executor.py
0 → 100644
浏览文件 @
64ad051b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
import
paddle
import
unittest
import
tarfile
import
os
import
shutil
proto_str
=
(
'name: "MultiSlotDataFeed"
\n
'
'batch_size: 2
\n
'
'multi_slot_desc {
\n
'
' slots {
\n
'
' name: "words"
\n
'
' type: "uint64"
\n
'
' is_dense: false
\n
'
' is_used: true
\n
'
' }
\n
'
' slots {
\n
'
' name: "label"
\n
'
' type: "uint64"
\n
'
' is_dense: false
\n
'
' is_used: true
\n
'
' }
\n
'
'}'
)
URL
=
'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
MD5
=
'2a405a31508969b3ab823f42c0f522ca'
def
bow_net
(
data
,
label
,
dict_dim
=
89528
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
models/fluid/PaddleNLP/text_classification/nets.py
"""
# embedding
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
is_sparse
=
True
)
bow
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
bowh
=
fluid
.
layers
.
tanh
(
bow
)
# fc layer after conv
fc_1
=
fluid
.
layers
.
fc
(
input
=
bowh
,
size
=
hid_dim
,
act
=
"tanh"
)
fc_2
=
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
hid_dim2
,
act
=
"tanh"
)
# probability of each class
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
class_dim
,
act
=
"softmax"
)
# cross entropy loss
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
# mean loss
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
acc
=
fluid
.
layers
.
accuracy
(
input
=
prediction
,
label
=
label
)
return
avg_cost
,
acc
,
prediction
class
TestAsyncExecutor
(
unittest
.
TestCase
):
def
setUp
(
self
):
with
open
(
'./data.prototxt'
,
'w+'
)
as
f
:
f
.
write
(
proto_str
)
f
.
close
()
with
tarfile
.
open
(
paddle
.
dataset
.
common
.
download
(
URL
,
"imdb"
,
MD5
))
as
tarf
:
tarf
.
extractall
(
path
=
'./'
)
tarf
.
close
()
def
test_data_feed_desc
(
self
):
data_feed
=
fluid
.
DataFeedDesc
(
'./data.prototxt'
)
# assertEqueal(data_feed.proto_desc.batch, 2)
# assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2)
self
.
assertEqual
(
" "
.
join
(
data_feed
.
desc
().
split
()),
" "
.
join
(
proto_str
.
split
()))
def
test_run
(
self
):
# Initialize dataset description
data_feed
=
fluid
.
DataFeedDesc
(
'train_data/data.prototxt'
)
data_feed
.
set_batch_size
(
128
)
# See API doc for how to change other fields
# define network
# input text data
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
# label data
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
avg_cost
,
acc
,
prediction
=
bow_net
(
data
,
label
)
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
0.002
)
opt_ops
,
weight_and_grad
=
sgd_optimizer
.
minimize
(
avg_cost
)
# Run startup program
startup_program
=
fluid
.
default_startup_program
()
place
=
fluid
.
CPUPlace
()
executor
=
fluid
.
Executor
(
place
)
executor
.
run
(
startup_program
)
main_program
=
fluid
.
default_main_program
()
async_executor
=
fluid
.
AsyncExecutor
(
place
)
self
.
assertRaises
(
TypeError
,
async_executor
.
run
)
self
.
assertRaises
(
TypeError
,
async_executor
.
run
,
main_program
)
self
.
assertRaises
(
TypeError
,
async_executor
.
run
,
main_program
,
data_feed
)
filelist
=
[
'train_data/part-%d'
%
i
for
i
in
range
(
10
)]
self
.
assertRaises
(
TypeError
,
async_executor
.
run
,
main_program
,
data_feed
,
filelist
)
thread_num
=
4
self
.
assertRaises
(
TypeError
,
async_executor
.
run
,
main_program
,
data_feed
,
filelist
,
thread_num
)
async_executor
.
run
(
main_program
,
data_feed
,
filelist
,
thread_num
,
[
acc
])
fluid
.
io
.
save_inference_model
(
"imdb.model"
,
[
data
.
name
,
label
.
name
],
[
acc
],
executor
)
statinfo
=
os
.
stat
(
'imdb.model/__model__'
)
self
.
assertGreater
(
statinfo
.
st_size
,
0
)
os
.
remove
(
'./data.prototxt'
)
shutil
.
rmtree
(
'./train_data'
)
shutil
.
rmtree
(
'./imdb.model'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
浏览文件 @
64ad051b
...
@@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
...
@@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
return
y
return
y
def
_cal_mean_variance
(
x
,
epsilon
,
data_format
):
assert
data_format
in
[
'NCHW'
,
'NHWC'
]
x_square
=
x
*
x
axis
=
(
0
,
2
,
3
)
if
data_format
==
'NCHW'
else
(
0
,
1
,
2
)
C
=
x
.
shape
[
1
]
if
data_format
==
'NCHW'
else
x
.
shape
[
-
1
]
x_square_sum
=
np
.
sum
(
x_square
,
axis
)
x_sum
=
np
.
sum
(
x
,
axis
=
axis
)
element_count
=
np
.
size
(
x
)
/
C
mean
=
x_sum
/
element_count
var
=
x_square_sum
/
element_count
-
mean
*
mean
return
mean
,
var
def
_reference_training
(
x
,
scale
,
offset
,
epsilon
,
data_format
):
def
_reference_training
(
x
,
scale
,
offset
,
epsilon
,
data_format
):
x_shape
=
x
.
shape
x_shape
=
x
.
shape
...
@@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase):
self
.
use_mkldnn
=
False
self
.
use_mkldnn
=
False
self
.
fuse_with_relu
=
False
self
.
fuse_with_relu
=
False
self
.
data_formats
=
[
"NCHW"
,
"NHWC"
]
self
.
data_formats
=
[
"NCHW"
,
"NHWC"
]
self
.
momentum
=
0.9
self
.
epsilon
=
0.00001
self
.
init_kernel_type
()
self
.
init_kernel_type
()
self
.
init_test_case
()
def
init_test_case
(
self
):
self
.
use_global_stats
=
False
self
.
no_grad_set
=
set
()
self
.
fetch_list
=
[
'y'
,
'mean'
,
'variance'
,
'saved_mean'
,
'saved_variance'
,
'x@GRAD'
,
'scale@GRAD'
,
'bias@GRAD'
]
def
__assert_close
(
self
,
tensor
,
np_array
,
msg
,
atol
=
1e-4
):
def
__assert_close
(
self
,
tensor
,
np_array
,
msg
,
atol
=
1e-4
):
np
.
allclose
(
np
.
array
(
tensor
),
np_array
,
atol
=
atol
)
np
.
allclose
(
np
.
array
(
tensor
),
np_array
,
atol
=
atol
)
...
@@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase):
return
y
,
mean_out
,
variance_out
,
saved_mean
,
saved_variance
,
x_grad
,
scale_grad
,
bias_grad
return
y
,
mean_out
,
variance_out
,
saved_mean
,
saved_variance
,
x_grad
,
scale_grad
,
bias_grad
def
set_mean_variance
(
self
,
scale_shape
,
x
,
data_layout
):
mean
=
np
.
zeros
(
scale_shape
).
astype
(
np
.
float32
)
variance
=
np
.
ones
(
scale_shape
).
astype
(
np
.
float32
)
# computing global mean/variance for one step
if
self
.
use_global_stats
:
mom
=
self
.
momentum
x_mean
,
x_var
=
_cal_mean_variance
(
x
,
self
.
epsilon
,
data_layout
)
mean
=
x_mean
*
(
1.
-
mom
)
+
mom
*
mean
variance
=
x_var
*
(
1.
-
mom
)
+
mom
*
variance
return
mean
,
variance
def
test_forward_backward
(
self
):
def
test_forward_backward
(
self
):
def
test_with_place
(
place
,
data_layout
,
shape
):
def
test_with_place
(
place
,
data_layout
,
shape
):
# attr
# attr
epsilon
=
0.00001
epsilon
=
self
.
epsilon
momentum
=
0.9
momentum
=
self
.
momentum
if
data_layout
==
"NCHW"
:
if
data_layout
==
"NCHW"
:
n
,
c
,
h
,
w
=
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
]
n
,
c
,
h
,
w
=
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
]
else
:
else
:
...
@@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
x
=
np
.
random
.
random_sample
(
shape
).
astype
(
np
.
float32
)
x
=
np
.
random
.
random_sample
(
shape
).
astype
(
np
.
float32
)
scale
=
np
.
random
.
random_sample
(
scale_shape
).
astype
(
np
.
float32
)
scale
=
np
.
random
.
random_sample
(
scale_shape
).
astype
(
np
.
float32
)
bias
=
np
.
random
.
random_sample
(
scale_shape
).
astype
(
np
.
float32
)
bias
=
np
.
random
.
random_sample
(
scale_shape
).
astype
(
np
.
float32
)
mean
=
np
.
zeros
(
scale_shape
).
astype
(
np
.
float32
)
mean
,
variance
=
self
.
set_mean_variance
(
scale_shape
,
x
,
data_layout
)
variance
=
np
.
ones
(
scale_shape
).
astype
(
np
.
float32
)
y_grad
=
np
.
random
.
random_sample
(
shape
).
astype
(
np
.
float32
)
y_grad
=
np
.
random
.
random_sample
(
shape
).
astype
(
np
.
float32
)
y
,
mean_out
,
variance_out
,
saved_mean
,
saved_variance
,
x_grad
,
scale_grad
,
bias_grad
=
self
.
ref_forward_backward
(
y
,
mean_out
,
variance_out
,
saved_mean
,
saved_variance
,
x_grad
,
scale_grad
,
bias_grad
=
self
.
ref_forward_backward
(
...
@@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
var_dict
=
locals
()
var_dict
=
locals
()
var_dict
[
'y@GRAD'
]
=
y_grad
var_dict
[
'y@GRAD'
]
=
y_grad
var_dict
[
'x@GRAD'
]
=
x_grad
var_dict
[
'scale@GRAD'
]
=
scale_grad
var_dict
[
'bias@GRAD'
]
=
bias_grad
var_names
=
[
var_names
=
[
'x'
,
'scale'
,
'bias'
,
'mean'
,
'variance'
,
'y'
,
'saved_mean'
,
'x'
,
'scale'
,
'bias'
,
'mean'
,
'variance'
,
'y'
,
'saved_mean'
,
...
@@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
},
},
outputs
=
{
outputs
=
{
"Y"
:
block
.
var
(
'y'
),
"Y"
:
block
.
var
(
'y'
),
"MeanOut"
:
block
.
var
(
'mean'
),
# share the same memory
"MeanOut"
:
block
.
var
(
'mean'
),
# share memory
"VarianceOut"
:
"VarianceOut"
:
block
.
var
(
'variance'
),
# share memory
block
.
var
(
'variance'
),
# share the same memory
"SavedMean"
:
block
.
var
(
'saved_mean'
),
"SavedMean"
:
block
.
var
(
'saved_mean'
),
"SavedVariance"
:
block
.
var
(
'saved_variance'
)
"SavedVariance"
:
block
.
var
(
'saved_variance'
)
},
},
...
@@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase):
"is_test"
:
False
,
"is_test"
:
False
,
"data_layout"
:
data_layout
,
"data_layout"
:
data_layout
,
"use_mkldnn"
:
self
.
use_mkldnn
,
"use_mkldnn"
:
self
.
use_mkldnn
,
"fuse_with_relu"
:
self
.
fuse_with_relu
"fuse_with_relu"
:
self
.
fuse_with_relu
,
"use_global_stats"
:
self
.
use_global_stats
})
})
block
.
create_var
(
name
=
'y@GRAD'
,
dtype
=
'float32'
,
shape
=
y
.
shape
)
block
.
create_var
(
name
=
'y@GRAD'
,
dtype
=
'float32'
,
shape
=
y
.
shape
)
# generate backward op_desc
# generate backward op_desc
grad_op_desc_list
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
grad_op_desc_list
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
bn_op
.
desc
,
se
t
()
,
[])
bn_op
.
desc
,
se
lf
.
no_grad_set
,
[])
grad_op_desc
=
grad_op_desc_list
[
0
]
grad_op_desc
=
grad_op_desc_list
[
0
]
new_op_desc
=
block
.
desc
.
append_op
()
new_op_desc
=
block
.
desc
.
append_op
()
new_op_desc
.
copy_from
(
grad_op_desc
)
new_op_desc
.
copy_from
(
grad_op_desc
)
...
@@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase):
for
name
in
for
name
in
[
'x'
,
'scale'
,
'bias'
,
'mean'
,
'variance'
,
'y@GRAD'
]
[
'x'
,
'scale'
,
'bias'
,
'mean'
,
'variance'
,
'y@GRAD'
]
},
},
fetch_list
=
[
fetch_list
=
self
.
fetch_list
)
'y'
,
'mean'
,
'variance'
,
'saved_mean'
,
'saved_variance'
,
'x@GRAD'
,
'scale@GRAD'
,
'bias@GRAD'
])
self
.
__assert_close
(
y
,
out
[
0
],
"y"
)
self
.
__assert_close
(
mean_out
,
out
[
1
],
"mean"
)
self
.
__assert_close
(
variance_out
,
out
[
2
],
"variance"
,
1e-3
)
self
.
__assert_close
(
saved_mean
,
out
[
3
],
"saved_mean"
)
self
.
__assert_close
(
saved_variance
,
out
[
4
],
"saved_variance"
,
1e-3
)
self
.
__assert_close
(
x_grad
,
out
[
5
],
"x_grad"
)
self
.
__assert_close
(
scale_grad
,
out
[
6
],
"scale_grad"
)
self
.
__assert_close
(
bias_grad
,
out
[
7
],
"bias_grad"
)
for
id
,
name
in
enumerate
(
self
.
fetch_list
):
self
.
__assert_close
(
var_dict
[
name
],
out
[
id
],
name
)
print
(
"op test forward passed: "
,
str
(
place
),
data_layout
)
print
(
"op test forward passed: "
,
str
(
place
),
data_layout
)
places
=
[
core
.
CPUPlace
()]
places
=
[
core
.
CPUPlace
()]
...
@@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase):
...
@@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase):
pass
pass
class
TestBatchNormOpFreezeStatsTraining
(
TestBatchNormOpTraining
):
def
init_test_case
(
self
):
self
.
use_global_stats
=
True
self
.
no_grad_set
=
set
()
self
.
fetch_list
=
[
'y'
,
'mean'
,
'variance'
,
'x@GRAD'
,
'scale@GRAD'
,
'bias@GRAD'
]
def
reference_grad
(
self
,
x
,
y_grad
,
scale
,
mean
,
var
,
epsilon
,
data_format
):
if
data_format
==
"NCHW"
:
x
=
np
.
transpose
(
x
,
(
0
,
2
,
3
,
1
))
y_grad
=
np
.
transpose
(
y_grad
,
(
0
,
2
,
3
,
1
))
x_grad
=
scale
*
y_grad
/
np
.
sqrt
(
var
+
epsilon
)
grad_scale
=
np
.
sum
(
y_grad
*
(
x
-
mean
)
/
np
.
sqrt
(
var
+
epsilon
),
axis
=
(
0
,
1
,
2
))
grad_offset
=
np
.
sum
(
y_grad
,
axis
=
(
0
,
1
,
2
))
# transfer back to N, C, H, W
if
data_format
==
"NCHW"
:
x_grad
=
np
.
transpose
(
x_grad
,
(
0
,
3
,
1
,
2
))
x
=
np
.
transpose
(
x
,
(
0
,
3
,
1
,
2
))
y_grad
=
np
.
transpose
(
y_grad
,
(
0
,
3
,
1
,
2
))
return
x_grad
,
grad_scale
,
grad_offset
def
ref_forward_backward
(
self
,
x
,
y_grad
,
scale
,
bias
,
mean
,
variance
,
epsilon
,
momentum
,
shape
,
data_layout
):
if
data_layout
!=
"NCHW"
and
data_layout
!=
"NHWC"
:
raise
ValueError
(
"Unknown data order."
)
if
data_layout
==
"NCHW"
:
x
=
np
.
transpose
(
x
,
(
0
,
2
,
3
,
1
))
# run normalizaton
normalized
=
(
x
-
mean
)
/
np
.
sqrt
(
variance
+
epsilon
)
y
=
normalized
*
scale
+
bias
# transfer back to N, C, H, W
if
data_layout
==
"NCHW"
:
x
=
np
.
transpose
(
x
,
(
0
,
3
,
1
,
2
))
y
=
np
.
transpose
(
y
,
(
0
,
3
,
1
,
2
))
mean_out
=
mean
variance_out
=
variance
saved_variance
=
1.
/
np
.
sqrt
(
variance
+
epsilon
)
# run backward
x_grad
,
scale_grad
,
bias_grad
=
self
.
reference_grad
(
x
,
y_grad
,
scale
,
mean
,
variance
,
epsilon
,
data_layout
)
return
y
,
mean_out
,
variance_out
,
mean
,
saved_variance
,
x_grad
,
scale_grad
,
bias_grad
class
TestBatchNormOpFreezeStatsAndScaleBiasTraining
(
TestBatchNormOpFreezeStatsTraining
):
def
init_test_case
(
self
):
self
.
use_global_stats
=
True
self
.
no_grad_set
=
set
([
'scale@GRAD'
,
'bias@GRAD'
])
self
.
fetch_list
=
[
'y'
,
'mean'
,
'variance'
,
'x@GRAD'
]
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
0 → 100644
浏览文件 @
64ad051b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
test_conv2d_transpose_op
import
TestConv2dTransposeOp
,
TestWithPad
,
TestWithStride
class
TestMKLDNN
(
TestConv2dTransposeOp
):
def
init_op_type
(
self
):
self
.
is_test
=
True
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
self
.
op_type
=
"conv2d_transpose"
self
.
_cpu_only
=
True
def
test_check_grad
(
self
):
return
def
test_check_grad_no_input
(
self
):
return
def
test_check_grad_no_filter
(
self
):
return
class
TestMKLDNNWithPad
(
TestWithPad
):
def
init_op_type
(
self
):
self
.
is_test
=
True
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
self
.
op_type
=
"conv2d_transpose"
self
.
_cpu_only
=
True
def
test_check_grad
(
self
):
return
def
test_check_grad_no_input
(
self
):
return
def
test_check_grad_no_filter
(
self
):
return
class
TestMKLDNNWithStride
(
TestWithStride
):
def
init_op_type
(
self
):
self
.
is_test
=
True
self
.
use_mkldnn
=
True
self
.
data_format
=
"NCHW"
self
.
op_type
=
"conv2d_transpose"
self
.
_cpu_only
=
True
def
test_check_grad
(
self
):
return
def
test_check_grad_no_input
(
self
):
return
def
test_check_grad_no_filter
(
self
):
return
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
浏览文件 @
64ad051b
...
@@ -68,8 +68,11 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
...
@@ -68,8 +68,11 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
class
TestConv2dTransposeOp
(
OpTest
):
class
TestConv2dTransposeOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
# init as conv transpose
# init as conv transpose
self
.
is_test
=
False
self
.
use_cudnn
=
False
self
.
use_cudnn
=
False
self
.
use_mkldnn
=
False
self
.
output_size
=
None
self
.
output_size
=
None
self
.
data_format
=
"AnyLayout"
self
.
init_op_type
()
self
.
init_op_type
()
self
.
init_test_case
()
self
.
init_test_case
()
...
@@ -83,7 +86,9 @@ class TestConv2dTransposeOp(OpTest):
...
@@ -83,7 +86,9 @@ class TestConv2dTransposeOp(OpTest):
'groups'
:
self
.
groups
,
'groups'
:
self
.
groups
,
'dilations'
:
self
.
dilations
,
'dilations'
:
self
.
dilations
,
'use_cudnn'
:
self
.
use_cudnn
,
'use_cudnn'
:
self
.
use_cudnn
,
'data_format'
:
'AnyLayout'
# TODO(dzhwinter) : should be fix latter
'is_test'
:
self
.
is_test
,
'use_mkldnn'
:
self
.
use_mkldnn
,
'data_format'
:
self
.
data_format
}
}
if
self
.
output_size
is
not
None
:
if
self
.
output_size
is
not
None
:
self
.
attrs
[
'output_size'
]
=
self
.
output_size
self
.
attrs
[
'output_size'
]
=
self
.
output_size
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
64ad051b
...
@@ -782,5 +782,46 @@ class TestNCCL2Transpile(TranspilerTest):
...
@@ -782,5 +782,46 @@ class TestNCCL2Transpile(TranspilerTest):
pass
pass
# test for remote prefetch
class
TestRemoteLookupTable
(
TestDistLookupTableBase
):
def
net_conf
(
self
):
import
os
os
.
environ
[
'PADDLE_ENABLE_REMOTE_PREFETCH'
]
=
"1"
self
.
network_with_table
(
is_sparse
=
True
,
is_distributed
=
False
)
def
transpiler_test_impl
(
self
):
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
4
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
# 2 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
# 3 optimize for table 2 adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'fetch_barrier'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
64ad051b
...
@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase):
...
@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase):
with
program_guard
(
program
):
with
program_guard
(
program
):
dat
=
layers
.
data
(
name
=
'data'
,
shape
=
[
10
],
dtype
=
'float32'
)
dat
=
layers
.
data
(
name
=
'data'
,
shape
=
[
10
],
dtype
=
'float32'
)
lbl
=
layers
.
data
(
name
=
'label'
,
shape
=
[
10
],
dtype
=
'float32'
)
lbl
=
layers
.
data
(
name
=
'label'
,
shape
=
[
10
],
dtype
=
'float32'
)
ignore_index
=
-
1
self
.
assertIsNotNone
(
self
.
assertIsNotNone
(
layers
.
sigmoid_cross_entropy_with_logits
(
layers
.
sigmoid_cross_entropy_with_logits
(
x
=
dat
,
label
=
lbl
))
x
=
dat
,
label
=
lbl
,
ignore_index
=
ignore_index
))
print
(
str
(
program
))
print
(
str
(
program
))
def
test_hsigmoid
(
self
):
def
test_hsigmoid
(
self
):
...
@@ -636,13 +637,21 @@ class TestBook(unittest.TestCase):
...
@@ -636,13 +637,21 @@ class TestBook(unittest.TestCase):
with
program_guard
(
program
):
with
program_guard
(
program
):
input
=
layers
.
data
(
input
=
layers
.
data
(
name
=
"input"
,
shape
=
[
3
,
100
,
100
],
dtype
=
"float32"
)
name
=
"input"
,
shape
=
[
3
,
100
,
100
],
dtype
=
"float32"
)
paddings
=
layers
.
fill_constant
(
shape
=
[
4
],
dtype
=
'int32'
,
value
=
1
)
out
=
layers
.
pad2d
(
out
=
layers
.
pad2d
(
input
,
input
,
paddings
=
[
1
,
2
,
3
,
4
],
paddings
=
[
1
,
2
,
3
,
4
],
mode
=
'reflect'
,
mode
=
'reflect'
,
data_format
=
'NCHW'
,
data_format
=
'NCHW'
,
name
=
"shape"
)
name
=
"shape"
)
out_1
=
layers
.
pad2d
(
input
,
paddings
=
paddings
,
mode
=
'reflect'
,
data_format
=
'NCHW'
,
name
=
"shape"
)
self
.
assertIsNotNone
(
out
)
self
.
assertIsNotNone
(
out
)
self
.
assertIsNotNone
(
out_1
)
print
(
str
(
program
))
print
(
str
(
program
))
def
test_prelu
(
self
):
def
test_prelu
(
self
):
...
@@ -955,6 +964,15 @@ class TestBook(unittest.TestCase):
...
@@ -955,6 +964,15 @@ class TestBook(unittest.TestCase):
print
(
str
(
program
))
print
(
str
(
program
))
def
test_batch_norm
(
self
):
program
=
Program
()
with
program_guard
(
program
):
data
=
layers
.
data
(
name
=
'data'
,
shape
=
[
32
,
128
,
128
],
dtype
=
"float32"
)
out
=
layers
.
batch_norm
(
data
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
0 → 100644
浏览文件 @
64ad051b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
signal
import
time
import
unittest
from
multiprocessing
import
Process
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.framework
import
Program
,
program_guard
def
run_pserver
(
pserver_id
,
use_cuda
,
sync_mode
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
# create table parameter in scope
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
# create and initialize Param Variable
param
=
scope
.
var
(
'table'
).
get_tensor
()
param_array
=
np
.
ones
((
10
,
8
)).
astype
(
"float32"
)
for
i
in
range
(
len
(
param_array
)):
param_array
[
i
]
*=
param_array
[
i
]
*
i
+
pserver_id
*
10
param
.
set
(
param_array
,
place
)
optimize_block
=
program
.
_create_block
(
program
.
global_block
().
idx
)
program
.
global_block
().
append_op
(
type
=
"listen_and_serv"
,
inputs
=
{
'X'
:
[]},
outputs
=
{},
attrs
=
{
"optimize_blocks"
:
[
optimize_block
],
"endpoint"
:
'127.0.0.1:0'
,
"Fanin"
:
1
,
"sync_mode"
:
True
,
"grad_to_block_id"
:
[]
})
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
program
)
class
TestListenAndServOp
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
ps_timeout
=
5
def
_start_pserver
(
self
,
pserver_id
,
use_cuda
,
sync_mode
,
pserver_func
):
p
=
Process
(
target
=
pserver_func
,
args
=
(
pserver_id
,
use_cuda
,
sync_mode
))
p
.
daemon
=
True
p
.
start
()
return
p
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
sleep_time
=
0.5
while
True
:
assert
start_left_time
>=
0
,
"wait ps ready failed"
time
.
sleep
(
sleep_time
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
start_left_time
-=
sleep_time
def
_get_pserver_port
(
self
,
pid
):
with
open
(
"/tmp/paddle.%d.port"
%
pid
,
'r'
)
as
f
:
port
=
int
(
f
.
read
().
strip
())
return
port
def
_run_lookup_table_op_one_pserver
(
self
,
place
,
port
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
# create and initialize Param Variable
param
=
scope
.
var
(
'W'
).
get_tensor
()
param_array
=
np
.
full
((
10
,
8
),
1.0
).
astype
(
"float32"
)
param
.
set
(
param_array
,
place
)
ids
=
scope
.
var
(
'Ids'
).
get_tensor
()
ids_array
=
np
.
array
([[
1
],
[
2
],
[
5
]]).
astype
(
"int64"
)
ids
.
set
(
ids_array
,
place
)
ids_lod
=
[[
0
,
1
,
2
,
3
]]
ids
.
set_lod
(
ids_lod
)
out
=
scope
.
var
(
'Out'
).
get_tensor
()
emaps
=
[
'127.0.0.1:'
+
str
(
port
)]
table_names
=
[
'table'
]
height_sections
=
[
10
]
# create and run sgd operator
lookup_table_op
=
Operator
(
"lookup_table"
,
W
=
'W'
,
Ids
=
'Ids'
,
Out
=
'Out'
,
remote_prefetch
=
True
,
epmap
=
emaps
,
table_names
=
table_names
,
height_sections
=
height_sections
)
lookup_table_op
.
run
(
scope
,
place
)
# get and compare result
result_array
=
np
.
array
(
out
)
self
.
assertEqual
(
out
.
lod
(),
ids_lod
)
self
.
assertEqual
(
list
(
result_array
.
shape
),
[
len
(
ids_array
),
8
])
for
i
in
range
(
len
(
ids_array
)):
id
=
ids_array
[
i
][
0
]
self
.
assertTrue
((
result_array
[
i
]
==
id
).
all
())
def
_run_lookup_table_op_two_pserver
(
self
,
place
,
port0
,
port1
):
scope
=
fluid
.
core
.
Scope
()
program
=
Program
()
with
fluid
.
scope_guard
(
scope
):
with
program_guard
(
program
,
startup_program
=
Program
()):
# create and initialize Param Variable
param
=
scope
.
var
(
'W'
).
get_tensor
()
param_array
=
np
.
full
((
10
,
8
),
1.0
).
astype
(
"float32"
)
param
.
set
(
param_array
,
place
)
ids
=
scope
.
var
(
'Ids'
).
get_tensor
()
ids_array
=
np
.
array
([[
1
],
[
2
],
[
11
],
[
13
]]).
astype
(
"int64"
)
ids
.
set
(
ids_array
,
place
)
ids_lod
=
[[
0
,
2
,
3
,
4
]]
ids
.
set_lod
(
ids_lod
)
out
=
scope
.
var
(
'Out'
).
get_tensor
()
emaps
=
[
'127.0.0.1:'
+
str
(
port0
),
'127.0.0.1:'
+
str
(
port1
)]
table_names
=
[
'table'
,
'table'
]
height_sections
=
[
10
,
20
]
# create and run sgd operator
lookup_table_op
=
Operator
(
"lookup_table"
,
W
=
'W'
,
Ids
=
'Ids'
,
Out
=
'Out'
,
remote_prefetch
=
True
,
epmap
=
emaps
,
table_names
=
table_names
,
height_sections
=
height_sections
)
lookup_table_op
.
run
(
scope
,
place
)
# get and compare result
result_array
=
np
.
array
(
out
)
self
.
assertEqual
(
out
.
lod
(),
ids_lod
)
self
.
assertEqual
(
list
(
result_array
.
shape
),
[
len
(
ids_array
),
8
])
for
i
in
range
(
len
(
ids_array
)):
id
=
ids_array
[
i
][
0
]
self
.
assertTrue
((
result_array
[
i
]
==
id
).
all
())
def
test_lookup_remote_table
(
self
):
os
.
environ
[
'PADDLE_ENABLE_REMOTE_PREFETCH'
]
=
"1"
# run pserver on CPU in sync mode
p0
=
self
.
_start_pserver
(
0
,
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p0
.
pid
)
port0
=
self
.
_get_pserver_port
(
p0
.
pid
)
p1
=
self
.
_start_pserver
(
1
,
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p1
.
pid
)
port1
=
self
.
_get_pserver_port
(
p1
.
pid
)
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
_run_lookup_table_op_one_pserver
(
place
,
port0
)
self
.
_run_lookup_table_op_two_pserver
(
place
,
port0
,
port1
)
# raise SIGTERM to pserver
os
.
kill
(
p0
.
pid
,
signal
.
SIGINT
)
p0
.
join
()
os
.
kill
(
p1
.
pid
,
signal
.
SIGINT
)
p1
.
join
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
0 → 100644
浏览文件 @
64ad051b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
import
paddle.fluid
as
fluid
SIGMOID_THRESHOLD_MIN
=
-
40.0
SIGMOID_THRESHOLD_MAX
=
13.0
EXP_MAX_INPUT
=
40.0
def
lstm_naive
(
input
,
w
,
):
seq_len
,
batch_size
,
hidden_size
=
input
.
shape
offset
=
0
wi
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wf
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wc
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
wo
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
ri
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
rf
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
rc
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
ro
=
w
[
offset
:
offset
+
hidden_size
*
hidden_size
].
reshape
(
(
hidden_size
,
hidden_size
)).
transpose
()
offset
+=
hidden_size
*
hidden_size
bi_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bf_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bc_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bo_1
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bi_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bf_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bc_2
=
w
[
offset
:
offset
+
hidden_size
]
offset
+=
hidden_size
bo_2
=
w
[
offset
:
offset
+
hidden_size
]
def
sigmoid
(
x
):
y
=
np
.
copy
(
x
)
y
[
x
<
SIGMOID_THRESHOLD_MIN
]
=
SIGMOID_THRESHOLD_MIN
y
[
x
>
SIGMOID_THRESHOLD_MAX
]
=
SIGMOID_THRESHOLD_MAX
return
1.
/
(
1.
+
np
.
exp
(
-
y
))
def
tanh
(
x
):
y
=
-
2.
*
x
y
[
y
>
EXP_MAX_INPUT
]
=
EXP_MAX_INPUT
return
(
2.
/
(
1.
+
np
.
exp
(
y
)))
-
1.
output
=
[]
pre_h
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
input
.
dtype
)
pre_c
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
input
.
dtype
)
for
i
in
range
(
seq_len
):
emb_1
=
input
[
i
]
input_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wi
)
+
np
.
matmul
(
pre_h
,
ri
)
+
bi_1
+
bi_2
)
forget_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wf
)
+
np
.
matmul
(
pre_h
,
rf
)
+
bf_1
+
bf_2
)
output_gate
=
sigmoid
(
np
.
matmul
(
emb_1
,
wo
)
+
np
.
matmul
(
pre_h
,
ro
)
+
bo_1
+
bo_2
)
c_t_temp
=
tanh
(
np
.
matmul
(
emb_1
,
wc
)
+
np
.
matmul
(
pre_h
,
rc
)
+
bc_1
+
bc_2
)
new_c
=
input_gate
*
c_t_temp
+
forget_gate
*
pre_c
new_h
=
output_gate
*
tanh
(
new_c
)
pre_h
=
new_h
pre_c
=
new_c
output
.
append
(
new_h
)
output
=
np
.
concatenate
(
output
,
-
1
)
output
=
output
.
reshape
((
batch_size
,
-
1
,
hidden_size
))
output
=
output
.
transpose
((
1
,
0
,
2
))
return
output
,
pre_h
,
pre_c
class
TestCUDNNLstmOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"cudnn_lstm"
self
.
dtype
=
np
.
float32
num_steps
=
20
batch_size
=
5
hidden_size
=
20
input_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
hidden_weight_size
=
(
hidden_size
*
hidden_size
)
*
4
weight_size
=
input_weight_size
+
hidden_weight_size
weight_size
+=
hidden_size
*
8
input
=
np
.
random
.
uniform
(
low
=-
0.1
,
high
=
0.1
,
size
=
(
num_steps
,
batch_size
,
hidden_size
)).
astype
(
self
.
dtype
)
flat_w
=
np
.
random
.
uniform
(
low
=-
0.1
,
high
=
0.1
,
size
=
(
weight_size
)).
astype
(
self
.
dtype
)
output
,
last_hidden
,
last_cell
=
lstm_naive
(
input
,
flat_w
)
init_h
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
np
.
float32
)
init_c
=
np
.
zeros
((
batch_size
,
hidden_size
),
dtype
=
np
.
float32
)
scope
=
core
.
Scope
()
program
=
fluid
.
Program
()
block
=
program
.
global_block
()
cache_temp
=
block
.
create_var
(
name
=
"Cache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
,
stop_gradient
=
True
)
self
.
inputs
=
{
'Input'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
),
'W'
:
OpTest
.
np_dtype_to_fluid_dtype
(
flat_w
),
'InitH'
:
OpTest
.
np_dtype_to_fluid_dtype
(
init_h
),
'InitC'
:
OpTest
.
np_dtype_to_fluid_dtype
(
init_c
),
}
self
.
cache_name_list
=
[
'Cache'
]
self
.
attrs
=
{
'max_len'
:
num_steps
,
'dropout_prob'
:
0.0
,
'is_bidirec'
:
False
,
'input_size'
:
hidden_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
1
,
}
self
.
outputs
=
{
'Out'
:
output
,
"last_h"
:
last_hidden
,
'last_c'
:
last_cell
}
def
test_output_with_place
(
self
):
if
self
.
testcuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_output_with_place
(
place
,
atol
=
1e-5
)
def
test_grad_with_place
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
self
.
check_grad_with_place
(
place
,
set
([
'Input'
,
'W'
,
'InitH'
,
'InitC'
]),
[
'Out'
,
'last_h'
,
'last_c'
],
max_relative_error
=
0.02
)
def
testcuda
(
self
):
return
core
.
is_compiled_with_cuda
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_pad2d_op.py
浏览文件 @
64ad051b
...
@@ -20,11 +20,17 @@ from op_test import OpTest
...
@@ -20,11 +20,17 @@ from op_test import OpTest
class
TestPad2dOp
(
OpTest
):
class
TestPad2dOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
pad_value
=
0.0
self
.
pad_value
=
0.0
self
.
variable_paddings
=
False
self
.
initTestCase
()
self
.
initTestCase
()
self
.
op_type
=
"pad2d"
self
.
op_type
=
"pad2d"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
self
.
shape
).
astype
(
"float32"
),
}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
self
.
shape
).
astype
(
"float32"
),
}
self
.
attrs
=
{}
self
.
attrs
=
{}
self
.
attrs
[
'paddings'
]
=
np
.
array
(
self
.
paddings
).
flatten
()
if
self
.
variable_paddings
:
self
.
attrs
[
'paddings'
]
=
[]
self
.
inputs
[
'Paddings'
]
=
np
.
array
(
self
.
paddings
).
flatten
().
astype
(
"int32"
)
else
:
self
.
attrs
[
'paddings'
]
=
np
.
array
(
self
.
paddings
).
flatten
()
self
.
attrs
[
'pad_value'
]
=
self
.
pad_value
self
.
attrs
[
'pad_value'
]
=
self
.
pad_value
self
.
attrs
[
'mode'
]
=
self
.
mode
self
.
attrs
[
'mode'
]
=
self
.
mode
self
.
attrs
[
'data_format'
]
=
self
.
data_format
self
.
attrs
[
'data_format'
]
=
self
.
data_format
...
@@ -98,5 +104,24 @@ class TestCase5(TestPad2dOp):
...
@@ -98,5 +104,24 @@ class TestCase5(TestPad2dOp):
self
.
data_format
=
"NHWC"
self
.
data_format
=
"NHWC"
class
TestCase6
(
TestPad2dOp
):
def
initTestCase
(
self
):
self
.
shape
=
(
2
,
4
,
4
,
2
)
self
.
paddings
=
[
0
,
1
,
2
,
3
]
self
.
mode
=
"constant"
self
.
pad_value
=
1.2
self
.
data_format
=
"NHWC"
self
.
variable_paddings
=
True
class
TestCase7
(
TestPad2dOp
):
def
initTestCase
(
self
):
self
.
shape
=
(
2
,
3
,
4
,
4
)
self
.
paddings
=
[
0
,
1
,
2
,
3
]
self
.
mode
=
"reflect"
self
.
data_format
=
"NCHW"
self
.
variable_paddings
=
True
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
浏览文件 @
64ad051b
...
@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
...
@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
"""Test sigmoid_cross_entropy_with_logit_op with probabalistic label
"""Test sigmoid_cross_entropy_with_logit_op with probabalistic label
"""
"""
def
setUp
(
self
):
self
.
op_type
=
"sigmoid_cross_entropy_with_logits"
batch_size
=
64
num_classes
=
20
ignore_index
=
-
1
self
.
inputs
=
{
'X'
:
logit
(
np
.
random
.
uniform
(
0
,
1
,
(
batch_size
,
num_classes
))
.
astype
(
"float32"
)),
'Label'
:
np
.
random
.
randint
(
-
1
,
2
,
(
batch_size
,
num_classes
))
.
astype
(
"float32"
)
}
self
.
attrs
=
{
'ignore_index'
:
ignore_index
,
}
# Fw Pass is implemented as elementwise sigmoid followed by
# elementwise logistic loss
# Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
sigmoid_X
=
expit
(
self
.
inputs
[
'X'
])
term1
=
self
.
inputs
[
'Label'
]
*
np
.
log
(
sigmoid_X
)
term2
=
(
1
-
self
.
inputs
[
'Label'
])
*
np
.
log
(
1
-
sigmoid_X
)
out
=
-
term1
-
term2
out
[
np
.
where
(
self
.
inputs
[
'Label'
]
==
ignore_index
)]
=
0
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
TestSigmoidCrossEntropyWithLogitsOp3
(
OpTest
):
"""Test sigmoid_cross_entropy_with_logit_op with probabalistic label
"""
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"sigmoid_cross_entropy_with_logits"
self
.
op_type
=
"sigmoid_cross_entropy_with_logits"
batch_size
=
64
batch_size
=
64
...
...
python/paddle/fluid/tests/unittests/testsuite.py
浏览文件 @
64ad051b
...
@@ -20,7 +20,7 @@ import paddle.fluid.core as core
...
@@ -20,7 +20,7 @@ import paddle.fluid.core as core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.op
import
Operator
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
):
def
create_op
(
scope
,
op_type
,
inputs
,
outputs
,
attrs
,
cache_list
=
None
):
kwargs
=
dict
()
kwargs
=
dict
()
op_maker
=
core
.
op_proto_and_checker_maker
op_maker
=
core
.
op_proto_and_checker_maker
...
@@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
...
@@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
__create_var__
(
in_name
,
sub_in_name
)
__create_var__
(
in_name
,
sub_in_name
)
else
:
else
:
__create_var__
(
in_name
,
in_name
)
__create_var__
(
in_name
,
in_name
)
if
cache_list
!=
None
and
isinstance
(
cache_list
,
list
):
for
name
in
cache_list
:
kwargs
[
name
]
=
[]
scope
.
var
(
name
)
kwargs
[
name
].
append
(
name
)
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
op_type
):
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
op_type
):
if
out_name
in
outputs
:
if
out_name
in
outputs
:
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
64ad051b
...
@@ -236,6 +236,31 @@ class DistributeTranspiler(object):
...
@@ -236,6 +236,31 @@ class DistributeTranspiler(object):
else
:
else
:
raise
ValueError
(
"must set trainer_id > 0"
)
raise
ValueError
(
"must set trainer_id > 0"
)
def
_get_all_remote_sparse_update_op
(
self
,
main_program
):
sparse_update_ops
=
[]
sparse_update_op_types
=
[
"lookup_table"
]
for
op
in
main_program
.
global_block
().
ops
:
if
op
.
type
in
sparse_update_op_types
and
op
.
attr
(
'remote_prefetch'
)
is
True
and
not
op
.
attr
(
'is_distributed'
):
sparse_update_ops
.
append
(
op
)
return
sparse_update_ops
def
_update_remote_sparse_update_op
(
self
,
param_varname
,
height_sections
,
endpint_map
,
table_names
):
for
op
in
self
.
sparse_update_ops
:
if
param_varname
in
op
.
input_arg_names
:
op
.
_set_attr
(
'epmap'
,
endpint_map
)
op
.
_set_attr
(
'table_names'
,
table_names
)
op
.
_set_attr
(
'height_sections'
,
height_sections
)
op
.
_set_attr
(
'trainer_id'
,
self
.
trainer_id
)
def
_is_input_of_remote_sparse_update_op
(
self
,
param_name
):
for
op
in
self
.
sparse_update_ops
:
if
param_name
in
op
.
input_arg_names
:
return
True
return
False
def
transpile
(
self
,
def
transpile
(
self
,
trainer_id
,
trainer_id
,
program
=
None
,
program
=
None
,
...
@@ -299,6 +324,12 @@ class DistributeTranspiler(object):
...
@@ -299,6 +324,12 @@ class DistributeTranspiler(object):
self
.
param_name_to_grad_name
[
param_var
.
name
]
=
grad_var
.
name
self
.
param_name_to_grad_name
[
param_var
.
name
]
=
grad_var
.
name
self
.
grad_name_to_param_name
[
grad_var
.
name
]
=
param_var
.
name
self
.
grad_name_to_param_name
[
grad_var
.
name
]
=
param_var
.
name
# get all sparse update ops
self
.
sparse_update_ops
=
self
.
_get_all_remote_sparse_update_op
(
self
.
origin_program
)
# use_sparse_update_param_name -> split_height_section
self
.
sparse_param_to_height_sections
=
dict
()
# add distributed attrs to program
# add distributed attrs to program
self
.
origin_program
.
_is_distributed
=
True
self
.
origin_program
.
_is_distributed
=
True
self
.
origin_program
.
_endpoints
=
self
.
pserver_endpoints
self
.
origin_program
.
_endpoints
=
self
.
pserver_endpoints
...
@@ -336,6 +367,13 @@ class DistributeTranspiler(object):
...
@@ -336,6 +367,13 @@ class DistributeTranspiler(object):
splited_grad_varname
=
splited_vars
[
0
].
name
splited_grad_varname
=
splited_vars
[
0
].
name
index
=
find_op_by_output_arg
(
index
=
find_op_by_output_arg
(
program
.
global_block
(),
splited_grad_varname
,
reverse
=
True
)
program
.
global_block
(),
splited_grad_varname
,
reverse
=
True
)
if
splited_vars
[
0
].
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
sparse_param_name
=
self
.
grad_name_to_param_name
[
grad_varname
]
if
self
.
_is_input_of_remote_sparse_update_op
(
sparse_param_name
):
self
.
sparse_param_to_height_sections
[
sparse_param_name
]
=
[
splited_vars
[
0
].
shape
[
0
]]
elif
len
(
splited_vars
)
>
1
:
elif
len
(
splited_vars
)
>
1
:
orig_var
=
program
.
global_block
().
vars
[
splited_grad_varname
]
orig_var
=
program
.
global_block
().
vars
[
splited_grad_varname
]
index
=
find_op_by_output_arg
(
index
=
find_op_by_output_arg
(
...
@@ -406,16 +444,18 @@ class DistributeTranspiler(object):
...
@@ -406,16 +444,18 @@ class DistributeTranspiler(object):
all_recv_outputs
=
[]
all_recv_outputs
=
[]
for
param_varname
,
splited_var
in
six
.
iteritems
(
self
.
param_var_mapping
):
for
param_varname
,
splited_var
in
six
.
iteritems
(
self
.
param_var_mapping
):
eps
=
[]
eps
=
[]
table_names
=
[]
for
var
in
splited_var
:
for
var
in
splited_var
:
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
eps
.
append
(
eplist
[
index
])
eps
.
append
(
eplist
[
index
])
table_names
.
append
(
var
.
name
)
if
self
.
sync_mode
:
if
self
.
sync_mode
:
recv_dep_in
=
send_barrier_out
recv_dep_in
=
send_barrier_out
else
:
else
:
# connect deps to send op in async mode
# connect deps to send op in async mode
recv_dep_in
=
self
.
grad_name_to_send_dummy_out
[
recv_dep_in
=
self
.
grad_name_to_send_dummy_out
[
self
.
param_name_to_grad_name
[
param_varname
]]
self
.
param_name_to_grad_name
[
param_varname
]]
all_recv_outputs
.
extend
(
splited_var
)
# get recv op_role_var, if not splited, the grad should have .trainer suffix
# get recv op_role_var, if not splited, the grad should have .trainer suffix
# if splited, grad should be the original grad var name. ParallelExecutor
# if splited, grad should be the original grad var name. ParallelExecutor
# will use op_role_var to get expected device place to run this op.
# will use op_role_var to get expected device place to run this op.
...
@@ -425,18 +465,25 @@ class DistributeTranspiler(object):
...
@@ -425,18 +465,25 @@ class DistributeTranspiler(object):
if
len
(
splited_trainer_grad
)
==
1
:
if
len
(
splited_trainer_grad
)
==
1
:
recv_op_role_var_name
=
splited_trainer_grad
[
0
].
name
recv_op_role_var_name
=
splited_trainer_grad
[
0
].
name
program
.
global_block
().
append_op
(
if
param_varname
in
self
.
sparse_param_to_height_sections
:
type
=
"recv"
,
height_sections
=
self
.
sparse_param_to_height_sections
[
inputs
=
{
"X"
:
[
recv_dep_in
]},
param_varname
]
outputs
=
{
"Out"
:
splited_var
},
self
.
_update_remote_sparse_update_op
(
attrs
=
{
param_varname
,
height_sections
,
eps
,
table_names
)
"epmap"
:
eps
,
else
:
"trainer_id"
:
self
.
trainer_id
,
all_recv_outputs
.
extend
(
splited_var
)
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
,
program
.
global_block
().
append_op
(
OP_ROLE_VAR_ATTR_NAME
:
type
=
"recv"
,
[
param_varname
,
recv_op_role_var_name
],
inputs
=
{
"X"
:
[
recv_dep_in
]},
"sync_mode"
:
not
self
.
sync_mode
outputs
=
{
"Out"
:
splited_var
},
})
attrs
=
{
"epmap"
:
eps
,
"trainer_id"
:
self
.
trainer_id
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
,
OP_ROLE_VAR_ATTR_NAME
:
[
param_varname
,
recv_op_role_var_name
],
"sync_mode"
:
not
self
.
sync_mode
})
if
self
.
sync_mode
:
if
self
.
sync_mode
:
# form a WAW dependency
# form a WAW dependency
...
@@ -454,14 +501,15 @@ class DistributeTranspiler(object):
...
@@ -454,14 +501,15 @@ class DistributeTranspiler(object):
if
len
(
splited_var
)
<=
1
:
if
len
(
splited_var
)
<=
1
:
continue
continue
orig_param
=
program
.
global_block
().
vars
[
param_varname
]
orig_param
=
program
.
global_block
().
vars
[
param_varname
]
program
.
global_block
().
append_op
(
if
param_varname
not
in
self
.
sparse_param_to_height_sections
:
type
=
"concat"
,
program
.
global_block
().
append_op
(
inputs
=
{
"X"
:
splited_var
},
type
=
"concat"
,
outputs
=
{
"Out"
:
[
orig_param
]},
inputs
=
{
"X"
:
splited_var
},
attrs
=
{
outputs
=
{
"Out"
:
[
orig_param
]},
"axis"
:
0
,
attrs
=
{
RPC_OP_ROLE_ATTR_NAME
:
DIST_OP_ROLE_ATTR_VALUE
"axis"
:
0
,
})
RPC_OP_ROLE_ATTR_NAME
:
DIST_OP_ROLE_ATTR_VALUE
})
self
.
_get_trainer_startup_program
(
recv_vars
=
recv_vars
,
eplist
=
eplist
)
self
.
_get_trainer_startup_program
(
recv_vars
=
recv_vars
,
eplist
=
eplist
)
...
@@ -1420,6 +1468,10 @@ to transpile() call.")
...
@@ -1420,6 +1468,10 @@ to transpile() call.")
height_sections
=
[]
height_sections
=
[]
for
v
in
splited_vars
:
for
v
in
splited_vars
:
height_sections
.
append
(
v
.
shape
[
0
])
height_sections
.
append
(
v
.
shape
[
0
])
sparse_param_name
=
self
.
grad_name_to_param_name
[
orig_var
.
name
]
if
self
.
_is_input_of_remote_sparse_update_op
(
sparse_param_name
):
self
.
sparse_param_to_height_sections
[
sparse_param_name
]
=
height_sections
program
.
global_block
().
_insert_op
(
program
.
global_block
().
_insert_op
(
index
=
index
+
1
,
index
=
index
+
1
,
type
=
"split_selected_rows"
,
type
=
"split_selected_rows"
,
...
...
python/paddle/reader/tests/decorator_test.py
浏览文件 @
64ad051b
...
@@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase):
...
@@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase):
for
idx
,
i
in
enumerate
(
b
()):
for
idx
,
i
in
enumerate
(
b
()):
elapsed_time
=
time
.
time
()
-
last_time
elapsed_time
=
time
.
time
()
-
last_time
if
i
==
0
:
if
i
==
0
:
time
.
sleep
(
0.3
)
time
.
sleep
(
1
)
else
:
else
:
# read time should be short, meaning already buffered.
# read time should be short, meaning already buffered.
self
.
assertLess
(
elapsed_time
,
0.0
5
)
self
.
assertLess
(
elapsed_time
,
0.0
8
)
last_time
=
time
.
time
()
last_time
=
time
.
time
()
...
...
python/setup.py.in
浏览文件 @
64ad051b
...
@@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON':
...
@@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path)
package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
if '${
CMAKE_BUILD_TYPE}' == 'Release
':
if '${
WITH_MKLDNN}' == 'ON
':
# only change rpath in Release mode.
if '${CMAKE_BUILD_TYPE}' == 'Release':
if '${WITH_MKLDNN}' == 'ON':
# only change rpath in Release mode.
# TODO(typhoonzero): use install_name_tool to patch mkl libs once
# TODO(typhoonzero): use install_name_tool to patch mkl libs once
# we can support mkl on mac.
# we can support mkl on mac.
#
#
...
@@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
...
@@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
if os.system(command) != 0:
if os.system(command) != 0:
raise Exception("patch libmkldnn.so failed, command: %s" % command)
raise Exception("patch libmkldnn.so failed, command: %s" % command)
package_data['paddle.libs']+=['libmkldnn.so.0']
package_data['paddle.libs']+=['libmkldnn.so.0']
shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
if '${WITH_NGRAPH}' == 'ON':
if '${WITH_NGRAPH}' == 'ON':
# only change rpath in Release mode,
# since in Debug mode, nGraph lib may be too large to be changed?
if '${CMAKE_BUILD_TYPE}' == 'Release':
if '${CMAKE_BUILD_TYPE}' == 'Release':
# only change rpath in Release mode.
if os.name != 'nt':
command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
if "@APPLE@" == "1":
if os.system(command) != 0:
command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}"
raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
else:
command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
if os.system(command) != 0:
raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录