Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
e0a89503
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e0a89503
编写于
10月 29, 2018
作者:
J
JiabinYang
浏览文件
操作
浏览文件
下载
差异文件
test=develop
上级
0e303868
79da263b
变更
84
隐藏空白更改
内联
并排
Showing
84 changed file
with
2559 addition
and
355 deletion
+2559
-355
CMakeLists.txt
CMakeLists.txt
+7
-0
benchmark/fluid/args.py
benchmark/fluid/args.py
+5
-0
benchmark/fluid/fluid_benchmark.py
benchmark/fluid/fluid_benchmark.py
+1
-1
cmake/external/xxhash.cmake
cmake/external/xxhash.cmake
+46
-0
cmake/inference_lib.cmake
cmake/inference_lib.cmake
+12
-2
paddle/fluid/API.spec
paddle/fluid/API.spec
+4
-0
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+4
-2
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+14
-7
paddle/fluid/framework/details/broadcast_op_handle.h
paddle/fluid/framework/details/broadcast_op_handle.h
+4
-1
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+1
-0
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+2
-0
paddle/fluid/framework/details/fused_broadcast_op_handle.cc
paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+55
-0
paddle/fluid/framework/details/fused_broadcast_op_handle.h
paddle/fluid/framework/details/fused_broadcast_op_handle.h
+57
-0
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+52
-10
paddle/fluid/framework/details/multi_devices_graph_pass.h
paddle/fluid/framework/details/multi_devices_graph_pass.h
+6
-1
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-0
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+12
-63
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+6
-0
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+12
-5
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+315
-0
paddle/fluid/framework/ir/multi_batch_merge_pass.h
paddle/fluid/framework/ir/multi_batch_merge_pass.h
+44
-0
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+2
-0
paddle/fluid/framework/lod_tensor_array.h
paddle/fluid/framework/lod_tensor_array.h
+77
-1
paddle/fluid/framework/op_desc.h
paddle/fluid/framework/op_desc.h
+0
-4
paddle/fluid/framework/op_proto_maker.h
paddle/fluid/framework/op_proto_maker.h
+3
-3
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+19
-11
paddle/fluid/framework/scope.h
paddle/fluid/framework/scope.h
+2
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+2
-2
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+3
-0
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+0
-1
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+15
-6
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+8
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+2
-0
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+5
-0
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+3
-2
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+7
-5
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+10
-7
paddle/fluid/inference/api/details/reset_tensor_array.cc
paddle/fluid/inference/api/details/reset_tensor_array.cc
+50
-0
paddle/fluid/inference/api/details/reset_tensor_array.h
paddle/fluid/inference/api/details/reset_tensor_array.h
+37
-0
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+2
-1
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+1
-7
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+1
-0
paddle/fluid/inference/tests/api/tester_helper.h
paddle/fluid/inference/tests/api/tester_helper.h
+3
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-0
paddle/fluid/operators/beam_search_decode_op.cc
paddle/fluid/operators/beam_search_decode_op.cc
+3
-0
paddle/fluid/operators/hash_op.cc
paddle/fluid/operators/hash_op.cc
+74
-0
paddle/fluid/operators/hash_op.h
paddle/fluid/operators/hash_op.h
+56
-0
paddle/fluid/operators/lars_momentum_op.cc
paddle/fluid/operators/lars_momentum_op.cc
+86
-0
paddle/fluid/operators/lars_momentum_op.cu
paddle/fluid/operators/lars_momentum_op.cu
+94
-0
paddle/fluid/operators/lars_momentum_op.h
paddle/fluid/operators/lars_momentum_op.h
+72
-0
paddle/fluid/operators/lookup_table_op.cc
paddle/fluid/operators/lookup_table_op.cc
+6
-0
paddle/fluid/operators/lookup_table_op.h
paddle/fluid/operators/lookup_table_op.h
+31
-19
paddle/fluid/operators/math/algorithm.h
paddle/fluid/operators/math/algorithm.h
+46
-0
paddle/fluid/operators/momentum_op.cc
paddle/fluid/operators/momentum_op.cc
+0
-48
paddle/fluid/operators/momentum_op.h
paddle/fluid/operators/momentum_op.h
+48
-0
paddle/fluid/operators/sequence_reverse_op.cc
paddle/fluid/operators/sequence_reverse_op.cc
+29
-0
paddle/fluid/operators/sequence_reverse_op.cu
paddle/fluid/operators/sequence_reverse_op.cu
+25
-0
paddle/fluid/operators/sequence_reverse_op.h
paddle/fluid/operators/sequence_reverse_op.h
+157
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+50
-15
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+8
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+7
-3
paddle/fluid/train/demo/CMakeLists.txt
paddle/fluid/train/demo/CMakeLists.txt
+3
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+7
-7
python/paddle/dataset/wmt16.py
python/paddle/dataset/wmt16.py
+1
-1
python/paddle/fluid/evaluator.py
python/paddle/fluid/evaluator.py
+1
-1
python/paddle/fluid/layers/learning_rate_scheduler.py
python/paddle/fluid/layers/learning_rate_scheduler.py
+14
-12
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+79
-22
python/paddle/fluid/metrics.py
python/paddle/fluid/metrics.py
+183
-64
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+89
-2
python/paddle/fluid/tests/unittests/dist_mnist.py
python/paddle/fluid/tests/unittests/dist_mnist.py
+1
-1
python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
...on/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
+80
-0
python/paddle/fluid/tests/unittests/dist_mnist_lars.py
python/paddle/fluid/tests/unittests/dist_mnist_lars.py
+73
-0
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+1
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+22
-5
python/paddle/fluid/tests/unittests/test_dist_ctr.py
python/paddle/fluid/tests/unittests/test_dist_ctr.py
+2
-3
python/paddle/fluid/tests/unittests/test_dist_mnist.py
python/paddle/fluid/tests/unittests/test_dist_mnist.py
+10
-2
python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
...ddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
+67
-0
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+1
-2
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+1
-2
python/paddle/fluid/tests/unittests/test_hash_op.py
python/paddle/fluid/tests/unittests/test_hash_op.py
+57
-0
python/paddle/fluid/tests/unittests/test_metrics.py
python/paddle/fluid/tests/unittests/test_metrics.py
+49
-0
python/paddle/fluid/tests/unittests/test_momentum_op.py
python/paddle/fluid/tests/unittests/test_momentum_op.py
+39
-0
python/paddle/fluid/tests/unittests/test_sequence_reverse.py
python/paddle/fluid/tests/unittests/test_sequence_reverse.py
+69
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+5
-1
未找到文件。
CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -69,6 +69,7 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF)
option
(
WITH_GRPC
"Use grpc as the default rpc framework"
${
WITH_DISTRIBUTE
}
)
option
(
WITH_BRPC_RDMA
"Use brpc rdma as the rpc protocal"
OFF
)
option
(
WITH_INFERENCE
"Compile fluid inference library"
ON
)
option
(
ON_INFER
"Turn on inference optimization."
OFF
)
option
(
WITH_INFERENCE_API_TEST
"Test fluid inference high-level api interface"
OFF
)
option
(
WITH_SYSTEM_BLAS
"Use system blas library"
OFF
)
option
(
PY_VERSION
"Compile PaddlePaddle with python3 support"
${
PY_VERSION
}
)
...
...
@@ -179,6 +180,7 @@ include(external/eigen) # download eigen3
include
(
external/pybind11
)
# download pybind11
include
(
external/cares
)
include
(
external/cub
)
include
(
external/xxhash
)
# download xxhash
if
(
NOT WIN32
)
# there is no official support of snappystream, warpctc, nccl, cupti in windows
...
...
@@ -301,3 +303,8 @@ if(WITH_DOC)
find_python_module
(
recommonmark REQUIRED
)
add_subdirectory
(
doc
)
endif
()
if
(
ON_INFER
)
message
(
WARNING
"On inference mode, will take place some specific optimization."
)
add_definitions
(
-DPADDLE_ON_INFERENCE
)
endif
()
benchmark/fluid/args.py
浏览文件 @
e0a89503
...
...
@@ -142,5 +142,10 @@ def parse_args():
choices
=
[
'reduce'
,
'all_reduce'
],
default
=
'all_reduce'
,
help
=
'Specify the reduce strategy, can be reduce, all_reduce'
)
parser
.
add_argument
(
'--fuse_broadcast_op'
,
action
=
'store_true'
,
help
=
'If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
)
args
=
parser
.
parse_args
()
return
args
benchmark/fluid/fluid_benchmark.py
浏览文件 @
e0a89503
...
...
@@ -177,6 +177,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
else
:
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
(
).
ReduceStrategy
.
AllReduce
build_strategy
.
fuse_broadcast_op
=
args
.
fuse_broadcast_op
avg_loss
=
train_args
[
0
]
...
...
@@ -240,7 +241,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
if
args
.
use_fake_data
or
args
.
use_reader_op
:
try
:
fetch_ret
=
exe
.
run
(
fetch_list
)
except
fluid
.
core
.
EOFException
as
eof
:
break
...
...
cmake/external/xxhash.cmake
0 → 100644
浏览文件 @
e0a89503
INCLUDE
(
ExternalProject
)
set
(
XXHASH_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/xxhash
)
set
(
XXHASH_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/xxhash
)
set
(
XXHASH_INCLUDE_DIR
"
${
XXHASH_INSTALL_DIR
}
/include"
)
IF
(
WITH_STATIC_LIB
)
SET
(
BUILD_CMD make lib
)
ELSE
()
SET
(
BUILD_CMD sed -i
"s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g"
${
XXHASH_SOURCE_DIR
}
/src/extern_xxhash/Makefile && make lib
)
ENDIF
()
ExternalProject_Add
(
extern_xxhash
${
EXTERNAL_PROJECT_LOG_ARGS
}
GIT_REPOSITORY
"https://github.com/Cyan4973/xxHash"
GIT_TAG
"v0.6.5"
PREFIX
${
XXHASH_SOURCE_DIR
}
DOWNLOAD_NAME
"xxhash"
UPDATE_COMMAND
""
CONFIGURE_COMMAND
""
BUILD_IN_SOURCE 1
PATCH_COMMAND
BUILD_COMMAND
${
BUILD_CMD
}
INSTALL_COMMAND export PREFIX=
${
XXHASH_INSTALL_DIR
}
/ && make install
TEST_COMMAND
""
)
set
(
XXHASH_LIBRARIES
"
${
XXHASH_INSTALL_DIR
}
/lib/libxxhash.a"
)
INCLUDE_DIRECTORIES
(
${
XXHASH_INCLUDE_DIR
}
)
add_library
(
xxhash STATIC IMPORTED GLOBAL
)
set_property
(
TARGET xxhash PROPERTY IMPORTED_LOCATION
${
XXHASH_LIBRARIES
}
)
include_directories
(
${
XXHASH_INCLUDE_DIR
}
)
add_dependencies
(
xxhash extern_xxhash
)
LIST
(
APPEND external_project_dependencies xxhash
)
IF
(
WITH_C_API
)
INSTALL
(
DIRECTORY
${
XXHASH_INCLUDE_DIR
}
DESTINATION third_party/xxhash
)
IF
(
ANDROID
)
INSTALL
(
FILES
${
XXHASH_LIBRARIES
}
DESTINATION third_party/xxhash/lib/
${
ANDROID_ABI
}
)
ELSE
()
INSTALL
(
FILES
${
XXHASH_LIBRARIES
}
DESTINATION third_party/xxhash/lib
)
ENDIF
()
ENDIF
()
cmake/inference_lib.cmake
浏览文件 @
e0a89503
...
...
@@ -14,6 +14,9 @@
# make package for paddle fluid shared and static library
function
(
copy TARGET
)
if
(
NOT ON_INFER
)
message
(
WARNING
"Turn on the ON_INFER flag when building inference_lib only."
)
endif
()
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DSTS DEPS
)
...
...
@@ -31,7 +34,7 @@ function(copy TARGET)
foreach
(
index RANGE
${
len
}
)
list
(
GET copy_lib_SRCS
${
index
}
src
)
list
(
GET copy_lib_DSTS
${
index
}
dst
)
add_custom_command
(
TARGET
${
TARGET
}
PRE_BUILD
add_custom_command
(
TARGET
${
TARGET
}
PRE_BUILD
COMMAND mkdir -p
"
${
dst
}
"
COMMAND cp -r
"
${
src
}
"
"
${
dst
}
"
COMMENT
"copying
${
src
}
->
${
dst
}
"
)
...
...
@@ -67,6 +70,13 @@ copy(boost_lib
DEPS boost
)
set
(
dst_dir
"
${
FLUID_INSTALL_DIR
}
/third_party/install/xxhash"
)
copy
(
xxhash_lib
SRCS
${
XXHASH_INCLUDE_DIR
}
${
XXHASH_LIBRARIES
}
DSTS
${
dst_dir
}
${
dst_dir
}
/lib
DEPS xxhash
)
if
(
NOT PROTOBUF_FOUND
)
set
(
dst_dir
"
${
FLUID_INSTALL_DIR
}
/third_party/install/protobuf"
)
copy
(
protobuf_lib
...
...
@@ -186,7 +196,7 @@ copy(cmake_cache
DSTS
${
FLUID_INSTALL_DIR
}
)
# This command generates a complete fluid library for both train and inference
add_custom_target
(
fluid_lib_dist DEPENDS
${
fluid_lib_dist_dep
}
)
add_custom_target
(
fluid_lib_dist DEPENDS
${
fluid_lib_dist_dep
}
)
# Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
...
...
paddle/fluid/API.spec
浏览文件 @
e0a89503
...
...
@@ -175,7 +175,9 @@ paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dim
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
@@ -354,6 +356,8 @@ paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_wind
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -16,12 +16,14 @@ if(WITH_GPU)
dynload_cuda variable_visitor
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda
)
nv_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
nv_library
(
fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle
)
else
()
cc_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor
)
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim
)
cc_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle
)
endif
()
cc_library
(
data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor
)
...
...
@@ -34,7 +36,7 @@ if(WITH_GPU)
endif
()
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle
)
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle
fused_broadcast_op_handle
)
if
(
WITH_GPU
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass
)
...
...
@@ -58,4 +60,4 @@ cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executo
cc_library
(
build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass
)
fuse_elewise_add_act_pass
multi_batch_merge_pass
)
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
e0a89503
...
...
@@ -48,16 +48,23 @@ void BroadcastOpHandle::RunImpl() {
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
}
BroadcastOneVar
(
*
in_var_handle
,
out_var_handles
,
var_scopes
);
}
void
BroadcastOpHandle
::
BroadcastOneVar
(
const
VarHandle
&
in_var_handle
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
,
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
{
auto
*
in_var
=
var_scopes
.
at
(
in_var_handle
->
scope_idx_
)
->
FindVar
(
in_var_handle
->
name_
);
var_scopes
.
at
(
in_var_handle
.
scope_idx_
)
->
FindVar
(
in_var_handle
.
name_
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
InitOutputValue
(
*
in_var_handle
,
out_var_handles
);
InitOutputValue
(
in_var_handle
,
out_var_handles
);
if
(
platform
::
is_cpu_place
(
in_tensor
.
place
()))
{
for
(
auto
*
out_var_handle
:
out_var_handles
)
{
if
(
out_var_handle
->
IsTheSameVar
(
*
in_var_handle
))
{
if
(
out_var_handle
->
IsTheSameVar
(
in_var_handle
))
{
continue
;
}
auto
&
out_p
=
out_var_handle
->
place_
;
...
...
@@ -114,12 +121,12 @@ void BroadcastOpHandle::RunImpl() {
}
}
if
(
!
out_handle
->
IsTheSameVar
(
*
in_var_handle
))
{
auto
out_var
=
var_scopes
.
at
(
in_var_handle
->
scope_idx_
)
if
(
!
out_handle
->
IsTheSameVar
(
in_var_handle
))
{
auto
out_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx_
)
->
FindVar
(
out_var_handles
[
0
]
->
name_
);
paddle
::
framework
::
TensorCopy
(
in_tensor
,
in_var_handle
->
place_
,
*
(
dev_ctxes_
.
at
(
in_var_handle
->
place_
)),
in_tensor
,
in_var_handle
.
place_
,
*
(
dev_ctxes_
.
at
(
in_var_handle
.
place_
)),
&
VariableVisitor
::
GetMutableTensor
(
out_var
));
}
});
...
...
paddle/fluid/framework/details/broadcast_op_handle.h
浏览文件 @
e0a89503
...
...
@@ -61,7 +61,10 @@ struct BroadcastOpHandle : public OpHandleBase {
protected:
void
RunImpl
()
override
;
private:
void
BroadcastOneVar
(
const
VarHandle
&
in_var_handle
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
,
const
std
::
vector
<
const
Scope
*>
&
var_scopes
);
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
#ifdef PADDLE_WITH_CUDA
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
e0a89503
...
...
@@ -121,6 +121,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
USE_PASS
(
fuse_elewise_add_act_pass
);
USE_PASS
(
graph_viz_pass
);
USE_PASS
(
multi_batch_merge_pass
);
USE_PASS
(
multi_devices_pass
);
USE_PASS
(
multi_devices_check_pass
);
USE_PASS
(
multi_devices_print_pass
);
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
e0a89503
...
...
@@ -69,6 +69,8 @@ struct BuildStrategy {
bool
enable_data_balance_
{
false
};
bool
fuse_broadcast_op_
{
false
};
// User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes
// from python side.
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle.cc
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
void
FusedBroadcastOpHandle
::
RunImpl
()
{
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
begin
()
->
second
);
if
(
places_
.
size
()
==
1UL
)
return
;
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
inputs_
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
outputs_
);
WaitInputVarGenerated
();
std
::
vector
<
const
Scope
*>
var_scopes
;
for
(
auto
*
s
:
local_scopes_
)
{
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
}
size_t
place_num
=
places_
.
size
();
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
()
*
place_num
,
out_var_handles
.
size
());
for
(
size_t
i
=
0
;
i
<
in_var_handles
.
size
();
++
i
)
{
BroadcastOneVar
(
*
in_var_handles
[
i
],
std
::
vector
<
VarHandle
*>
(
out_var_handles
.
begin
()
+
i
*
place_num
,
out_var_handles
.
begin
()
+
(
i
+
1
)
*
place_num
),
var_scopes
);
}
}
std
::
string
FusedBroadcastOpHandle
::
Name
()
const
{
return
"fused_broadcast"
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/fused_broadcast_op_handle.h
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
namespace
details
{
struct
FusedBroadcastOpHandle
:
public
BroadcastOpHandle
{
public:
#ifdef PADDLE_WITH_CUDA
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
nccl_ctx
)
:
BroadcastOpHandle
(
node
,
local_scopes
,
places
,
nccl_ctx
)
{}
#else
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>&
places
)
:
BroadcastOpHandle
(
node
,
local_scopes
,
places
)
{}
#endif
std
::
string
Name
()
const
override
;
protected:
void
RunImpl
()
override
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
e0a89503
...
...
@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h"
...
...
@@ -347,7 +348,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
BuildStrategy
::
GradientScaleStrategy
::
kCustomized
)
{
// TODO(paddle-dev): Why is there no input for this op_handle?
auto
loss_grad_name
=
node
->
Op
()
->
OutputArgumentNames
()[
0
];
CreateScaleLossGradOp
(
&
result
,
loss_grad_name
);
CreateScaleLossGradOp
(
&
result
,
loss_grad_name
,
node
->
outputs
[
0
]
);
}
// This assumes the backward generating code will ensure IsScaleLossOp
// is true only for the op that scale the final scalar loss.
...
...
@@ -436,10 +437,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
if
((
use_gpu
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
||
is_dist_train
)
{
for
(
size_t
dev_id
=
0
;
dev_id
<
bcast_var_name_set
.
size
();
++
dev_id
)
{
auto
&
to_bcast_set
=
bcast_var_name_set
[
dev_id
];
for
(
auto
&
bcast_name
:
to_bcast_set
)
{
CreateBroadcastOp
(
&
result
,
bcast_name
,
dev_id
);
if
(
strategy_
.
fuse_broadcast_op_
)
{
CreateFusedBroadcastOp
(
&
result
,
bcast_var_name_set
);
}
else
{
for
(
size_t
dev_id
=
0
;
dev_id
<
bcast_var_name_set
.
size
();
++
dev_id
)
{
auto
&
to_bcast_set
=
bcast_var_name_set
[
dev_id
];
for
(
auto
&
bcast_name
:
to_bcast_set
)
{
CreateBroadcastOp
(
&
result
,
bcast_name
,
dev_id
);
}
}
}
}
...
...
@@ -508,6 +513,44 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
}
}
void
MultiDevSSAGraphBuilder
::
CreateFusedBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
bcast_varnames
)
const
{
#ifdef PADDLE_WITH_CUDA
auto
*
op_handle
=
new
FusedBroadcastOpHandle
(
result
->
CreateEmptyNode
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
);
#else
auto
*
op_handle
=
new
FusedBroadcastOpHandle
(
result
->
CreateEmptyNode
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
);
#endif
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
op_handle
);
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
SetCommunicationContext
(
op_handle
,
p
);
}
for
(
size_t
dev_id
=
0
;
dev_id
<
bcast_varnames
.
size
();
++
dev_id
)
{
for
(
auto
&
p_name
:
bcast_varnames
[
dev_id
])
{
auto
*
in
=
result
->
Get
<
GraphVars
>
(
kGraphVars
).
at
(
dev_id
).
at
(
p_name
).
back
().
get
();
op_handle
->
AddInput
(
in
);
for
(
size_t
out_dev_id
=
0
;
out_dev_id
<
places_
.
size
();
++
out_dev_id
)
{
auto
&
p
=
places_
[
out_dev_id
];
auto
&
vars
=
result
->
Get
<
GraphVars
>
(
kGraphVars
).
at
(
out_dev_id
).
at
(
p_name
);
auto
*
out_var
=
new
VarHandle
(
result
->
CreateEmptyNode
(
p_name
,
ir
::
Node
::
Type
::
kVariable
),
vars
.
size
(),
out_dev_id
,
p_name
,
p
);
vars
.
emplace_back
(
out_var
);
op_handle
->
AddOutput
(
out_var
);
}
}
}
}
void
MultiDevSSAGraphBuilder
::
CreateComputationalOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
dev_id
)
const
{
...
...
@@ -602,7 +645,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
}
void
MultiDevSSAGraphBuilder
::
CreateScaleLossGradOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
loss_grad_name
)
const
{
ir
::
Graph
*
result
,
const
std
::
string
&
loss_grad_name
,
ir
::
Node
*
out_var_node
)
const
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// Insert ScaleCost OpHandle
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
places_
[
i
]);
...
...
@@ -617,10 +661,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput
(
result
,
op_handle
,
result
->
CreateEmptyNode
(
loss_grad_name
,
ir
::
Node
::
Type
::
kVariable
),
places_
[
i
],
i
);
CreateOpOutput
(
result
,
op_handle
,
result
->
CreateVarNode
(
out_var_node
->
Var
()),
places_
[
i
],
i
);
}
}
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.h
浏览文件 @
e0a89503
...
...
@@ -61,7 +61,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
size_t
num_places
)
const
;
void
CreateScaleLossGradOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
loss_grad_name
)
const
;
const
std
::
string
&
loss_grad_name
,
ir
::
Node
*
out_var_node
)
const
;
VarHandle
*
CreateReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
,
int
dst_dev_id
)
const
;
...
...
@@ -78,6 +79,10 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
void
CreateBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
;
void
CreateFusedBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
bcast_varnames
)
const
;
bool
IsSparseGradient
(
const
std
::
string
&
og
)
const
;
size_t
GetAppropriateDeviceID
(
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -36,6 +36,7 @@ pass_library(fc_lstm_fuse_pass inference)
pass_library
(
embedding_fc_lstm_fuse_pass inference
)
pass_library
(
fc_gru_fuse_pass inference
)
pass_library
(
seq_concat_fc_fuse_pass inference
)
pass_library
(
multi_batch_merge_pass base
)
pass_library
(
conv_bn_fuse_pass inference
)
pass_library
(
seqconv_eltadd_relu_fuse_pass inference
)
if
(
WITH_MKLDNN
)
...
...
paddle/fluid/framework/ir/graph.cc
浏览文件 @
e0a89503
...
...
@@ -24,79 +24,23 @@ namespace paddle {
namespace
framework
{
namespace
ir
{
std
::
vector
<
std
::
string
>
FindDistTrainSendVars
(
const
std
::
vector
<
ir
::
Node
*>
&
nodes
)
{
std
::
vector
<
std
::
string
>
send_vars
;
// since parameters are all in block 0,
// it's enough to only scan send ops in block 0
for
(
auto
&
node
:
nodes
)
{
auto
op_vars
=
node
->
Op
()
->
InputArgumentNames
();
send_vars
.
reserve
(
send_vars
.
size
()
+
std
::
distance
(
op_vars
.
begin
(),
op_vars
.
end
()));
send_vars
.
insert
(
send_vars
.
end
(),
op_vars
.
begin
(),
op_vars
.
end
());
}
return
send_vars
;
}
std
::
vector
<
std
::
string
>
FindDistTrainRecvVars
(
const
std
::
vector
<
ir
::
Node
*>
&
nodes
)
{
std
::
vector
<
std
::
string
>
recv_vars
;
for
(
auto
&
node
:
nodes
)
{
auto
op_vars
=
node
->
Op
()
->
OutputArgumentNames
();
recv_vars
.
reserve
(
recv_vars
.
size
()
+
std
::
distance
(
op_vars
.
begin
(),
op_vars
.
end
()));
recv_vars
.
insert
(
recv_vars
.
end
(),
op_vars
.
begin
(),
op_vars
.
end
());
}
return
recv_vars
;
}
bool
IsDistTrainOp
(
ir
::
Node
*
node
,
const
std
::
vector
<
std
::
string
>
&
send_vars
,
const
std
::
vector
<
std
::
string
>
&
recv_vars
)
{
if
(
send_vars
.
size
()
==
0
||
recv_vars
.
size
()
==
0
)
{
return
false
;
}
/**
* Check any of opvars contains `.block` and in sendvars
*/
auto
checker
=
[](
const
std
::
vector
<
std
::
string
>
&
opvars
,
const
std
::
vector
<
std
::
string
>
&
rpc_vars
)
->
bool
{
for
(
auto
&
var
:
opvars
)
{
// a variable name with the suffix `.block` means it's a splited
// variable by (DistributeTranspiler)
// [python/paddle/fluid/transpiler/distribute_transpiler.py]
if
(
var
.
find
(
".block"
)
!=
std
::
string
::
npos
&&
std
::
find
(
rpc_vars
.
begin
(),
rpc_vars
.
end
(),
var
)
!=
rpc_vars
.
end
())
{
return
true
;
}
}
return
false
;
};
std
::
vector
<
std
::
string
>
input_var_names
;
std
::
vector
<
std
::
string
>
output_var_names
;
for
(
ir
::
Node
*
input
:
node
->
inputs
)
{
input_var_names
.
push_back
(
input
->
Name
());
}
for
(
ir
::
Node
*
output
:
node
->
outputs
)
{
output_var_names
.
push_back
(
output
->
Name
());
}
return
checker
(
output_var_names
,
send_vars
)
||
checker
(
input_var_names
,
recv_vars
);
}
Graph
::
Graph
(
const
ProgramDesc
&
program
)
:
program_
(
program
)
{
// Make the nodes id start from 0.
Node
::
ResetId
();
auto
var_nodes
=
InitFromProgram
(
program_
);
ResolveHazard
(
var_nodes
);
}
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
Graph
::
InitFromProgram
(
const
ProgramDesc
&
program
)
{
VLOG
(
3
)
<<
"block in program:"
<<
program_
.
Size
();
std
::
unordered_map
<
std
::
string
,
VarDesc
*>
all_vars
;
// var nodes for each var name, will have multiple versions in SSA
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes
;
for
(
auto
*
var
:
program
.
Block
(
0
).
AllVars
())
{
all_vars
.
emplace
(
var
->
Name
(),
var
);
}
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes
;
for
(
auto
*
op
:
program
.
Block
(
0
).
AllOps
())
{
ir
::
Node
*
node
=
CreateOpNode
(
op
);
// For input args, reuse the same var name if it was created before.
...
...
@@ -134,7 +78,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
var
->
inputs
.
push_back
(
node
);
}
}
return
std
::
move
(
var_nodes
);
}
void
Graph
::
ResolveHazard
(
const
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
&
var_nodes
)
{
/**
* We should handle write after read(WAR) and write after write(WAW) here.
* Because some of the operators of the program can be executed parallelly.
...
...
@@ -153,6 +101,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
auto
it_old
=
versions
.
rbegin
();
++
it_old
;
for
(;
it_old
!=
versions
.
rend
();
it_new
=
it_old
,
++
it_old
)
{
VLOG
(
3
)
<<
"deal with var: "
<<
(
*
it_new
)
->
Name
();
ir
::
Node
*
write_op
=
(
*
it_new
)
->
inputs
.
empty
()
?
nullptr
:
(
*
it_new
)
->
inputs
[
0
];
const
auto
&
read_ops
=
(
*
it_old
)
->
outputs
;
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
e0a89503
...
...
@@ -160,6 +160,12 @@ class Graph {
return
nullptr
;
}
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
InitFromProgram
(
const
ProgramDesc
&
program
);
void
ResolveHazard
(
const
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
&
var_nodes
);
private:
// This method takes ownership of `node`.
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
e0a89503
...
...
@@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) {
std
::
deque
<
ir
::
Node
*>
q_nodes
;
std
::
vector
<
std
::
unordered_set
<
ir
::
Node
*>>
graph_nodes
;
std
::
unordered_set
<
ir
::
Node
*>
g_nodes
;
// q_set used to record records in the queue.
std
::
unordered_set
<
ir
::
Node
*>
q_set
;
size_t
graph_count
=
0
;
auto
traverse_nodes
=
[
&
visited_nodes
,
&
q_nodes
](
const
std
::
vector
<
ir
::
Node
*>
&
nodes
)
{
std
::
copy_if
(
nodes
.
begin
(),
nodes
.
end
(),
std
::
back_inserter
(
q_nodes
),
[
&
visited_nodes
](
Node
*
node
)
{
return
!
visited_nodes
.
count
(
node
);
});
auto
traverse_nodes
=
[
&
visited_nodes
,
&
q_nodes
,
&
q_set
](
const
std
::
vector
<
ir
::
Node
*>
&
nodes
)
{
for
(
auto
n
:
nodes
)
{
if
(
visited_nodes
.
count
(
n
)
==
0
&&
q_set
.
count
(
n
)
==
0
)
{
q_nodes
.
push_back
(
n
);
q_set
.
insert
(
n
);
}
}
};
while
(
visited_nodes
.
size
()
!=
nodes
.
size
())
{
if
(
!
q_nodes
.
empty
())
{
auto
cur_node
=
q_nodes
.
front
();
q_nodes
.
pop_front
();
q_set
.
erase
(
cur_node
);
visited_nodes
.
insert
(
cur_node
);
g_nodes
.
insert
(
cur_node
);
traverse_nodes
(
cur_node
->
inputs
);
...
...
@@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) {
for
(
auto
&
n
:
nodes
)
{
if
(
visited_nodes
.
count
(
n
)
==
0
)
{
q_nodes
.
push_back
(
n
);
q_set
.
insert
(
n
);
break
;
}
}
...
...
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/multi_batch_merge_pass.h"
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
static
const
char
kNumRepeats
[]
=
"num_repeats"
;
typedef
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
SSAVarList
;
ir
::
Node
*
SameNameVar
(
std
::
unordered_set
<
ir
::
Node
*>
all
,
ir
::
Node
*
target
)
{
for
(
auto
n
:
all
)
{
if
(
target
->
IsVar
()
&&
target
->
Name
()
==
n
->
Name
())
{
return
n
;
}
}
return
nullptr
;
}
VarDesc
CopyVarDesc
(
VarDesc
*
var_desc
)
{
VarDesc
repeated_var
(
var_desc
->
Name
());
// copy other variable attributes
if
(
var_desc
->
GetType
()
!=
proto
::
VarType
::
READER
)
{
repeated_var
.
SetType
(
var_desc
->
GetType
());
repeated_var
.
SetShape
(
var_desc
->
GetShape
());
repeated_var
.
SetDataType
(
var_desc
->
GetDataType
());
repeated_var
.
SetLoDLevel
(
var_desc
->
GetLoDLevel
());
repeated_var
.
SetPersistable
(
var_desc
->
Persistable
());
}
else
{
// TODO(typhoonzero): copy reader var
}
return
repeated_var
;
}
VarDesc
UpdateGradVarDesc
(
VarDesc
*
var_desc
,
int
repeat
,
const
std
::
unordered_set
<
std
::
string
>&
grad_names
,
const
std
::
unordered_set
<
std
::
string
>&
bn_vars_need_rename
)
{
if
(
grad_names
.
find
(
var_desc
->
Name
())
!=
grad_names
.
end
()
||
bn_vars_need_rename
.
find
(
var_desc
->
Name
())
!=
bn_vars_need_rename
.
end
())
{
std
::
string
new_gname
=
string
::
Sprintf
(
"%s.repeat.%d"
,
var_desc
->
Name
(),
repeat
);
VarDesc
repeated_var
=
CopyVarDesc
(
var_desc
);
repeated_var
.
SetName
(
new_gname
);
VLOG
(
3
)
<<
"update "
<<
var_desc
->
Name
()
<<
" to repeat "
<<
repeat
;
return
repeated_var
;
}
return
*
var_desc
;
}
std
::
unique_ptr
<
Graph
>
BatchMergePass
::
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
{
int
num_repeats
=
Get
<
const
int
>
(
kNumRepeats
);
std
::
vector
<
Node
*>
forward_backward_ops
;
std
::
vector
<
Node
*>
optimize_ops
;
std
::
vector
<
Node
*>
lr_ops
;
// ops other than forward/backward/optimize
std
::
unordered_set
<
std
::
string
>
grad_names
;
std
::
vector
<
ir
::
Node
*>
nodes
=
TopologySortOperations
(
*
graph
);
auto
origin_nodes
=
graph
->
ReleaseNodes
();
VLOG
(
3
)
<<
"origin nodes count: "
<<
origin_nodes
.
size
();
ir
::
Graph
&
result
=
*
graph
;
// 1. record op nodes of different roles
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsVar
())
continue
;
int
op_role
=
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleAttrName
()));
if
((
op_role
==
static_cast
<
int
>
(
framework
::
OpRole
::
kForward
))
||
(
op_role
&
static_cast
<
int
>
(
framework
::
OpRole
::
kBackward
))
||
(
op_role
&
static_cast
<
int
>
(
framework
::
OpRole
::
kLoss
)))
{
forward_backward_ops
.
push_back
(
node
);
}
else
if
((
op_role
&
static_cast
<
int
>
(
framework
::
OpRole
::
kOptimize
))
||
(
op_role
&
static_cast
<
int
>
(
framework
::
OpRole
::
kDist
))
||
(
op_role
&
static_cast
<
int
>
(
framework
::
OpRole
::
kRPC
)))
{
optimize_ops
.
push_back
(
node
);
auto
op_role_var
=
node
->
Op
()
->
GetNullableAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
());
auto
op_role_vars
=
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
op_role_var
);
for
(
size_t
i
=
0
;
i
<
op_role_vars
.
size
();
i
+=
2
)
{
grad_names
.
insert
(
op_role_vars
[
i
+
1
]);
}
}
else
if
(
op_role
&
static_cast
<
int
>
(
framework
::
OpRole
::
kLRSched
))
{
lr_ops
.
push_back
(
node
);
}
else
{
// NOLINT
PADDLE_THROW
(
"Invalid op_role: %d"
,
static_cast
<
int
>
(
op_role
));
}
}
// 2. copy forward backward
ir
::
Node
*
prev_repeat_last_op_node
=
nullptr
;
// record origin_grad -> repeated grad list map.
std
::
map
<
ir
::
Node
*
,
std
::
vector
<
ir
::
Node
*>>
grad_repeated_map
;
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
created
;
std
::
unordered_set
<
std
::
string
>
bn_vars_need_rename
;
for
(
int
i
=
0
;
i
<
num_repeats
;
++
i
)
{
std
::
unordered_set
<
ir
::
Node
*>
copied
;
for
(
size_t
node_idx
=
0
;
node_idx
<
forward_backward_ops
.
size
();
++
node_idx
)
{
auto
node
=
forward_backward_ops
[
node_idx
];
OpDesc
repeated_op
(
*
(
node
->
Op
()),
node
->
Op
()
->
Block
());
// 3. rename grad outputs to current repeat.
for
(
auto
outname
:
repeated_op
.
OutputArgumentNames
())
{
if
(
grad_names
.
find
(
outname
)
!=
grad_names
.
end
())
{
std
::
string
new_gname
=
string
::
Sprintf
(
"%s.repeat.%d"
,
outname
,
i
);
repeated_op
.
RenameOutput
(
outname
,
new_gname
);
}
}
// 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
// not need this update
if
(
node
->
Name
()
==
"batch_norm"
)
{
// NOTE: assume bn op created by layers use save var as output mean and
// variance
std
::
string
new_mean_name
=
string
::
Sprintf
(
"%s.repeat.%d"
,
repeated_op
.
Input
(
"Mean"
)[
0
],
i
);
std
::
string
new_var_name
=
string
::
Sprintf
(
"%s.repeat.%d"
,
repeated_op
.
Input
(
"Variance"
)[
0
],
i
);
bn_vars_need_rename
.
insert
(
repeated_op
.
Input
(
"Mean"
)[
0
]);
bn_vars_need_rename
.
insert
(
repeated_op
.
Input
(
"Variance"
)[
0
]);
VLOG
(
3
)
<<
"renaming "
<<
repeated_op
.
Input
(
"Mean"
)[
0
]
<<
" to "
<<
new_mean_name
;
repeated_op
.
RenameInput
(
repeated_op
.
Input
(
"Mean"
)[
0
],
new_mean_name
);
repeated_op
.
RenameInput
(
repeated_op
.
Input
(
"Variance"
)[
0
],
new_var_name
);
repeated_op
.
RenameOutput
(
repeated_op
.
Output
(
"MeanOut"
)[
0
],
new_mean_name
);
repeated_op
.
RenameOutput
(
repeated_op
.
Output
(
"VarianceOut"
)[
0
],
new_var_name
);
}
// 3.9 do copy
auto
repeated_node
=
result
.
CreateOpNode
(
&
repeated_op
);
copied
.
insert
(
node
);
// 4. add deps between repeats
if
(
node_idx
==
forward_backward_ops
.
size
()
-
1
)
{
prev_repeat_last_op_node
=
repeated_node
;
}
if
(
node_idx
==
0
&&
prev_repeat_last_op_node
)
{
auto
*
depvar
=
result
.
CreateControlDepVar
();
prev_repeat_last_op_node
->
outputs
.
push_back
(
depvar
);
depvar
->
inputs
.
push_back
(
prev_repeat_last_op_node
);
repeated_node
->
inputs
.
push_back
(
depvar
);
depvar
->
outputs
.
push_back
(
repeated_node
);
}
for
(
auto
in_node
:
node
->
inputs
)
{
if
(
in_node
->
IsCtrlVar
())
{
continue
;
}
ir
::
Node
*
var
=
nullptr
;
auto
updated_var
=
UpdateGradVarDesc
(
in_node
->
Var
(),
i
,
grad_names
,
bn_vars_need_rename
);
// should be initialized by startup, how to initilize tensor in the
// scope?
if
(
node
->
Name
()
==
"batch_norm"
&&
bn_vars_need_rename
.
find
(
in_node
->
Name
())
!=
bn_vars_need_rename
.
end
())
{
// Create bn mean/variance for each repeat
var
=
result
.
CreateVarNode
(
&
updated_var
);
created
[
updated_var
.
Name
()].
push_back
(
var
);
copied
.
insert
(
in_node
);
repeated_node
->
inputs
.
push_back
(
var
);
var
->
outputs
.
push_back
(
repeated_node
);
continue
;
}
// for other ops
if
(
in_node
->
inputs
.
empty
()
&&
i
>
0
)
{
// do not copy head vars (inputs, params) in repeats > 0
var
=
created
.
at
(
in_node
->
Name
()).
back
();
}
else
{
if
(
copied
.
find
(
in_node
)
==
copied
.
end
())
{
var
=
result
.
CreateVarNode
(
&
updated_var
);
if
(
grad_names
.
find
(
in_node
->
Var
()
->
Name
())
!=
grad_names
.
end
())
{
grad_repeated_map
[
in_node
].
push_back
(
var
);
}
copied
.
insert
(
in_node
);
created
[
updated_var
.
Name
()].
push_back
(
var
);
}
else
{
var
=
created
.
at
(
updated_var
.
Name
()).
back
();
}
}
repeated_node
->
inputs
.
push_back
(
var
);
var
->
outputs
.
push_back
(
repeated_node
);
}
for
(
auto
out_node
:
node
->
outputs
)
{
if
(
out_node
->
IsCtrlVar
())
{
continue
;
}
ir
::
Node
*
var
=
nullptr
;
auto
updated_var
=
UpdateGradVarDesc
(
out_node
->
Var
(),
i
,
grad_names
,
bn_vars_need_rename
);
if
(
copied
.
find
(
out_node
)
==
copied
.
end
())
{
var
=
result
.
CreateVarNode
(
&
updated_var
);
if
(
grad_names
.
find
(
out_node
->
Var
()
->
Name
())
!=
grad_names
.
end
())
{
grad_repeated_map
[
out_node
].
push_back
(
var
);
}
copied
.
insert
(
out_node
);
created
[
updated_var
.
Name
()].
push_back
(
var
);
}
else
{
var
=
created
.
at
(
updated_var
.
Name
()).
back
();
}
repeated_node
->
outputs
.
push_back
(
var
);
var
->
inputs
.
push_back
(
repeated_node
);
}
}
}
// 5. create GRAD merge op node
for
(
auto
kv
:
grad_repeated_map
)
{
OpDesc
sum_op
;
sum_op
.
SetType
(
"sum"
);
std
::
vector
<
std
::
string
>
repeated_grad_names
;
for
(
auto
r
:
kv
.
second
)
{
repeated_grad_names
.
push_back
(
r
->
Var
()
->
Name
());
}
sum_op
.
SetInput
(
"X"
,
repeated_grad_names
);
sum_op
.
SetOutput
(
"Out"
,
{
kv
.
first
->
Var
()
->
Name
()});
sum_op
.
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kBackward
));
auto
sum_op_node
=
result
.
CreateOpNode
(
&
sum_op
);
for
(
auto
r
:
kv
.
second
)
{
sum_op_node
->
inputs
.
push_back
(
r
);
r
->
outputs
.
push_back
(
sum_op_node
);
}
auto
sum_out_var_node
=
result
.
CreateVarNode
(
kv
.
first
->
Var
());
sum_op_node
->
outputs
.
push_back
(
sum_out_var_node
);
sum_out_var_node
->
inputs
.
push_back
(
sum_op_node
);
created
[
sum_out_var_node
->
Name
()].
push_back
(
sum_out_var_node
);
OpDesc
scale_op
;
scale_op
.
SetType
(
"scale"
);
scale_op
.
SetInput
(
"X"
,
{
sum_out_var_node
->
Var
()
->
Name
()});
// NOTE: inplace scale.
scale_op
.
SetOutput
(
"Out"
,
{
sum_out_var_node
->
Var
()
->
Name
()});
scale_op
.
SetAttr
(
"scale"
,
static_cast
<
float
>
(
1.0
f
/
num_repeats
));
scale_op
.
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kBackward
));
auto
scale_op_node
=
result
.
CreateOpNode
(
&
scale_op
);
scale_op_node
->
inputs
.
push_back
(
sum_out_var_node
);
sum_out_var_node
->
outputs
.
push_back
(
scale_op_node
);
auto
scale_out_var_node
=
result
.
CreateVarNode
(
sum_out_var_node
->
Var
());
scale_op_node
->
outputs
.
push_back
(
scale_out_var_node
);
scale_out_var_node
->
inputs
.
push_back
(
scale_op_node
);
created
[
scale_out_var_node
->
Name
()].
push_back
(
scale_out_var_node
);
}
// 6. add optimize ops
{
auto
copy_node
=
[
&
result
,
&
created
](
ir
::
Node
*
node
)
{
auto
op_node
=
result
.
CreateOpNode
(
node
->
Op
());
// copy op ins/outs
// NOTE: for send/recv ops, the OpDesc uses ctrldepvar to describe
// dependencies, so create those depvars if OpDesc have in/outs.
for
(
auto
in_node
:
node
->
inputs
)
{
if
(
in_node
->
IsCtrlVar
()
&&
!
in_node
->
Var
())
{
continue
;
}
ir
::
Node
*
var
=
nullptr
;
if
(
created
.
find
(
in_node
->
Name
())
==
created
.
end
())
{
var
=
result
.
CreateVarNode
(
in_node
->
Var
());
created
[
in_node
->
Name
()].
push_back
(
var
);
}
else
{
var
=
created
.
at
(
in_node
->
Name
()).
back
();
}
op_node
->
inputs
.
push_back
(
var
);
var
->
outputs
.
push_back
(
op_node
);
}
for
(
auto
out_node
:
node
->
outputs
)
{
if
(
out_node
->
IsCtrlVar
()
&&
!
out_node
->
Var
())
{
continue
;
}
auto
var
=
result
.
CreateVarNode
(
out_node
->
Var
());
created
[
out_node
->
Name
()].
push_back
(
var
);
op_node
->
outputs
.
push_back
(
var
);
var
->
inputs
.
push_back
(
op_node
);
}
};
for
(
auto
node
:
lr_ops
)
{
copy_node
(
node
);
}
for
(
auto
node
:
optimize_ops
)
{
copy_node
(
node
);
}
}
result
.
ResolveHazard
(
created
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
multi_batch_merge_pass
,
paddle
::
framework
::
ir
::
BatchMergePass
)
.
RequirePassAttr
(
paddle
::
framework
::
ir
::
kNumRepeats
);
paddle/fluid/framework/ir/multi_batch_merge_pass.h
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
// BatchMergePass is used to copy forward and backward ops for several
// times to run several batches to simulate large batch size training
// as if we have more than 1 GPUs.
// User can define how many batches to run, gradients will be merged
// through those repeats, and then do optimization using merged gradients.
// This pass is extremely useful when doing large batch-size distributed
// sync training, we can simulate even large batch size as if we have more
// GPUs.
class
BatchMergePass
:
public
Pass
{
public:
virtual
~
BatchMergePass
()
{}
protected:
std
::
unique_ptr
<
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
Graph
>
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/node.h
浏览文件 @
e0a89503
...
...
@@ -44,6 +44,7 @@ class Node {
return
op_desc_
.
get
();
}
// Please don't use this API!
int
id
()
const
{
return
id_
;
}
bool
IsOp
()
const
{
return
type_
==
Type
::
kOperation
;
}
...
...
@@ -92,6 +93,7 @@ class Node {
Node
()
=
delete
;
static
int
count_
;
// Please don't use this API or make this public.
static
void
ResetId
()
{
count_
=
0
;
}
DISABLE_COPY_AND_ASSIGN
(
Node
);
};
...
...
paddle/fluid/framework/lod_tensor_array.h
浏览文件 @
e0a89503
...
...
@@ -18,6 +18,82 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
// directly, because there are many vector<LoDTensor> used accross the project,
// and some of them are treated as LoDTensorArray.
#if !defined(PADDLE_ON_INFERENCE)
using
LoDTensorArray
=
std
::
vector
<
LoDTensor
>
;
}
#else // !PADDLE_ON_INFERENCE
#pragma message "LoDTensorArray is replaced with the inference one."
/*
* A LoDTensorArray which will not deallocate buffer when resized, fix the data
* diff in inference, and more performance friendly in the concurrency
* scenerios.
*/
class
LoDTensorArray
{
public:
LoDTensorArray
()
=
default
;
using
iterator
=
std
::
vector
<
LoDTensor
>::
iterator
;
using
const_iterator
=
std
::
vector
<
LoDTensor
>::
const_iterator
;
const_iterator
begin
()
const
{
return
array_
.
begin
();
}
const_iterator
end
()
const
{
return
array_
.
begin
()
+
size_
;
}
iterator
begin
()
{
return
array_
.
begin
();
}
iterator
end
()
{
return
array_
.
begin
()
+
size_
;
}
void
push_back
(
const
LoDTensor
&
x
)
{
if
(
size_
<
array_
.
size
())
{
array_
[
size_
++
]
=
x
;
}
else
{
array_
.
push_back
(
x
);
++
size_
;
}
}
void
resize
(
size_t
size
)
{
if
(
array_
.
size
()
<
size
)
{
array_
.
resize
(
size
);
}
size_
=
size
;
}
void
emplace_back
()
{
array_
.
emplace_back
();
}
void
emplace_back
(
LoDTensor
&&
x
)
{
array_
.
emplace_back
(
std
::
move
(
x
));
}
LoDTensor
&
back
()
{
return
array_
.
back
();
}
size_t
space
()
const
{
return
array_
.
size
();
}
void
reserve
(
size_t
size
)
{
// Naive warning to tell user this array might be to large. The memory and
// buffer used by this TensorArray will not be deleted during the training
// and inference phase, so attention not to make it expand too long.
if
(
size
>
800UL
)
{
LOG
(
WARNING
)
<<
"TensorArray has more than 800 items"
;
}
array_
.
reserve
(
size
);
}
bool
empty
()
const
{
return
size_
==
0UL
;
}
void
clear
()
{
size_
=
0UL
;
}
LoDTensor
&
operator
[](
size_t
id
)
{
return
array_
[
id
];
}
const
LoDTensor
&
operator
[](
size_t
id
)
const
{
return
array_
[
id
];
}
LoDTensor
&
at
(
size_t
id
)
{
return
array_
.
at
(
id
);
}
const
LoDTensor
&
at
(
size_t
id
)
const
{
return
array_
.
at
(
id
);
}
size_t
size
()
const
{
return
size_
;
}
private:
size_t
size_
{
0
};
std
::
vector
<
LoDTensor
>
array_
;
};
#endif // !PADDLE_ON_INFERENCE
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/op_desc.h
浏览文件 @
e0a89503
...
...
@@ -121,10 +121,6 @@ class OpDesc {
BlockDesc
*
Block
()
{
return
this
->
block_
;
}
const
BlockDesc
&
BlockRef
()
const
{
return
*
this
->
block_
;
}
void
SetBlock
(
BlockDesc
*
block
)
{
this
->
block_
=
block
;
}
private:
template
<
typename
MapType
>
static
std
::
vector
<
typename
MapType
::
key_type
>
MapKeys
(
const
MapType
&
map
)
{
...
...
paddle/fluid/framework/op_proto_maker.h
浏览文件 @
e0a89503
...
...
@@ -28,12 +28,12 @@ enum class OpRole {
kBackward
=
0x0001
,
kOptimize
=
0x0002
,
// RPC role is for send/recv releated op
kRPC
=
0x000
3
,
kRPC
=
0x000
4
,
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
kDist
=
0x000
4
,
kDist
=
0x000
8
,
// Tag all learning rate scheduler operators.
kLRSched
=
0x00
05
,
kLRSched
=
0x00
16
,
kLoss
=
0x0100
,
// The default value of op's role. This should be only used for unittests and
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
e0a89503
...
...
@@ -109,18 +109,9 @@ ParallelExecutor::ParallelExecutor(
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
BCastParamsToDevices
(
bcast_vars
);
}
// Startup Program has been run. All local scopes has correct parameters.
// Startup Program has been run. All local scopes has correct parameters.
// Step 2. Create vars in each scope;
std
::
vector
<
details
::
VariableInfo
>
var_infos
;
for
(
auto
*
var
:
main_program
.
Block
(
0
).
AllVars
())
{
var_infos
.
emplace_back
();
var_infos
.
back
().
name_
=
var
->
Name
();
var_infos
.
back
().
type_
=
var
->
GetType
();
var_infos
.
back
().
persistable_
=
var
->
Persistable
();
}
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
ir
::
Graph
>
graph
=
build_strategy
.
Apply
(
...
...
@@ -156,6 +147,23 @@ ParallelExecutor::ParallelExecutor(
params
,
member_
->
local_scopes_
,
member_
->
use_cuda_
);
#endif
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std
::
vector
<
details
::
VariableInfo
>
var_infos
;
for
(
auto
&
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
()
&&
!
node
->
IsCtrlVar
()
&&
node
->
Var
())
{
var_infos
.
emplace_back
();
var_infos
.
back
().
name_
=
node
->
Var
()
->
Name
();
var_infos
.
back
().
type_
=
node
->
Var
()
->
GetType
();
var_infos
.
back
().
persistable_
=
node
->
Var
()
->
Persistable
();
}
}
// If the loss_var_name is given, the number of graph should be only one.
if
(
loss_var_name
.
size
())
{
PADDLE_ENFORCE_EQ
(
ir
::
GraphNum
(
*
graph
),
1
,
"The number of graph should be only one"
);
}
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
places
,
std
::
move
(
graph
)));
...
...
paddle/fluid/framework/scope.h
浏览文件 @
e0a89503
...
...
@@ -78,6 +78,8 @@ class Scope {
/// Drop all kids scopes belonged to this scope.
void
DropKids
();
std
::
list
<
Scope
*>&
kids
()
const
{
return
kids_
;
}
/// Find if a scope exists in the kid scopes
bool
HasKid
(
const
Scope
*
scope
)
const
;
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -30,7 +30,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
endif
()
# Create static library
cc_library
(
paddle_fluid DEPS
${
fluid_modules
}
${
STATIC_INFERENCE_APIS
}
zero_copy_tensor
)
cc_library
(
paddle_fluid DEPS
${
fluid_modules
}
${
STATIC_INFERENCE_APIS
}
zero_copy_tensor
reset_tensor_array
)
if
(
NOT APPLE
)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
...
...
@@ -40,7 +40,7 @@ endif()
# Create shared library
cc_library
(
paddle_fluid_shared SHARED SRCS
${
SHARED_INFERENCE_SRCS
}
DEPS
${
fluid_modules
}
paddle_fluid_api
)
DEPS
${
fluid_modules
}
paddle_fluid_api
reset_tensor_array
)
set_target_properties
(
paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid
)
if
(
NOT APPLE
)
...
...
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
e0a89503
...
...
@@ -107,6 +107,9 @@ void Analyzer::Run(Argument* argument) {
passes
.
push_back
(
"mkldnn_placement_pass"
);
}
#endif
// infer_clean_graph_pass should be the first default pass
// after mkldnn_placement_pass.
passes
.
push_back
(
"infer_clean_graph_pass"
);
for
(
auto
&
pass
:
ir_passes_
)
{
if
(
!
disabled_ir_passes_
.
count
(
pass
))
{
passes
.
push_back
(
pass
);
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
e0a89503
...
...
@@ -67,7 +67,6 @@ class Analyzer : public OrderedRegistry<PassManager> {
// larger fusion.
const
std
::
vector
<
std
::
string
>
all_ir_passes_
{{
// Manual update the passes here.
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"seqconv_eltadd_relu_fuse_pass"
,
//
"embedding_fc_lstm_fuse_pass"
,
//
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -18,7 +18,8 @@ if(APPLE)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor
${
GLOB_PASS_LIB
}
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor
${
GLOB_PASS_LIB
}
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine analysis_predictor
)
...
...
@@ -31,10 +32,17 @@ function(inference_api_test TARGET_NAME)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
inference_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book/
)
if
(
WITH_GPU
)
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book/ --fraction_of_gpu_memory_to_use=0.15
)
else
()
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book/
)
endif
()
if
(
inference_test_ARGS
)
set_tests_properties
(
${
TARGET_NAME
}
PROPERTIES DEPENDS
"
${
inference_test_ARGS
}
"
)
...
...
@@ -42,7 +50,8 @@ function(inference_api_test TARGET_NAME)
endif
(
WITH_TESTING
)
endfunction
(
inference_api_test
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope
)
cc_library
(
reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor
)
cc_library
(
zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api
)
cc_library
(
zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api
)
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
e0a89503
...
...
@@ -82,6 +82,7 @@ bool AnalysisPredictor::Init(
// Get the feed_target_names and fetch_target_names
PrepareFeedFetch
();
return
true
;
}
...
...
@@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
return
false
;
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
tensor_array_batch_cleaner_
.
ResetTensorArray
();
return
true
;
}
...
...
@@ -322,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
bool
AnalysisPredictor
::
ZeroCopyRun
()
{
executor_
->
Run
();
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
tensor_array_batch_cleaner_
.
ResetTensorArray
();
return
true
;
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
e0a89503
...
...
@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
...
...
@@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
};
}
// namespace paddle
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
e0a89503
...
...
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
return
false
;
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
tensor_array_batch_cleaner_
.
ResetTensorArray
();
return
true
;
}
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
e0a89503
...
...
@@ -26,11 +26,11 @@ limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/init.h"
...
...
@@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor {
std
::
vector
<
framework
::
OpDesc
*>
fetchs_
;
// Do not use unique_ptr, use parent scope to delete
framework
::
Scope
*
sub_scope_
{
nullptr
};
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
};
}
// namespace paddle
paddle/fluid/inference/api/demo_ci/CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -52,6 +52,7 @@ include_directories("${PADDLE_LIB}")
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/gflags/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/xxhash/include"
)
if
(
NOT WIN32
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/snappy/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/snappystream/include"
)
...
...
@@ -61,8 +62,8 @@ endif(NOT WIN32)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/boost"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/eigen3"
)
if
(
NOT WIN32
)
if
(
USE_TENSORRT AND WITH_GPU
)
if
(
NOT WIN32
)
if
(
USE_TENSORRT AND WITH_GPU
)
include_directories
(
"
${
TENSORRT_INCLUDE_DIR
}
"
)
link_directories
(
"
${
TENSORRT_LIB_DIR
}
"
)
endif
()
...
...
@@ -77,13 +78,14 @@ endif(NOT WIN32)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/gflags/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/xxhash/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/paddle/lib"
)
add_executable
(
${
DEMO_NAME
}
${
DEMO_NAME
}
.cc
)
if
(
WITH_MKL
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/mklml/include"
)
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml_intel
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
set
(
MATH_LIB
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libmklml_intel
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
${
PADDLE_LIB
}
/third_party/install/mklml/lib/libiomp5
${
CMAKE_SHARED_LIBRARY_SUFFIX
}
)
set
(
MKLDNN_PATH
"
${
PADDLE_LIB
}
/third_party/install/mkldnn"
)
if
(
EXISTS
${
MKLDNN_PATH
}
)
...
...
@@ -107,7 +109,7 @@ if (NOT WIN32)
set
(
EXTERNAL_LIB
"-lrt -ldl -lpthread"
)
set
(
DEPS
${
DEPS
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
glog gflags protobuf snappystream snappy z
glog gflags protobuf snappystream snappy z
xxhash
${
EXTERNAL_LIB
}
)
else
()
set
(
DEPS
${
DEPS
}
...
...
@@ -120,7 +122,7 @@ endif(NOT WIN32)
if
(
WITH_GPU
)
if
(
NOT WIN32
)
if
(
USE_TENSORRT
)
if
(
USE_TENSORRT
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
set
(
DEPS
${
DEPS
}
${
TENSORRT_LIB_DIR
}
/libnvinfer_plugin
${
CMAKE_STATIC_LIBRARY_SUFFIX
}
)
endif
()
...
...
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
e0a89503
...
...
@@ -16,7 +16,7 @@ if [ $2 == ON ]; then
fi
if
[
$3
==
ON
]
;
then
use_gpu_list
=
'true false'
else
else
use_gpu_list
=
'false'
fi
...
...
@@ -60,7 +60,8 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_MKL
=
$TURN_ON_MKL
\
-DDEMO_NAME
=
simple_on_word2vec
\
-DWITH_GPU
=
$TEST_GPU_CPU
\
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
\
-DON_INFER
=
ON
make
-j
word2vec_model
=
${
PADDLE_ROOT
}
'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
if
[
-d
$word2vec_model
]
;
then
...
...
@@ -80,10 +81,11 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_MKL
=
$TURN_ON_MKL
\
-DDEMO_NAME
=
vis_demo
\
-DWITH_GPU
=
$TEST_GPU_CPU
\
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
\
-DON_INFER
=
ON
make
-j
for
use_gpu
in
$use_gpu_list
;
do
for
vis_demo_name
in
$vis_demo_list
;
do
for
vis_demo_name
in
$vis_demo_list
;
do
./vis_demo
\
--modeldir
=
$DATA_DIR
/
$vis_demo_name
/model
\
--data
=
$DATA_DIR
/
$vis_demo_name
/data.txt
\
...
...
@@ -95,7 +97,7 @@ for WITH_STATIC_LIB in ON OFF; do
fi
done
done
# --------tensorrt mobilenet------
if
[
$USE_TENSORRT
==
ON
-a
$TEST_GPU_CPU
==
ON
]
;
then
rm
-rf
*
...
...
@@ -106,8 +108,9 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
\
-DUSE_TENSORRT
=
$USE_TENSORRT
\
-DTENSORRT_INCLUDE_DIR
=
$TENSORRT_INCLUDE_DIR
\
-DTENSORRT_LIB_DIR
=
$TENSORRT_LIB_DIR
make
-j
-DTENSORRT_LIB_DIR
=
$TENSORRT_LIB_DIR
\
-DON_INFER
=
ON
make
-j
./trt_mobilenet_demo
\
--modeldir
=
$DATA_DIR
/mobilenet/model
\
--data
=
$DATA_DIR
/mobilenet/data.txt
\
...
...
paddle/fluid/inference/api/details/reset_tensor_array.cc
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
namespace
paddle
{
namespace
details
{
// Should be called after the parameters are loaded.
void
TensorArrayBatchCleaner
::
CollectTensorArrays
(
framework
::
Scope
*
scope
)
{
if
(
flag_
)
{
for
(
auto
&
var_name
:
scope
->
LocalVarNames
())
{
auto
*
var
=
scope
->
FindVar
(
var_name
);
// TODO(Superjomn) should avoid the case when a TensorArray is a
// parameter.
if
(
var_name
==
"feed"
||
var_name
==
"fetch"
)
continue
;
if
(
var
->
Type
()
==
typeid
(
framework
::
LoDTensorArray
))
{
VLOG
(
4
)
<<
"collect "
<<
var_name
;
arrays_
.
push_back
(
var
->
GetMutable
<
framework
::
LoDTensorArray
>
());
}
}
for
(
auto
*
kid
:
scope
->
kids
())
{
CollectTensorArrays
(
kid
);
}
VLOG
(
3
)
<<
"Collect "
<<
arrays_
.
size
()
<<
" arrays"
;
flag_
=
false
;
}
}
// Should be called when `Run` finished.
void
TensorArrayBatchCleaner
::
ResetTensorArray
()
{
for
(
auto
*
arr
:
arrays_
)
{
arr
->
clear
();
}
}
}
// namespace details
}
// namespace paddle
paddle/fluid/inference/api/details/reset_tensor_array.h
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
details
{
// Clean the TensorArray each batch to make the behavior the same with the
// training phase.
struct
TensorArrayBatchCleaner
{
// Fix the tensor array not clear in the inference scenarios.
void
CollectTensorArrays
(
framework
::
Scope
*
scope
);
void
ResetTensorArray
();
private:
bool
flag_
{
true
};
std
::
vector
<
framework
::
LoDTensorArray
*>
arrays_
;
};
}
// namespace details
}
// namespace paddle
paddle/fluid/inference/api/helper.h
浏览文件 @
e0a89503
...
...
@@ -160,7 +160,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double
latency
,
int
epoch
=
1
)
{
LOG
(
INFO
)
<<
"====== batch_size: "
<<
batch_size
<<
", repeat: "
<<
repeat
<<
", threads: "
<<
num_threads
<<
", thread id: "
<<
tid
<<
", latency: "
<<
latency
<<
"ms ======"
;
<<
", latency: "
<<
latency
<<
"ms, fps: "
<<
1
/
(
latency
/
1000.
f
)
<<
" ======"
;
if
(
epoch
>
1
)
{
int
samples
=
batch_size
*
epoch
;
LOG
(
INFO
)
<<
"====== sample number: "
<<
samples
...
...
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
e0a89503
...
...
@@ -124,7 +124,7 @@ class ZeroCopyTensor {
std
::
vector
<
std
::
vector
<
size_t
>>
lod
()
const
;
protected:
ZeroCopyTensor
(
void
*
scope
)
:
scope_
{
scope
}
{}
explicit
ZeroCopyTensor
(
void
*
scope
)
:
scope_
{
scope
}
{}
void
SetName
(
const
std
::
string
&
name
)
{
name_
=
name
;
}
void
*
FindTensor
()
const
;
...
...
@@ -259,12 +259,6 @@ struct AnalysisConfig : public NativeConfig {
kExclude
// Specify the disabled passes in `ir_passes`.
};
void
SetIncludeMode
()
{
ir_mode
=
IrPassMode
::
kInclude
;
// this pass has to be run at the beginning of all fuse passes
ir_passes
=
{
"infer_clean_graph_pass"
};
}
// Determine whether to perform graph optimization.
bool
enable_ir_optim
=
true
;
// Manually determine the IR passes to run.
...
...
paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
浏览文件 @
e0a89503
...
...
@@ -228,6 +228,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST
(
Analyzer_rnn1
,
profile
)
{
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
cfg
.
use_gpu
=
false
;
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
...
...
paddle/fluid/inference/tests/api/tester_helper.h
浏览文件 @
e0a89503
...
...
@@ -139,6 +139,9 @@ void TestMultiThreadPrediction(
}
for
(
int
tid
=
0
;
tid
<
num_threads
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
#ifdef PADDLE_WITH_MKLDNN
platform
::
set_cur_thread_id
(
static_cast
<
int
>
(
tid
)
+
1
);
#endif
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
inputs_tid
=
inputs
;
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
else
()
set
(
DEPS_OPS
${
DEPS_OPS
}
tensorrt_engine_op
)
endif
()
op_library
(
hash_op DEPS xxhash
)
op_library
(
clip_by_norm_op DEPS selected_rows_functor selected_rows
)
op_library
(
sum_op DEPS selected_rows_functor
)
op_library
(
sgd_op DEPS selected_rows_functor
)
...
...
paddle/fluid/operators/beam_search_decode_op.cc
浏览文件 @
e0a89503
...
...
@@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor {
bool
tensor_on_gpu_
;
size_t
beam_size_
;
int
end_id_
;
// TODO(Superjomn) Here might result serious performance issue in the
// concurrency
// scenarios.
const
LoDTensorArray
&
step_ids_origin_
;
const
LoDTensorArray
&
step_scores_origin_
;
LoDTensorArray
step_ids_
=
LoDTensorArray
();
...
...
paddle/fluid/operators/hash_op.cc
0 → 100644
浏览文件 @
e0a89503
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/hash_op.h"
#include <string>
#include <vector>
namespace
paddle
{
namespace
operators
{
class
HashOp
:
public
framework
::
OperatorWithKernel
{
public:
HashOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of HashOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of HashOp should not be null."
);
auto
dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_EQ
(
dims
.
size
(),
2UL
,
"The input of hash_op's dimensions must be 2"
);
std
::
vector
<
int64_t
>
out_dims
;
out_dims
.
reserve
(
dims
.
size
()
+
1
);
// copy all dims except the last one
for
(
size_t
i
=
0u
;
i
!=
dims
.
size
()
-
1
;
++
i
)
{
out_dims
.
emplace_back
(
dims
[
i
]);
}
int
num_hash
=
ctx
->
Attrs
().
Get
<
int
>
(
"num_hash"
);
out_dims
.
emplace_back
(
num_hash
);
// keep the last dim to 1
out_dims
.
emplace_back
(
1
);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
class
HashOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) Input tensor of scale operator."
);
AddOutput
(
"Out"
,
"(Tensor) Output tensor of scale operator."
);
AddComment
(
R"DOC(
**Hash Operator**
$$Out = scale * X$$
)DOC"
);
AddAttr
<
int
>
(
"num_hash"
,
""
).
SetDefault
(
1
);
AddAttr
<
int
>
(
"mod_by"
,
""
).
SetDefault
(
100000
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_WITHOUT_GRADIENT
(
hash
,
ops
::
HashOp
,
ops
::
HashOpMaker
);
REGISTER_OP_CPU_KERNEL
(
hash
,
ops
::
HashKerel
<
int
>
,
ops
::
HashKerel
<
int64_t
>
);
paddle/fluid/operators/hash_op.h
0 → 100644
浏览文件 @
e0a89503
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
extern
"C"
{
#include <xxhash.h>
}
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
// template <typename DeviceContext, typename T>
template
<
typename
T
>
class
HashKerel
:
public
framework
::
OpKernel
<
T
>
{
public:
virtual
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
out_t
=
context
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
*
in_t
=
context
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
int
mod_by
=
context
.
Attr
<
int
>
(
"mod_by"
);
int
num_hash
=
context
.
Attr
<
int
>
(
"num_hash"
);
auto
*
output
=
out_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
in_dims
=
in_t
->
dims
();
auto
in_lod
=
in_t
->
lod
();
PADDLE_ENFORCE_EQ
(
static_cast
<
uint64_t
>
(
in_dims
[
0
]),
in_lod
[
0
].
back
(),
"The actual input data's size mismatched with LoD information."
);
auto
seq_length
=
in_dims
[
0
];
auto
last_dim
=
in_dims
[
in_dims
.
size
()
-
1
];
auto
*
input
=
in_t
->
data
<
T
>
();
for
(
int
idx
=
0
;
idx
<
seq_length
;
++
idx
)
{
for
(
int
ihash
=
0
;
ihash
!=
num_hash
;
++
ihash
)
{
output
[
idx
*
num_hash
+
ihash
]
=
XXH64
(
input
,
sizeof
(
int
)
*
last_dim
,
ihash
)
%
mod_by
;
}
input
+=
last_dim
;
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/lars_momentum_op.cc
0 → 100644
浏览文件 @
e0a89503
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/lars_momentum_op.h"
#include "paddle/fluid/operators/momentum_op.h"
namespace
paddle
{
namespace
operators
{
class
LarsMomentumOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Param"
,
"(LoDTensor, default LoDTensor<float>) "
"Input parameter that has to be updated"
);
AddInput
(
"Grad"
,
"(LoDTensor, default LoDTensor<float>) "
"Input gradient of the parameter"
);
AddInput
(
"Velocity"
,
"(LoDTensor, default LoDTensor<float>) "
"Input velocity (corresponding to the parameter) "
"that has to be updated"
);
AddInput
(
"LearningRate"
,
"(LoDTensor, default LoDTensor<float>) "
"Input learning rate"
);
AddOutput
(
"ParamOut"
,
"(LoDTensor) This output is updated parameter. "
"It shared memory with Input(Param)."
);
AddOutput
(
"VelocityOut"
,
"(LoDTensor) This output is updated velocity. "
"It shared memory with Input(Velocity)."
);
AddAttr
<
float
>
(
"mu"
,
"(float) Momentum coefficient"
);
AddAttr
<
float
>
(
"lars_coeff"
,
"(float, default 0.001) LARS coefficient."
)
.
SetDefault
(
0.001
);
AddAttr
<
float
>
(
"lars_weight_decay"
,
"(float, default 0.0005) LARS weight decay"
)
.
SetDefault
(
0.0005
);
AddComment
(
R"DOC(
Lars Momentum Optimizer.
This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
weight using a local learning rate:
$$
local\_lr = \eta *
\frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
velocity = mu * velocity +
local\_lr * (grad + \beta * param) \\
param = param - velocity. \\
$$
Note that we use lars_weight_decay here to decay weights, you may need not to
use L2 regularizers in case of using LARS.
)DOC"
);
}
};
class
LarsMomentumOpVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
lars_momentum
,
ops
::
MomentumOp
,
ops
::
LarsMomentumOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
LarsMomentumOpVarTypeInference
);
REGISTER_OP_CPU_KERNEL
(
lars_momentum
,
ops
::
LarsMomentumOpKernel
<
float
>
,
ops
::
LarsMomentumOpKernel
<
double
>
);
paddle/fluid/operators/lars_momentum_op.cu
0 → 100644
浏览文件 @
e0a89503
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/lars_momentum_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
MomentumLarsKernel
(
const
T
*
p
,
const
T
*
g
,
const
T
*
v
,
const
T
*
learning_rate
,
const
T
mu
,
const
int64_t
num
,
const
T
lars_coeff
,
const
T
lars_weight_decay
,
const
T
*
p_norm
,
const
T
*
g_norm
,
T
*
p_out
,
T
*
v_out
)
{
T
lr
=
learning_rate
[
0
];
T
local_lr
=
learning_rate
[
0
];
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
p_norm
[
0
]
>
0
&&
g_norm
[
0
]
>
0
)
{
local_lr
=
lr
*
lars_coeff
*
p_norm
[
0
]
/
(
g_norm
[
0
]
+
lars_weight_decay
*
p_norm
[
0
]);
}
T
v_new
=
v
[
i
]
*
mu
+
local_lr
*
(
g
[
i
]
+
lars_weight_decay
*
p
[
i
]);
v_out
[
i
]
=
v_new
;
p_out
[
i
]
=
p
[
i
]
-
v_new
;
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
LarsMomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
param_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Param"
);
auto
velocity
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Velocity"
);
auto
grad
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Grad"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"LearningRate"
);
T
*
p_out
=
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
v_out
=
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
T
lars_coeff
=
ctx
.
Attr
<
float
>
(
"lars_coeff"
);
T
lars_weight_decay
=
ctx
.
Attr
<
float
>
(
"lars_weight_decay"
);
auto
*
p
=
param
->
data
<
T
>
();
auto
*
v
=
velocity
->
data
<
T
>
();
auto
*
g
=
grad
->
data
<
T
>
();
auto
*
lr
=
learning_rate
->
data
<
T
>
();
int
block
=
512
;
int
grid
=
(
param
->
numel
()
+
block
-
1
)
/
block
;
auto
eigen_p
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param
);
auto
eigen_g
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
grad
);
// calculate norms using eigein and launch the kernel.
framework
::
Tensor
p_norm_t
,
g_norm_t
;
p_norm_t
.
Resize
({
1
});
g_norm_t
.
Resize
({
1
});
auto
*
p_norm_data
=
p_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
g_norm_data
=
g_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
ep_norm
=
framework
::
EigenScalar
<
T
>::
From
(
p_norm_t
);
auto
eg_norm
=
framework
::
EigenScalar
<
T
>::
From
(
g_norm_t
);
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
ep_norm
.
device
(
*
place
)
=
eigen_p
.
square
().
sum
().
sqrt
();
eg_norm
.
device
(
*
place
)
=
eigen_g
.
square
().
sum
().
sqrt
();
MomentumLarsKernel
<<<
grid
,
block
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
p
,
g
,
v
,
lr
,
mu
,
param
->
numel
(),
lars_coeff
,
lars_weight_decay
,
p_norm_data
,
g_norm_data
,
p_out
,
v_out
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
lars_momentum
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
LarsMomentumOpCUDAKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/lars_momentum_op.h
0 → 100644
浏览文件 @
e0a89503
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
LarsMomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
param_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Param"
);
auto
velocity
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Velocity"
);
auto
learning_rate
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"LearningRate"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
// only support dense for now.
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
());
auto
grad
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Grad"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
T
lars_coeff
=
ctx
.
Attr
<
float
>
(
"lars_coeff"
);
T
lars_weight_decay
=
ctx
.
Attr
<
float
>
(
"lars_weight_decay"
);
auto
p_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param_out
);
auto
v_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
velocity_out
);
auto
p
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param
);
auto
v
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
velocity
);
auto
g
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
grad
);
auto
*
lr
=
learning_rate
->
data
<
T
>
();
framework
::
Tensor
p_norm_t
,
g_norm_t
;
p_norm_t
.
Resize
({
1
});
g_norm_t
.
Resize
({
1
});
p_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
g_norm_t
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
ep_norm
=
framework
::
EigenScalar
<
T
>::
From
(
p_norm_t
);
auto
eg_norm
=
framework
::
EigenScalar
<
T
>::
From
(
g_norm_t
);
ep_norm
=
p
.
square
().
sum
().
sqrt
();
eg_norm
=
g
.
square
().
sum
().
sqrt
();
T
local_lr
=
lr
[
0
];
if
(
ep_norm
(
0
)
>
0
&&
eg_norm
(
0
)
>
0
)
{
local_lr
=
lr
[
0
]
*
lars_coeff
*
ep_norm
(
0
)
/
(
eg_norm
(
0
)
+
lars_weight_decay
*
ep_norm
(
0
));
}
v_out
=
v
*
mu
+
local_lr
*
(
g
+
lars_weight_decay
*
p
);
p_out
=
p
-
v_out
;
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/lookup_table_op.cc
浏览文件 @
e0a89503
...
...
@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids."
)
.
SetDefault
(
kNoPadding
);
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr
<
bool
>
(
"grad_inplace"
,
"(boolean, default false) "
"If the grad op reuse the input's variable."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Lookup Table Operator.
...
...
paddle/fluid/operators/lookup_table_op.h
浏览文件 @
e0a89503
...
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
const
auto
*
table
=
table_t
.
value
().
data
<
T
>
();
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int64_t
i
=
0
;
i
<
ids_numel
;
++
i
)
{
if
(
padding_idx
!=
kNoPadding
&&
ids
[
i
]
==
padding_idx
)
{
memset
(
output
+
i
*
row_width
,
0
,
row_width
*
sizeof
(
T
));
...
...
@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GE
(
ids
[
i
],
0
);
auto
id_index
=
table_t
.
Index
(
ids
[
i
]);
PADDLE_ENFORCE_GE
(
id_index
,
0
,
"the input key should be exists."
);
memcpy
(
output
+
i
*
row_width
,
table
+
id_index
*
row_width
,
row_width
*
sizeof
(
T
)
);
blas
.
VCOPY
(
row_width
,
table
+
id_index
*
row_width
,
output
+
i
*
row_width
);
}
}
}
...
...
@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
int64_t
ids_num
=
ids
->
numel
();
framework
::
Vector
<
int64_t
>
new_rows
;
new_rows
.
reserve
(
ids_num
);
for
(
int64_t
i
=
0
;
i
<
ids_num
;
i
++
)
{
new_rows
.
push_back
(
ids_data
[
i
]);
}
std
::
vector
<
int64_t
>
new_rows
;
new_rows
.
resize
(
ids_num
);
std
::
memcpy
(
&
new_rows
[
0
],
ids_data
,
ids_num
*
sizeof
(
int64_t
));
d_table
->
set_rows
(
new_rows
);
auto
*
d_table_value
=
d_table
->
mutable_value
();
d_table_value
->
Resize
({
ids_num
,
table_dim
[
1
]});
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
d_table
->
set_height
(
table_dim
[
0
]);
auto
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
*
d_table_data
=
d_table_value
->
data
<
T
>
();
auto
d_output_dims
=
d_output
->
dims
();
PADDLE_ENFORCE_EQ
(
d_table_value
->
dims
(),
framework
::
flatten_to_2d
(
d_output_dims
,
d_output_dims
.
size
()
-
1
));
memcpy
(
d_table_data
,
d_output_data
,
sizeof
(
T
)
*
d_output
->
numel
());
// FIXME(minqiyang):
// memory optimization will NOT reuse Tensor with SelectedRows
// so we could just share the tensor here directly.
// However, the InferVarType method will infer the output SelectedRows
// to Tensor sometimes, which is a bug, so we will add an attribute
// here to indicate the inplace and remove this attribute after
// the InferVarType's bug was fixed
bool
grad_inplace
=
context
.
Attr
<
bool
>
(
"grad_inplace"
);
if
(
grad_inplace
)
{
d_table_value
->
ShareDataWith
(
*
d_output
);
}
else
{
d_table_value
->
mutable_data
<
T
>
(
context
.
GetPlace
());
d_table
->
set_height
(
table_dim
[
0
]);
auto
*
d_output_data
=
d_output
->
data
<
T
>
();
auto
*
d_table_data
=
d_table_value
->
data
<
T
>
();
auto
d_output_dims
=
d_output
->
dims
();
PADDLE_ENFORCE_EQ
(
d_table_value
->
dims
(),
framework
::
flatten_to_2d
(
d_output_dims
,
d_output_dims
.
size
()
-
1
));
memcpy
(
d_table_data
,
d_output_data
,
sizeof
(
T
)
*
d_output
->
numel
());
}
}
else
{
auto
*
ids
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
auto
*
d_output
=
context
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
...
...
paddle/fluid/operators/math/algorithm.h
浏览文件 @
e0a89503
...
...
@@ -39,6 +39,52 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
return
-
1
;
}
template
<
typename
T
>
HOSTDEVICE
inline
size_t
LowerBound
(
const
T
*
x
,
size_t
num
,
const
T
&
val
)
{
#ifdef __CUDA_ARCH__
// The following code is from
// https://en.cppreference.com/w/cpp/algorithm/lower_bound
auto
*
first
=
x
;
int64_t
count
=
static_cast
<
int64_t
>
(
num
);
while
(
count
>
0
)
{
int64_t
step
=
(
count
>>
1
);
auto
*
it
=
first
+
step
;
if
(
*
it
<
val
)
{
first
=
++
it
;
count
-=
(
step
+
1
);
}
else
{
count
=
step
;
}
}
return
static_cast
<
size_t
>
(
first
-
x
);
#else
return
static_cast
<
size_t
>
(
std
::
lower_bound
(
x
,
x
+
num
,
val
)
-
x
);
#endif
}
template
<
typename
T
>
HOSTDEVICE
inline
size_t
UpperBound
(
const
T
*
x
,
size_t
num
,
const
T
&
val
)
{
#ifdef __CUDA_ARCH__
// The following code is from
// https://en.cppreference.com/w/cpp/algorithm/upper_bound
auto
*
first
=
x
;
int64_t
count
=
static_cast
<
int64_t
>
(
num
);
while
(
count
>
0
)
{
auto
step
=
(
count
>>
1
);
auto
*
it
=
first
+
step
;
if
(
val
<
*
it
)
{
count
=
step
;
}
else
{
first
=
++
it
;
count
-=
(
step
+
1
);
}
}
return
static_cast
<
size_t
>
(
first
-
x
);
#else
return
static_cast
<
size_t
>
(
std
::
upper_bound
(
x
,
x
+
num
,
val
)
-
x
);
#endif
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/momentum_op.cc
浏览文件 @
e0a89503
...
...
@@ -19,54 +19,6 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
class
MomentumOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Param"
),
"Input(param) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
"Input(grad) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Velocity"
),
"Input(velocity) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"VelocityOut"
),
"Output(VelocityOut) of Momentum should not be null."
);
auto
param_dim
=
ctx
->
GetInputDim
(
"Param"
);
if
(
ctx
->
GetInputsVarType
(
"Grad"
)[
0
]
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Grad"
),
"Param and Grad input of MomentumOp should have the same dimension."
);
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Velocity"
),
"Param and Velocity of MomentumOp should have the same dimension."
);
}
PADDLE_ENFORCE_EQ
(
framework
::
product
(
ctx
->
GetInputDim
(
"LearningRate"
)),
1
,
"Learning_rate should be a scalar"
);
ctx
->
SetOutputDim
(
"ParamOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"VelocityOut"
,
param_dim
);
}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
class
MomentumOpInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
...
...
paddle/fluid/operators/momentum_op.h
浏览文件 @
e0a89503
...
...
@@ -28,6 +28,54 @@ using framework::SelectedRows;
struct
NoNesterov
;
struct
UseNesterov
;
class
MomentumOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Param"
),
"Input(param) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
"Input(grad) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Velocity"
),
"Input(velocity) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"VelocityOut"
),
"Output(VelocityOut) of Momentum should not be null."
);
auto
param_dim
=
ctx
->
GetInputDim
(
"Param"
);
if
(
ctx
->
GetInputsVarType
(
"Grad"
)[
0
]
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Grad"
),
"Param and Grad input of MomentumOp should have the same dimension."
);
PADDLE_ENFORCE_EQ
(
param_dim
,
ctx
->
GetInputDim
(
"Velocity"
),
"Param and Velocity of MomentumOp should have the same dimension."
);
}
PADDLE_ENFORCE_EQ
(
framework
::
product
(
ctx
->
GetInputDim
(
"LearningRate"
)),
1
,
"Learning_rate should be a scalar"
);
ctx
->
SetOutputDim
(
"ParamOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"VelocityOut"
,
param_dim
);
}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
template
<
typename
T
>
class
CPUDenseMomentumFunctor
{
private:
...
...
paddle/fluid/operators/sequence_reverse_op.cc
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/sequence_reverse_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
sequence_reverse
,
ops
::
SequenceReverseOp
,
ops
::
SequenceReverseOpMaker
,
ops
::
SequenceReverseGradOpDescMaker
);
REGISTER_OP_CPU_KERNEL
(
sequence_reverse
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
uint8_t
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/sequence_reverse_op.cu
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/sequence_reverse_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
sequence_reverse
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
uint8_t
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SequenceReverseOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/sequence_reverse_op.h
0 → 100644
浏览文件 @
e0a89503
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
class
SequenceReverseOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) must exist"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
"Output(Y) must exist"
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_GE
(
x_dim
.
size
(),
2
,
"Rank of Input(X) must be not less than 2."
);
ctx
->
SetOutputDim
(
"Y"
,
x_dim
);
ctx
->
ShareLoD
(
"X"
,
"Y"
);
}
};
class
SequenceReverseOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The input LoDTensor of sequence_reverse op."
);
AddOutput
(
"Y"
,
"The output LoDTensor of sequence_reverse op."
);
AddComment
(
R"DOC(
SequenceReverse Operator.
Reverse each sequence in input X along dim 0.
Assuming X is a LoDTensor with dims [5, 4] and lod [[0, 2, 5]], where:
X.data() = [
[1, 2, 3, 4],
[5, 6, 7, 8], # the 0-th sequence with length 2
[9, 10, 11, 12],
[13, 14, 15, 16],
[17, 18, 19, 20] # the 1-st sequence with length 3
]
The output Y would be a LoDTensor sharing the same dims and lod with input X,
and:
Y.data() = [
[5, 6, 7, 8],
[1, 2, 3, 4], # the reversed 0-th sequence with length 2
[17, 18, 19, 20],
[13, 14, 15, 16],
[9, 10, 11, 12] # the reversed 1-st sequence with length 3
]
This Operator is useful to build a reverse dynamic RNN network.
This Operator only supports one-level lod currently.
)DOC"
);
}
};
template
<
typename
T
>
struct
SequenceReverseFunctor
{
SequenceReverseFunctor
(
const
T
*
x
,
T
*
y
,
const
size_t
*
lod
,
size_t
lod_count
,
size_t
row_numel
)
:
x_
(
x
),
y_
(
y
),
lod_
(
lod
),
lod_count_
(
lod_count
),
row_numel_
(
row_numel
)
{}
HOSTDEVICE
void
operator
()(
size_t
idx_x
)
const
{
auto
row_idx_x
=
idx_x
/
row_numel_
;
auto
lod_idx
=
math
::
UpperBound
(
lod_
,
lod_count_
,
row_idx_x
);
auto
row_idx_y
=
lod_
[
lod_idx
-
1
]
+
(
lod_
[
lod_idx
]
-
1
-
row_idx_x
);
auto
idx_y
=
row_idx_y
*
row_numel_
+
idx_x
%
row_numel_
;
y_
[
idx_y
]
=
x_
[
idx_x
];
}
const
T
*
x_
;
T
*
y_
;
const
size_t
*
lod_
;
size_t
lod_count_
;
size_t
row_numel_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
SequenceReverseOpKernel
:
public
framework
::
OpKernel
<
T
>
{
using
LoDTensor
=
framework
::
LoDTensor
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
x
=
*
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
LoDTensor
>
(
"Y"
);
PADDLE_ENFORCE_EQ
(
x
.
lod
().
size
(),
1
,
"SequenceReverse Op only support one level lod."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
const
size_t
*
lod
;
size_t
lod_count
=
x
.
lod
()[
0
].
size
();
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
lod
=
x
.
lod
()[
0
].
CUDAData
(
ctx
.
GetPlace
());
}
else
{
#endif
lod
=
x
.
lod
()[
0
].
data
();
#ifdef PADDLE_WITH_CUDA
}
#endif
size_t
limit
=
static_cast
<
size_t
>
(
x
.
numel
());
size_t
row_numel
=
static_cast
<
size_t
>
(
limit
/
x
.
dims
()[
0
]);
auto
*
x_data
=
x
.
data
<
T
>
();
auto
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_NE
(
x_data
,
y_data
,
"SequenceReverse Op does not support in-place operation"
);
SequenceReverseFunctor
<
T
>
functor
(
x_data
,
y_data
,
lod
,
lod_count
,
row_numel
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
dev_ctx
,
limit
);
for_range
(
functor
);
}
};
class
SequenceReverseGradOpDescMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
std
::
unique_ptr
<
framework
::
OpDesc
>
op
(
new
framework
::
OpDesc
());
op
->
SetType
(
"sequence_reverse"
);
op
->
SetInput
(
"X"
,
OutputGrad
(
"Y"
));
op
->
SetOutput
(
"Y"
,
InputGrad
(
"X"
));
op
->
SetAttrMap
(
Attrs
());
return
op
;
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/platform/device_context.cc
浏览文件 @
e0a89503
...
...
@@ -296,38 +296,73 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext
::
MKLDNNDeviceContext
(
CPUPlace
place
)
:
CPUDeviceContext
(
place
),
engine_
(
mkldnn
::
engine
::
cpu
,
0
),
p_blobs_
()
{
p_blobs_
.
reset
(
new
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>
());
:
CPUDeviceContext
(
place
),
engine_
(
mkldnn
::
engine
::
cpu
,
0
),
p_blobmap_
()
{
p_blobmap_
.
reset
(
new
BlobMap
());
p_mutex_
.
reset
(
new
std
::
mutex
());
}
namespace
{
// Current thread's id.
thread_local
int
cur_thread_id
=
0
;
}
void
set_cur_thread_id
(
int
tid
)
{
cur_thread_id
=
tid
;
}
int
get_cur_thread_id
(
void
)
{
return
cur_thread_id
;
}
void
MKLDNNDeviceContext
::
SetBlob
(
const
std
::
string
&
name
,
std
::
shared_ptr
<
void
>
data
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>*
p
;
p
=
p_blobs_
.
get
();
BlobMap
*
pMap
=
p_blobmap_
.
get
();
std
::
shared_ptr
<
KeyBlob
>
pBlob
=
nullptr
;
int
tid
=
platform
::
get_cur_thread_id
();
auto
it
=
p
->
find
(
name
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
()
);
if
(
it
==
p
->
end
())
{
(
*
p
)[
name
]
=
data
;
// create new blob
// Find KeyBlob for current thread
auto
map_it
=
pMap
->
find
(
tid
);
if
(
map_it
==
pMap
->
end
())
{
// 1st time to set blob in current thread
pBlob
=
std
::
shared_ptr
<
KeyBlob
>
(
new
KeyBlob
());
(
*
pMap
)[
tid
]
=
pBlob
;
}
else
{
it
->
second
=
data
;
// set data to existing blob
pBlob
=
map_it
->
second
;
}
// Find Key in found (or newly created) KeyBlob
auto
key_it
=
pBlob
->
find
(
name
);
if
(
key_it
==
pBlob
->
end
())
{
(
*
pBlob
)[
name
]
=
data
;
// create new blob
}
else
{
key_it
->
second
=
data
;
// set data to existing blob
}
// lock will be automatically released when out of scope
return
;
}
std
::
shared_ptr
<
void
>
MKLDNNDeviceContext
::
GetBlob
(
const
std
::
string
&
name
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>*
p
;
p
=
p_blobs_
.
get
()
;
BlobMap
*
pMap
=
p_blobmap_
.
get
()
;
std
::
shared_ptr
<
KeyBlob
>
pBlob
=
nullptr
;
auto
it
=
p
->
find
(
name
);
int
tid
=
platform
::
get_cur_thread_id
(
);
if
(
it
!=
p
->
end
())
{
return
it
->
second
;
}
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
());
// Find KeyBlob for current thread firstly
auto
map_it
=
pMap
->
find
(
tid
);
if
(
map_it
==
pMap
->
end
())
return
nullptr
;
pBlob
=
map_it
->
second
;
// Find Blob via name
auto
key_it
=
pBlob
->
find
(
name
);
if
(
key_it
==
pBlob
->
end
())
return
nullptr
;
return
nullptr
;
// lock will be automatically released when out of scope
return
key_it
->
second
;
}
#endif
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
e0a89503
...
...
@@ -176,6 +176,12 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
#endif
#ifdef PADDLE_WITH_MKLDNN
using
KeyBlob
=
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>
;
using
BlobMap
=
std
::
unordered_map
<
int
,
std
::
shared_ptr
<
KeyBlob
>>
;
void
set_cur_thread_id
(
int
);
int
get_cur_thread_id
(
void
);
class
MKLDNNDeviceContext
:
public
CPUDeviceContext
{
public:
explicit
MKLDNNDeviceContext
(
CPUPlace
place
);
...
...
@@ -191,8 +197,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
private:
mkldnn
::
engine
engine_
;
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>>
p_blobs
_
;
std
::
shared_ptr
<
BlobMap
>
p_blobmap_
;
std
::
shared_ptr
<
std
::
mutex
>
p_mutex
_
;
};
#endif
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
e0a89503
...
...
@@ -645,9 +645,13 @@ All parameter, weight, gradient are variables in Paddle.
py
::
class_
<
ir
::
Pass
,
std
::
shared_ptr
<
ir
::
Pass
>>
pass
(
m
,
"Pass"
);
pass
.
def
(
py
::
init
())
.
def
(
"set_str"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
const
std
::
string
&
attr
)
{
self
.
Set
<
std
::
string
>
(
name
,
new
std
::
string
(
attr
));
.
def
(
"set_str"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
const
std
::
string
&
attr
)
{
self
.
Set
<
std
::
string
>
(
name
,
new
std
::
string
(
attr
));
})
.
def
(
"set_int"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
int
val
)
{
self
.
Set
<
const
int
>
(
name
,
new
int
(
val
));
});
py
::
class_
<
ir
::
PassBuilder
,
std
::
shared_ptr
<
ir
::
PassBuilder
>>
pb
(
...
...
paddle/fluid/train/demo/CMakeLists.txt
浏览文件 @
e0a89503
...
...
@@ -15,6 +15,7 @@ include_directories("${PADDLE_LIB}")
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/gflags/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/xxhash/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/snappy/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/snappystream/include"
)
include_directories
(
"
${
PADDLE_LIB
}
/third_party/install/zlib/include"
)
...
...
@@ -27,6 +28,7 @@ link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/protobuf/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/glog/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/gflags/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/xxhash/lib"
)
link_directories
(
"
${
PADDLE_LIB
}
/third_party/install/zlib/lib"
)
add_executable
(
demo_trainer demo_trainer.cc
)
...
...
@@ -62,5 +64,5 @@ target_link_libraries(demo_trainer
${
ARCHIVE_END
}
${
MATH_LIB
}
${
MKLDNN_LIB
}
glog gflags protobuf snappystream snappy z
glog gflags protobuf snappystream snappy z
xxhash
${
EXTERNAL_LIB
}
)
paddle/scripts/paddle_build.sh
浏览文件 @
e0a89503
...
...
@@ -95,9 +95,9 @@ function cmake_gen() {
exit
1
fi
fi
else
else
if
[
"
$1
"
!=
""
]
;
then
echo
"using python abi:
$1
"
echo
"using python abi:
$1
"
if
[
"
$1
"
==
"cp27-cp27m"
]
;
then
export
LD_LIBRARY_PATH
=
/opt/_internal/cpython-2.7.11-ucs2/lib:
${
LD_LIBRARY_PATH
#/opt/_internal/cpython-2.7.11-ucs4/lib
:
}
export
PATH
=
/opt/python/cp27-cp27m/bin/:
${
PATH
}
...
...
@@ -119,7 +119,7 @@ function cmake_gen() {
fi
fi
fi
if
[
"
$SYSTEM
"
==
"Darwin"
]
;
then
WITH_DISTRIBUTE
=
${
WITH_DISTRIBUTE
:-
ON
}
WITH_AVX
=
${
WITH_AVX
:-
ON
}
...
...
@@ -127,7 +127,7 @@ function cmake_gen() {
else
INFERENCE_DEMO_INSTALL_DIR
=
${
INFERENCE_DEMO_INSTALL_DIR
:-
/root/.cache/inference_demo
}
fi
cat
<<
EOF
========================================
Configuring cmake in /paddle/build ...
...
...
@@ -394,8 +394,8 @@ EOF
export
http_proxy
=
export
https_proxy
=
# TODO: jiabin need to refine this part when these tests fixed on mac
ctest
--output-on-failure
-j
$1
# make install should also be test when unittest
ctest
--output-on-failure
-j
$1
# make install should also be test when unittest
make
install
-j
8
pip
install
--user
${
INSTALL_PREFIX
:-
/paddle/build
}
/opt/paddle/share/wheels/
*
.whl
if
[[
${
WITH_FLUID_ONLY
:-
OFF
}
==
"OFF"
]]
;
then
...
...
@@ -659,7 +659,7 @@ function gen_fluid_lib() {
Generating fluid library for train and inference ...
========================================
EOF
cmake ..
-DWITH_DISTRIBUTE
=
OFF
cmake ..
-DWITH_DISTRIBUTE
=
OFF
-DON_INFER
=
ON
make
-j
`
nproc
`
fluid_lib_dist
make
-j
`
nproc
`
inference_lib_dist
fi
...
...
python/paddle/dataset/wmt16.py
浏览文件 @
e0a89503
...
...
@@ -78,7 +78,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
six
.
iteritems
(
word_dict
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)):
if
idx
+
3
==
dict_size
:
break
fout
.
write
(
"%s
\n
"
%
(
word
[
0
]
))
fout
.
write
(
"%s
\n
"
%
(
cpt
.
to_bytes
(
word
[
0
])
))
def
__load_dict
(
tar_file
,
dict_size
,
lang
,
reverse
=
False
):
...
...
python/paddle/fluid/evaluator.py
浏览文件 @
e0a89503
...
...
@@ -316,7 +316,7 @@ class DetectionMAP(Evaluator):
gt_label (Variable): The ground truth label index, which is a LoDTensor
with shape [N, 1].
gt_box (Variable): The ground truth bounding box (bbox), which is a
LoDTensor with shape [N,
6
]. The layout is [xmin, ymin, xmax, ymax].
LoDTensor with shape [N,
4
]. The layout is [xmin, ymin, xmax, ymax].
gt_difficult (Variable|None): Whether this ground truth is a difficult
bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
it means all the ground truth labels are not difficult bbox.
...
...
python/paddle/fluid/layers/learning_rate_scheduler.py
浏览文件 @
e0a89503
...
...
@@ -27,7 +27,7 @@ from . import nn
from
.
import
ops
from
.
import
tensor
from
..initializer
import
init_on_cpu
from
..framework
import
default_main_program
,
Parameter
,
unique_name
from
..framework
import
default_main_program
,
Parameter
,
unique_name
,
name_scope
__all__
=
[
'exponential_decay'
,
'natural_exp_decay'
,
'inverse_time_decay'
,
...
...
@@ -332,14 +332,16 @@ def append_LARS(params_grads, learning_rate, weight_decay):
return
grad_norm
+
weight_decay
*
param_norm
for
param
,
grad
in
params_grads
:
param_lr
=
param
.
optimize_attr
[
'learning_rate'
]
param_norm
=
ops
.
sqrt
(
nn
.
reduce_sum
(
input
=
ops
.
square
(
param
)))
grad_norm
=
ops
.
sqrt
(
nn
.
reduce_sum
(
input
=
ops
.
square
(
grad
)))
if
type
(
param_lr
)
==
float
and
param_lr
==
1.0
:
decayed_lr
=
learning_rate
*
param_norm
\
/
_balanced_weight
(
param_norm
,
grad_norm
)
else
:
decayed_lr
=
learning_rate
*
param_lr
*
param_norm
\
/
_balanced_weight
(
param_norm
,
grad_norm
)
# set back param local learning rate
param
.
optimize_attr
[
'learning_rate'
]
=
decayed_lr
with
param
.
block
.
program
.
optimized_guard
(
[
param
,
grad
]),
name_scope
(
"optimizer"
):
param_lr
=
param
.
optimize_attr
[
'learning_rate'
]
param_norm
=
ops
.
sqrt
(
nn
.
reduce_sum
(
input
=
ops
.
square
(
param
)))
grad_norm
=
ops
.
sqrt
(
nn
.
reduce_sum
(
input
=
ops
.
square
(
grad
)))
if
type
(
param_lr
)
==
float
and
param_lr
==
1.0
:
decayed_lr
=
learning_rate
*
param_norm
\
/
_balanced_weight
(
param_norm
,
grad_norm
)
else
:
decayed_lr
=
learning_rate
*
param_lr
*
param_norm
\
/
_balanced_weight
(
param_norm
,
grad_norm
)
# set back param local learning rate
param
.
optimize_attr
[
'learning_rate'
]
=
decayed_lr
python/paddle/fluid/layers/nn.py
浏览文件 @
e0a89503
...
...
@@ -155,7 +155,9 @@ __all__ = [
'sigmoid_cross_entropy_with_logits'
,
'maxout'
,
'space_to_depth'
,
'sequence_reverse'
,
'affine_channel'
,
'hash'
,
]
...
...
@@ -1991,17 +1993,17 @@ def sequence_slice(input, offset, length, name=None):
"""
**Sequence Slice Layer**
The layer crops a subsequence from given sequence with given start
The layer crops a subsequence from given sequence with given start
offset and subsequence length.
It only supports sequence data (LoDTensor with lod_level equal to 1).
.. code-block:: text
- Case:
Given the input Variable **input**:
input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
input.lod = [[3, 2]],
input.dims = (5, 2),
...
...
@@ -2009,16 +2011,16 @@ def sequence_slice(input, offset, length, name=None):
with offset.data = [[0], [1]] and length.data = [[2], [1]],
the output Variable will be
out.data = [[a1, a2], [b1, b2], [e1, e2]],
out.lod = [[2, 1]],
out.dims = (3, 2).
NOTE: The first dimension size of **input**, **offset** and **length**
NOTE: The first dimension size of **input**, **offset** and **length**
should be equal. The **offset** should start from 0.
Args:
input(Variable): The input Variable which consists of the complete
input(Variable): The input Variable which consists of the complete
sequences.
offset(Variable): The offset to slice each sequence.
length(Variable): The length of each subsequence.
...
...
@@ -2037,7 +2039,7 @@ def sequence_slice(input, offset, length, name=None):
dtype='float32', lod_level=1)
offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
length=length)
"""
helper
=
LayerHelper
(
"sequence_slice"
,
**
locals
())
...
...
@@ -2420,12 +2422,12 @@ def layer_norm(input,
param_attr(ParamAttr|None): The parameter attribute for the learnable
gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as scale. The
:attr:`param_attr` is initialized as 1 if it is added. Default None.
a default :code:`ParamAttr` would be added as scale. The
:attr:`param_attr` is initialized as 1 if it is added. Default None.
bias_attr(ParamAttr|None): The parameter attribute for the learnable
bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
a default :code:`ParamAttr` would be added as bias. The
a default :code:`ParamAttr` would be added as bias. The
:attr:`bias_attr` is initialized as 0 if it is added. Default None.
act(str): Activation to be applied to the output of layer normalizaiton.
Default None.
...
...
@@ -3043,8 +3045,8 @@ def sequence_unpad(x, length, name=None):
"""
**Sequence Unpad Layer**
This layer removes the padding data in the input sequences and convert
them into sequences with actual length as output, identitied by lod
This layer removes the padding data in the input sequences and convert
them into sequences with actual length as output, identitied by lod
information.
.. code-block:: text
...
...
@@ -3054,9 +3056,9 @@ def sequence_unpad(x, length, name=None):
Given input Variable **x**:
x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0],
[ 6.0, 7.0, 8.0, 9.0, 10.0],
[11.0, 12.0, 13.0, 14.0, 15.0]],
in which there are 3 sequences padded to length 5, and the acutal length
[11.0, 12.0, 13.0, 14.0, 15.0]],
in which there are 3 sequences padded to length 5, and the acutal length
specified by input Variable **length**:
length.data = [[2], [3], [4]],
...
...
@@ -3064,7 +3066,7 @@ def sequence_unpad(x, length, name=None):
after unpadding, the output Variable will be:
out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
out.lod = [[2, 3, 4]]
out.lod = [[2, 3, 4]]
Args:
x(Variable): Input Variable which contains the padded sequences with
...
...
@@ -5499,9 +5501,9 @@ def roi_align(input,
Examples:
.. code-block:: python
align_out = fluid.layers.roi_align(input=x,
rois=rois,
pooled_height=7,
align_out = fluid.layers.roi_align(input=x,
rois=rois,
pooled_height=7,
pooled_width=7,
spatial_scale=0.5,
sampling_ratio=-1)
...
...
@@ -7538,13 +7540,40 @@ def space_to_depth(x, blocksize, name=None):
return
out
@
templatedoc
()
def
sequence_reverse
(
x
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${y_type}): ${y_comment}
"""
helper
=
LayerHelper
(
"sequence_reverse"
,
**
locals
())
if
name
is
None
:
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
else
:
out
=
helper
.
create_variable
(
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
"sequence_reverse"
,
inputs
=
{
"X"
:
x
},
outputs
=
{
"Y"
:
out
},
attrs
=
dict
())
return
out
def
affine_channel
(
x
,
scale
=
None
,
bias
=
None
,
data_layout
=
'NCHW'
,
name
=
None
):
"""
Applies a separate affine transformation to each channel of the input.
Useful for replacing spatial batch norm with its equivalent fixed
transformation. The input also can be 2D tensor and applies a affine
transformation in second dimension.
Args:
x (Variable): Feature map input can be a 4D tensor with order NCHW
or NHWC. It also can be a 2D tensor and the affine transformation
...
...
@@ -7577,3 +7606,31 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
attrs
=
{
"data_layout"
:
data_layout
},
outputs
=
{
"Out"
:
out
})
return
out
def
hash
(
input
,
hash_size
,
num_hash
=
1
,
name
=
None
):
"""
hash the input
Args:
input (Variable): The input variable which is a one-hot word.
hash_size (int): The space size for hash algorithm.
num_hash (int): The times of hash, default 1.
name (str, default None): The name of this layer.
Returns:
Variable: The hash result variable which is a LoDTensor.
Examples:
.. code-block:: python
word_dict = paddle.dataset.imdb.word_dict()
x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
out = fluid.layers.hash(input=x, len(word_dict))
"""
helper
=
LayerHelper
(
'hash'
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
helper
.
input_dtype
(),
stop_gradient
=
True
)
helper
.
append_op
(
type
=
'hash'
,
inputs
=
{
'X'
:
input
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'num_hash'
:
num_hash
,
'mod_by'
:
hash_size
})
return
out
python/paddle/fluid/metrics.py
浏览文件 @
e0a89503
...
...
@@ -13,8 +13,6 @@
# limitations under the License.
"""
Fluid Metrics
The metrics are accomplished via Python natively.
"""
from
__future__
import
print_function
...
...
@@ -24,6 +22,12 @@ import copy
import
warnings
import
six
from
.layer_helper
import
LayerHelper
from
.initializer
import
Constant
from
.
import
unique_name
from
.framework
import
Program
,
Variable
,
program_guard
from
.
import
layers
__all__
=
[
'MetricBase'
,
'CompositeMetric'
,
...
...
@@ -474,71 +478,10 @@ class EditDistance(MetricBase):
"There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
)
avg_distance
=
self
.
total_distance
/
self
.
seq_num
avg_instance_error
=
self
.
instance_error
/
self
.
seq_num
avg_instance_error
=
self
.
instance_error
/
float
(
self
.
seq_num
)
return
avg_distance
,
avg_instance_error
class
DetectionMAP
(
MetricBase
):
"""
Calculate the detection mean average precision (mAP).
mAP is the metric to measure the accuracy of object detectors
like Faster R-CNN, SSD, etc.
It is the average of the maximum precisions at different recall values.
Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'.
Examples:
.. code-block:: python
pred = fluid.layers.fc(input=data, size=1000, act="tanh")
batch_map = layers.detection_map(
input,
label,
class_num,
background_label,
overlap_threshold=overlap_threshold,
evaluate_difficult=evaluate_difficult,
ap_version=ap_version)
metric = fluid.metrics.DetectionMAP()
for data in train_reader():
loss, preds, labels = exe.run(fetch_list=[cost, batch_map])
batch_size = data[0]
metric.update(value=batch_map, weight=batch_size)
numpy_map = metric.eval()
"""
def
__init__
(
self
,
name
=
None
):
super
(
DetectionMAP
,
self
).
__init__
(
name
)
# the current map value
self
.
value
=
.
0
self
.
weight
=
.
0
def
update
(
self
,
value
,
weight
):
if
not
_is_number_or_matrix_
(
value
):
raise
ValueError
(
"The 'value' must be a number(int, float) or a numpy ndarray."
)
if
not
_is_number_
(
weight
):
raise
ValueError
(
"The 'weight' must be a number(int, float)."
)
self
.
value
+=
value
self
.
weight
+=
weight
def
eval
(
self
):
if
self
.
weight
==
0
:
raise
ValueError
(
"There is no data in DetectionMAP Metrics. "
"Please check layers.detection_map output has added to DetectionMAP."
)
return
self
.
value
/
self
.
weight
class
Auc
(
MetricBase
):
"""
Auc metric adapts to the binary classification.
...
...
@@ -616,3 +559,179 @@ class Auc(MetricBase):
idx
-=
1
return
auc
/
tot_pos
/
tot_neg
if
tot_pos
>
0.0
and
tot_neg
>
0.0
else
0.0
class
DetectionMAP
(
object
):
"""
Calculate the detection mean average precision (mAP).
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'.
Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
Args:
input (Variable): The detection results, which is a LoDTensor with shape
[M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
gt_label (Variable): The ground truth label index, which is a LoDTensor
with shape [N, 1].
gt_box (Variable): The ground truth bounding box (bbox), which is a
LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
gt_difficult (Variable|None): Whether this ground truth is a difficult
bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
it means all the ground truth labels are not difficult bbox.
class_num (int): The class number.
background_label (int): The index of background label, the background
label will be ignored. If set to -1, then all categories will be
considered, 0 by defalut.
overlap_threshold (float): The threshold for deciding true/false
positive, 0.5 by defalut.
evaluate_difficult (bool): Whether to consider difficult ground truth
for evaluation, True by defalut. This argument does not work when
gt_difficult is None.
ap_version (string): The average precision calculation ways, it must be
'integral' or '11point'. Please check
https://sanchom.wordpress.com/tag/average-precision/ for details.
- 11point: the 11-point interpolated average precision.
- integral: the natural integral of the precision-recall curve.
Examples:
.. code-block:: python
exe = fluid.Executor(place)
map_evaluator = fluid.Evaluator.DetectionMAP(input,
gt_label, gt_box, gt_difficult)
cur_map, accum_map = map_evaluator.get_map_var()
fetch = [cost, cur_map, accum_map]
for epoch in PASS_NUM:
map_evaluator.reset(exe)
for data in batches:
loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
In the above example:
'cur_map_v' is the mAP of current mini-batch.
'accum_map_v' is the accumulative mAP of one pass.
"""
def
__init__
(
self
,
input
,
gt_label
,
gt_box
,
gt_difficult
=
None
,
class_num
=
None
,
background_label
=
0
,
overlap_threshold
=
0.5
,
evaluate_difficult
=
True
,
ap_version
=
'integral'
):
self
.
helper
=
LayerHelper
(
'map_eval'
)
gt_label
=
layers
.
cast
(
x
=
gt_label
,
dtype
=
gt_box
.
dtype
)
if
gt_difficult
:
gt_difficult
=
layers
.
cast
(
x
=
gt_difficult
,
dtype
=
gt_box
.
dtype
)
label
=
layers
.
concat
([
gt_label
,
gt_difficult
,
gt_box
],
axis
=
1
)
else
:
label
=
layers
.
concat
([
gt_label
,
gt_box
],
axis
=
1
)
# calculate mean average precision (mAP) of current mini-batch
map
=
layers
.
detection_map
(
input
,
label
,
class_num
,
background_label
,
overlap_threshold
=
overlap_threshold
,
evaluate_difficult
=
evaluate_difficult
,
ap_version
=
ap_version
)
states
=
[]
states
.
append
(
self
.
_create_state
(
dtype
=
'int32'
,
shape
=
None
,
suffix
=
'accum_pos_count'
))
states
.
append
(
self
.
_create_state
(
dtype
=
'float32'
,
shape
=
None
,
suffix
=
'accum_true_pos'
))
states
.
append
(
self
.
_create_state
(
dtype
=
'float32'
,
shape
=
None
,
suffix
=
'accum_false_pos'
))
var
=
self
.
_create_state
(
dtype
=
'int32'
,
shape
=
[
1
],
suffix
=
'has_state'
)
self
.
helper
.
set_variable_initializer
(
var
,
initializer
=
Constant
(
value
=
int
(
0
)))
self
.
has_state
=
var
# calculate accumulative mAP
accum_map
=
layers
.
detection_map
(
input
,
label
,
class_num
,
background_label
,
overlap_threshold
=
overlap_threshold
,
evaluate_difficult
=
evaluate_difficult
,
has_state
=
self
.
has_state
,
input_states
=
states
,
out_states
=
states
,
ap_version
=
ap_version
)
layers
.
fill_constant
(
shape
=
self
.
has_state
.
shape
,
value
=
1
,
dtype
=
self
.
has_state
.
dtype
,
out
=
self
.
has_state
)
self
.
cur_map
=
map
self
.
accum_map
=
accum_map
def
_create_state
(
self
,
suffix
,
dtype
,
shape
):
"""
Create state variable.
Args:
suffix(str): the state suffix.
dtype(str|core.VarDesc.VarType): the state data type
shape(tuple|list): the shape of state
Returns: State variable
"""
state
=
self
.
helper
.
create_variable
(
name
=
"_"
.
join
([
unique_name
.
generate
(
self
.
helper
.
name
),
suffix
]),
persistable
=
True
,
dtype
=
dtype
,
shape
=
shape
)
return
state
def
get_map_var
(
self
):
"""
Returns: mAP variable of current mini-batch and
accumulative mAP variable cross mini-batches.
"""
return
self
.
cur_map
,
self
.
accum_map
def
reset
(
self
,
executor
,
reset_program
=
None
):
"""
Reset metric states at the begin of each pass/user specified batch.
Args:
executor(Executor): a executor for executing
the reset_program.
reset_program(Program|None): a single Program for reset process.
If None, will create a Program.
"""
def
_clone_var_
(
block
,
var
):
assert
isinstance
(
var
,
Variable
)
return
block
.
create_var
(
name
=
var
.
name
,
shape
=
var
.
shape
,
dtype
=
var
.
dtype
,
type
=
var
.
type
,
lod_level
=
var
.
lod_level
,
persistable
=
var
.
persistable
)
if
reset_program
is
None
:
reset_program
=
Program
()
with
program_guard
(
main_program
=
reset_program
):
var
=
_clone_var_
(
reset_program
.
current_block
(),
self
.
has_state
)
layers
.
fill_constant
(
shape
=
var
.
shape
,
value
=
0
,
dtype
=
var
.
dtype
,
out
=
var
)
executor
.
run
(
reset_program
)
python/paddle/fluid/optimizer.py
浏览文件 @
e0a89503
...
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
import
re
import
sys
from
collections
import
defaultdict
from
paddle.fluid.framework
import
Program
,
Variable
,
name_scope
,
default_main_program
from
.
import
framework
...
...
@@ -32,7 +33,8 @@ __all__ = [
'SGD'
,
'Momentum'
,
'Adagrad'
,
'Adam'
,
'Adamax'
,
'DecayedAdagrad'
,
'Ftrl'
,
'SGDOptimizer'
,
'MomentumOptimizer'
,
'AdagradOptimizer'
,
'AdamOptimizer'
,
'AdamaxOptimizer'
,
'DecayedAdagradOptimizer'
,
'RMSPropOptimizer'
,
'FtrlOptimizer'
,
'Adadelta'
,
'ModelAverage'
,
'RMSPropOptimizer'
'FtrlOptimizer'
,
'Adadelta'
,
'ModelAverage'
,
'LarsMomentum'
,
'LarsMomentumOptimizer'
]
...
...
@@ -105,7 +107,6 @@ class Optimizer(object):
param
=
param_and_grad
[
0
]
param_lr
=
param
.
optimize_attr
[
'learning_rate'
]
if
type
(
param_lr
)
==
Variable
:
print
(
"returns updated param lr "
,
param_lr
)
return
param_lr
else
:
if
param_lr
==
1.0
:
...
...
@@ -400,6 +401,91 @@ class MomentumOptimizer(Optimizer):
return
momentum_op
class
LarsMomentumOptimizer
(
Optimizer
):
"""
Momentum optimizer with LARS support
The update equations are as follows:
.. math::
& local\_learning\_rate = learning\_rate * lars\_coeff *
\\
\\
frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
& velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
& param = param - velocity
Args:
learning_rate (float|Variable): the learning rate used to update parameters.
\
Can be a float value or a Variable with one float value as data element.
momentum (float): momentum factor
lars_coeff (float): defines how much we trust the layer to change its weights.
lars_weight_decay (float): weight decay coefficient for decaying using LARS.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.LarsMomentum(learning_rate=0.2, momentum=0.1, lars_weight_decay=0.001)
optimizer.minimize(cost)
"""
_velocity_acc_str
=
"velocity"
def
__init__
(
self
,
learning_rate
,
momentum
,
lars_coeff
=
0.001
,
lars_weight_decay
=
0.0005
,
regularization
=
None
,
name
=
None
):
assert
learning_rate
is
not
None
assert
momentum
is
not
None
super
(
LarsMomentumOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
regularization
=
regularization
,
name
=
name
)
self
.
type
=
"lars_momentum"
self
.
_momentum
=
momentum
self
.
_lars_coeff
=
float
(
lars_coeff
)
self
.
_lars_weight_decay
=
float
(
lars_weight_decay
)
def
_create_accumulators
(
self
,
block
,
parameters
):
assert
isinstance
(
block
,
framework
.
Block
)
for
p
in
parameters
:
self
.
_add_accumulator
(
self
.
_velocity_acc_str
,
p
)
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
assert
isinstance
(
block
,
framework
.
Block
)
velocity_acc
=
self
.
_get_accumulator
(
self
.
_velocity_acc_str
,
param_and_grad
[
0
])
# create the momentum optimize op
momentum_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"Velocity"
:
velocity_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
)
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"VelocityOut"
:
velocity_acc
},
attrs
=
{
"mu"
:
self
.
_momentum
,
"lars_coeff"
:
self
.
_lars_coeff
,
"lars_weight_decay"
:
self
.
_lars_weight_decay
})
return
momentum_op
class
AdagradOptimizer
(
Optimizer
):
"""
**Adaptive Gradient Algorithm (Adagrad)**
...
...
@@ -1221,6 +1307,7 @@ DecayedAdagrad = DecayedAdagradOptimizer
Adadelta
=
AdadeltaOptimizer
RMSProp
=
RMSPropOptimizer
Ftrl
=
FtrlOptimizer
LarsMomentum
=
LarsMomentumOptimizer
class
ModelAverage
(
Optimizer
):
...
...
python/paddle/fluid/tests/unittests/dist_mnist.py
浏览文件 @
e0a89503
...
...
@@ -95,7 +95,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
# Reader
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
t
rain
(),
batch_size
=
batch_size
)
paddle
.
dataset
.
mnist
.
t
est
(),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
opt
.
minimize
(
avg_cost
)
...
...
python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
0 → 100644
浏览文件 @
e0a89503
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
numpy
as
np
import
argparse
import
time
import
math
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.profiler
as
profiler
from
paddle.fluid
import
core
import
unittest
from
multiprocessing
import
Process
import
os
import
signal
from
functools
import
reduce
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
from
dist_mnist
import
cnn_model
DTYPE
=
"float32"
def
test_merge_reader
(
repeat_batch_size
=
8
):
orig_reader
=
paddle
.
dataset
.
mnist
.
test
()
record_batch
=
[]
b
=
0
for
d
in
orig_reader
():
if
b
>=
repeat_batch_size
:
break
record_batch
.
append
(
d
)
b
+=
1
while
True
:
for
d
in
record_batch
:
yield
d
class
TestDistMnist2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
):
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
# Train program
predict
=
cnn_model
(
images
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
batch_size_tensor
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size_tensor
)
inference_program
=
fluid
.
default_main_program
().
clone
()
# Optimization
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
)
# Reader
train_reader
=
paddle
.
batch
(
test_merge_reader
,
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
opt
.
minimize
(
avg_cost
)
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
if
__name__
==
"__main__"
:
runtime_main
(
TestDistMnist2x2
)
python/paddle/fluid/tests/unittests/dist_mnist_lars.py
0 → 100644
浏览文件 @
e0a89503
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
numpy
as
np
import
argparse
import
time
import
math
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.profiler
as
profiler
from
paddle.fluid
import
core
import
unittest
from
multiprocessing
import
Process
import
os
import
signal
from
functools
import
reduce
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
from
dist_mnist
import
cnn_model
DTYPE
=
"float32"
paddle
.
dataset
.
mnist
.
fetch
()
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
class
TestDistMnist2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
):
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
# Train program
predict
=
cnn_model
(
images
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
batch_size_tensor
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size_tensor
)
inference_program
=
fluid
.
default_main_program
().
clone
()
# Optimization
opt
=
fluid
.
optimizer
.
LarsMomentumOptimizer
(
learning_rate
=
0.001
,
momentum
=
0.9
)
# Reader
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
opt
.
minimize
(
avg_cost
)
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
if
__name__
==
"__main__"
:
runtime_main
(
TestDistMnist2x2
)
python/paddle/fluid/tests/unittests/dist_transformer.py
浏览文件 @
e0a89503
...
...
@@ -1159,6 +1159,7 @@ def prepare_encoder(src_word,
name
=
pos_enc_param_name
,
trainable
=
False
,
initializer
=
fluid
.
initializer
.
ConstantInitializer
(
0.001
)))
src_pos_enc
.
stop_gradient
=
True
enc_input
=
src_word_emb
+
src_pos_enc
return
layers
.
dropout
(
enc_input
,
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
e0a89503
...
...
@@ -26,10 +26,11 @@ import argparse
import
paddle.fluid
as
fluid
RUN_STEP
=
10
DEFAULT_BATCH_SIZE
=
2
class
TestDistRunnerBase
(
object
):
def
get_model
(
self
,
batch_size
=
2
):
def
get_model
(
self
,
batch_size
=
DEFAULT_BATCH_SIZE
):
raise
NotImplementedError
(
"get_model should be implemented by child classes."
)
...
...
@@ -48,8 +49,7 @@ class TestDistRunnerBase(object):
return
t
def
run_pserver
(
self
,
args
):
self
.
get_model
(
batch_size
=
2
)
self
.
get_model
(
batch_size
=
args
.
batch_size
)
# NOTE: pserver should not call memory optimize
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
...
...
@@ -65,7 +65,7 @@ class TestDistRunnerBase(object):
def
run_trainer
(
self
,
args
):
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
=
\
self
.
get_model
(
batch_size
=
2
)
self
.
get_model
(
batch_size
=
args
.
batch_size
)
if
args
.
mem_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
(),
skip_grads
=
True
)
...
...
@@ -92,6 +92,11 @@ class TestDistRunnerBase(object):
strategy
.
allow_op_delay
=
False
build_stra
=
fluid
.
BuildStrategy
()
if
args
.
batch_merge_repeat
>
1
:
pass_builder
=
build_stra
.
_create_passes_from_strategy
()
mypass
=
pass_builder
.
insert_pass
(
len
(
pass_builder
.
all_passes
())
-
2
,
"multi_batch_merge_pass"
)
mypass
.
set_int
(
"num_repeats"
,
args
.
batch_merge_repeat
)
if
args
.
use_reduce
:
build_stra
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
...
...
@@ -145,6 +150,9 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--use_reduce'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_reader_alloc'
,
action
=
'store_true'
,
required
=
False
,
default
=
True
)
parser
.
add_argument
(
'--batch_size'
,
required
=
False
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--batch_merge_repeat'
,
required
=
False
,
type
=
int
,
default
=
1
)
args
=
parser
.
parse_args
()
...
...
@@ -244,9 +252,18 @@ class TestDistBase(unittest.TestCase):
(
e
,
retry_times
))
retry_times
-=
1
def
_run_local
(
self
,
model
,
envs
,
check_error_log
):
def
_run_local
(
self
,
model
,
envs
,
check_error_log
=
False
,
batch_size
=
DEFAULT_BATCH_SIZE
,
batch_merge_repeat
=
1
):
cmd
=
"%s %s --role trainer"
%
(
self
.
_python_interp
,
model
)
if
batch_size
!=
DEFAULT_BATCH_SIZE
:
cmd
+=
" --batch_size %d"
%
batch_size
if
batch_merge_repeat
>
1
:
cmd
+=
" --batch_merge_repeat %d"
%
batch_merge_repeat
if
self
.
__use_cuda
:
cmd
+=
" --use_cuda"
...
...
python/paddle/fluid/tests/unittests/test_dist_ctr.py
浏览文件 @
e0a89503
...
...
@@ -23,9 +23,8 @@ class TestDistCTR2x2(TestDistBase):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
test_dist_ctr
(
self
):
self
.
check_with_place
(
"dist_ctr.py"
,
delta
=
1e-7
,
check_error_log
=
False
)
def
test_dist_ctr
(
self
):
self
.
check_with_place
(
"dist_ctr.py"
,
delta
=
1e-7
,
check_error_log
=
False
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist.py
浏览文件 @
e0a89503
...
...
@@ -26,6 +26,15 @@ class TestDistMnist2x2(TestDistBase):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1e-5
)
class
TestDistMnist2x2Lars
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
def
test_se_resnext
(
self
):
self
.
check_with_place
(
"dist_mnist_lars.py"
,
delta
=
1e-5
)
class
TestDistMnist2x2WithMemopt
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
...
...
@@ -40,8 +49,7 @@ class TestDistMnistAsync(TestDistBase):
self
.
_sync_mode
=
False
self
.
_use_reduce
=
False
# FIXME(typhoonzero): fix async mode test later
def
no_test_dist_train
(
self
):
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
200
)
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
0 → 100644
浏览文件 @
e0a89503
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
from
test_dist_base
import
TestDistBase
import
os
class
TestDistMnist2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_mnist_batch_merge.py"
,
delta
=
1e-5
)
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{}):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
,
"FLAGS_cudnn_deterministic"
:
"1"
,
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_v"
]
=
"7"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
no_merge_losses
=
self
.
_run_local
(
model_file
,
required_envs
,
check_error_log
=
check_error_log
,
batch_size
=
4
)
batch_merge_losses
=
self
.
_run_local
(
model_file
,
required_envs
,
check_error_log
=
check_error_log
,
batch_size
=
2
,
batch_merge_repeat
=
2
)
# Ensure both result have values.
self
.
assertGreater
(
len
(
no_merge_losses
),
1
)
self
.
assertEqual
(
len
(
no_merge_losses
),
len
(
batch_merge_losses
))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
e0a89503
...
...
@@ -40,8 +40,7 @@ class TestDistSeResneXt2x2Async(TestDistBase):
self
.
_sync_mode
=
False
self
.
_use_reader_alloc
=
False
#FIXME(typhoonzero): fix async mode later
def
no_test_dist_train
(
self
):
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
...
...
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
浏览文件 @
e0a89503
...
...
@@ -79,8 +79,7 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self
.
_sync_mode
=
False
self
.
_enforce_place
=
"CPU"
#FIXME(typhoonzero): fix async tests later
def
no_test_simnet_bow
(
self
):
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'1'
,
...
...
python/paddle/fluid/tests/unittests/test_hash_op.py
0 → 100644
浏览文件 @
e0a89503
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
class
TestScaleOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"hash"
self
.
init_test_case
()
self
.
inputs
=
{
'X'
:
(
self
.
in_seq
,
self
.
lod
)}
self
.
attrs
=
{
'num_hash'
:
4
,
'mod_by'
:
10000
}
self
.
outputs
=
{
'Out'
:
(
self
.
out_seq
,
self
.
lod
)}
def
init_test_case
(
self
):
np
.
random
.
seed
=
1
self
.
in_seq
=
np
.
random
.
randint
(
0
,
10
,
(
30
,
1
)).
astype
(
"int32"
)
self
.
lod
=
[[
9
,
4
,
11
,
6
]]
# self.out_seq = np.ones([30, 4, 1], dtype=np.int32)
self
.
out_seq
=
[
[[
9662
],
[
9217
],
[
1129
],
[
8487
]],
[[
9662
],
[
9217
],
[
1129
],
[
8487
]],
[[
8310
],
[
1327
],
[
1654
],
[
4567
]],
[[
6897
],
[
3218
],
[
2013
],
[
1241
]],
[[
9407
],
[
6715
],
[
6949
],
[
8094
]],
[[
8473
],
[
694
],
[
5142
],
[
2479
]],
[[
8310
],
[
1327
],
[
1654
],
[
4567
]],
[[
6897
],
[
3218
],
[
2013
],
[
1241
]],
[[
4372
],
[
9456
],
[
8204
],
[
6695
]],
[[
6897
],
[
3218
],
[
2013
],
[
1241
]],
[[
8473
],
[
694
],
[
5142
],
[
2479
]],
[[
4372
],
[
9456
],
[
8204
],
[
6695
]],
[[
4372
],
[
9456
],
[
8204
],
[
6695
]],
[[
8473
],
[
694
],
[
5142
],
[
2479
]],
[[
9407
],
[
6715
],
[
6949
],
[
8094
]],
[[
9369
],
[
4525
],
[
8935
],
[
9210
]],
[[
4372
],
[
9456
],
[
8204
],
[
6695
]],
[[
4372
],
[
9456
],
[
8204
],
[
6695
]],
[[
9369
],
[
4525
],
[
8935
],
[
9210
]],
[[
6897
],
[
3218
],
[
2013
],
[
1241
]],
[[
9038
],
[
7951
],
[
5953
],
[
8657
]],
[[
9407
],
[
6715
],
[
6949
],
[
8094
]],
[[
9662
],
[
9217
],
[
1129
],
[
8487
]],
[[
9369
],
[
4525
],
[
8935
],
[
9210
]],
[[
9038
],
[
7951
],
[
5953
],
[
8657
]],
[[
9662
],
[
9217
],
[
1129
],
[
8487
]],
[[
9369
],
[
4525
],
[
8935
],
[
9210
]],
[[
1719
],
[
5986
],
[
9919
],
[
3421
]],
[[
4372
],
[
9456
],
[
8204
],
[
6695
]],
[[
9038
],
[
7951
],
[
5953
],
[
8657
]]
]
self
.
out_seq
=
np
.
array
(
self
.
out_seq
)
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_metrics.py
0 → 100644
浏览文件 @
e0a89503
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle.fluid
as
fluid
from
paddle.fluid.framework
import
Program
,
program_guard
class
TestMetricsDetectionMap
(
unittest
.
TestCase
):
def
test_detection_map
(
self
):
program
=
fluid
.
Program
()
with
program_guard
(
program
):
detect_res
=
fluid
.
layers
.
data
(
name
=
'detect_res'
,
shape
=
[
10
,
6
],
append_batch_size
=
False
,
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
10
,
1
],
append_batch_size
=
False
,
dtype
=
'float32'
)
box
=
fluid
.
layers
.
data
(
name
=
'bbox'
,
shape
=
[
10
,
4
],
append_batch_size
=
False
,
dtype
=
'float32'
)
map_eval
=
fluid
.
metrics
.
DetectionMAP
(
detect_res
,
label
,
box
,
class_num
=
21
)
cur_map
,
accm_map
=
map_eval
.
get_map_var
()
self
.
assertIsNotNone
(
cur_map
)
self
.
assertIsNotNone
(
accm_map
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_momentum_op.py
浏览文件 @
e0a89503
...
...
@@ -90,6 +90,45 @@ class TestMomentumOp2(OpTest):
self
.
check_output
()
class
TestLarsMomentumOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"lars_momentum"
param
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float32"
)
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float32"
)
velocity
=
np
.
zeros
((
123
,
321
)).
astype
(
"float32"
)
learning_rate
=
np
.
array
([
0.001
]).
astype
(
"float32"
)
mu
=
0.0001
lars_coeff
=
0.001
lars_weight_decay
=
0.0005
self
.
inputs
=
{
'Param'
:
param
,
'Grad'
:
grad
,
'Velocity'
:
velocity
,
'LearningRate'
:
learning_rate
}
self
.
attrs
=
{
'mu'
:
mu
,
'lars_coeff'
:
lars_coeff
,
'lars_weight_decay'
:
lars_weight_decay
}
pnorm
=
np
.
sqrt
(
np
.
square
(
param
).
sum
())
gnorm
=
np
.
sqrt
(
np
.
square
(
grad
).
sum
())
local_lr
=
learning_rate
*
lars_coeff
*
pnorm
/
(
gnorm
+
lars_weight_decay
*
param
)
velocity_out
=
mu
*
velocity
+
local_lr
*
(
grad
+
lars_weight_decay
*
param
)
param_out
=
param
-
velocity_out
self
.
outputs
=
{
'ParamOut'
:
param_out
,
'VelocityOut'
:
velocity_out
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestSparseMomentumOp
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
use_nesterov
=
False
...
...
python/paddle/fluid/tests/unittests/test_sequence_reverse.py
0 → 100644
浏览文件 @
e0a89503
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
import
numpy
as
np
class
TestSequenceReverseBase
(
OpTest
):
def
initParameters
(
self
):
pass
def
setUp
(
self
):
self
.
size
=
(
10
,
3
,
4
)
self
.
lod
=
[
2
,
3
,
5
]
self
.
dtype
=
'float32'
self
.
initParameters
()
self
.
op_type
=
'sequence_reverse'
self
.
x
=
np
.
random
.
random
(
self
.
size
).
astype
(
self
.
dtype
)
self
.
y
=
self
.
get_output
()
self
.
inputs
=
{
'X'
:
(
self
.
x
,
[
self
.
lod
,
]),
}
self
.
outputs
=
{
'Y'
:
(
self
.
y
,
[
self
.
lod
,
]),
}
def
get_output
(
self
):
tmp_x
=
np
.
reshape
(
self
.
x
,
newshape
=
[
self
.
x
.
shape
[
0
],
-
1
])
tmp_y
=
np
.
ndarray
(
tmp_x
.
shape
).
astype
(
self
.
dtype
)
prev_idx
=
0
for
cur_len
in
self
.
lod
:
idx_range
=
range
(
prev_idx
,
prev_idx
+
cur_len
)
tmp_y
[
idx_range
,
:]
=
np
.
flip
(
tmp_x
[
idx_range
,
:],
0
)
prev_idx
+=
cur_len
return
np
.
reshape
(
tmp_y
,
newshape
=
self
.
x
.
shape
).
astype
(
self
.
dtype
)
def
test_output
(
self
):
self
.
check_output
(
0
)
def
test_grad
(
self
):
self
.
check_grad
([
'X'
],
'Y'
)
class
TestSequenceReserve1
(
TestSequenceReverseBase
):
def
initParameters
(
self
):
self
.
size
=
(
12
,
10
)
self
.
lod
=
[
4
,
5
,
3
]
class
TestSequenceReverse2
(
TestSequenceReverseBase
):
def
initParameters
(
self
):
self
.
size
=
(
12
,
10
)
self
.
lod
=
[
12
]
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
e0a89503
...
...
@@ -1431,7 +1431,7 @@ to transpile() call.")
elif
op_type
==
"adamax"
:
if
varkey
in
[
"Moment"
,
"InfNorm"
]:
return
param_shape
elif
op_type
==
"momentum"
:
elif
op_type
in
[
"momentum"
,
"lars_momentum"
]
:
if
varkey
==
"Velocity"
:
return
param_shape
elif
op_type
==
"rmsprop"
:
...
...
@@ -1442,6 +1442,10 @@ to transpile() call.")
return
param_shape
elif
op_type
==
"sgd"
:
pass
else
:
raise
ValueError
(
"Not supported optimizer for distributed training: %s"
%
op_type
)
return
orig_shape
def
_get_varname_parts
(
self
,
varname
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录