Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
54f9d44e
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
54f9d44e
编写于
1月 20, 2019
作者:
L
liuwei1031
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into develop
上级
a4dc3d2b
62d36ce0
变更
56
隐藏空白更改
内联
并排
Showing
56 changed file
with
1593 addition
and
214 deletion
+1593
-214
cmake/generic.cmake
cmake/generic.cmake
+2
-2
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-2
paddle/fluid/framework/details/all_reduce_deps_pass.cc
paddle/fluid/framework/details/all_reduce_deps_pass.cc
+3
-3
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+4
-4
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+17
-17
paddle/fluid/framework/details/data_balance_op_handle.cc
paddle/fluid/framework/details/data_balance_op_handle.cc
+2
-2
paddle/fluid/framework/details/fetch_op_handle.cc
paddle/fluid/framework/details/fetch_op_handle.cc
+3
-3
paddle/fluid/framework/details/fuse_vars_op_handle.cc
paddle/fluid/framework/details/fuse_vars_op_handle.cc
+2
-2
paddle/fluid/framework/details/gather_op_handle.cc
paddle/fluid/framework/details/gather_op_handle.cc
+8
-8
paddle/fluid/framework/details/memory_early_delete_pass.cc
paddle/fluid/framework/details/memory_early_delete_pass.cc
+2
-2
paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
...fluid/framework/details/multi_devices_graph_print_pass.cc
+4
-4
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+17
-17
paddle/fluid/framework/details/rpc_op_handle.cc
paddle/fluid/framework/details/rpc_op_handle.cc
+1
-1
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+1
-1
paddle/fluid/framework/details/var_handle.h
paddle/fluid/framework/details/var_handle.h
+7
-0
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+3
-2
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+1
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+2
-0
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+2
-1
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+5
-2
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+2
-0
paddle/fluid/inference/api/details/zero_copy_tensor.cc
paddle/fluid/inference/api/details/zero_copy_tensor.cc
+12
-6
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+28
-3
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+6
-1
paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
.../fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+182
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/controlflow/while_op.cc
paddle/fluid/operators/controlflow/while_op.cc
+6
-16
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+3
-3
paddle/fluid/operators/gru_op.cc
paddle/fluid/operators/gru_op.cc
+7
-2
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+2
-1
paddle/fluid/operators/gru_op.h
paddle/fluid/operators/gru_op.h
+2
-1
paddle/fluid/operators/gru_unit_op.cc
paddle/fluid/operators/gru_unit_op.cc
+7
-0
paddle/fluid/operators/gru_unit_op.h
paddle/fluid/operators/gru_unit_op.h
+23
-7
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-0
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+26
-20
paddle/fluid/operators/math/detail/gru_gpu_kernel.h
paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+6
-4
paddle/fluid/operators/math/detail/gru_kernel.h
paddle/fluid/operators/math/detail/gru_kernel.h
+60
-25
paddle/fluid/operators/math/gru_compute.cc
paddle/fluid/operators/math/gru_compute.cc
+8
-4
paddle/fluid/operators/math/gru_compute.cu
paddle/fluid/operators/math/gru_compute.cu
+8
-6
paddle/fluid/operators/math/gru_compute.h
paddle/fluid/operators/math/gru_compute.h
+4
-2
paddle/fluid/operators/math/tree2col.cc
paddle/fluid/operators/math/tree2col.cc
+197
-0
paddle/fluid/operators/math/tree2col.cu
paddle/fluid/operators/math/tree2col.cu
+208
-0
paddle/fluid/operators/math/tree2col.h
paddle/fluid/operators/math/tree2col.h
+90
-0
paddle/fluid/operators/tree_conv_op.cc
paddle/fluid/operators/tree_conv_op.cc
+129
-0
paddle/fluid/operators/tree_conv_op.cu
paddle/fluid/operators/tree_conv_op.cu
+24
-0
paddle/fluid/operators/tree_conv_op.h
paddle/fluid/operators/tree_conv_op.h
+146
-0
python/paddle/fluid/async_executor.py
python/paddle/fluid/async_executor.py
+0
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+115
-7
python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
...-api/recommender_system/test_recommender_system_newapi.py
+11
-8
python/paddle/fluid/tests/book/test_recommender_system.py
python/paddle/fluid/tests/book/test_recommender_system.py
+12
-8
python/paddle/fluid/tests/unittests/test_auc_op.py
python/paddle/fluid/tests/unittests/test_auc_op.py
+1
-1
python/paddle/fluid/tests/unittests/test_gru_op.py
python/paddle/fluid/tests/unittests/test_gru_op.py
+29
-4
python/paddle/fluid/tests/unittests/test_gru_unit_op.py
python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+23
-6
python/paddle/fluid/tests/unittests/test_nce.py
python/paddle/fluid/tests/unittests/test_nce.py
+2
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
...luid/tests/unittests/test_parallel_executor_fetch_feed.py
+3
-3
python/paddle/fluid/tests/unittests/test_tree_conv_op.py
python/paddle/fluid/tests/unittests/test_tree_conv_op.py
+120
-0
未找到文件。
cmake/generic.cmake
浏览文件 @
54f9d44e
...
@@ -748,7 +748,7 @@ function(grpc_library TARGET_NAME)
...
@@ -748,7 +748,7 @@ function(grpc_library TARGET_NAME)
#FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
#FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
# somehow it didn't. line 602 to 604 is to patching this. Leaving this here
# somehow it didn't. line 602 to 604 is to patching this. Leaving this here
# for now to enable dist CI.
# for now to enable dist CI.
protobuf_generate_cpp
(
grpc_proto_srcs grpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
p
addle_p
rotobuf_generate_cpp
(
grpc_proto_srcs grpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
set
(
grpc_grpc_srcs
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
PROTO_WE
}
.grpc.pb.cc"
)
set
(
grpc_grpc_srcs
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
PROTO_WE
}
.grpc.pb.cc"
)
set
(
grpc_grpc_hdrs
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
PROTO_WE
}
.grpc.pb.h"
)
set
(
grpc_grpc_hdrs
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
PROTO_WE
}
.grpc.pb.h"
)
cc_library
(
"
${
TARGET_NAME
}
_proto"
SRCS
"
${
grpc_proto_srcs
}
"
)
cc_library
(
"
${
TARGET_NAME
}
_proto"
SRCS
"
${
grpc_proto_srcs
}
"
)
...
@@ -791,7 +791,7 @@ function(brpc_library TARGET_NAME)
...
@@ -791,7 +791,7 @@ function(brpc_library TARGET_NAME)
get_filename_component
(
PROTO_WE
${
brpc_library_PROTO
}
NAME_WE
)
get_filename_component
(
PROTO_WE
${
brpc_library_PROTO
}
NAME_WE
)
get_filename_component
(
PROTO_PATH
${
ABS_PROTO
}
PATH
)
get_filename_component
(
PROTO_PATH
${
ABS_PROTO
}
PATH
)
protobuf_generate_cpp
(
brpc_proto_srcs brpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
p
addle_p
rotobuf_generate_cpp
(
brpc_proto_srcs brpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
cc_library
(
"
${
TARGET_NAME
}
_proto"
SRCS
"
${
brpc_proto_srcs
}
"
)
cc_library
(
"
${
TARGET_NAME
}
_proto"
SRCS
"
${
brpc_proto_srcs
}
"
)
cc_library
(
"
${
TARGET_NAME
}
"
SRCS
"
${
brpc_library_SRCS
}
"
DEPS
"
${
TARGET_NAME
}
_proto"
"
${
brpc_library_DEPS
}
"
)
cc_library
(
"
${
TARGET_NAME
}
"
SRCS
"
${
brpc_library_SRCS
}
"
DEPS
"
${
TARGET_NAME
}
_proto"
"
${
brpc_library_DEPS
}
"
)
endfunction
()
endfunction
()
paddle/fluid/API.spec
浏览文件 @
54f9d44e
...
@@ -70,8 +70,8 @@ paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param
...
@@ -70,8 +70,8 @@ paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'
], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', Non
e))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'
, 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, Fals
e))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'
], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid'
))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'
, 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False
))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
...
@@ -215,6 +215,7 @@ paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', '
...
@@ -215,6 +215,7 @@ paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', '
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.tree_conv ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/details/all_reduce_deps_pass.cc
浏览文件 @
54f9d44e
...
@@ -82,13 +82,13 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
...
@@ -82,13 +82,13 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
PADDLE_ENFORCE
(
i0
!=
nullptr
&&
i1
!=
nullptr
,
"%s convert to %s error"
,
PADDLE_ENFORCE
(
i0
!=
nullptr
&&
i1
!=
nullptr
,
"%s convert to %s error"
,
op1
->
DebugString
(),
op2
->
DebugString
());
op1
->
DebugString
(),
op2
->
DebugString
());
auto
l_it
=
vars
.
find
(
i0
->
name
_
);
auto
l_it
=
vars
.
find
(
i0
->
name
()
);
auto
r_it
=
vars
.
find
(
i1
->
name
_
);
auto
r_it
=
vars
.
find
(
i1
->
name
()
);
if
(
l_it
->
second
<
r_it
->
second
)
return
true
;
if
(
l_it
->
second
<
r_it
->
second
)
return
true
;
if
(
l_it
->
second
==
r_it
->
second
)
{
if
(
l_it
->
second
==
r_it
->
second
)
{
return
i0
->
name
_
<
i1
->
name_
;
return
i0
->
name
()
<
i1
->
name
()
;
}
}
return
false
;
return
false
;
...
...
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -70,9 +70,9 @@ void AllReduceOpHandle::RunImpl() {
...
@@ -70,9 +70,9 @@ void AllReduceOpHandle::RunImpl() {
auto
*
s
=
local_scopes_
[
i
];
auto
*
s
=
local_scopes_
[
i
];
auto
&
local_scope
=
*
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
local_scope
=
*
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
lod_tensor
=
auto
&
lod_tensor
=
local_scope
.
FindVar
(
in_var_handles
[
i
]
->
name
_
)
->
Get
<
LoDTensor
>
();
local_scope
.
FindVar
(
in_var_handles
[
i
]
->
name
()
)
->
Get
<
LoDTensor
>
();
lod_tensors
.
emplace_back
(
&
lod_tensor
);
lod_tensors
.
emplace_back
(
&
lod_tensor
);
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
_
,
out_var_handles
[
i
]
->
name_
,
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
(),
out_var_handles
[
i
]
->
name
()
,
"The name of input and output should be equal."
);
"The name of input and output should be equal."
);
}
}
...
@@ -134,7 +134,7 @@ void AllReduceOpHandle::RunImpl() {
...
@@ -134,7 +134,7 @@ void AllReduceOpHandle::RunImpl() {
auto
&
trg
=
*
this
->
local_scopes_
[
0
]
auto
&
trg
=
*
this
->
local_scopes_
[
0
]
->
FindVar
(
kLocalExecScopeName
)
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
Get
<
Scope
*>
()
->
FindVar
(
out_var_handles
[
0
]
->
name
_
)
->
FindVar
(
out_var_handles
[
0
]
->
name
()
)
->
GetMutable
<
framework
::
LoDTensor
>
();
->
GetMutable
<
framework
::
LoDTensor
>
();
// Reduce All Tensor to trg in CPU
// Reduce All Tensor to trg in CPU
...
@@ -145,7 +145,7 @@ void AllReduceOpHandle::RunImpl() {
...
@@ -145,7 +145,7 @@ void AllReduceOpHandle::RunImpl() {
auto
&
scope
=
auto
&
scope
=
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
p
=
places_
[
i
];
auto
&
p
=
places_
[
i
];
auto
*
var
=
scope
.
FindVar
(
out_var_handles
[
i
]
->
name
_
);
auto
*
var
=
scope
.
FindVar
(
out_var_handles
[
i
]
->
name
()
);
auto
*
dev_ctx
=
dev_ctxes_
.
at
(
p
);
auto
*
dev_ctx
=
dev_ctxes_
.
at
(
p
);
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -56,11 +56,11 @@ void BroadcastOpHandle::BroadcastOneVar(
...
@@ -56,11 +56,11 @@ void BroadcastOpHandle::BroadcastOneVar(
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
,
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
{
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
{
auto
*
in_var
=
auto
*
in_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
_
)
->
FindVar
(
in_var_handle
.
name_
);
var_scopes
.
at
(
in_var_handle
.
scope_idx
())
->
FindVar
(
in_var_handle
.
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
if
(
UNLIKELY
(
!
in_tensor
.
IsInitialized
()))
{
if
(
UNLIKELY
(
!
in_tensor
.
IsInitialized
()))
{
VLOG
(
3
)
<<
"in var "
<<
in_var_handle
.
name
_
<<
"not inited, return!"
;
VLOG
(
3
)
<<
"in var "
<<
in_var_handle
.
name
()
<<
"not inited, return!"
;
return
;
return
;
}
}
...
@@ -71,9 +71,9 @@ void BroadcastOpHandle::BroadcastOneVar(
...
@@ -71,9 +71,9 @@ void BroadcastOpHandle::BroadcastOneVar(
if
(
out_var_handle
->
IsTheSameVar
(
in_var_handle
))
{
if
(
out_var_handle
->
IsTheSameVar
(
in_var_handle
))
{
continue
;
continue
;
}
}
auto
&
out_p
=
out_var_handle
->
place
_
;
auto
&
out_p
=
out_var_handle
->
place
()
;
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
_
)
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
()
)
->
FindVar
(
out_var_handle
->
name
_
);
->
FindVar
(
out_var_handle
->
name
()
);
RunAndRecordEvent
(
out_p
,
[
in_tensor
,
out_var
]
{
RunAndRecordEvent
(
out_p
,
[
in_tensor
,
out_var
]
{
paddle
::
framework
::
TensorCopy
(
paddle
::
framework
::
TensorCopy
(
...
@@ -91,11 +91,11 @@ void BroadcastOpHandle::BroadcastOneVar(
...
@@ -91,11 +91,11 @@ void BroadcastOpHandle::BroadcastOneVar(
size_t
numel
=
static_cast
<
size_t
>
(
in_tensor
.
numel
());
size_t
numel
=
static_cast
<
size_t
>
(
in_tensor
.
numel
());
for
(
auto
out_var_handle
:
out_var_handles
)
{
for
(
auto
out_var_handle
:
out_var_handles
)
{
Variable
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
_
)
Variable
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
()
)
->
FindVar
(
out_var_handle
->
name
_
);
->
FindVar
(
out_var_handle
->
name
()
);
int
dst_id
=
int
dst_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_var_handle
->
place
_
).
device
;
boost
::
get
<
platform
::
CUDAPlace
>
(
out_var_handle
->
place
()
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dst_id
);
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dst_id
);
...
@@ -106,7 +106,7 @@ void BroadcastOpHandle::BroadcastOneVar(
...
@@ -106,7 +106,7 @@ void BroadcastOpHandle::BroadcastOneVar(
}
else
{
}
else
{
send_recv_buffer
=
VariableVisitor
::
GetMutableTensor
(
out_var
)
send_recv_buffer
=
VariableVisitor
::
GetMutableTensor
(
out_var
)
.
Resize
(
in_tensor
.
dims
())
.
Resize
(
in_tensor
.
dims
())
.
mutable_data
(
out_var_handle
->
place
_
);
.
mutable_data
(
out_var_handle
->
place
()
);
}
}
broadcast_calls
.
emplace_back
(
broadcast_calls
.
emplace_back
(
...
@@ -126,11 +126,11 @@ void BroadcastOpHandle::BroadcastOneVar(
...
@@ -126,11 +126,11 @@ void BroadcastOpHandle::BroadcastOneVar(
}
}
if
(
!
out_handle
->
IsTheSameVar
(
in_var_handle
))
{
if
(
!
out_handle
->
IsTheSameVar
(
in_var_handle
))
{
auto
out_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
_
)
auto
out_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
()
)
->
FindVar
(
out_var_handles
[
0
]
->
name
_
);
->
FindVar
(
out_var_handles
[
0
]
->
name
()
);
paddle
::
framework
::
TensorCopy
(
paddle
::
framework
::
TensorCopy
(
in_tensor
,
in_var_handle
.
place
_
,
in_tensor
,
in_var_handle
.
place
()
,
*
(
dev_ctxes_
.
at
(
in_var_handle
.
place
_
)),
*
(
dev_ctxes_
.
at
(
in_var_handle
.
place
()
)),
&
VariableVisitor
::
GetMutableTensor
(
out_var
));
&
VariableVisitor
::
GetMutableTensor
(
out_var
));
}
}
});
});
...
@@ -148,7 +148,7 @@ void BroadcastOpHandle::InitOutputValue(
...
@@ -148,7 +148,7 @@ void BroadcastOpHandle::InitOutputValue(
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
}
}
auto
*
in_var
=
auto
*
in_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
_
)
->
FindVar
(
in_var_handle
.
name_
);
var_scopes
.
at
(
in_var_handle
.
scope_idx
())
->
FindVar
(
in_var_handle
.
name
()
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
...
@@ -158,9 +158,9 @@ void BroadcastOpHandle::InitOutputValue(
...
@@ -158,9 +158,9 @@ void BroadcastOpHandle::InitOutputValue(
if
(
out_var_handle
->
IsTheSameVar
(
in_var_handle
))
{
if
(
out_var_handle
->
IsTheSameVar
(
in_var_handle
))
{
continue
;
continue
;
}
}
auto
t_out_p
=
out_var_handle
->
place
_
;
auto
t_out_p
=
out_var_handle
->
place
()
;
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
_
)
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
()
)
->
FindVar
(
out_var_handle
->
name
_
);
->
FindVar
(
out_var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
if
(
is_gpu_place
(
in_tensor
.
place
()))
{
if
(
is_gpu_place
(
in_tensor
.
place
()))
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
t_out_p
),
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
t_out_p
),
...
...
paddle/fluid/framework/details/data_balance_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -100,13 +100,13 @@ void DataBalanceOpHandle::RunImpl() {
...
@@ -100,13 +100,13 @@ void DataBalanceOpHandle::RunImpl() {
std
::
vector
<
std
::
vector
<
LoDTensor
*>>
lod_tensors
(
data_num
);
std
::
vector
<
std
::
vector
<
LoDTensor
*>>
lod_tensors
(
data_num
);
std
::
vector
<
int
>
device_sizes
;
std
::
vector
<
int
>
device_sizes
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
in_var_handles
.
size
());
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
in_var_handles
.
size
());
++
i
)
{
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
_
,
out_var_handles
[
i
]
->
name_
,
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
(),
out_var_handles
[
i
]
->
name
()
,
"The name of input and output should be equal."
);
"The name of input and output should be equal."
);
int
place_idx
=
i
/
data_num
;
int
place_idx
=
i
/
data_num
;
int
data_idx
=
i
%
data_num
;
int
data_idx
=
i
%
data_num
;
auto
*
local_scope
=
auto
*
local_scope
=
local_scopes_
[
place_idx
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
local_scopes_
[
place_idx
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
*
tensor_var
=
local_scope
->
FindVar
(
in_var_handles
[
i
]
->
name
_
);
auto
*
tensor_var
=
local_scope
->
FindVar
(
in_var_handles
[
i
]
->
name
()
);
PADDLE_ENFORCE
(
tensor_var
->
IsType
<
LoDTensor
>
());
PADDLE_ENFORCE
(
tensor_var
->
IsType
<
LoDTensor
>
());
auto
*
tensor
=
tensor_var
->
GetMutable
<
LoDTensor
>
();
auto
*
tensor
=
tensor_var
->
GetMutable
<
LoDTensor
>
();
lod_tensors
[
data_idx
].
push_back
(
tensor
);
lod_tensors
[
data_idx
].
push_back
(
tensor
);
...
...
paddle/fluid/framework/details/fetch_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -52,12 +52,12 @@ void FetchOpHandle::RunImpl() {
...
@@ -52,12 +52,12 @@ void FetchOpHandle::RunImpl() {
for
(
size_t
i
=
0
;
i
<
inputs_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
inputs_
.
size
();
++
i
)
{
auto
*
var_handle
=
static_cast
<
VarHandle
*>
(
inputs_
[
i
]);
auto
*
var_handle
=
static_cast
<
VarHandle
*>
(
inputs_
[
i
]);
auto
&
scope
=
scopes
.
at
(
var_handle
->
scope_idx
_
);
auto
&
scope
=
scopes
.
at
(
var_handle
->
scope_idx
()
);
auto
*
var
=
scope
->
FindVar
(
kLocalExecScopeName
)
auto
*
var
=
scope
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
Get
<
Scope
*>
()
->
FindVar
(
var_handle
->
name
_
);
->
FindVar
(
var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"Cannot find variable %s in execution scope"
,
PADDLE_ENFORCE_NOT_NULL
(
var
,
"Cannot find variable %s in execution scope"
,
var_handle
->
name
_
);
var_handle
->
name
()
);
auto
&
t
=
var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
t
=
var
->
Get
<
framework
::
LoDTensor
>
();
if
(
platform
::
is_gpu_place
(
t
.
place
()))
{
if
(
platform
::
is_gpu_place
(
t
.
place
()))
{
...
...
paddle/fluid/framework/details/fuse_vars_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -29,14 +29,14 @@ void FuseVarsOpHandle::RunImpl() {
...
@@ -29,14 +29,14 @@ void FuseVarsOpHandle::RunImpl() {
auto
scope
=
local_scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
scope
=
local_scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
out_var_handle
=
out_var_handles
[
0
];
auto
out_var_handle
=
out_var_handles
[
0
];
auto
out_var
=
scope
->
Var
(
out_var_handle
->
name
_
);
auto
out_var
=
scope
->
Var
(
out_var_handle
->
name
()
);
auto
out_tensor
=
out_var
->
GetMutable
<
LoDTensor
>
();
auto
out_tensor
=
out_var
->
GetMutable
<
LoDTensor
>
();
out_tensor
->
Resize
({
total_numel_
}).
mutable_data
(
this
->
place_
,
type_
);
out_tensor
->
Resize
({
total_numel_
}).
mutable_data
(
this
->
place_
,
type_
);
int64_t
s
=
0
;
int64_t
s
=
0
;
for
(
size_t
i
=
1
;
i
<
out_var_handles
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
out_var_handles
.
size
();
++
i
)
{
auto
out_name
=
out_var_handles
[
i
]
->
name
_
;
auto
out_name
=
out_var_handles
[
i
]
->
name
()
;
auto
out_t
=
scope
->
Var
(
out_name
)
->
GetMutable
<
LoDTensor
>
();
auto
out_t
=
scope
->
Var
(
out_name
)
->
GetMutable
<
LoDTensor
>
();
auto
numel
=
this
->
inputs_numel_
.
at
(
out_name
);
auto
numel
=
this
->
inputs_numel_
.
at
(
out_name
);
out_t
->
ShareDataWith
(
out_tensor
->
Slice
(
s
,
s
+
numel
));
out_t
->
ShareDataWith
(
out_tensor
->
Slice
(
s
,
s
+
numel
));
...
...
paddle/fluid/framework/details/gather_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -49,7 +49,7 @@ void GatherOpHandle::RunImpl() {
...
@@ -49,7 +49,7 @@ void GatherOpHandle::RunImpl() {
auto
in_0_handle
=
in_var_handles
[
0
];
auto
in_0_handle
=
in_var_handles
[
0
];
auto
pre_in_var
=
auto
pre_in_var
=
var_scopes
.
at
(
in_0_handle
->
scope_idx
_
)
->
FindVar
(
in_0_handle
->
name_
);
var_scopes
.
at
(
in_0_handle
->
scope_idx
())
->
FindVar
(
in_0_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
PADDLE_ENFORCE
(
pre_in_var
->
IsType
<
framework
::
SelectedRows
>
(),
PADDLE_ENFORCE
(
pre_in_var
->
IsType
<
framework
::
SelectedRows
>
(),
...
@@ -65,7 +65,7 @@ void GatherOpHandle::RunImpl() {
...
@@ -65,7 +65,7 @@ void GatherOpHandle::RunImpl() {
// Gather the inputs
// Gather the inputs
for
(
auto
*
in_handle
:
in_var_handles
)
{
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
*
in_var
=
auto
*
in_var
=
var_scopes
.
at
(
in_handle
->
scope_idx
_
)
->
FindVar
(
in_handle
->
name_
);
var_scopes
.
at
(
in_handle
->
scope_idx
())
->
FindVar
(
in_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
VariableVisitor
::
EnforceShapeAndDTypeEQ
(
*
in_var
,
*
pre_in_var
);
VariableVisitor
::
EnforceShapeAndDTypeEQ
(
*
in_var
,
*
pre_in_var
);
...
@@ -77,7 +77,7 @@ void GatherOpHandle::RunImpl() {
...
@@ -77,7 +77,7 @@ void GatherOpHandle::RunImpl() {
}
}
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform
::
Place
t_out_p
=
out_var_handle
->
place
_
;
platform
::
Place
t_out_p
=
out_var_handle
->
place
()
;
if
(
platform
::
is_gpu_place
(
pre_in_value
.
place
()))
{
if
(
platform
::
is_gpu_place
(
pre_in_value
.
place
()))
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
t_out_p
),
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
t_out_p
),
"Places of input and output must be all on GPU."
);
"Places of input and output must be all on GPU."
);
...
@@ -85,8 +85,8 @@ void GatherOpHandle::RunImpl() {
...
@@ -85,8 +85,8 @@ void GatherOpHandle::RunImpl() {
t_out_p
=
platform
::
CPUPlace
();
t_out_p
=
platform
::
CPUPlace
();
}
}
auto
out_var
=
auto
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
())
var_scopes
.
at
(
out_var_handle
->
scope_idx_
)
->
FindVar
(
out_var_handle
->
name_
);
->
FindVar
(
out_var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_value
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
out_value
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
out_value
->
set_height
(
pre_in_value
.
height
());
out_value
->
set_height
(
pre_in_value
.
height
());
...
@@ -99,9 +99,9 @@ void GatherOpHandle::RunImpl() {
...
@@ -99,9 +99,9 @@ void GatherOpHandle::RunImpl() {
Tensor
*
out_tensor
=
out_value
->
mutable_value
();
Tensor
*
out_tensor
=
out_value
->
mutable_value
();
// copy
// copy
auto
dev_ctx
=
dev_ctxes_
.
at
(
out_var_handle
->
place
_
);
auto
dev_ctx
=
dev_ctxes_
.
at
(
out_var_handle
->
place
()
);
RunAndRecordEvent
(
out_var_handle
->
place
_
,
[
in_tensors
,
out_tensor
,
&
dev_ctx
,
RunAndRecordEvent
(
out_var_handle
->
place
()
,
[
in_tensors
,
out_tensor
,
&
dev_ctx
,
t_out_p
]
{
t_out_p
]
{
int
s
=
0
,
e
=
0
;
int
s
=
0
,
e
=
0
;
for
(
size_t
j
=
0
;
j
<
in_tensors
.
size
();
++
j
)
{
for
(
size_t
j
=
0
;
j
<
in_tensors
.
size
();
++
j
)
{
e
+=
in_tensors
[
j
].
dims
()[
0
];
e
+=
in_tensors
[
j
].
dims
()[
0
];
...
...
paddle/fluid/framework/details/memory_early_delete_pass.cc
浏览文件 @
54f9d44e
...
@@ -33,7 +33,7 @@ static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
...
@@ -33,7 +33,7 @@ static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
queue
.
pop
();
queue
.
pop
();
for
(
auto
*
op
:
var
->
PendingOps
())
{
for
(
auto
*
op
:
var
->
PendingOps
())
{
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place
_
)
{
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place
()
)
{
return
compute_op
;
return
compute_op
;
}
}
for
(
auto
*
out_var
:
op
->
Outputs
())
{
for
(
auto
*
out_var
:
op
->
Outputs
())
{
...
@@ -64,7 +64,7 @@ std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
...
@@ -64,7 +64,7 @@ std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
for
(
auto
&
var
:
vars
)
{
for
(
auto
&
var
:
vars
)
{
auto
*
var_handle
=
dynamic_cast
<
VarHandle
*>
(
var
);
auto
*
var_handle
=
dynamic_cast
<
VarHandle
*>
(
var
);
auto
var_name
=
var
->
Node
()
->
Name
();
auto
var_name
=
var
->
Node
()
->
Name
();
auto
&
var_place
=
var_handle
->
place
_
;
auto
&
var_place
=
var_handle
->
place
()
;
if
(
unlived_vars
.
count
(
var_name
)
==
0
)
continue
;
if
(
unlived_vars
.
count
(
var_name
)
==
0
)
continue
;
if
(
!
unlived_vars
[
var_name
].
empty
())
{
if
(
!
unlived_vars
[
var_name
].
empty
())
{
if
(
compute_op
!=
nullptr
&&
if
(
compute_op
!=
nullptr
&&
...
...
paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
浏览文件 @
54f9d44e
...
@@ -52,11 +52,11 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
...
@@ -52,11 +52,11 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
vars
[
var_ptr
]
=
cur_var_id
;
vars
[
var_ptr
]
=
cur_var_id
;
if
(
var_handle_ptr
)
{
if
(
var_handle_ptr
)
{
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
"
<<
var_handle_ptr
->
name
_
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
"
<<
var_handle_ptr
->
name
()
<<
"
\\
n"
<<
"
\\
n"
<<
var_handle_ptr
->
place
_
<<
"
\\
n"
<<
var_handle_ptr
->
place
()
<<
"
\\
n"
<<
"scope: "
<<
var_handle_ptr
->
scope_idx
_
<<
"
\\
n"
<<
"scope: "
<<
var_handle_ptr
->
scope_idx
()
<<
"
\\
n"
<<
"v"
<<
var_handle_ptr
->
version
_
<<
"
\"
]"
<<
std
::
endl
;
<<
"v"
<<
var_handle_ptr
->
version
()
<<
"
\"
]"
<<
std
::
endl
;
}
else
if
(
dummy_ptr
)
{
}
else
if
(
dummy_ptr
)
{
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
dummy
\"
]"
<<
std
::
endl
;
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
dummy
\"
]"
<<
std
::
endl
;
}
}
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -60,8 +60,8 @@ void ReduceOpHandle::GatherSelectedRows(
...
@@ -60,8 +60,8 @@ void ReduceOpHandle::GatherSelectedRows(
*
CollectiveContext
::
GetInstance
();
*
CollectiveContext
::
GetInstance
();
// 1. gather local selected rows, merge them
// 1. gather local selected rows, merge them
std
::
string
gathered_var_name
=
out_var_handle
->
name
_
+
"_gathered_tmp"
;
std
::
string
gathered_var_name
=
out_var_handle
->
name
()
+
"_gathered_tmp"
;
auto
scope
=
local_scopes_
.
at
(
out_var_handle
->
scope_idx
_
);
auto
scope
=
local_scopes_
.
at
(
out_var_handle
->
scope_idx
()
);
auto
gathered_var_mid
=
scope
->
Var
(
gathered_var_name
);
auto
gathered_var_mid
=
scope
->
Var
(
gathered_var_name
);
auto
gathered_select_rows
=
auto
gathered_select_rows
=
gathered_var_mid
->
GetMutable
<
framework
::
SelectedRows
>
();
gathered_var_mid
->
GetMutable
<
framework
::
SelectedRows
>
();
...
@@ -73,7 +73,7 @@ void ReduceOpHandle::GatherSelectedRows(
...
@@ -73,7 +73,7 @@ void ReduceOpHandle::GatherSelectedRows(
// merge them
// merge them
auto
merged_dev_ctx
=
dynamic_cast
<
DevCtx
*>
(
dev_ctxes
.
at
(
out_place
));
auto
merged_dev_ctx
=
dynamic_cast
<
DevCtx
*>
(
dev_ctxes
.
at
(
out_place
));
std
::
string
merged_var_name
=
std
::
string
merged_var_name
=
GetRemoteVarName
(
out_var_handle
->
name
_
,
collective_context
.
trainer_id_
);
GetRemoteVarName
(
out_var_handle
->
name
()
,
collective_context
.
trainer_id_
);
auto
merged_select_rows
=
auto
merged_select_rows
=
scope
->
Var
(
merged_var_name
)
->
GetMutable
<
SelectedRows
>
();
scope
->
Var
(
merged_var_name
)
->
GetMutable
<
SelectedRows
>
();
operators
::
math
::
scatter
::
MergeAdd
<
DevCtx
,
DataType
>
merge_func
;
operators
::
math
::
scatter
::
MergeAdd
<
DevCtx
,
DataType
>
merge_func
;
...
@@ -101,7 +101,7 @@ void ReduceOpHandle::GatherSelectedRows(
...
@@ -101,7 +101,7 @@ void ReduceOpHandle::GatherSelectedRows(
operators
::
distributed
::
RemoteVar
var
;
operators
::
distributed
::
RemoteVar
var
;
var
.
trainer_id_
=
i
;
var
.
trainer_id_
=
i
;
var
.
var_name_
=
GetRemoteVarName
(
out_var_handle
->
name
_
,
i
);
var
.
var_name_
=
GetRemoteVarName
(
out_var_handle
->
name
()
,
i
);
var
.
ep_
=
collective_context
.
endpoints_
[
i
];
var
.
ep_
=
collective_context
.
endpoints_
[
i
];
vars
.
push_back
(
var
);
vars
.
push_back
(
var
);
...
@@ -166,7 +166,7 @@ void ReduceOpHandle::RunImpl() {
...
@@ -166,7 +166,7 @@ void ReduceOpHandle::RunImpl() {
}
}
auto
pre_in_var
=
auto
pre_in_var
=
var_scopes
.
at
(
in_0_handle
->
scope_idx
_
)
->
FindVar
(
in_0_handle
->
name_
);
var_scopes
.
at
(
in_0_handle
->
scope_idx
())
->
FindVar
(
in_0_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
// Wait input done, this Wait is asynchronous operation
// Wait input done, this Wait is asynchronous operation
...
@@ -175,15 +175,15 @@ void ReduceOpHandle::RunImpl() {
...
@@ -175,15 +175,15 @@ void ReduceOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std
::
vector
<
platform
::
Place
>
in_places
;
// used to get dev_ctx
std
::
vector
<
platform
::
Place
>
in_places
;
// used to get dev_ctx
for
(
auto
*
in_handle
:
in_var_handles
)
{
for
(
auto
*
in_handle
:
in_var_handles
)
{
in_places
.
emplace_back
(
in_handle
->
place
_
);
in_places
.
emplace_back
(
in_handle
->
place
()
);
auto
in_var
=
auto
in_var
=
var_scopes
.
at
(
in_handle
->
scope_idx
_
)
->
FindVar
(
in_handle
->
name_
);
var_scopes
.
at
(
in_handle
->
scope_idx
())
->
FindVar
(
in_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
VariableVisitor
::
EnforceShapeAndDTypeEQ
(
*
pre_in_var
,
*
in_var
);
VariableVisitor
::
EnforceShapeAndDTypeEQ
(
*
pre_in_var
,
*
in_var
);
}
}
auto
out_var
=
auto
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
())
var_scopes
.
at
(
out_var_handle
->
scope_idx_
)
->
FindVar
(
out_var_handle
->
name_
);
->
FindVar
(
out_var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
// NOTE: The tensors' Place of input and output must be all on GPU or all on
// NOTE: The tensors' Place of input and output must be all on GPU or all on
...
@@ -191,9 +191,9 @@ void ReduceOpHandle::RunImpl() {
...
@@ -191,9 +191,9 @@ void ReduceOpHandle::RunImpl() {
auto
in_p
=
VariableVisitor
::
GetMutableTensor
(
pre_in_var
).
place
();
auto
in_p
=
VariableVisitor
::
GetMutableTensor
(
pre_in_var
).
place
();
platform
::
Place
t_out_p
;
platform
::
Place
t_out_p
;
if
(
platform
::
is_gpu_place
(
in_p
))
{
if
(
platform
::
is_gpu_place
(
in_p
))
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
out_var_handle
->
place
_
),
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
out_var_handle
->
place
()
),
"Places of input and output must be all on GPU."
);
"Places of input and output must be all on GPU."
);
t_out_p
=
out_var_handle
->
place
_
;
t_out_p
=
out_var_handle
->
place
()
;
}
else
{
}
else
{
t_out_p
=
platform
::
CPUPlace
();
t_out_p
=
platform
::
CPUPlace
();
}
}
...
@@ -253,7 +253,7 @@ void ReduceOpHandle::RunImpl() {
...
@@ -253,7 +253,7 @@ void ReduceOpHandle::RunImpl() {
auto
&
reduce_sum_trg
=
*
this
->
local_scopes_
[
0
]
auto
&
reduce_sum_trg
=
*
this
->
local_scopes_
[
0
]
->
FindVar
(
kLocalExecScopeName
)
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
Get
<
Scope
*>
()
->
FindVar
(
out_var_handle
->
name
_
)
->
FindVar
(
out_var_handle
->
name
()
)
->
GetMutable
<
framework
::
LoDTensor
>
();
->
GetMutable
<
framework
::
LoDTensor
>
();
ReduceLoDTensor
func
(
lod_tensors
,
&
reduce_sum_trg
);
ReduceLoDTensor
func
(
lod_tensors
,
&
reduce_sum_trg
);
VisitDataType
(
lod_tensors
[
0
]
->
type
(),
func
);
VisitDataType
(
lod_tensors
[
0
]
->
type
(),
func
);
...
@@ -269,9 +269,9 @@ void ReduceOpHandle::RunImpl() {
...
@@ -269,9 +269,9 @@ void ReduceOpHandle::RunImpl() {
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
out_var_handle
->
place
_
,
pre_in
.
type
());
out_var_handle
->
place
()
,
pre_in
.
type
());
auto
out_p
=
out_var_handle
->
place
_
;
auto
out_p
=
out_var_handle
->
place
()
;
int
root_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_p
).
device
;
int
root_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_p
).
device
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
var_scopes
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
var_scopes
.
size
();
++
i
)
{
...
@@ -286,7 +286,7 @@ void ReduceOpHandle::RunImpl() {
...
@@ -286,7 +286,7 @@ void ReduceOpHandle::RunImpl() {
if
(
root_id
==
dev_id
)
{
if
(
root_id
==
dev_id
)
{
recvbuffer
=
recvbuffer
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
mutable_data
(
out_var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
mutable_data
(
out_var_handle
->
place
_
);
out_var_handle
->
place
()
);
}
}
int
type
=
platform
::
ToNCCLDataType
(
lod_tensor
.
type
());
int
type
=
platform
::
ToNCCLDataType
(
lod_tensor
.
type
());
...
@@ -320,8 +320,8 @@ std::vector<const T *> ReduceOpHandle::GetInputValues(
...
@@ -320,8 +320,8 @@ std::vector<const T *> ReduceOpHandle::GetInputValues(
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
const
{
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
const
{
std
::
vector
<
const
T
*>
in_selected_rows
;
std
::
vector
<
const
T
*>
in_selected_rows
;
for
(
auto
*
in_handle
:
in_var_handles
)
{
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
&
in_sr
=
var_scopes
.
at
(
in_handle
->
scope_idx
_
)
auto
&
in_sr
=
var_scopes
.
at
(
in_handle
->
scope_idx
()
)
->
FindVar
(
in_handle
->
name
_
)
->
FindVar
(
in_handle
->
name
()
)
->
Get
<
T
>
();
->
Get
<
T
>
();
in_selected_rows
.
emplace_back
(
&
in_sr
);
in_selected_rows
.
emplace_back
(
&
in_sr
);
}
}
...
...
paddle/fluid/framework/details/rpc_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -30,7 +30,7 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
...
@@ -30,7 +30,7 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
void
RPCOpHandle
::
RunImpl
()
{
void
RPCOpHandle
::
RunImpl
()
{
for
(
auto
*
in
:
inputs_
)
{
for
(
auto
*
in
:
inputs_
)
{
auto
&
p
=
static_cast
<
VarHandle
*>
(
in
)
->
place
_
;
auto
&
p
=
static_cast
<
VarHandle
*>
(
in
)
->
place
()
;
if
(
ir
::
IsControlDepVar
(
*
in
->
Node
()))
{
if
(
ir
::
IsControlDepVar
(
*
in
->
Node
()))
{
continue
;
continue
;
}
}
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
54f9d44e
...
@@ -68,7 +68,7 @@ struct ScaleLossGradFunctor {
...
@@ -68,7 +68,7 @@ struct ScaleLossGradFunctor {
void
ScaleLossGradOpHandle
::
RunImpl
()
{
void
ScaleLossGradOpHandle
::
RunImpl
()
{
// Doesn't wait any event
// Doesn't wait any event
std
::
string
var_name
=
static_cast
<
VarHandle
*>
(
this
->
outputs_
[
0
])
->
name
_
;
std
::
string
var_name
=
static_cast
<
VarHandle
*>
(
this
->
outputs_
[
0
])
->
name
()
;
auto
&
local_scope
=
*
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
local_scope
=
*
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
*
tensor
=
local_scope
.
FindVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
auto
*
tensor
=
local_scope
.
FindVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
...
...
paddle/fluid/framework/details/var_handle.h
浏览文件 @
54f9d44e
...
@@ -111,15 +111,22 @@ struct VarHandle : public VarHandleBase {
...
@@ -111,15 +111,22 @@ struct VarHandle : public VarHandleBase {
// version field currently is not used, however, just store the version to
// version field currently is not used, however, just store the version to
// debug easily.
// debug easily.
private:
size_t
version_
;
size_t
version_
;
size_t
scope_idx_
;
size_t
scope_idx_
;
std
::
string
name_
;
std
::
string
name_
;
platform
::
Place
place_
;
platform
::
Place
place_
;
public:
bool
IsTheSameVar
(
const
VarHandle
&
o
)
const
{
bool
IsTheSameVar
(
const
VarHandle
&
o
)
const
{
return
o
.
generated_op_
==
generated_op_
&&
o
.
name_
==
name_
&&
return
o
.
generated_op_
==
generated_op_
&&
o
.
name_
==
name_
&&
o
.
scope_idx_
==
scope_idx_
;
o
.
scope_idx_
==
scope_idx_
;
}
}
size_t
version
()
const
{
return
version_
;
}
size_t
scope_idx
()
const
{
return
scope_idx_
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
const
platform
::
Place
&
place
()
const
{
return
place_
;
}
};
};
// Dummy Variable. It is used to represent dependencies between operators
// Dummy Variable. It is used to represent dependencies between operators
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
54f9d44e
...
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
...
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_
=
true
;
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
Update
();
Update
();
}
}
...
@@ -145,8 +146,8 @@ void contrib::AnalysisConfig::Update() {
...
@@ -145,8 +146,8 @@ void contrib::AnalysisConfig::Update() {
LOG
(
ERROR
)
LOG
(
ERROR
)
<<
"TensorRT engine is not available when EnableGpu() not actived."
;
<<
"TensorRT engine is not available when EnableGpu() not actived."
;
}
else
{
}
else
{
// Append after the
infer_clean
pass.
// Append after the
Affine_channel_conv_fuse
pass.
pass_builder
()
->
InsertPass
(
1
,
"tensorrt_subgraph_pass"
);
pass_builder
()
->
InsertPass
(
3
,
"tensorrt_subgraph_pass"
);
}
}
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
54f9d44e
...
@@ -561,6 +561,7 @@ AnalysisPredictor::~AnalysisPredictor() {
...
@@ -561,6 +561,7 @@ AnalysisPredictor::~AnalysisPredictor() {
}
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
clone_mutex_
);
auto
*
x
=
new
AnalysisPredictor
(
config_
);
auto
*
x
=
new
AnalysisPredictor
(
config_
);
x
->
Init
(
scope_
,
inference_program_
);
x
->
Init
(
scope_
,
inference_program_
);
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
54f9d44e
...
@@ -115,6 +115,8 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -115,6 +115,8 @@ class AnalysisPredictor : public PaddlePredictor {
// concurrency problems, wrong results and memory leak, so cache them.
// concurrency problems, wrong results and memory leak, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
// A mutex help to make Clone thread safe.
std
::
mutex
clone_mutex_
;
private:
private:
// Some status here that help to determine the status inside the predictor.
// Some status here that help to determine the status inside the predictor.
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
54f9d44e
...
@@ -179,8 +179,9 @@ TEST(AnalysisPredictor, Clone) {
...
@@ -179,8 +179,9 @@ TEST(AnalysisPredictor, Clone) {
threads
.
emplace_back
([
&
predictors
,
&
inputs
,
i
]
{
threads
.
emplace_back
([
&
predictors
,
&
inputs
,
i
]
{
LOG
(
INFO
)
<<
"thread #"
<<
i
<<
" running"
;
LOG
(
INFO
)
<<
"thread #"
<<
i
<<
" running"
;
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
PaddleTensor
>
outputs
;
auto
predictor
=
predictors
.
front
()
->
Clone
();
for
(
int
j
=
0
;
j
<
10
;
j
++
)
{
for
(
int
j
=
0
;
j
<
10
;
j
++
)
{
ASSERT_TRUE
(
predictor
s
[
i
]
->
Run
(
inputs
,
&
outputs
));
ASSERT_TRUE
(
predictor
->
Run
(
inputs
,
&
outputs
));
}
}
});
});
}
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
54f9d44e
...
@@ -161,13 +161,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
...
@@ -161,13 +161,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
}
}
std
::
unique_ptr
<
PaddlePredictor
>
NativePaddlePredictor
::
Clone
()
{
std
::
unique_ptr
<
PaddlePredictor
>
NativePaddlePredictor
::
Clone
()
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
clone_mutex_
);
VLOG
(
3
)
<<
"Predictor::clone"
;
VLOG
(
3
)
<<
"Predictor::clone"
;
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
NativePaddlePredictor
(
config_
));
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
NativePaddlePredictor
(
config_
));
// Hot fix the bug that result diff in multi-thread.
if
(
!
dynamic_cast
<
NativePaddlePredictor
*>
(
cls
.
get
())
->
Init
(
scope_
))
{
// TODO(Superjomn) re-implement a real clone here.
if
(
!
dynamic_cast
<
NativePaddlePredictor
*>
(
cls
.
get
())
->
Init
(
nullptr
))
{
LOG
(
ERROR
)
<<
"fail to call Init"
;
LOG
(
ERROR
)
<<
"fail to call Init"
;
return
nullptr
;
return
nullptr
;
}
}
#ifdef __clang__
#ifdef __clang__
// fix clang compile error
// fix clang compile error
return
cls
;
return
cls
;
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
54f9d44e
...
@@ -74,6 +74,8 @@ class NativePaddlePredictor : public PaddlePredictor {
...
@@ -74,6 +74,8 @@ class NativePaddlePredictor : public PaddlePredictor {
// Do not use unique_ptr, use parent scope to delete
// Do not use unique_ptr, use parent scope to delete
framework
::
Scope
*
sub_scope_
{
nullptr
};
framework
::
Scope
*
sub_scope_
{
nullptr
};
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
// A mutex to make Clone thread safe.
std
::
mutex
clone_mutex_
;
};
};
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/details/zero_copy_tensor.cc
浏览文件 @
54f9d44e
...
@@ -33,9 +33,15 @@ void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
...
@@ -33,9 +33,15 @@ void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
tensor
->
Resize
(
framework
::
make_ddim
(
shape
));
tensor
->
Resize
(
framework
::
make_ddim
(
shape
));
}
}
#define EAGER_GET_TENSOR \
if (!tensor_) { \
tensor_ = FindTensor(); \
} \
auto *tensor = static_cast<framework::LoDTensor *>(tensor_);
template
<
typename
T
>
template
<
typename
T
>
T
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
)
{
T
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
EAGER_GET_TENSOR
;
switch
(
static_cast
<
int
>
(
place
))
{
switch
(
static_cast
<
int
>
(
place
))
{
case
static_cast
<
int
>
(
PaddlePlace
::
kCPU
):
{
case
static_cast
<
int
>
(
PaddlePlace
::
kCPU
):
{
return
tensor
->
mutable_data
<
T
>
(
platform
::
CPUPlace
());
return
tensor
->
mutable_data
<
T
>
(
platform
::
CPUPlace
());
...
@@ -52,7 +58,7 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
...
@@ -52,7 +58,7 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
template
<
typename
T
>
template
<
typename
T
>
T
*
ZeroCopyTensor
::
data
(
PaddlePlace
*
place
,
int
*
size
)
const
{
T
*
ZeroCopyTensor
::
data
(
PaddlePlace
*
place
,
int
*
size
)
const
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
EAGER_GET_TENSOR
;
auto
*
res
=
tensor
->
data
<
T
>
();
auto
*
res
=
tensor
->
data
<
T
>
();
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
...
@@ -87,13 +93,13 @@ void *ZeroCopyTensor::FindTensor() const {
...
@@ -87,13 +93,13 @@ void *ZeroCopyTensor::FindTensor() const {
}
}
std
::
vector
<
int64_t
>
ZeroCopyTensor
::
shape
()
const
{
std
::
vector
<
int64_t
>
ZeroCopyTensor
::
shape
()
const
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
EAGER_GET_TENSOR
;
PADDLE_ENFORCE
(
tensor
,
"not found tensor called %s in the scope"
,
name_
);
PADDLE_ENFORCE
(
tensor
_
,
"not found tensor called %s in the scope"
,
name_
);
return
framework
::
vectorize
(
tensor
->
dims
());
return
framework
::
vectorize
(
tensor
->
dims
());
}
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
EAGER_GET_TENSOR
;
framework
::
LoD
lod
;
framework
::
LoD
lod
;
for
(
auto
&
level
:
x
)
{
for
(
auto
&
level
:
x
)
{
lod
.
emplace_back
(
level
);
lod
.
emplace_back
(
level
);
...
@@ -102,8 +108,8 @@ void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
...
@@ -102,8 +108,8 @@ void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
}
}
std
::
vector
<
std
::
vector
<
size_t
>>
ZeroCopyTensor
::
lod
()
const
{
std
::
vector
<
std
::
vector
<
size_t
>>
ZeroCopyTensor
::
lod
()
const
{
EAGER_GET_TENSOR
;
std
::
vector
<
std
::
vector
<
size_t
>>
res
;
std
::
vector
<
std
::
vector
<
size_t
>>
res
;
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
for
(
auto
&
level
:
tensor
->
lod
())
{
for
(
auto
&
level
:
tensor
->
lod
())
{
res
.
emplace_back
(
level
);
res
.
emplace_back
(
level
);
}
}
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
54f9d44e
...
@@ -146,6 +146,9 @@ class ZeroCopyTensor {
...
@@ -146,6 +146,9 @@ class ZeroCopyTensor {
bool
input_or_output_
;
bool
input_or_output_
;
friend
class
AnalysisPredictor
;
friend
class
AnalysisPredictor
;
void
*
scope_
{
nullptr
};
void
*
scope_
{
nullptr
};
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
mutable
void
*
tensor_
{
nullptr
};
};
};
/** A simple Inference API for Paddle.
/** A simple Inference API for Paddle.
...
@@ -167,18 +170,40 @@ class PaddlePredictor {
...
@@ -167,18 +170,40 @@ class PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
int
batch_size
=
-
1
)
=
0
;
/** Zero copy input and output optimization.
/** \brief Get a mutable tensor directly.
* Get the input or output tensors, and operate on their memory directly,
*
* without copy.
* NOTE Only works in AnalysisPredictor.
*
* One can also use this to modify any temporary variable related tensors in
* the predictor.
*
*/
*/
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
{
const
std
::
string
&
name
)
{
return
nullptr
;
return
nullptr
;
}
}
/**
* \brief Get an immutable tensor without copy.
*
* NOTE Only works in AnalysisPredictor.
* One can use this API to get any temporary tensors in the predictor and
* read it.
*/
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
const
std
::
string
&
name
)
{
const
std
::
string
&
name
)
{
return
nullptr
;
return
nullptr
;
}
}
/**
* \brief Run the predictor with zero-copied inputs and outputs.
*
* NOTE Only works in AnalysisPredictor.
*
* This will save the IO copy for transfering inputs and outputs to predictor
* workspace and get some performance improvement.
* To use it, one should call the `AnalysisConfig.SwitchUseFeedFetchOp(true)`
* and then use the `GetInputTensor` and `GetOutputTensor` to directly write
* or read the input/output tensors.
*/
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
/** Clone a predictor that share the model weights, the Cloned predictor
/** Clone a predictor that share the model weights, the Cloned predictor
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
54f9d44e
...
@@ -84,7 +84,12 @@ inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_te
...
@@ -84,7 +84,12 @@ inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_te
# MM DNN
# MM DNN
set
(
MM_DNN_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mm_dnn"
)
set
(
MM_DNN_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mm_dnn"
)
download_model_and_data
(
${
MM_DNN_INSTALL_DIR
}
"MM_DNN_model.tar.gz"
"MM_DNN_data.txt.tar.gz"
)
download_model_and_data
(
${
MM_DNN_INSTALL_DIR
}
"MM_DNN_model.tar.gz"
"MM_DNN_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_mm_dnn
${
MM_DNN_INSTALL_DIR
}
analyzer_mm_dnn_tester.cc
)
inference_analysis_api_test
(
test_analyzer_mm_dnn
${
MM_DNN_INSTALL_DIR
}
analyzer_mm_dnn_tester.cc SERIAL
)
# Pyramid DNN
set
(
PYRAMID_DNN_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/pyramid_dnn"
)
download_model_and_data
(
${
PYRAMID_DNN_INSTALL_DIR
}
"PyramidDNN_model.tar.gz"
"PyramidDNN_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_pyramid_dnn
${
PYRAMID_DNN_INSTALL_DIR
}
analyzer_pyramid_dnn_tester.cc
)
# text_classification
# text_classification
set
(
TEXT_CLASSIFICATION_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/text_classification"
)
set
(
TEXT_CLASSIFICATION_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/text_classification"
)
...
...
paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
int64_t
>>
query_basic
,
query_phrase
,
title_basic
,
title_phrase
;
std
::
vector
<
size_t
>
lod1
,
lod2
,
lod3
,
lod4
;
size_t
batch_iter
{
0
},
batch_size
{
1
},
num_samples
;
// total number of samples
DataRecord
()
=
default
;
explicit
DataRecord
(
const
std
::
string
&
path
,
int
batch_size
=
1
)
:
batch_size
(
batch_size
)
{
Load
(
path
);
}
DataRecord
NextBatch
()
{
DataRecord
data
;
size_t
batch_end
=
batch_iter
+
batch_size
;
// NOTE skip the final batch, if no enough data is provided.
if
(
batch_end
<=
query_basic
.
size
())
{
GetInputPerBatch
(
query_basic
,
&
data
.
query_basic
,
&
data
.
lod1
,
batch_iter
,
batch_end
);
GetInputPerBatch
(
query_phrase
,
&
data
.
query_phrase
,
&
data
.
lod2
,
batch_iter
,
batch_end
);
GetInputPerBatch
(
title_basic
,
&
data
.
title_basic
,
&
data
.
lod3
,
batch_iter
,
batch_end
);
GetInputPerBatch
(
title_phrase
,
&
data
.
title_phrase
,
&
data
.
lod4
,
batch_iter
,
batch_end
);
}
batch_iter
+=
batch_size
;
return
data
;
}
void
Load
(
const
std
::
string
&
path
)
{
std
::
ifstream
file
(
path
);
std
::
string
line
;
int
num_lines
=
0
;
while
(
std
::
getline
(
file
,
line
))
{
std
::
vector
<
std
::
string
>
data
;
split
(
line
,
';'
,
&
data
);
// load query data
std
::
vector
<
int64_t
>
query_basic_data
;
split_to_int64
(
data
[
1
],
' '
,
&
query_basic_data
);
std
::
vector
<
int64_t
>
query_phrase_data
;
split_to_int64
(
data
[
2
],
' '
,
&
query_phrase_data
);
// load title data
std
::
vector
<
int64_t
>
title_basic_data
;
split_to_int64
(
data
[
3
],
' '
,
&
title_basic_data
);
std
::
vector
<
int64_t
>
title_phrase_data
;
split_to_int64
(
data
[
4
],
' '
,
&
title_phrase_data
);
// filter the empty data
bool
flag
=
data
[
1
].
size
()
&&
data
[
2
].
size
()
&&
data
[
3
].
size
()
&&
data
[
4
].
size
();
if
(
flag
)
{
query_basic
.
push_back
(
std
::
move
(
query_basic_data
));
query_phrase
.
push_back
(
std
::
move
(
query_phrase_data
));
title_basic
.
push_back
(
std
::
move
(
title_basic_data
));
title_phrase
.
push_back
(
std
::
move
(
title_phrase_data
));
num_lines
++
;
}
}
num_samples
=
num_lines
;
}
};
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
input_slots
,
DataRecord
*
data
,
int
batch_size
)
{
PaddleTensor
query_basic_tensor
,
query_phrase_tensor
,
title_basic_tensor
,
title_phrase_tensor
;
query_basic_tensor
.
name
=
"query_basic"
;
query_phrase_tensor
.
name
=
"query_phrase"
;
title_basic_tensor
.
name
=
"pos_title_basic"
;
title_phrase_tensor
.
name
=
"pos_title_phrase"
;
auto
one_batch
=
data
->
NextBatch
();
// assign data
TensorAssignData
<
int64_t
>
(
&
query_basic_tensor
,
one_batch
.
query_basic
,
one_batch
.
lod1
);
TensorAssignData
<
int64_t
>
(
&
query_phrase_tensor
,
one_batch
.
query_phrase
,
one_batch
.
lod2
);
TensorAssignData
<
int64_t
>
(
&
title_basic_tensor
,
one_batch
.
title_basic
,
one_batch
.
lod3
);
TensorAssignData
<
int64_t
>
(
&
title_phrase_tensor
,
one_batch
.
title_phrase
,
one_batch
.
lod4
);
// Set inputs.
input_slots
->
assign
({
query_basic_tensor
,
query_phrase_tensor
,
title_basic_tensor
,
title_phrase_tensor
});
for
(
auto
&
tensor
:
*
input_slots
)
{
tensor
.
dtype
=
PaddleDType
::
INT64
;
}
}
void
SetConfig
(
contrib
::
AnalysisConfig
*
cfg
)
{
cfg
->
SetModel
(
FLAGS_infer_model
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
PaddleTensor
>
input_slots
;
int
epoch
=
FLAGS_test_all_data
?
data
.
num_samples
/
FLAGS_batch_size
:
1
;
LOG
(
INFO
)
<<
"number of samples: "
<<
epoch
*
FLAGS_batch_size
;
for
(
int
bid
=
0
;
bid
<
epoch
;
++
bid
)
{
PrepareInputs
(
&
input_slots
,
&
data
,
FLAGS_batch_size
);
(
*
inputs
).
emplace_back
(
input_slots
);
}
}
// Easy for profiling independently.
TEST
(
Analyzer_Pyramid_DNN
,
profile
)
{
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
1UL
);
size_t
size
=
GetSize
(
outputs
[
0
]);
PADDLE_ENFORCE_GT
(
size
,
0
);
float
*
result
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
// output is probability, which is in (0, 1).
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
EXPECT_GT
(
result
[
i
],
0
);
EXPECT_LT
(
result
[
i
],
1
);
}
}
}
// Check the fuse status
TEST
(
Analyzer_Pyramid_DNN
,
fuse_statis
)
{
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
}
// Compare result of NativeConfig and AnalysisConfig
TEST
(
Analyzer_Pyramid_DNN
,
compare
)
{
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Compare Deterministic result
TEST
(
Analyzer_Pyramid_DNN
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
54f9d44e
...
@@ -65,7 +65,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
...
@@ -65,7 +65,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler
tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
...
...
paddle/fluid/operators/controlflow/while_op.cc
浏览文件 @
54f9d44e
...
@@ -58,7 +58,6 @@ class WhileOp : public framework::OperatorBase {
...
@@ -58,7 +58,6 @@ class WhileOp : public framework::OperatorBase {
void
RunImpl
(
const
framework
::
Scope
&
scope
,
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
platform
::
Place
&
dev_place
)
const
override
{
PADDLE_ENFORCE_NOT_NULL
(
scope
.
FindVar
(
Input
(
kCondition
)));
PADDLE_ENFORCE_NOT_NULL
(
scope
.
FindVar
(
Input
(
kCondition
)));
auto
&
cond
=
scope
.
FindVar
(
Input
(
kCondition
))
->
Get
<
LoDTensor
>
();
auto
&
cond
=
scope
.
FindVar
(
Input
(
kCondition
))
->
Get
<
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
cond
.
dims
(),
paddle
::
framework
::
make_ddim
({
1
}));
PADDLE_ENFORCE_EQ
(
cond
.
dims
(),
paddle
::
framework
::
make_ddim
({
1
}));
...
@@ -73,27 +72,18 @@ class WhileOp : public framework::OperatorBase {
...
@@ -73,27 +72,18 @@ class WhileOp : public framework::OperatorBase {
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
cond
.
place
()),
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
cond
.
place
()),
"Condition of while op must in CPU memory."
);
"Condition of while op must in CPU memory."
);
bool
is_test
=
Attr
<
bool
>
(
"is_test"
);
auto
&
skip_vars
=
Attr
<
std
::
vector
<
std
::
string
>>
(
kSkipEagerDeletionVars
);
auto
&
skip_vars
=
Attr
<
std
::
vector
<
std
::
string
>>
(
kSkipEagerDeletionVars
);
VLOG
(
2
)
<<
GetSkipEagerDeletionVarsDebugString
(
skip_vars
);
VLOG
(
2
)
<<
GetSkipEagerDeletionVarsDebugString
(
skip_vars
);
bool
is_test
=
Attr
<
bool
>
(
"is_test"
);
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
(),
skip_vars
);
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
(),
skip_vars
);
while
(
cond
.
data
<
bool
>
()[
0
])
{
if
(
!
is_test
)
{
while
(
cond
.
data
<
bool
>
()[
0
])
{
auto
&
current_scope
=
scope
.
NewScope
();
step_scopes
->
push_back
(
&
current_scope
);
executor
.
RunPreparedContext
(
ctx
.
get
(),
&
current_scope
,
false
,
true
,
true
);
}
}
else
{
auto
&
current_scope
=
scope
.
NewScope
();
auto
&
current_scope
=
scope
.
NewScope
();
executor
.
CreateVariables
(
*
program
,
&
current_scope
,
block
->
ID
()
);
step_scopes
->
push_back
(
&
current_scope
);
while
(
cond
.
data
<
bool
>
()[
0
])
{
executor
.
RunPreparedContext
(
ctx
.
get
(),
&
current_scope
,
false
,
true
,
true
);
executor
.
RunPreparedContext
(
ctx
.
get
(),
&
current_scope
,
false
,
false
,
if
(
is_test
)
{
fals
e
);
scope
.
DeleteScope
(
&
current_scop
e
);
}
}
scope
.
DeleteScope
(
&
current_scope
);
}
}
}
}
};
};
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
54f9d44e
...
@@ -7,7 +7,7 @@ if(WITH_GRPC)
...
@@ -7,7 +7,7 @@ if(WITH_GRPC)
else
()
else
()
set
(
cc_generic_services
"true"
)
set
(
cc_generic_services
"true"
)
endif
()
endif
()
configure_file
(
send_recv.proto.in
${
CMAKE_CURRENT_
BINARY
_DIR
}
/send_recv.proto @ONLY
)
configure_file
(
send_recv.proto.in
${
CMAKE_CURRENT_
SOURCE
_DIR
}
/send_recv.proto @ONLY
)
# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
...
@@ -19,8 +19,8 @@ if(WITH_GRPC)
...
@@ -19,8 +19,8 @@ if(WITH_GRPC)
variable_response.cc
variable_response.cc
collective_client.cc collective_server.cc
collective_client.cc collective_server.cc
${
GRPC_SRCS
}
${
GRPC_SRCS
}
PROTO
${
CMAKE_CURRENT_BINARY_DIR
}
/
send_recv.proto
PROTO send_recv.proto
DEPS lod_tensor selected_rows_functor memory
${
GRPC_DEPS
}
)
DEPS lod_tensor selected_rows_functor memory
)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set
(
RPC_DEPS sendrecvop_rpc
${
GRPC_DEPS
}
)
set
(
RPC_DEPS sendrecvop_rpc
${
GRPC_DEPS
}
)
...
...
paddle/fluid/operators/gru_op.cc
浏览文件 @
54f9d44e
...
@@ -137,6 +137,10 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -137,6 +137,10 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, defalut: False) "
"(bool, defalut: False) "
"whether to compute reversed GRU."
)
"whether to compute reversed GRU."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"origin_mode"
,
"bool"
"use origin mode in article https://arxiv.org/abs/1412.3555"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
GRU Operator implements part calculations of the complete GRU as following:
GRU Operator implements part calculations of the complete GRU as following:
...
@@ -221,6 +225,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
...
@@ -221,6 +225,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
public:
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
bool
origin_mode
=
context
.
Attr
<
bool
>
(
"origin_mode"
);
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
...
@@ -327,7 +332,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
...
@@ -327,7 +332,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_node
);
cur_batch_size
,
active_node
,
origin_mode
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
}
...
@@ -351,7 +356,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
...
@@ -351,7 +356,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
active_gate
,
origin_mode
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
}
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
54f9d44e
...
@@ -21,6 +21,7 @@ template <typename DeviceContext, typename T>
...
@@ -21,6 +21,7 @@ template <typename DeviceContext, typename T>
class
GRUKernel
:
public
framework
::
OpKernel
<
T
>
{
class
GRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
bool
origin_mode
=
context
.
Attr
<
bool
>
(
"origin_mode"
);
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
...
@@ -87,7 +88,7 @@ class GRUKernel : public framework::OpKernel<T> {
...
@@ -87,7 +88,7 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
active_gate
,
origin_mode
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
}
...
...
paddle/fluid/operators/gru_op.h
浏览文件 @
54f9d44e
...
@@ -41,6 +41,7 @@ template <typename DeviceContext, typename T>
...
@@ -41,6 +41,7 @@ template <typename DeviceContext, typename T>
class
GRUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
class
GRUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
bool
origin_mode
=
context
.
Attr
<
bool
>
(
"origin_mode"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
const
T
*
weight_data
=
weight
->
data
<
T
>
();
...
@@ -146,7 +147,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
...
@@ -146,7 +147,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
math
::
GRUUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
GRUUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
gru_grad
,
frame_size
,
cur_batch_size
,
active_node
,
dev_ctx
,
gru_value
,
gru_grad
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
active_gate
,
origin_mode
);
}
}
if
(
input_grad
)
{
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
...
...
paddle/fluid/operators/gru_unit_op.cc
浏览文件 @
54f9d44e
...
@@ -111,6 +111,13 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -111,6 +111,13 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
"The activation type used in update gate and reset gate."
)
"The activation type used in update gate and reset gate."
)
.
SetDefault
(
sigmoid
)
.
SetDefault
(
sigmoid
)
.
InEnum
({
identity
,
sigmoid
,
tanh
,
relu
});
.
InEnum
({
identity
,
sigmoid
,
tanh
,
relu
});
AddAttr
<
bool
>
(
"origin_mode"
,
"bool"
"use origin mode in article <Learning Phrase Representations "
"using RNN Encoder–Decoder
\n
"
"for Statistical Machine "
"Translation>(https://arxiv.org/pdf/1406.1078.pdf)"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
GRUUnit Operator implements partial calculations of the GRU unit as following:
GRUUnit Operator implements partial calculations of the GRU unit as following:
...
...
paddle/fluid/operators/gru_unit_op.h
浏览文件 @
54f9d44e
...
@@ -113,7 +113,11 @@ class GRUUnitKernel : public framework::OpKernel<T> {
...
@@ -113,7 +113,11 @@ class GRUUnitKernel : public framework::OpKernel<T> {
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
// calculate final output
// calculate final output
h
.
device
(
place
)
=
u
*
(
c
-
h_p
)
+
h_p
;
if
(
context
.
Attr
<
bool
>
(
"origin_mode"
))
{
h
.
device
(
place
)
=
c
+
u
*
(
h_p
-
c
);
// (1 - u) * c + u * h_p
}
else
{
h
.
device
(
place
)
=
u
*
(
c
-
h_p
)
+
h_p
;
// u * c + (1 - u) * h_p
}
}
}
};
};
...
@@ -180,11 +184,19 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
...
@@ -180,11 +184,19 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
// backward for unactivated update gate
// backward for unactivated update gate
ActGradCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
u
,
u
,
if
(
context
.
Attr
<
bool
>
(
"origin_mode"
))
{
d_g
.
slice
(
u_offsets
,
extents
),
d_h
*
(
c
-
h_p
));
ActGradCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
u
,
u
,
// backward for unactivated output candidate
d_g
.
slice
(
u_offsets
,
extents
),
d_h
*
(
h_p
-
c
));
ActGradCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
c
,
c
,
// backward for unactivated output candidate
d_g
.
slice
(
c_offsets
,
extents
),
d_h
*
u
);
ActGradCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
c
,
c
,
d_g
.
slice
(
c_offsets
,
extents
),
d_h
*
(
1
-
u
));
}
else
{
ActGradCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
u
,
u
,
d_g
.
slice
(
u_offsets
,
extents
),
d_h
*
(
c
-
h_p
));
// backward for unactivated output candidate
ActGradCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
c
,
c
,
d_g
.
slice
(
c_offsets
,
extents
),
d_h
*
u
);
}
// backward for reset_hidden_prev
// backward for reset_hidden_prev
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
,
1
,
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
,
1
,
...
@@ -213,7 +225,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
...
@@ -213,7 +225,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
T
*
hidden_prev_grad_data
=
T
*
hidden_prev_grad_data
=
hidden_prev_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
hidden_prev_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
d_h_p
=
EigenMatrix
<
T
>::
From
(
*
hidden_prev_grad
);
auto
d_h_p
=
EigenMatrix
<
T
>::
From
(
*
hidden_prev_grad
);
d_h_p
.
device
(
place
)
=
d_r_h_p
*
r
+
d_h
*
(
u
.
constant
(
T
(
1
))
-
u
);
if
(
context
.
Attr
<
bool
>
(
"origin_mode"
))
{
d_h_p
.
device
(
place
)
=
d_r_h_p
*
r
+
d_h
*
u
;
}
else
{
d_h_p
.
device
(
place
)
=
d_r_h_p
*
r
+
d_h
*
(
1
-
u
);
}
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
*
2
,
1
,
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
*
2
,
1
,
gate_grad_data
,
frame_size
*
3
,
weight_data
,
frame_size
*
2
,
1
,
gate_grad_data
,
frame_size
*
3
,
weight_data
,
frame_size
*
2
,
1
,
hidden_prev_grad_data
,
frame_size
);
hidden_prev_grad_data
,
frame_size
);
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
54f9d44e
...
@@ -60,6 +60,7 @@ math_library(matrix_bit_code)
...
@@ -60,6 +60,7 @@ math_library(matrix_bit_code)
math_library
(
unpooling
)
math_library
(
unpooling
)
math_library
(
vol2col
)
math_library
(
vol2col
)
math_library
(
prelu
)
math_library
(
prelu
)
math_library
(
tree2col DEPS math_function
)
cc_test
(
math_function_test SRCS math_function_test.cc DEPS math_function
)
cc_test
(
math_function_test SRCS math_function_test.cc DEPS math_function
)
cc_test
(
selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor
)
cc_test
(
selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor
)
...
...
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
浏览文件 @
54f9d44e
...
@@ -56,7 +56,8 @@ template <class OpFinalOutput, typename T>
...
@@ -56,7 +56,8 @@ template <class OpFinalOutput, typename T>
void
hl_naive_gru_forward_final_output
(
OpFinalOutput
op_final_output
,
void
hl_naive_gru_forward_final_output
(
OpFinalOutput
op_final_output
,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
output_value
,
int
frame_size
,
T
*
output_value
,
int
frame_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
T
r_value_update_gate
;
T
r_value_update_gate
;
T
r_value_frame_state
;
T
r_value_frame_state
;
T
r_prev_out
=
0
;
T
r_prev_out
=
0
;
...
@@ -72,7 +73,7 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
...
@@ -72,7 +73,7 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
}
}
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
&
r_output
,
active_node
);
&
r_output
,
active_node
,
origin_mode
);
frame_state
[
i
]
=
r_value_frame_state
;
frame_state
[
i
]
=
r_value_frame_state
;
output_value
[
i
]
=
r_output
;
output_value
[
i
]
=
r_output
;
...
@@ -146,7 +147,8 @@ template <class OpFinalOutput, typename T>
...
@@ -146,7 +147,8 @@ template <class OpFinalOutput, typename T>
void
hl_avx_gru_forward_final_output
(
OpFinalOutput
op_final_output
,
void
hl_avx_gru_forward_final_output
(
OpFinalOutput
op_final_output
,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
output_value
,
int
frame_size
,
T
*
output_value
,
int
frame_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
#ifdef __AVX__
#ifdef __AVX__
__m256
r_value_update_gate
,
r_value_update_gate_last
=
_mm256_set1_ps
(
0.0
f
);
__m256
r_value_update_gate
,
r_value_update_gate_last
=
_mm256_set1_ps
(
0.0
f
);
__m256
r_value_frame_state
,
r_value_frame_state_last
=
_mm256_set1_ps
(
0.0
f
);
__m256
r_value_frame_state
,
r_value_frame_state_last
=
_mm256_set1_ps
(
0.0
f
);
...
@@ -180,7 +182,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
...
@@ -180,7 +182,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
}
}
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
&
r_output
,
active_node
);
&
r_output
,
active_node
,
origin_mode
);
_mm256_storeu_ps
(
reinterpret_cast
<
float
*>
(
frame_state
+
i
),
_mm256_storeu_ps
(
reinterpret_cast
<
float
*>
(
frame_state
+
i
),
r_value_frame_state
);
r_value_frame_state
);
...
@@ -190,7 +192,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
...
@@ -190,7 +192,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
if
(
rest
>
0
)
{
if
(
rest
>
0
)
{
i
=
n
-
block
;
i
=
n
-
block
;
op_final_output
(
&
r_value_update_gate_last
,
&
r_value_frame_state_last
,
op_final_output
(
&
r_value_update_gate_last
,
&
r_value_frame_state_last
,
&
r_prev_out_last
,
&
r_output
,
active_node
);
&
r_prev_out_last
,
&
r_output
,
active_node
,
origin_mode
);
_mm256_storeu_ps
(
reinterpret_cast
<
float
*>
(
frame_state
+
i
),
_mm256_storeu_ps
(
reinterpret_cast
<
float
*>
(
frame_state
+
i
),
r_value_frame_state_last
);
r_value_frame_state_last
);
...
@@ -227,17 +229,18 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
...
@@ -227,17 +229,18 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
template
<
class
OpFinalOutput
,
typename
T
>
template
<
class
OpFinalOutput
,
typename
T
>
inline
void
forward_final_output
(
OpFinalOutput
op_final_output
,
inline
void
forward_final_output
(
OpFinalOutput
op_final_output
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
)
{
int
batch_size
,
ActivationType
active_node
,
bool
origin_mode
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
if
(
OpFinalOutput
::
avx
&&
(
frame_size
>
static_cast
<
int
>
(
8
-
1
))
&&
if
(
OpFinalOutput
::
avx
&&
(
frame_size
>
static_cast
<
int
>
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
(
sizeof
(
T
)
==
4
))
{
hl_avx_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
hl_avx_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
active_node
);
frame_size
,
active_node
,
origin_mode
);
}
else
{
}
else
{
hl_naive_gru_forward_final_output
(
hl_naive_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
active_node
);
value
.
output_value
,
frame_size
,
active_node
,
origin_mode
);
}
}
value
.
gate_value
+=
frame_size
*
3
;
value
.
gate_value
+=
frame_size
*
3
;
...
@@ -253,7 +256,8 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
...
@@ -253,7 +256,8 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
prev_out_grad
,
T
*
output_grad
,
T
*
prev_out_grad
,
T
*
output_grad
,
int
frame_size
,
int
frame_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
T
r_update_gate_value
;
T
r_update_gate_value
;
T
r_update_gate_grad
;
T
r_update_gate_grad
;
T
r_frame_state_value
;
T
r_frame_state_value
;
...
@@ -279,7 +283,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
...
@@ -279,7 +283,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
);
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
,
origin_mode
);
update_gate_grad
[
i
]
=
r_update_gate_grad
;
update_gate_grad
[
i
]
=
r_update_gate_grad
;
frame_state_grad
[
i
]
=
r_frame_state_grad
;
frame_state_grad
[
i
]
=
r_frame_state_grad
;
...
@@ -338,8 +342,8 @@ template <class OpStateGrad, typename T>
...
@@ -338,8 +342,8 @@ template <class OpStateGrad, typename T>
void
hl_avx_gru_backward_state_grad
(
OpStateGrad
op_state_grad
,
T
*
gate_value
,
void
hl_avx_gru_backward_state_grad
(
OpStateGrad
op_state_grad
,
T
*
gate_value
,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
prev_out_grad
,
T
*
output_grad
,
T
*
prev_out_grad
,
T
*
output_grad
,
int
frame_size
,
int
frame_size
,
ActivationType
active_node
,
ActivationType
active_n
ode
)
{
bool
origin_m
ode
)
{
#ifdef __AVX__
#ifdef __AVX__
__m256
r_update_gate_value
;
__m256
r_update_gate_value
;
__m256
r_update_gate_grad
;
__m256
r_update_gate_grad
;
...
@@ -368,7 +372,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
...
@@ -368,7 +372,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
);
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
,
origin_mode
);
update_gate_grad
[
i
]
=
r_update_gate_grad
;
update_gate_grad
[
i
]
=
r_update_gate_grad
;
frame_state_grad
[
i
]
=
r_frame_state_grad
;
frame_state_grad
[
i
]
=
r_frame_state_grad
;
...
@@ -431,16 +435,18 @@ template <class OpStateGrad, typename T>
...
@@ -431,16 +435,18 @@ template <class OpStateGrad, typename T>
inline
void
backward_state_grad
(
OpStateGrad
op_state_grad
,
inline
void
backward_state_grad
(
OpStateGrad
op_state_grad
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
if
(
OpStateGrad
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
if
(
OpStateGrad
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
hl_avx_gru_backward_state_grad
(
hl_avx_gru_backward_state_grad
(
op_state_grad
,
value
.
gate_value
,
op_state_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
);
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
,
origin_mode
);
}
else
{
}
else
{
hl_naive_gru_backward_state_grad
(
hl_naive_gru_backward_state_grad
(
op_state_grad
,
value
.
gate_value
,
op_state_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
);
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
,
origin_mode
);
}
}
value
.
gate_value
+=
frame_size
*
3
;
value
.
gate_value
+=
frame_size
*
3
;
...
...
paddle/fluid/operators/math/detail/gru_gpu_kernel.h
浏览文件 @
54f9d44e
...
@@ -72,7 +72,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
...
@@ -72,7 +72,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
output_value
,
int
frame_size
,
T
*
output_value
,
int
frame_size
,
int
batch_size
,
int
batch_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
frame_idx
>=
frame_size
)
return
;
if
(
frame_idx
>=
frame_size
)
return
;
int
batch_idx
=
0
;
int
batch_idx
=
0
;
...
@@ -94,7 +95,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
...
@@ -94,7 +95,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
}
}
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
&
r_output
,
active_node
);
&
r_output
,
active_node
,
origin_mode
);
gate_value
[
frame_idx
+
frame_size
*
2
]
=
r_value_frame_state
;
gate_value
[
frame_idx
+
frame_size
*
2
]
=
r_value_frame_state
;
output_value
[
frame_idx
]
=
r_output
;
output_value
[
frame_idx
]
=
r_output
;
...
@@ -109,7 +110,8 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
...
@@ -109,7 +110,8 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
prev_out_grad
,
T
*
output_grad
,
T
*
prev_out_grad
,
T
*
output_grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
frame_idx
>=
frame_size
)
return
;
if
(
frame_idx
>=
frame_size
)
return
;
int
batch_idx
=
0
;
int
batch_idx
=
0
;
...
@@ -139,7 +141,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
...
@@ -139,7 +141,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
&
r_frame_state_value
,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_prev_out_grad
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
);
&
r_out_grad
,
active_node
,
origin_mode
);
gate_grad
[
frame_idx
+
frame_size
*
0
]
=
r_update_gate_grad
;
gate_grad
[
frame_idx
+
frame_size
*
0
]
=
r_update_gate_grad
;
gate_grad
[
frame_idx
+
frame_size
*
2
]
=
r_frame_state_grad
;
gate_grad
[
frame_idx
+
frame_size
*
2
]
=
r_frame_state_grad
;
...
...
paddle/fluid/operators/math/detail/gru_kernel.h
浏览文件 @
54f9d44e
...
@@ -57,10 +57,16 @@ class gru_finalOutput {
...
@@ -57,10 +57,16 @@ class gru_finalOutput {
public:
public:
HOSTDEVICE
void
operator
()(
T
*
value_update_gate
,
T
*
value_frame_state
,
HOSTDEVICE
void
operator
()(
T
*
value_update_gate
,
T
*
value_frame_state
,
T
*
prev_out
,
T
*
value_output
,
T
*
prev_out
,
T
*
value_output
,
ActivationType
act_input
)
{
ActivationType
act_input
,
bool
origin_mode
)
{
*
value_frame_state
=
activation
(
*
value_frame_state
,
act_input
);
*
value_frame_state
=
activation
(
*
value_frame_state
,
act_input
);
*
value_output
=
*
prev_out
-
((
*
value_update_gate
)
*
(
*
prev_out
))
+
if
(
origin_mode
)
{
((
*
value_update_gate
)
*
(
*
value_frame_state
));
*
value_output
=
((
*
value_update_gate
)
*
(
*
prev_out
))
+
*
value_frame_state
-
((
*
value_update_gate
)
*
(
*
value_frame_state
));
}
else
{
*
value_output
=
*
prev_out
-
((
*
value_update_gate
)
*
(
*
prev_out
))
+
((
*
value_update_gate
)
*
(
*
value_frame_state
));
}
}
}
#ifndef __NVCC__
#ifndef __NVCC__
#ifndef __AVX__
#ifndef __AVX__
...
@@ -69,11 +75,20 @@ class gru_finalOutput {
...
@@ -69,11 +75,20 @@ class gru_finalOutput {
static
const
bool
avx
=
true
;
static
const
bool
avx
=
true
;
HOSTDEVICE
void
operator
()(
__m256
*
value_update_gate
,
HOSTDEVICE
void
operator
()(
__m256
*
value_update_gate
,
__m256
*
value_frame_state
,
__m256
*
prev_out
,
__m256
*
value_frame_state
,
__m256
*
prev_out
,
__m256
*
value_output
,
ActivationType
act_input
)
{
__m256
*
value_output
,
ActivationType
act_input
,
bool
origin_mode
)
{
*
value_frame_state
=
activation
(
*
value_frame_state
,
act_input
);
*
value_frame_state
=
activation
(
*
value_frame_state
,
act_input
);
*
value_output
=
_mm256_add_ps
(
if
(
origin_mode
)
{
_mm256_sub_ps
(
*
prev_out
,
_mm256_mul_ps
(
*
value_update_gate
,
*
prev_out
)),
*
value_output
=
_mm256_sub_ps
(
_mm256_mul_ps
(
*
value_update_gate
,
*
value_frame_state
));
_mm256_add_ps
(
_mm256_mul_ps
(
*
value_update_gate
,
*
prev_out
),
*
value_frame_state
),
_mm256_mul_ps
(
*
value_update_gate
,
*
value_frame_state
));
}
else
{
*
value_output
=
_mm256_add_ps
(
_mm256_sub_ps
(
*
prev_out
,
_mm256_mul_ps
(
*
value_update_gate
,
*
prev_out
)),
_mm256_mul_ps
(
*
value_update_gate
,
*
value_frame_state
));
}
}
}
#endif
#endif
#endif
#endif
...
@@ -88,13 +103,23 @@ class gru_stateGrad {
...
@@ -88,13 +103,23 @@ class gru_stateGrad {
HOSTDEVICE
void
operator
()(
T
*
value_update_gate
,
T
*
grad_update_gate
,
HOSTDEVICE
void
operator
()(
T
*
value_update_gate
,
T
*
grad_update_gate
,
T
*
value_frame_state
,
T
*
grad_frame_state
,
T
*
value_frame_state
,
T
*
grad_frame_state
,
T
*
value_prev_out
,
T
*
grad_prev_out
,
T
*
value_prev_out
,
T
*
grad_prev_out
,
T
*
grad_output
,
ActivationType
act_input
)
{
T
*
grad_output
,
ActivationType
act_input
,
*
grad_update_gate
=
(
*
grad_output
*
(
*
value_frame_state
));
bool
origin_mode
)
{
*
grad_update_gate
-=
(
*
grad_output
*
(
*
value_prev_out
));
if
(
origin_mode
)
{
*
grad_prev_out
-=
(
*
grad_output
*
(
*
value_update_gate
));
*
grad_update_gate
=
*
grad_prev_out
+=
*
grad_output
;
(
*
grad_output
)
*
((
*
value_prev_out
)
-
(
*
value_frame_state
));
*
grad_frame_state
=
activation
(
*
grad_output
*
(
*
value_update_gate
),
*
grad_prev_out
+=
(
*
grad_output
*
(
*
value_update_gate
));
*
value_frame_state
,
act_input
);
*
grad_frame_state
=
activation
(
*
grad_output
*
(
static_cast
<
T
>
(
1.0
)
-
(
*
value_update_gate
)),
*
value_frame_state
,
act_input
);
}
else
{
*
grad_update_gate
=
(
*
grad_output
)
*
((
*
value_frame_state
)
-
(
*
value_prev_out
));
*
grad_prev_out
+=
(
*
grad_output
*
(
static_cast
<
T
>
(
1.0
)
-
*
value_update_gate
));
*
grad_frame_state
=
activation
(
*
grad_output
*
(
*
value_update_gate
),
*
value_frame_state
,
act_input
);
}
}
}
#ifndef __NVCC__
#ifndef __NVCC__
#ifndef __AVX__
#ifndef __AVX__
...
@@ -106,17 +131,27 @@ class gru_stateGrad {
...
@@ -106,17 +131,27 @@ class gru_stateGrad {
__m256
*
value_frame_state
,
__m256
*
value_frame_state
,
__m256
*
grad_frame_state
,
__m256
*
value_prev_out
,
__m256
*
grad_frame_state
,
__m256
*
value_prev_out
,
__m256
*
grad_prev_out
,
__m256
*
grad_output
,
__m256
*
grad_prev_out
,
__m256
*
grad_output
,
ActivationType
act_input
)
{
ActivationType
act_input
,
bool
origin_mode
)
{
*
grad_update_gate
=
_mm256_mul_ps
(
*
grad_output
,
*
value_frame_state
);
if
(
origin_mode
)
{
*
grad_update_gate
=
_mm256_sub_ps
(
*
grad_update_gate
=
_mm256_mul_ps
(
*
grad_update_gate
,
_mm256_mul_ps
(
*
grad_output
,
*
value_prev_out
));
*
grad_output
,
_mm256_sub_ps
(
*
value_prev_out
,
*
value_frame_state
));
*
grad_prev_out
=
_mm256_add_ps
(
*
grad_prev_out
=
_mm256_add_ps
(
_mm256_sub_ps
(
*
grad_prev_out
,
*
grad_prev_out
,
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
));
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
)),
*
grad_frame_state
=
activation
(
*
grad_output
);
_mm256_mul_ps
(
*
grad_output
,
_mm256_sub_ps
(
_mm256_set1_ps
(
1.0
f
),
*
grad_frame_state
=
*
value_update_gate
)),
activation
(
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
),
*
value_frame_state
,
act_input
);
*
value_frame_state
,
act_input
);
}
else
{
*
grad_update_gate
=
_mm256_mul_ps
(
*
grad_output
,
_mm256_sub_ps
(
*
value_frame_state
,
*
value_prev_out
));
*
grad_prev_out
=
_mm256_add_ps
(
*
grad_prev_out
,
_mm256_mul_ps
(
*
grad_output
,
_mm256_sub_ps
(
_mm256_set1_ps
(
1.0
f
),
*
value_update_gate
)));
*
grad_frame_state
=
activation
(
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
),
*
value_frame_state
,
act_input
);
}
}
}
#endif
#endif
#endif
#endif
...
...
paddle/fluid/operators/math/gru_compute.cc
浏览文件 @
54f9d44e
...
@@ -23,7 +23,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
...
@@ -23,7 +23,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
#ifndef __NVCC__
#ifndef __NVCC__
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
if
(
value
.
prev_out_value
)
{
if
(
value
.
prev_out_value
)
{
...
@@ -43,7 +44,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
...
@@ -43,7 +44,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
}
}
detail
::
forward_final_output
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
,
detail
::
forward_final_output
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_node
);
frame_size
,
batch_size
,
active_node
,
origin_mode
);
#endif
#endif
}
}
};
};
...
@@ -54,10 +56,12 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
...
@@ -54,10 +56,12 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
#ifndef __NVCC__
#ifndef __NVCC__
detail
::
backward_state_grad
(
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
,
detail
::
backward_state_grad
(
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
,
grad
,
frame_size
,
batch_size
,
active_node
);
grad
,
frame_size
,
batch_size
,
active_node
,
origin_mode
);
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
if
(
value
.
prev_out_value
&&
grad
.
prev_out_grad
)
{
if
(
value
.
prev_out_value
&&
grad
.
prev_out_grad
)
{
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
,
1
,
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
,
1
,
...
...
paddle/fluid/operators/math/gru_compute.cu
浏览文件 @
54f9d44e
...
@@ -24,7 +24,8 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
...
@@ -24,7 +24,8 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
auto
stream
=
context
.
stream
();
auto
stream
=
context
.
stream
();
dim3
threads
;
dim3
threads
;
dim3
grid
;
dim3
grid
;
...
@@ -73,14 +74,14 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
...
@@ -73,14 +74,14 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
.
gate_value
,
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
batch_size
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
batch_size
,
active_node
);
active_node
,
origin_mode
);
}
else
{
}
else
{
detail
::
KeGruForwardFinalOutput
<
detail
::
forward
::
gru_finalOutput
<
T
>
,
detail
::
KeGruForwardFinalOutput
<
detail
::
forward
::
gru_finalOutput
<
T
>
,
/* is_batch= */
true
,
/* is_batch= */
true
,
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
.
gate_value
,
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
batch_size
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
batch_size
,
active_node
);
active_node
,
origin_mode
);
}
}
}
}
};
};
...
@@ -91,7 +92,8 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -91,7 +92,8 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
auto
stream
=
context
.
stream
();
auto
stream
=
context
.
stream
();
dim3
threads
;
dim3
threads
;
dim3
grid
;
dim3
grid
;
...
@@ -111,14 +113,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
...
@@ -111,14 +113,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
.
gate_value
,
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
);
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
,
origin_mode
);
}
else
{
}
else
{
detail
::
KeGruBackwardStateGrad
<
detail
::
KeGruBackwardStateGrad
<
detail
::
backward
::
gru_stateGrad
<
T
>
,
detail
::
backward
::
gru_stateGrad
<
T
>
,
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
.
gate_value
,
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
);
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
,
origin_mode
);
}
}
auto
blas
=
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
context
);
auto
blas
=
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
context
);
...
...
paddle/fluid/operators/math/gru_compute.h
浏览文件 @
54f9d44e
...
@@ -44,7 +44,8 @@ struct GRUUnitFunctor {
...
@@ -44,7 +44,8 @@ struct GRUUnitFunctor {
static
void
compute
(
const
DeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
static
void
compute
(
const
DeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
);
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
);
};
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
...
@@ -52,7 +53,8 @@ struct GRUUnitGradFunctor {
...
@@ -52,7 +53,8 @@ struct GRUUnitGradFunctor {
static
void
compute
(
const
DeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
static
void
compute
(
const
DeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
);
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
);
};
};
}
// namespace math
}
// namespace math
...
...
paddle/fluid/operators/math/tree2col.cc
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/math/tree2col.h"
#include <deque>
#include <stack>
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
std
::
vector
<
TreeNode
>
Tree2ColUtil
::
construct_patch
(
size_t
root
,
int
max_depth
,
const
std
::
vector
<
std
::
vector
<
int
>>
&
tr
)
{
std
::
stack
<
TreeNode
,
std
::
deque
<
TreeNode
>>
stack
;
std
::
unordered_map
<
int
,
bool
>
visited
;
std
::
vector
<
TreeNode
>
patch
;
stack
.
push
(
TreeNode
(
root
,
1
,
1
,
0
));
patch
.
emplace_back
(
TreeNode
(
root
,
1
,
1
,
0
));
visited
[
root
]
=
true
;
while
(
!
stack
.
empty
())
{
TreeNode
&
u
=
stack
.
top
();
bool
end
=
true
;
size_t
node
=
u
.
get_node
(),
sz
=
tr
[
node
].
size
();
visited
[
node
]
=
true
;
for
(
size_t
i
=
0
;
i
<
sz
;
i
++
)
{
size_t
v
=
tr
[
node
][
i
];
if
(
!
visited
[
v
]
&&
static_cast
<
int
>
(
u
.
get_depth
())
+
1
<
max_depth
)
{
visited
[
v
]
=
true
;
stack
.
push
(
TreeNode
(
v
,
i
,
sz
,
u
.
get_depth
()
+
1
));
patch
.
push_back
(
TreeNode
(
v
,
i
+
1
,
sz
,
u
.
get_depth
()
+
1
));
end
=
false
;
}
}
if
(
end
)
{
stack
.
pop
();
}
}
return
patch
;
}
void
Tree2ColUtil
::
construct_tree
(
const
paddle
::
Tensor
&
EdgeSet
,
std
::
vector
<
std
::
vector
<
int
>>
*
tr
,
size_t
*
node_count
)
{
auto
edge_set_dims
=
EdgeSet
.
dims
();
PADDLE_ENFORCE_EQ
(
edge_set_dims
[
1
],
2
);
int64_t
edge_count
=
EdgeSet
.
numel
();
const
int
*
edge_data
=
EdgeSet
.
data
<
int
>
();
for
(
int64_t
i
=
0
;
i
<
edge_count
;
i
+=
2
)
{
int
u
=
edge_data
[
i
],
v
=
edge_data
[
i
+
1
];
if
(
u
!=
0
&&
v
!=
0
)
(
*
node_count
)
++
;
}
(
*
node_count
)
++
;
tr
->
resize
(
static_cast
<
size_t
>
(
*
node_count
+
1
));
for
(
int64_t
i
=
0
;
i
<
edge_count
;
i
+=
2
)
{
int
u
=
edge_data
[
i
],
v
=
edge_data
[
i
+
1
];
if
(
u
!=
0
&&
v
!=
0
)
{
tr
->
at
(
u
).
push_back
(
v
);
}
else
{
break
;
}
}
}
template
<
typename
T
>
class
Tree2ColFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
node_features
,
framework
::
Tensor
*
patch
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
feature_dims
=
node_features
.
dims
();
auto
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
constant
;
int64_t
feature_size
=
feature_dims
[
1
];
size_t
patch_elem_size
=
3
*
static_cast
<
size_t
>
(
feature_size
);
size_t
node_count
=
0
,
patch_count
=
0
,
patch_size
;
Tree2ColUtil
::
construct_tree
(
EdgeSet
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
TreeNode
>>
processing_list
;
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
TreeNode
>
temp_patch
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
temp_patch
.
empty
())
{
processing_list
.
emplace_back
(
temp_patch
);
}
}
patch_size
=
processing_list
.
size
();
T
*
patch_data
=
patch
->
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
patch_size
),
static_cast
<
int64_t
>
(
patch_elem_size
)},
cpu_place
);
constant
(
context
,
patch
,
0
);
const
T
*
features
=
node_features
.
data
<
T
>
();
for
(
auto
&
patch_item
:
processing_list
)
{
size_t
pointer_base
=
patch_count
*
patch_elem_size
;
for
(
auto
&
v
:
patch_item
)
{
T
eta_l
=
v
.
eta_l
<
T
>
(
max_depth
),
eta_r
=
v
.
eta_r
<
T
>
(
max_depth
),
eta_t
=
v
.
eta_t
<
T
>
(
max_depth
);
size_t
id
=
v
.
get_node
()
-
1
;
for
(
int
i
=
0
;
i
<
feature_size
;
i
++
)
{
patch_data
[
pointer_base
+
i
*
3
]
+=
eta_l
*
features
[
id
*
feature_size
+
i
];
patch_data
[
pointer_base
+
i
*
3
+
1
]
+=
eta_r
*
features
[
id
*
feature_size
+
i
];
patch_data
[
pointer_base
+
i
*
3
+
2
]
+=
eta_t
*
features
[
id
*
feature_size
+
i
];
}
}
patch_count
++
;
}
patch
->
Resize
({
static_cast
<
int64_t
>
(
patch_count
),
static_cast
<
int64_t
>
(
patch_elem_size
)});
}
};
template
<
typename
T
>
class
Col2TreeFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
out_grad
,
framework
::
Tensor
*
in_grad
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
output_dims
=
out_grad
.
dims
();
auto
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
constant
;
int64_t
output_size
=
output_dims
[
1
];
size_t
grad_elem_size
=
3
*
static_cast
<
size_t
>
(
output_size
);
size_t
node_count
=
0
,
grad_count
=
0
;
Tree2ColUtil
::
construct_tree
(
EdgeSet
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
TreeNode
>>
processing_list
;
std
::
vector
<
std
::
vector
<
TreeNode
>>
grad_list
;
grad_list
.
resize
(
node_count
);
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
TreeNode
>
tmp
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
tmp
.
empty
())
{
processing_list
.
push_back
(
tmp
);
}
}
for
(
size_t
patch_id
=
0
;
patch_id
<
processing_list
.
size
();
patch_id
++
)
{
for
(
auto
v
:
processing_list
[
patch_id
])
{
grad_list
[
v
.
get_node
()
-
1
].
push_back
(
v
.
change_node
(
patch_id
+
1
));
}
}
T
*
grad_data
=
in_grad
->
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
node_count
),
static_cast
<
int64_t
>
(
grad_elem_size
)},
cpu_place
);
constant
(
context
,
in_grad
,
0
);
const
T
*
out_g
=
out_grad
.
data
<
T
>
();
for
(
auto
&
patch_item
:
grad_list
)
{
size_t
pointer_base
=
grad_count
*
grad_elem_size
;
for
(
auto
&
v
:
patch_item
)
{
T
eta_l
=
v
.
eta_l
<
T
>
(
max_depth
),
eta_r
=
v
.
eta_r
<
T
>
(
max_depth
),
eta_t
=
v
.
eta_t
<
T
>
(
max_depth
);
size_t
id
=
v
.
get_node
()
-
1
;
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
grad_data
[
pointer_base
+
i
*
3
]
+=
eta_l
*
out_g
[
id
*
output_size
+
i
];
grad_data
[
pointer_base
+
i
*
3
+
1
]
+=
eta_r
*
out_g
[
id
*
output_size
+
i
];
grad_data
[
pointer_base
+
i
*
3
+
2
]
+=
eta_t
*
out_g
[
id
*
output_size
+
i
];
}
}
grad_count
++
;
}
}
};
template
class
Tree2ColFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
Tree2ColFunctor
<
platform
::
CPUDeviceContext
,
double
>;
template
class
Col2TreeFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
Col2TreeFunctor
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/tree2col.cu
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stack>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/tree2col.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
using
Node
=
paddle
::
operators
::
math
::
TreeNode
;
template
<
typename
T
>
__global__
void
tree2col
(
const
T
*
eta
,
const
int
*
node
,
const
int
*
index
,
const
T
*
vectors
,
T
*
result
,
int
feature_size
,
int
n
)
{
const
int
thread_id
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
patch_id
=
thread_id
/
feature_size
;
const
int
j
=
thread_id
%
feature_size
;
if
(
patch_id
<
n
)
{
const
int
begin_o
=
patch_id
*
3
*
feature_size
;
const
int
begin
=
index
[
patch_id
*
2
],
end
=
index
[
patch_id
*
2
+
1
];
T
res_l
=
0
,
res_r
=
0
,
res_t
=
0
;
for
(
int
i
=
begin
;
i
<
end
;
i
++
)
{
const
int
id
=
node
[
i
];
const
T
vec
=
vectors
[
id
*
feature_size
+
j
];
res_l
+=
eta
[
i
*
3
]
*
vec
;
res_r
+=
eta
[
i
*
3
+
1
]
*
vec
;
res_t
+=
eta
[
i
*
3
+
2
]
*
vec
;
}
result
[
begin_o
+
j
*
3
]
=
res_l
;
result
[
begin_o
+
j
*
3
+
1
]
=
res_r
;
result
[
begin_o
+
j
*
3
+
2
]
=
res_t
;
}
}
template
<
typename
T
>
class
Tree2ColFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
paddle
::
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
node_features
,
framework
::
Tensor
*
patch
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
context
.
GetPlace
());
auto
cpu_place
=
platform
::
CPUPlace
();
auto
stream
=
context
.
stream
();
auto
feature_dims
=
node_features
.
dims
();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant
;
Tensor
EdgeSet_cpu
;
framework
::
TensorCopy
(
EdgeSet
,
cpu_place
,
&
EdgeSet_cpu
);
int64_t
feature_size
=
feature_dims
[
1
];
size_t
patch_elem_size
=
3
*
static_cast
<
size_t
>
(
feature_size
);
size_t
node_count
=
0
,
patch_count
=
0
,
total_size
=
0
;
size_t
max_size
=
feature_dims
[
0
];
Tree2ColUtil
::
construct_tree
(
EdgeSet_cpu
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
Node
>>
processing_list
;
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
Node
>
tmp
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
tmp
.
empty
())
{
processing_list
.
push_back
(
tmp
);
total_size
+=
tmp
.
size
();
}
}
size_t
patch_size
=
processing_list
.
size
();
Tensor
node_cpu
,
node_gpu
,
eta_cpu
,
eta_gpu
,
index_cpu
,
index_gpu
;
int
*
node
=
node_cpu
.
mutable_data
<
int
>
({
static_cast
<
int64_t
>
(
total_size
)},
cpu_place
);
T
*
eta
=
eta_cpu
.
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
total_size
*
3
)},
cpu_place
);
int
*
index
=
index_cpu
.
mutable_data
<
int
>
(
{
static_cast
<
int64_t
>
(
patch_size
*
2
)},
cpu_place
);
int
idx
=
0
,
index_idx
=
0
;
for
(
auto
&
tmp
:
processing_list
)
{
index
[
index_idx
++
]
=
idx
;
for
(
auto
&
v
:
tmp
)
{
node
[
idx
]
=
static_cast
<
int
>
(
v
.
node
-
1
);
eta
[
idx
*
3
]
=
v
.
eta_l
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
1
]
=
v
.
eta_r
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
2
]
=
v
.
eta_t
<
T
>
(
max_depth
);
idx
++
;
}
index
[
index_idx
++
]
=
idx
;
}
framework
::
TensorCopy
(
node_cpu
,
gpu_place
,
context
,
&
node_gpu
);
framework
::
TensorCopy
(
eta_cpu
,
gpu_place
,
context
,
&
eta_gpu
);
framework
::
TensorCopy
(
index_cpu
,
gpu_place
,
context
,
&
index_gpu
);
int
elem_size
=
patch_size
*
feature_size
;
int
blocks
=
(
elem_size
+
1024
-
1
)
/
1024
;
int
block_x
=
512
;
int
block_y
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
block_x
,
block_y
);
patch
->
mutable_data
<
T
>
(
{
static_cast
<
int64_t
>
(
max_size
),
static_cast
<
int64_t
>
(
patch_elem_size
)},
gpu_place
);
constant
(
context
,
patch
,
0
);
tree2col
<
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
eta_gpu
.
data
<
T
>
(),
node_gpu
.
data
<
int
>
(),
index_gpu
.
data
<
int
>
(),
node_features
.
data
<
T
>
(),
patch
->
data
<
T
>
(),
feature_size
,
patch_size
);
}
};
template
<
typename
T
>
class
Col2TreeFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
patch_grad
,
framework
::
Tensor
*
embedding_grad
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
context
.
GetPlace
());
auto
cpu_place
=
platform
::
CPUPlace
();
auto
stream
=
context
.
stream
();
auto
output_dims
=
patch_grad
.
dims
();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant
;
Tensor
EdgeSet_cpu
;
framework
::
TensorCopy
(
EdgeSet
,
cpu_place
,
&
EdgeSet_cpu
);
int64_t
output_size
=
output_dims
[
1
];
size_t
patch_elem_size
=
3
*
static_cast
<
size_t
>
(
output_size
);
size_t
node_count
=
0
,
patch_count
=
0
;
size_t
max_size
=
output_dims
[
0
];
Tree2ColUtil
::
construct_tree
(
EdgeSet_cpu
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
Node
>>
processing_list
;
std
::
vector
<
std
::
vector
<
Node
>>
grad_list
;
grad_list
.
resize
(
node_count
);
size_t
total_size
=
0
,
grad_size
=
node_count
;
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
Node
>
tmp
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
tmp
.
empty
())
{
processing_list
.
push_back
(
tmp
);
}
}
for
(
size_t
patch_id
=
0
;
patch_id
<
processing_list
.
size
();
patch_id
++
)
{
for
(
auto
v
:
processing_list
[
patch_id
])
{
grad_list
[
v
.
get_node
()
-
1
].
push_back
(
v
.
change_node
(
patch_id
+
1
));
}
}
for
(
auto
&
tmp
:
grad_list
)
{
total_size
+=
tmp
.
size
();
}
Tensor
node_cpu
,
node_gpu
,
eta_cpu
,
eta_gpu
,
index_cpu
,
index_gpu
;
int
*
node
=
node_cpu
.
mutable_data
<
int
>
({
static_cast
<
int64_t
>
(
total_size
)},
cpu_place
);
T
*
eta
=
eta_cpu
.
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
total_size
*
3
)},
cpu_place
);
int
*
index
=
index_cpu
.
mutable_data
<
int
>
(
{
static_cast
<
int64_t
>
(
grad_size
*
2
)},
cpu_place
);
size_t
idx
=
0
,
index_idx
=
0
;
for
(
auto
&
tmp
:
grad_list
)
{
index
[
index_idx
++
]
=
idx
;
for
(
auto
&
v
:
tmp
)
{
node
[
idx
]
=
static_cast
<
int
>
(
v
.
node
-
1
);
eta
[
idx
*
3
]
=
v
.
eta_l
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
1
]
=
v
.
eta_r
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
2
]
=
v
.
eta_t
<
T
>
(
max_depth
);
idx
++
;
}
index
[
index_idx
++
]
=
idx
;
}
framework
::
TensorCopy
(
node_cpu
,
gpu_place
,
&
node_gpu
);
framework
::
TensorCopy
(
eta_cpu
,
gpu_place
,
&
eta_gpu
);
framework
::
TensorCopy
(
index_cpu
,
gpu_place
,
&
index_gpu
);
int
elem_size
=
output_size
*
grad_size
;
int
blocks
=
(
elem_size
+
1024
-
1
)
/
1024
;
int
block_x
=
512
;
int
block_y
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
block_x
,
block_y
);
embedding_grad
->
mutable_data
<
T
>
(
{
static_cast
<
int64_t
>
(
max_size
),
static_cast
<
int64_t
>
(
patch_elem_size
)},
gpu_place
);
constant
(
context
,
embedding_grad
,
0
);
tree2col
<
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
eta_gpu
.
data
<
T
>
(),
node_gpu
.
data
<
int
>
(),
index_gpu
.
data
<
int
>
(),
patch_grad
.
data
<
T
>
(),
embedding_grad
->
data
<
T
>
(),
output_size
,
grad_size
);
}
};
template
class
Tree2ColFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Tree2ColFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2TreeFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2TreeFunctor
<
platform
::
CUDADeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/tree2col.h
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <array>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
using
Tensor
=
framework
::
Tensor
;
using
DDim
=
framework
::
DDim
;
namespace
operators
{
namespace
math
{
class
TreeNode
{
public:
size_t
node
;
explicit
TreeNode
(
size_t
node
=
0
,
size_t
index
=
0
,
size_t
pclen
=
0
,
size_t
depth
=
0
)
:
node
(
node
),
index
(
index
),
pclen
(
pclen
),
depth
(
depth
)
{}
template
<
typename
T
>
T
eta_t
(
T
filter_depth
)
{
return
((
filter_depth
-
this
->
depth
)
/
filter_depth
);
}
template
<
typename
T
>
T
eta_l
(
T
filter_depth
)
{
T
temp
;
if
(
this
->
pclen
==
1
)
{
temp
=
0.5
;
}
else
{
temp
=
(
this
->
index
-
1.0
)
/
(
this
->
pclen
-
1.0
);
}
return
(
1.0
-
this
->
eta_t
<
T
>
(
filter_depth
))
*
temp
;
}
template
<
typename
T
>
T
eta_r
(
T
filter_depth
)
{
return
(
1.0
-
this
->
eta_t
<
T
>
(
filter_depth
))
*
(
1.0
-
this
->
eta_l
<
T
>
(
filter_depth
));
}
TreeNode
change_node
(
size_t
v
)
{
return
TreeNode
(
v
,
this
->
index
,
this
->
pclen
,
this
->
depth
);
}
size_t
get_node
()
{
return
this
->
node
;
}
size_t
get_depth
()
{
return
this
->
depth
;
}
private:
size_t
index
,
pclen
,
depth
;
};
class
Tree2ColUtil
{
public:
static
std
::
vector
<
TreeNode
>
construct_patch
(
size_t
root
,
int
max_depth
,
const
std
::
vector
<
std
::
vector
<
int
>>
&
tr
);
static
void
construct_tree
(
const
Tensor
&
EdgeSet
,
std
::
vector
<
std
::
vector
<
int
>>
*
tr
,
size_t
*
node_count
);
};
template
<
typename
DeviceContext
,
typename
T
>
class
Tree2ColFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
node_features
,
framework
::
Tensor
*
patch
,
int
max_depth
);
};
template
<
typename
DeviceContext
,
typename
T
>
class
Col2TreeFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
out_grad
,
framework
::
Tensor
*
in_grad
,
int
max_depth
);
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/tree_conv_op.cc
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/tree_conv_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
class
TreeConvOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"NodesVector"
,
"(Tensor) The feature vector of every node on the tree. "
"The shape of the feature vector must be "
"[max_tree_node_size, feature_size]."
);
AddInput
(
"EdgeSet"
,
"(Tensor) The Edges of Tree. The edge must be directional. "
"The shape of the edge set must be [max_tree_node_size, 2]."
);
AddInput
(
"Filter"
,
"(Tensor) The feature detector. "
"The shape of the filter is "
"[feature_size, 3, output_size, num_filters]."
);
AddOutput
(
"Out"
,
"(Tensor) The feature vector of subtrees. "
"The shape of the output tensor is [max_tree_node_size, "
"output_size, num_filters]. "
"The output tensor could be a new feature "
"vector for next tree convolution layers."
);
AddAttr
<
int
>
(
"max_depth"
,
"(int, default: 2) The depth of feature detector."
)
.
SetDefault
(
2
)
.
GreaterThan
(
1
);
AddComment
(
R"DOC(
**Tree-Based Convolution Operator**
Tree-Based Convolution is a kind of convolution based on tree structure.
Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
which is used to classify tree structures, such as Abstract Syntax Tree.
Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
which regards multiway tree as binary tree.
The paper of Tree-Based Convolution Operator is here:
https://arxiv.org/abs/1409.5718v1
)DOC"
);
}
};
class
TreeConvOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
));
auto
edge_dims
=
ctx
->
GetInputDim
(
"EdgeSet"
);
auto
vector_dims
=
ctx
->
GetInputDim
(
"NodesVector"
);
auto
filter_dims
=
ctx
->
GetInputDim
(
"Filter"
);
PADDLE_ENFORCE_EQ
(
edge_dims
[
2
],
2
,
"Input(EdgeSet) dim[2] should be 2"
);
PADDLE_ENFORCE_EQ
(
edge_dims
.
size
(),
3
,
"The dimension of EdgeSet Tensor should be 3"
);
PADDLE_ENFORCE_EQ
(
vector_dims
.
size
(),
3
,
"The dimension of NodesVector Tensor should be 3"
);
PADDLE_ENFORCE_EQ
(
filter_dims
.
size
(),
4
,
"The dimension of Filter Tensor should be 4"
);
PADDLE_ENFORCE_EQ
(
filter_dims
[
1
],
3
,
"Input(Filter) dim[1] should be 3"
);
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
],
vector_dims
[
2
],
"Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]"
);
auto
output_dims
=
framework
::
make_ddim
(
{
vector_dims
[
0
],
vector_dims
[
1
],
filter_dims
[
2
],
filter_dims
[
3
]});
ctx
->
SetOutputDim
(
"Out"
,
output_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"NodesVector"
)
->
type
(),
ctx
.
device_context
());
}
};
class
TreeConvGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
vectors_dims
=
ctx
->
GetInputDim
(
"NodesVector"
);
auto
filter_dims
=
ctx
->
GetInputDim
(
"Filter"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"the gradient of output(Out) must not be null"
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Filter"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Filter"
),
filter_dims
);
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"NodesVector"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"NodesVector"
),
vectors_dims
);
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"NodesVector"
)
->
type
(),
ctx
.
device_context
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
tree_conv
,
ops
::
TreeConvOp
,
ops
::
TreeConvOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
tree_conv_grad
,
ops
::
TreeConvGradOp
);
REGISTER_OP_CPU_KERNEL
(
tree_conv
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
tree_conv_grad
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/tree_conv_op.cu
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/tree_conv_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
tree_conv
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
tree_conv_grad
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/tree_conv_op.h
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/tree2col.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
DDim
=
framework
::
DDim
;
template
<
typename
DeviceContext
,
typename
T
>
class
TreeConvKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
math
::
Tree2ColFunctor
<
DeviceContext
,
T
>
tree2col
;
math
::
SetConstant
<
DeviceContext
,
T
>
constant
;
auto
*
Edges
=
ctx
.
Input
<
Tensor
>
(
"EdgeSet"
);
auto
*
Embeddings
=
ctx
.
Input
<
Tensor
>
(
"NodesVector"
);
auto
*
Filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
auto
*
output_emb
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
int
max_depth
=
ctx
.
Attr
<
int
>
(
"max_depth"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
Tensor
W
;
W
.
ShareDataWith
(
*
Filter
);
W
.
Resize
(
framework
::
flatten_to_2d
(
Filter
->
dims
(),
2
));
int
batch_size
=
static_cast
<
int
>
(
Edges
->
dims
()[
0
]);
int
n
=
static_cast
<
int
>
(
Embeddings
->
dims
()[
1
]);
int
out_size
=
static_cast
<
int
>
(
Filter
->
dims
()[
2
]);
int
num_filters
=
static_cast
<
int
>
(
Filter
->
dims
()[
3
]);
output_emb
->
mutable_data
<
T
>
({
batch_size
,
n
,
out_size
,
num_filters
},
ctx
.
GetPlace
());
auto
edge_set_slicedim
=
framework
::
slice_ddim
(
Edges
->
dims
(),
1
,
static_cast
<
int
>
(
Edges
->
dims
().
size
()));
auto
embedding_slicedim
=
framework
::
slice_ddim
(
Embeddings
->
dims
(),
1
,
static_cast
<
int
>
(
Embeddings
->
dims
().
size
()));
auto
output_slicedim
=
framework
::
slice_ddim
(
output_emb
->
dims
(),
1
,
static_cast
<
int
>
(
output_emb
->
dims
().
size
()));
output_slicedim
=
framework
::
flatten_to_2d
(
output_slicedim
,
1
);
for
(
int
idx
=
0
;
idx
<
batch_size
;
idx
++
)
{
auto
edge_set
=
Edges
->
Slice
(
idx
,
idx
+
1
).
Resize
(
edge_set_slicedim
);
auto
embeddings
=
Embeddings
->
Slice
(
idx
,
idx
+
1
).
Resize
(
embedding_slicedim
);
auto
out_vec
=
output_emb
->
Slice
(
idx
,
idx
+
1
).
Resize
(
output_slicedim
);
Tensor
patch
;
tree2col
(
dev_ctx
,
edge_set
,
embeddings
,
&
patch
,
max_depth
);
constant
(
dev_ctx
,
&
out_vec
,
0
);
blas
.
MatMul
(
patch
,
W
,
&
out_vec
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
TreeConvGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out_g
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
in_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"NodesVector"
));
auto
*
filter_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
int
max_depth
=
ctx
.
Attr
<
int
>
(
"max_depth"
);
auto
*
Embeddings
=
ctx
.
Input
<
Tensor
>
(
"NodesVector"
);
auto
*
edges
=
ctx
.
Input
<
Tensor
>
(
"EdgeSet"
);
auto
*
Filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
math
::
Tree2ColFunctor
<
DeviceContext
,
T
>
tree2col
;
math
::
Col2TreeFunctor
<
DeviceContext
,
T
>
col2tree
;
math
::
SetConstant
<
DeviceContext
,
T
>
constant
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
Tensor
W
;
W
.
ShareDataWith
(
*
Filter
);
W
.
Resize
(
framework
::
flatten_to_2d
(
Filter
->
dims
(),
1
));
int
batch_size
=
static_cast
<
int
>
(
Embeddings
->
dims
()[
0
]);
auto
edge_set_slicedim
=
framework
::
slice_ddim
(
edges
->
dims
(),
1
,
static_cast
<
int
>
(
edges
->
dims
().
size
()));
auto
embedding_slicedim
=
framework
::
slice_ddim
(
Embeddings
->
dims
(),
1
,
static_cast
<
int
>
(
Embeddings
->
dims
().
size
()));
auto
out_grad_dims
=
framework
::
slice_ddim
(
out_g
->
dims
(),
1
,
static_cast
<
int
>
(
out_g
->
dims
().
size
()));
out_grad_dims
=
framework
::
flatten_to_2d
(
out_grad_dims
,
1
);
if
(
filter_g
)
{
filter_g
->
mutable_data
<
T
>
(
Filter
->
dims
(),
ctx
.
GetPlace
());
Tensor
f_g
;
f_g
.
ShareDataWith
(
*
filter_g
);
f_g
.
Resize
(
framework
::
flatten_to_2d
(
Filter
->
dims
(),
2
));
constant
(
dev_ctx
,
filter_g
,
0
);
for
(
int
batch_id
=
0
;
batch_id
<
batch_size
;
batch_id
++
)
{
auto
edge_set
=
edges
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
edge_set_slicedim
);
auto
embeddings
=
Embeddings
->
Slice
(
batch_id
,
batch_id
+
1
)
.
Resize
(
embedding_slicedim
);
auto
out_grad
=
out_g
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
out_grad_dims
);
Tensor
patch
;
tree2col
(
dev_ctx
,
edge_set
,
embeddings
,
&
patch
,
max_depth
);
blas
.
MatMul
(
patch
,
true
,
out_grad
,
false
,
T
(
1.0
),
&
f_g
,
T
(
1.0
));
}
}
if
(
in_g
)
{
auto
input_grad_dims
=
framework
::
slice_ddim
(
in_g
->
dims
(),
1
,
static_cast
<
int
>
(
in_g
->
dims
().
size
()));
in_g
->
mutable_data
<
T
>
(
Embeddings
->
dims
(),
ctx
.
GetPlace
());
constant
(
dev_ctx
,
in_g
,
0
);
for
(
int
batch_id
=
0
;
batch_id
<
batch_size
;
batch_id
++
)
{
auto
edge_set
=
edges
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
edge_set_slicedim
);
auto
out_grad
=
out_g
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
out_grad_dims
);
auto
in_grad
=
in_g
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
input_grad_dims
);
Tensor
in_grad_temp
;
col2tree
(
dev_ctx
,
edge_set
,
out_grad
,
&
in_grad_temp
,
max_depth
);
blas
.
MatMul
(
in_grad_temp
,
false
,
W
,
true
,
&
in_grad
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
python/paddle/fluid/async_executor.py
浏览文件 @
54f9d44e
...
@@ -200,7 +200,6 @@ class AsyncExecutor(object):
...
@@ -200,7 +200,6 @@ class AsyncExecutor(object):
local_path
,
local_path
,
self
.
instance
.
get_worker_index
(),
self
.
instance
.
get_worker_index
(),
self
.
instance
.
get_node_cnt
()
/
2
,
self
.
instance
.
get_node_cnt
()
/
2
,
file_cnt
,
multi_processes
=
process_num
)
multi_processes
=
process_num
)
self
.
instance
.
barrier_worker
()
#wait for download_data
self
.
instance
.
barrier_worker
()
#wait for download_data
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
54f9d44e
...
@@ -183,6 +183,7 @@ __all__ = [
...
@@ -183,6 +183,7 @@ __all__ = [
'psroi_pool'
,
'psroi_pool'
,
'teacher_student_sigmoid_loss'
,
'teacher_student_sigmoid_loss'
,
'huber_loss'
,
'huber_loss'
,
'tree_conv'
,
]
]
kIgnoreIndex
=
-
100
kIgnoreIndex
=
-
100
...
@@ -864,12 +865,14 @@ def dynamic_gru(input,
...
@@ -864,12 +865,14 @@ def dynamic_gru(input,
is_reverse
=
False
,
is_reverse
=
False
,
gate_activation
=
'sigmoid'
,
gate_activation
=
'sigmoid'
,
candidate_activation
=
'tanh'
,
candidate_activation
=
'tanh'
,
h_0
=
None
):
h_0
=
None
,
origin_mode
=
False
):
"""
"""
**Gated Recurrent Unit (GRU) Layer**
**Gated Recurrent Unit (GRU) Layer**
Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
if origin_mode is False, then the equation of a gru step is from paper
Sequence Modeling <https://arxiv.org/abs/1412.3555>`_ .
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_ .
The formula is as follows:
The formula is as follows:
...
@@ -883,6 +886,21 @@ def dynamic_gru(input,
...
@@ -883,6 +886,21 @@ def dynamic_gru(input,
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot
\\
tilde{h_t}
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot
\\
tilde{h_t}
if origin_mode is True then the equation is from paper
Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
.. math::
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
\\
tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot
\\
tilde{h_t}
The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
is the update gate and reset gate activation function and :math:`sigmoid`
is the update gate and reset gate activation function and :math:`sigmoid`
is usually used for it. :math:`act_c` is the activation function for
is usually used for it. :math:`act_c` is the activation function for
...
@@ -980,7 +998,8 @@ def dynamic_gru(input,
...
@@ -980,7 +998,8 @@ def dynamic_gru(input,
attrs
=
{
attrs
=
{
'is_reverse'
:
is_reverse
,
'is_reverse'
:
is_reverse
,
'gate_activation'
:
gate_activation
,
'gate_activation'
:
gate_activation
,
'activation'
:
candidate_activation
'activation'
:
candidate_activation
,
'origin_mode'
:
origin_mode
})
})
return
hidden
return
hidden
...
@@ -991,9 +1010,14 @@ def gru_unit(input,
...
@@ -991,9 +1010,14 @@ def gru_unit(input,
param_attr
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
bias_attr
=
None
,
activation
=
'tanh'
,
activation
=
'tanh'
,
gate_activation
=
'sigmoid'
):
gate_activation
=
'sigmoid'
,
origin_mode
=
False
):
"""
"""
GRU unit layer. The equation of a gru step is:
**GRU unit layer**
if origin_mode is True, then the equation of a gru step is from paper
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
.. math::
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
...
@@ -1002,7 +1026,21 @@ def gru_unit(input,
...
@@ -1002,7 +1026,21 @@ def gru_unit(input,
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
if origin_mode is False, then the equation of a gru step is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
of the equation above, the :math:`z_t` is split into 3 parts -
of the equation above, the :math:`z_t` is split into 3 parts -
...
@@ -9893,3 +9931,73 @@ def huber_loss(input, label, delta):
...
@@ -9893,3 +9931,73 @@ def huber_loss(input, label, delta):
'Residual'
:
residual
},
'Residual'
:
residual
},
attrs
=
{
'delta'
:
delta
})
attrs
=
{
'delta'
:
delta
})
return
out
return
out
@
templatedoc
()
def
tree_conv
(
nodes_vector
,
edge_set
,
output_size
,
num_filters
=
1
,
max_depth
=
2
,
act
=
'tanh'
,
param_attr
=
None
,
bias_attr
=
None
,
name
=
None
):
"""
${comment}
Args:
nodes_vector(${nodes_vector_type}): ${nodes_vector_comment}
edge_set(${edge_set_type}): ${edge_set_comment}
output_size(int): output feature width
num_filters(int): number of filters, Default 1
max_depth(int): max depth of filters, Default 2
act(str): activation function, Default tanh
param_attr(ParamAttr): the parameter attribute for the filters, Default None
bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default None
name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default None
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
nodes_vector = layers.data(name='vectors', shape=[None, 10, 5], dtype='float32)
# None for batch size, 10 for max_node_size of dataset, 5 for vector width
edge_set = layers.data(name='edge_set', shape=[None, 10, 2], dtype='float32')
# None for batch size, 10 for max_node_size of dataset, 2 for every edge has two nodes
# edges must be directional
out_vector = layers.tree_conv(nodes_vector, edge_set, 6, 1, 2, 'tanh',
ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
# the shape of output will be [None, 10, 6, 1],
# None for batch size, 10 for max_node_size of dataset, 6 for output size, 1 for 1 filter
out_vector = layers.reshape(out_vector, shape=[None, 10, 6])
# After reshape, output tensor could be nodes_vector for next tree convolution
out_vector_2 = layers.tree_conv(out_vector, edge_set, 3, 4, 2, 'tanh',
ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
# also output tensor could be pooling(the pooling in paper called global pooling)
pooled = layers.reduce_max(out_vector, dims=2) # global pooling
"""
helper
=
LayerHelper
(
"tree_conv"
,
**
locals
())
dtype
=
helper
.
input_dtype
(
'nodes_vector'
)
feature_size
=
nodes_vector
.
shape
[
2
]
W_shape
=
[
feature_size
,
3
,
output_size
,
num_filters
]
W
=
helper
.
create_parameter
(
attr
=
param_attr
,
shape
=
W_shape
,
dtype
=
dtype
,
is_bias
=
False
)
if
name
==
None
:
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
dtype
)
else
:
out
=
helper
.
create_variable
(
name
=
name
,
dtype
=
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
'tree_conv'
,
inputs
=
{
'NodesVector'
:
nodes_vector
,
'EdgeSet'
:
edge_set
,
'Filter'
:
W
},
outputs
=
{
'Out'
:
out
,
},
attrs
=
{
'max_depth'
:
max_depth
})
if
helper
.
bias_attr
:
pre_activation
=
helper
.
append_bias_op
(
out
)
else
:
pre_activation
=
out
return
helper
.
append_activation
(
pre_activation
)
python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
浏览文件 @
54f9d44e
...
@@ -231,14 +231,17 @@ def infer(use_cuda, inference_program, params_dirname):
...
@@ -231,14 +231,17 @@ def infer(use_cuda, inference_program, params_dirname):
# Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# level of detail info, indicating that `data` consists of two sequences
# level of detail info, indicating that `data` consists of two sequences
# of length 3 and 2, respectively.
# of length 3 and 2, respectively.
user_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
user_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)]],
[[
1
]],
place
)
gender_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
gender_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)]],
[[
1
]],
place
)
age_id
=
fluid
.
create_lod_tensor
([[
0
]],
[[
1
]],
place
)
age_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
0
)]],
[[
1
]],
place
)
job_id
=
fluid
.
create_lod_tensor
([[
10
]],
[[
1
]],
place
)
job_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
10
)]],
[[
1
]],
place
)
movie_id
=
fluid
.
create_lod_tensor
([[
783
]],
[[
1
]],
place
)
movie_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
783
)]],
[[
1
]],
place
)
category_id
=
fluid
.
create_lod_tensor
([[
10
,
8
,
9
]],
[[
3
]],
place
)
category_id
=
fluid
.
create_lod_tensor
(
movie_title
=
fluid
.
create_lod_tensor
([[
1069
,
4140
,
2923
,
710
,
988
]],
[[
5
]],
[
np
.
array
(
place
)
[
10
,
8
,
9
],
dtype
=
'int64'
)],
[[
3
]],
place
)
movie_title
=
fluid
.
create_lod_tensor
(
[
np
.
array
(
[
1069
,
4140
,
2923
,
710
,
988
],
dtype
=
'int64'
)],
[[
5
]],
place
)
results
=
inferencer
.
infer
(
results
=
inferencer
.
infer
(
{
{
...
...
python/paddle/fluid/tests/book/test_recommender_system.py
浏览文件 @
54f9d44e
...
@@ -271,26 +271,30 @@ def infer(use_cuda, save_dirname=None):
...
@@ -271,26 +271,30 @@ def infer(use_cuda, save_dirname=None):
# Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# level of detail info, indicating that `data` consists of two sequences
# level of detail info, indicating that `data` consists of two sequences
# of length 3 and 2, respectively.
# of length 3 and 2, respectively.
user_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
user_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
1
]
==
"gender_id"
assert
feed_target_names
[
1
]
==
"gender_id"
gender_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
gender_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
2
]
==
"age_id"
assert
feed_target_names
[
2
]
==
"age_id"
age_id
=
fluid
.
create_lod_tensor
([[
0
]],
[[
1
]],
place
)
age_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
0
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
3
]
==
"job_id"
assert
feed_target_names
[
3
]
==
"job_id"
job_id
=
fluid
.
create_lod_tensor
([[
10
]],
[[
1
]],
place
)
job_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
10
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
4
]
==
"movie_id"
assert
feed_target_names
[
4
]
==
"movie_id"
movie_id
=
fluid
.
create_lod_tensor
([[
783
]],
[[
1
]],
place
)
movie_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
783
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
5
]
==
"category_id"
assert
feed_target_names
[
5
]
==
"category_id"
category_id
=
fluid
.
create_lod_tensor
([[
10
,
8
,
9
]],
[[
3
]],
place
)
category_id
=
fluid
.
create_lod_tensor
(
[
np
.
array
(
[
10
,
8
,
9
],
dtype
=
'int64'
)],
[[
3
]],
place
)
assert
feed_target_names
[
6
]
==
"movie_title"
assert
feed_target_names
[
6
]
==
"movie_title"
movie_title
=
fluid
.
create_lod_tensor
([[
1069
,
4140
,
2923
,
710
,
988
]],
movie_title
=
fluid
.
create_lod_tensor
(
[[
5
]],
place
)
[
np
.
array
(
[
1069
,
4140
,
2923
,
710
,
988
],
dtype
=
'int64'
)],
[[
5
]],
place
)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
# and results will contain a list of data corresponding to fetch_targets.
...
...
python/paddle/fluid/tests/unittests/test_auc_op.py
浏览文件 @
54f9d44e
...
@@ -24,7 +24,7 @@ class TestAucOp(OpTest):
...
@@ -24,7 +24,7 @@ class TestAucOp(OpTest):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"auc"
self
.
op_type
=
"auc"
pred
=
np
.
random
.
random
((
128
,
2
)).
astype
(
"float32"
)
pred
=
np
.
random
.
random
((
128
,
2
)).
astype
(
"float32"
)
labels
=
np
.
random
.
randint
(
0
,
2
,
(
128
,
1
))
labels
=
np
.
random
.
randint
(
0
,
2
,
(
128
,
1
))
.
astype
(
"int64"
)
num_thresholds
=
200
num_thresholds
=
200
stat_pos
=
np
.
zeros
((
num_thresholds
+
1
,
)).
astype
(
"int64"
)
stat_pos
=
np
.
zeros
((
num_thresholds
+
1
,
)).
astype
(
"int64"
)
...
...
python/paddle/fluid/tests/unittests/test_gru_op.py
浏览文件 @
54f9d44e
...
@@ -31,7 +31,8 @@ def gru(
...
@@ -31,7 +31,8 @@ def gru(
is_reverse
,
is_reverse
,
act_state
,
act_state
,
act_gate
,
act_gate
,
dtype
=
'float32'
):
dtype
=
'float32'
,
origin_mode
=
False
):
def
_seq_to_batch
(
lod
,
is_reverse
):
def
_seq_to_batch
(
lod
,
is_reverse
):
idx_in_seq_list
=
[]
idx_in_seq_list
=
[]
seq_lens
=
lod
[
0
]
seq_lens
=
lod
[
0
]
...
@@ -66,7 +67,10 @@ def gru(
...
@@ -66,7 +67,10 @@ def gru(
w_c
=
w
.
flatten
()[
D
*
D
*
2
:].
reshape
((
D
,
D
))
w_c
=
w
.
flatten
()[
D
*
D
*
2
:].
reshape
((
D
,
D
))
c
=
act_state
(
np
.
dot
(
r_h_p
,
w_c
)
+
g
[:,
D
*
2
:])
c
=
act_state
(
np
.
dot
(
r_h_p
,
w_c
)
+
g
[:,
D
*
2
:])
g
=
np
.
hstack
((
u_r
,
c
))
g
=
np
.
hstack
((
u_r
,
c
))
h
=
u
*
c
+
(
1
-
u
)
*
h_p
if
origin_mode
:
h
=
(
1
-
u
)
*
c
+
u
*
h_p
else
:
h
=
u
*
c
+
(
1
-
u
)
*
h_p
return
g
,
r_h_p
,
h
return
g
,
r_h_p
,
h
T
=
sum
(
lod
[
0
])
T
=
sum
(
lod
[
0
])
...
@@ -110,6 +114,7 @@ class TestGRUOp(OpTest):
...
@@ -110,6 +114,7 @@ class TestGRUOp(OpTest):
self
.
act_state
=
'tanh'
self
.
act_state
=
'tanh'
self
.
act_gate
=
'sigmoid'
self
.
act_gate
=
'sigmoid'
self
.
dtype
=
'float64'
self
.
dtype
=
'float64'
self
.
origin_mode
=
False
self
.
set_confs
()
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
T
=
sum
(
self
.
lod
[
0
])
...
@@ -126,7 +131,8 @@ class TestGRUOp(OpTest):
...
@@ -126,7 +131,8 @@ class TestGRUOp(OpTest):
batch_gate
,
batch_reset_hidden_prev
,
batch_hidden
,
hidden
=
gru
(
batch_gate
,
batch_reset_hidden_prev
,
batch_hidden
,
hidden
=
gru
(
input
,
self
.
lod
,
h0
,
weight
,
bias
,
self
.
is_reverse
,
input
,
self
.
lod
,
h0
,
weight
,
bias
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_state
],
ACTIVATION
[
self
.
act_gate
],
self
.
dtype
)
ACTIVATION
[
self
.
act_state
],
ACTIVATION
[
self
.
act_gate
],
self
.
dtype
,
self
.
origin_mode
)
self
.
inputs
=
{
'Input'
:
(
input
,
self
.
lod
),
'Weight'
:
weight
}
self
.
inputs
=
{
'Input'
:
(
input
,
self
.
lod
),
'Weight'
:
weight
}
if
self
.
with_bias
:
if
self
.
with_bias
:
...
@@ -145,7 +151,8 @@ class TestGRUOp(OpTest):
...
@@ -145,7 +151,8 @@ class TestGRUOp(OpTest):
self
.
attrs
=
{
self
.
attrs
=
{
'activation'
:
self
.
act_state
,
'activation'
:
self
.
act_state
,
'gate_activation'
:
self
.
act_gate
,
'gate_activation'
:
self
.
act_gate
,
'is_reverse'
:
self
.
is_reverse
'is_reverse'
:
self
.
is_reverse
,
'origin_mode'
:
self
.
origin_mode
}
}
def
test_check_output
(
self
):
def
test_check_output
(
self
):
...
@@ -155,12 +162,24 @@ class TestGRUOp(OpTest):
...
@@ -155,12 +162,24 @@ class TestGRUOp(OpTest):
self
.
check_grad
([
'Input'
,
'H0'
,
'Weight'
,
'Bias'
],
[
'Hidden'
])
self
.
check_grad
([
'Input'
,
'H0'
,
'Weight'
,
'Bias'
],
[
'Hidden'
])
class
TestGRUOriginMode
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
origin_mode
=
True
class
TestGRUOp2
(
TestGRUOp
):
class
TestGRUOp2
(
TestGRUOp
):
def
set_confs
(
self
):
def
set_confs
(
self
):
self
.
D
=
19
self
.
D
=
19
self
.
dtype
=
'float32'
self
.
dtype
=
'float32'
class
TestGRUOp2OriginMode
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
D
=
19
self
.
dtype
=
'float32'
self
.
origin_mode
=
True
class
TestGRUOpNoInitial
(
TestGRUOp
):
class
TestGRUOpNoInitial
(
TestGRUOp
):
def
set_confs
(
self
):
def
set_confs
(
self
):
self
.
with_h0
=
False
self
.
with_h0
=
False
...
@@ -182,5 +201,11 @@ class TestGRUOpReverse(TestGRUOp):
...
@@ -182,5 +201,11 @@ class TestGRUOpReverse(TestGRUOp):
self
.
is_reverse
=
True
self
.
is_reverse
=
True
class
TestGRUOpReverseOriginMode
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
is_reverse
=
True
self
.
origin_mode
=
True
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_gru_unit_op.py
浏览文件 @
54f9d44e
...
@@ -53,7 +53,7 @@ class TestGRUUnitOp(OpTest):
...
@@ -53,7 +53,7 @@ class TestGRUUnitOp(OpTest):
GRUActivationType
.
relu
:
relu
,
GRUActivationType
.
relu
:
relu
,
}
}
def
set_inputs
(
self
):
def
set_inputs
(
self
,
origin_mode
=
False
):
batch_size
=
self
.
batch_size
batch_size
=
self
.
batch_size
frame_size
=
self
.
frame_size
frame_size
=
self
.
frame_size
self
.
op_type
=
'gru_unit'
self
.
op_type
=
'gru_unit'
...
@@ -68,10 +68,11 @@ class TestGRUUnitOp(OpTest):
...
@@ -68,10 +68,11 @@ class TestGRUUnitOp(OpTest):
}
}
self
.
attrs
=
{
self
.
attrs
=
{
'activation'
:
GRUActivationType
.
tanh
,
'activation'
:
GRUActivationType
.
tanh
,
'gate_activation'
:
GRUActivationType
.
sigmoid
'gate_activation'
:
GRUActivationType
.
sigmoid
,
'origin_mode'
:
origin_mode
}
}
def
set_outputs
(
self
):
def
set_outputs
(
self
,
origin_mode
=
False
):
# GRU calculations
# GRU calculations
batch_size
=
self
.
batch_size
batch_size
=
self
.
batch_size
frame_size
=
self
.
frame_size
frame_size
=
self
.
frame_size
...
@@ -93,7 +94,10 @@ class TestGRUUnitOp(OpTest):
...
@@ -93,7 +94,10 @@ class TestGRUUnitOp(OpTest):
c
=
self
.
activate
[
self
.
attrs
[
'activation'
]](
np
.
dot
(
r_h_p
,
w_c
)
+
c
=
self
.
activate
[
self
.
attrs
[
'activation'
]](
np
.
dot
(
r_h_p
,
w_c
)
+
g
[:,
frame_size
*
2
:])
g
[:,
frame_size
*
2
:])
g
=
np
.
hstack
((
u_r
,
c
))
g
=
np
.
hstack
((
u_r
,
c
))
h
=
u
*
c
+
(
1
-
u
)
*
h_p
if
origin_mode
:
h
=
(
1
-
u
)
*
c
+
u
*
h_p
else
:
h
=
u
*
c
+
(
1
-
u
)
*
h_p
self
.
outputs
=
{
self
.
outputs
=
{
'Gate'
:
g
.
astype
(
'float64'
),
'Gate'
:
g
.
astype
(
'float64'
),
'ResetHiddenPrev'
:
r_h_p
.
astype
(
'float64'
),
'ResetHiddenPrev'
:
r_h_p
.
astype
(
'float64'
),
...
@@ -111,8 +115,14 @@ class TestGRUUnitOp(OpTest):
...
@@ -111,8 +115,14 @@ class TestGRUUnitOp(OpTest):
self
.
check_grad
([
'Input'
,
'HiddenPrev'
,
'Weight'
],
[
'Hidden'
])
self
.
check_grad
([
'Input'
,
'HiddenPrev'
,
'Weight'
],
[
'Hidden'
])
class
TestGRUUnitOpOriginMode
(
TestGRUUnitOp
):
def
setUp
(
self
):
self
.
set_inputs
(
origin_mode
=
True
)
self
.
set_outputs
(
origin_mode
=
True
)
class
TestGRUUnitOpWithBias
(
TestGRUUnitOp
):
class
TestGRUUnitOpWithBias
(
TestGRUUnitOp
):
def
set_inputs
(
self
):
def
set_inputs
(
self
,
origin_mode
=
False
):
batch_size
=
self
.
batch_size
batch_size
=
self
.
batch_size
frame_size
=
self
.
frame_size
frame_size
=
self
.
frame_size
super
(
TestGRUUnitOpWithBias
,
self
).
set_inputs
()
super
(
TestGRUUnitOpWithBias
,
self
).
set_inputs
()
...
@@ -120,7 +130,8 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
...
@@ -120,7 +130,8 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
-
0.1
,
0.1
,
(
1
,
frame_size
*
3
)).
astype
(
'float64'
)
-
0.1
,
0.1
,
(
1
,
frame_size
*
3
)).
astype
(
'float64'
)
self
.
attrs
=
{
self
.
attrs
=
{
'activation'
:
GRUActivationType
.
identity
,
'activation'
:
GRUActivationType
.
identity
,
'gate_activation'
:
GRUActivationType
.
sigmoid
'gate_activation'
:
GRUActivationType
.
sigmoid
,
'origin_mode'
:
origin_mode
}
}
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
...
@@ -132,5 +143,11 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
...
@@ -132,5 +143,11 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
no_grad_set
=
set
(
'Input'
))
no_grad_set
=
set
(
'Input'
))
class
TestGRUUnitOpWithBiasOriginMode
(
TestGRUUnitOpWithBias
):
def
setUp
(
self
):
self
.
set_inputs
(
origin_mode
=
True
)
self
.
set_outputs
(
origin_mode
=
True
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_nce.py
浏览文件 @
54f9d44e
...
@@ -68,7 +68,8 @@ class TestNCE(OpTest):
...
@@ -68,7 +68,8 @@ class TestNCE(OpTest):
weight
=
np
.
random
.
randn
(
num_classes
,
dim
).
astype
(
np
.
float32
)
weight
=
np
.
random
.
randn
(
num_classes
,
dim
).
astype
(
np
.
float32
)
bias
=
np
.
random
.
randn
(
num_classes
).
astype
(
np
.
float32
)
bias
=
np
.
random
.
randn
(
num_classes
).
astype
(
np
.
float32
)
sample_weight
=
np
.
random
.
randn
(
batch_size
).
astype
(
np
.
float32
)
sample_weight
=
np
.
random
.
randn
(
batch_size
).
astype
(
np
.
float32
)
labels
=
np
.
random
.
randint
(
0
,
num_classes
,
(
batch_size
,
num_true_class
))
labels
=
np
.
random
.
randint
(
0
,
num_classes
,
(
batch_size
,
num_true_class
)).
astype
(
"int64"
)
self
.
attrs
=
{
self
.
attrs
=
{
'num_total_classes'
:
num_classes
,
'num_total_classes'
:
num_classes
,
'num_neg_samples'
:
num_neg_samples
,
'num_neg_samples'
:
num_neg_samples
,
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
浏览文件 @
54f9d44e
...
@@ -24,14 +24,14 @@ import os
...
@@ -24,14 +24,14 @@ import os
def
Lenet
(
data
,
class_dim
):
def
Lenet
(
data
,
class_dim
):
conv1
=
fluid
.
layers
.
conv2d
(
data
,
32
,
5
,
1
,
act
=
None
)
conv1
=
fluid
.
layers
.
conv2d
(
data
,
4
,
5
,
1
,
act
=
None
)
bn1
=
fluid
.
layers
.
batch_norm
(
conv1
,
act
=
'relu'
)
bn1
=
fluid
.
layers
.
batch_norm
(
conv1
,
act
=
'relu'
)
pool1
=
fluid
.
layers
.
pool2d
(
bn1
,
2
,
'max'
,
2
)
pool1
=
fluid
.
layers
.
pool2d
(
bn1
,
2
,
'max'
,
2
)
conv2
=
fluid
.
layers
.
conv2d
(
pool1
,
50
,
5
,
1
,
act
=
None
)
conv2
=
fluid
.
layers
.
conv2d
(
pool1
,
16
,
5
,
1
,
act
=
None
)
bn2
=
fluid
.
layers
.
batch_norm
(
conv2
,
act
=
'relu'
)
bn2
=
fluid
.
layers
.
batch_norm
(
conv2
,
act
=
'relu'
)
pool2
=
fluid
.
layers
.
pool2d
(
bn2
,
2
,
'max'
,
2
)
pool2
=
fluid
.
layers
.
pool2d
(
bn2
,
2
,
'max'
,
2
)
fc1
=
fluid
.
layers
.
fc
(
pool2
,
size
=
50
0
,
act
=
'relu'
)
fc1
=
fluid
.
layers
.
fc
(
pool2
,
size
=
50
,
act
=
'relu'
)
fc2
=
fluid
.
layers
.
fc
(
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
fc2
=
fluid
.
layers
.
fc
(
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
return
fc2
return
fc2
...
...
python/paddle/fluid/tests/unittests/test_tree_conv_op.py
0 → 100644
浏览文件 @
54f9d44e
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
op_test
import
OpTest
def
collect_node_patch
(
og
,
max_depth
):
"""
The naive method to construct patches
:param og: original graph
:param max_depth: the depth of convolution filters
:return: convolution patches
"""
def
gen
(
node
,
max_depth
):
collected
=
[(
node
,
1
,
1
,
0
,
max_depth
)]
def
recurse_helper
(
node
,
depth
):
if
depth
>
max_depth
:
return
l
=
len
(
og
[
node
])
for
idx
,
c
in
enumerate
(
og
[
node
],
1
):
if
depth
+
1
<
max_depth
:
collected
.
append
((
c
,
idx
,
l
,
depth
+
1
,
max_depth
))
recurse_helper
(
c
,
depth
+
1
)
recurse_helper
(
node
,
0
)
return
collected
res
=
[]
for
u
in
range
(
1
,
len
(
og
)):
lis
=
gen
(
u
,
max_depth
)
if
len
(
lis
)
>
0
:
res
.
append
(
lis
)
return
res
class
TestTreeConvOp
(
OpTest
):
def
setUp
(
self
):
self
.
n
=
17
self
.
fea_size
=
3
self
.
output_size
=
1
self
.
max_depth
=
2
self
.
batch_size
=
1
self
.
num_filters
=
1
adj_array
=
[
1
,
2
,
1
,
3
,
1
,
4
,
1
,
5
,
2
,
6
,
2
,
7
,
2
,
8
,
4
,
9
,
4
,
10
,
5
,
11
,
6
,
12
,
6
,
13
,
9
,
14
,
9
,
15
,
9
,
16
,
9
,
17
]
adj
=
np
.
array
(
adj_array
).
reshape
((
1
,
self
.
n
-
1
,
2
)).
astype
(
'int32'
)
adj
=
np
.
tile
(
adj
,
(
self
.
batch_size
,
1
,
1
))
self
.
op_type
=
'tree_conv'
vectors
=
np
.
random
.
random
(
(
self
.
batch_size
,
self
.
n
,
self
.
fea_size
)).
astype
(
'float32'
)
self
.
inputs
=
{
'EdgeSet'
:
adj
,
'NodesVector'
:
vectors
,
'Filter'
:
np
.
random
.
random
((
self
.
fea_size
,
3
,
self
.
output_size
,
self
.
num_filters
)).
astype
(
'float32'
)
}
self
.
attrs
=
{
'max_depth'
:
self
.
max_depth
}
vectors
=
[]
for
i
in
range
(
self
.
batch_size
):
vector
=
self
.
get_output_naive
(
i
)
vectors
.
append
(
vector
)
self
.
outputs
=
{
'Out'
:
np
.
array
(
vectors
).
astype
(
'float32'
),
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
(
[
'NodesVector'
,
'Filter'
],
'Out'
,
max_relative_error
=
0.5
)
def
get_output_naive
(
self
,
batch_id
):
og
=
[[]
for
i
in
range
(
1
,
self
.
n
+
2
)]
st
=
np
.
array
(
self
.
inputs
[
'EdgeSet'
][
batch_id
]).
tolist
()
for
e
in
st
:
og
[
e
[
0
]].
append
(
e
[
1
])
patches
=
collect_node_patch
(
og
,
self
.
max_depth
)
W
=
np
.
array
(
self
.
inputs
[
'Filter'
]).
astype
(
'float32'
)
W
=
np
.
transpose
(
W
,
axes
=
[
1
,
0
,
2
,
3
])
vec
=
[]
for
i
,
patch
in
enumerate
(
patches
,
1
):
result
=
np
.
zeros
((
1
,
W
.
shape
[
2
],
W
.
shape
[
3
]))
for
v
in
patch
:
eta_t
=
float
(
v
[
4
]
-
v
[
3
])
/
float
(
v
[
4
])
eta_l
=
(
1.0
-
eta_t
)
*
(
0.5
if
v
[
2
]
==
1
else
float
(
v
[
1
]
-
1.0
)
/
float
(
v
[
2
]
-
1.0
))
eta_r
=
(
1.0
-
eta_t
)
*
(
1.0
-
eta_l
)
x
=
self
.
inputs
[
'NodesVector'
][
batch_id
][
v
[
0
]
-
1
]
eta
=
np
.
array
([
eta_l
,
eta_r
,
eta_t
]).
reshape
(
(
3
,
1
)).
astype
(
'float32'
)
Wconvi
=
np
.
tensordot
(
eta
,
W
,
axes
=
([
0
],
[
0
]))
x
=
np
.
array
(
x
).
reshape
((
1
,
1
,
self
.
fea_size
))
res
=
np
.
tensordot
(
x
,
Wconvi
,
axes
=
2
)
result
=
result
+
res
vec
.
append
(
result
)
vec
=
np
.
concatenate
(
vec
,
axis
=
0
)
vec
=
np
.
concatenate
(
[
vec
,
np
.
zeros
(
(
self
.
n
-
vec
.
shape
[
0
],
W
.
shape
[
2
],
W
.
shape
[
3
]),
dtype
=
'float32'
)
],
axis
=
0
)
return
vec
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录