Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
54f9d44e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
54f9d44e
编写于
1月 20, 2019
作者:
L
liuwei1031
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into develop
上级
a4dc3d2b
62d36ce0
变更
56
隐藏空白更改
内联
并排
Showing
56 changed file
with
1593 addition
and
214 deletion
+1593
-214
cmake/generic.cmake
cmake/generic.cmake
+2
-2
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-2
paddle/fluid/framework/details/all_reduce_deps_pass.cc
paddle/fluid/framework/details/all_reduce_deps_pass.cc
+3
-3
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+4
-4
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+17
-17
paddle/fluid/framework/details/data_balance_op_handle.cc
paddle/fluid/framework/details/data_balance_op_handle.cc
+2
-2
paddle/fluid/framework/details/fetch_op_handle.cc
paddle/fluid/framework/details/fetch_op_handle.cc
+3
-3
paddle/fluid/framework/details/fuse_vars_op_handle.cc
paddle/fluid/framework/details/fuse_vars_op_handle.cc
+2
-2
paddle/fluid/framework/details/gather_op_handle.cc
paddle/fluid/framework/details/gather_op_handle.cc
+8
-8
paddle/fluid/framework/details/memory_early_delete_pass.cc
paddle/fluid/framework/details/memory_early_delete_pass.cc
+2
-2
paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
...fluid/framework/details/multi_devices_graph_print_pass.cc
+4
-4
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+17
-17
paddle/fluid/framework/details/rpc_op_handle.cc
paddle/fluid/framework/details/rpc_op_handle.cc
+1
-1
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+1
-1
paddle/fluid/framework/details/var_handle.h
paddle/fluid/framework/details/var_handle.h
+7
-0
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+3
-2
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+1
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+2
-0
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+2
-1
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+5
-2
paddle/fluid/inference/api/api_impl.h
paddle/fluid/inference/api/api_impl.h
+2
-0
paddle/fluid/inference/api/details/zero_copy_tensor.cc
paddle/fluid/inference/api/details/zero_copy_tensor.cc
+12
-6
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+28
-3
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+6
-1
paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
.../fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+182
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/controlflow/while_op.cc
paddle/fluid/operators/controlflow/while_op.cc
+6
-16
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+3
-3
paddle/fluid/operators/gru_op.cc
paddle/fluid/operators/gru_op.cc
+7
-2
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+2
-1
paddle/fluid/operators/gru_op.h
paddle/fluid/operators/gru_op.h
+2
-1
paddle/fluid/operators/gru_unit_op.cc
paddle/fluid/operators/gru_unit_op.cc
+7
-0
paddle/fluid/operators/gru_unit_op.h
paddle/fluid/operators/gru_unit_op.h
+23
-7
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-0
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+26
-20
paddle/fluid/operators/math/detail/gru_gpu_kernel.h
paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+6
-4
paddle/fluid/operators/math/detail/gru_kernel.h
paddle/fluid/operators/math/detail/gru_kernel.h
+60
-25
paddle/fluid/operators/math/gru_compute.cc
paddle/fluid/operators/math/gru_compute.cc
+8
-4
paddle/fluid/operators/math/gru_compute.cu
paddle/fluid/operators/math/gru_compute.cu
+8
-6
paddle/fluid/operators/math/gru_compute.h
paddle/fluid/operators/math/gru_compute.h
+4
-2
paddle/fluid/operators/math/tree2col.cc
paddle/fluid/operators/math/tree2col.cc
+197
-0
paddle/fluid/operators/math/tree2col.cu
paddle/fluid/operators/math/tree2col.cu
+208
-0
paddle/fluid/operators/math/tree2col.h
paddle/fluid/operators/math/tree2col.h
+90
-0
paddle/fluid/operators/tree_conv_op.cc
paddle/fluid/operators/tree_conv_op.cc
+129
-0
paddle/fluid/operators/tree_conv_op.cu
paddle/fluid/operators/tree_conv_op.cu
+24
-0
paddle/fluid/operators/tree_conv_op.h
paddle/fluid/operators/tree_conv_op.h
+146
-0
python/paddle/fluid/async_executor.py
python/paddle/fluid/async_executor.py
+0
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+115
-7
python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
...-api/recommender_system/test_recommender_system_newapi.py
+11
-8
python/paddle/fluid/tests/book/test_recommender_system.py
python/paddle/fluid/tests/book/test_recommender_system.py
+12
-8
python/paddle/fluid/tests/unittests/test_auc_op.py
python/paddle/fluid/tests/unittests/test_auc_op.py
+1
-1
python/paddle/fluid/tests/unittests/test_gru_op.py
python/paddle/fluid/tests/unittests/test_gru_op.py
+29
-4
python/paddle/fluid/tests/unittests/test_gru_unit_op.py
python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+23
-6
python/paddle/fluid/tests/unittests/test_nce.py
python/paddle/fluid/tests/unittests/test_nce.py
+2
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
...luid/tests/unittests/test_parallel_executor_fetch_feed.py
+3
-3
python/paddle/fluid/tests/unittests/test_tree_conv_op.py
python/paddle/fluid/tests/unittests/test_tree_conv_op.py
+120
-0
未找到文件。
cmake/generic.cmake
浏览文件 @
54f9d44e
...
...
@@ -748,7 +748,7 @@ function(grpc_library TARGET_NAME)
#FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
# somehow it didn't. line 602 to 604 is to patching this. Leaving this here
# for now to enable dist CI.
protobuf_generate_cpp
(
grpc_proto_srcs grpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
p
addle_p
rotobuf_generate_cpp
(
grpc_proto_srcs grpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
set
(
grpc_grpc_srcs
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
PROTO_WE
}
.grpc.pb.cc"
)
set
(
grpc_grpc_hdrs
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
PROTO_WE
}
.grpc.pb.h"
)
cc_library
(
"
${
TARGET_NAME
}
_proto"
SRCS
"
${
grpc_proto_srcs
}
"
)
...
...
@@ -791,7 +791,7 @@ function(brpc_library TARGET_NAME)
get_filename_component
(
PROTO_WE
${
brpc_library_PROTO
}
NAME_WE
)
get_filename_component
(
PROTO_PATH
${
ABS_PROTO
}
PATH
)
protobuf_generate_cpp
(
brpc_proto_srcs brpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
p
addle_p
rotobuf_generate_cpp
(
brpc_proto_srcs brpc_proto_hdrs
"
${
ABS_PROTO
}
"
)
cc_library
(
"
${
TARGET_NAME
}
_proto"
SRCS
"
${
brpc_proto_srcs
}
"
)
cc_library
(
"
${
TARGET_NAME
}
"
SRCS
"
${
brpc_library_SRCS
}
"
DEPS
"
${
TARGET_NAME
}
_proto"
"
${
brpc_library_DEPS
}
"
)
endfunction
()
paddle/fluid/API.spec
浏览文件 @
54f9d44e
...
...
@@ -70,8 +70,8 @@ paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'
], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', Non
e))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'
], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid'
))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'
, 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, Fals
e))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation'
, 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False
))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
...
...
@@ -215,6 +215,7 @@ paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', '
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.tree_conv ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/details/all_reduce_deps_pass.cc
浏览文件 @
54f9d44e
...
...
@@ -82,13 +82,13 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
PADDLE_ENFORCE
(
i0
!=
nullptr
&&
i1
!=
nullptr
,
"%s convert to %s error"
,
op1
->
DebugString
(),
op2
->
DebugString
());
auto
l_it
=
vars
.
find
(
i0
->
name
_
);
auto
r_it
=
vars
.
find
(
i1
->
name
_
);
auto
l_it
=
vars
.
find
(
i0
->
name
()
);
auto
r_it
=
vars
.
find
(
i1
->
name
()
);
if
(
l_it
->
second
<
r_it
->
second
)
return
true
;
if
(
l_it
->
second
==
r_it
->
second
)
{
return
i0
->
name
_
<
i1
->
name_
;
return
i0
->
name
()
<
i1
->
name
()
;
}
return
false
;
...
...
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -70,9 +70,9 @@ void AllReduceOpHandle::RunImpl() {
auto
*
s
=
local_scopes_
[
i
];
auto
&
local_scope
=
*
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
lod_tensor
=
local_scope
.
FindVar
(
in_var_handles
[
i
]
->
name
_
)
->
Get
<
LoDTensor
>
();
local_scope
.
FindVar
(
in_var_handles
[
i
]
->
name
()
)
->
Get
<
LoDTensor
>
();
lod_tensors
.
emplace_back
(
&
lod_tensor
);
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
_
,
out_var_handles
[
i
]
->
name_
,
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
(),
out_var_handles
[
i
]
->
name
()
,
"The name of input and output should be equal."
);
}
...
...
@@ -134,7 +134,7 @@ void AllReduceOpHandle::RunImpl() {
auto
&
trg
=
*
this
->
local_scopes_
[
0
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
FindVar
(
out_var_handles
[
0
]
->
name
_
)
->
FindVar
(
out_var_handles
[
0
]
->
name
()
)
->
GetMutable
<
framework
::
LoDTensor
>
();
// Reduce All Tensor to trg in CPU
...
...
@@ -145,7 +145,7 @@ void AllReduceOpHandle::RunImpl() {
auto
&
scope
=
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
p
=
places_
[
i
];
auto
*
var
=
scope
.
FindVar
(
out_var_handles
[
i
]
->
name
_
);
auto
*
var
=
scope
.
FindVar
(
out_var_handles
[
i
]
->
name
()
);
auto
*
dev_ctx
=
dev_ctxes_
.
at
(
p
);
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -56,11 +56,11 @@ void BroadcastOpHandle::BroadcastOneVar(
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
,
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
{
auto
*
in_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
_
)
->
FindVar
(
in_var_handle
.
name_
);
var_scopes
.
at
(
in_var_handle
.
scope_idx
())
->
FindVar
(
in_var_handle
.
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
if
(
UNLIKELY
(
!
in_tensor
.
IsInitialized
()))
{
VLOG
(
3
)
<<
"in var "
<<
in_var_handle
.
name
_
<<
"not inited, return!"
;
VLOG
(
3
)
<<
"in var "
<<
in_var_handle
.
name
()
<<
"not inited, return!"
;
return
;
}
...
...
@@ -71,9 +71,9 @@ void BroadcastOpHandle::BroadcastOneVar(
if
(
out_var_handle
->
IsTheSameVar
(
in_var_handle
))
{
continue
;
}
auto
&
out_p
=
out_var_handle
->
place
_
;
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
_
)
->
FindVar
(
out_var_handle
->
name
_
);
auto
&
out_p
=
out_var_handle
->
place
()
;
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
()
)
->
FindVar
(
out_var_handle
->
name
()
);
RunAndRecordEvent
(
out_p
,
[
in_tensor
,
out_var
]
{
paddle
::
framework
::
TensorCopy
(
...
...
@@ -91,11 +91,11 @@ void BroadcastOpHandle::BroadcastOneVar(
size_t
numel
=
static_cast
<
size_t
>
(
in_tensor
.
numel
());
for
(
auto
out_var_handle
:
out_var_handles
)
{
Variable
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
_
)
->
FindVar
(
out_var_handle
->
name
_
);
Variable
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
()
)
->
FindVar
(
out_var_handle
->
name
()
);
int
dst_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_var_handle
->
place
_
).
device
;
boost
::
get
<
platform
::
CUDAPlace
>
(
out_var_handle
->
place
()
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dst_id
);
...
...
@@ -106,7 +106,7 @@ void BroadcastOpHandle::BroadcastOneVar(
}
else
{
send_recv_buffer
=
VariableVisitor
::
GetMutableTensor
(
out_var
)
.
Resize
(
in_tensor
.
dims
())
.
mutable_data
(
out_var_handle
->
place
_
);
.
mutable_data
(
out_var_handle
->
place
()
);
}
broadcast_calls
.
emplace_back
(
...
...
@@ -126,11 +126,11 @@ void BroadcastOpHandle::BroadcastOneVar(
}
if
(
!
out_handle
->
IsTheSameVar
(
in_var_handle
))
{
auto
out_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
_
)
->
FindVar
(
out_var_handles
[
0
]
->
name
_
);
auto
out_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
()
)
->
FindVar
(
out_var_handles
[
0
]
->
name
()
);
paddle
::
framework
::
TensorCopy
(
in_tensor
,
in_var_handle
.
place
_
,
*
(
dev_ctxes_
.
at
(
in_var_handle
.
place
_
)),
in_tensor
,
in_var_handle
.
place
()
,
*
(
dev_ctxes_
.
at
(
in_var_handle
.
place
()
)),
&
VariableVisitor
::
GetMutableTensor
(
out_var
));
}
});
...
...
@@ -148,7 +148,7 @@ void BroadcastOpHandle::InitOutputValue(
var_scopes
.
emplace_back
(
s
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
());
}
auto
*
in_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
_
)
->
FindVar
(
in_var_handle
.
name_
);
var_scopes
.
at
(
in_var_handle
.
scope_idx
())
->
FindVar
(
in_var_handle
.
name
()
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
...
...
@@ -158,9 +158,9 @@ void BroadcastOpHandle::InitOutputValue(
if
(
out_var_handle
->
IsTheSameVar
(
in_var_handle
))
{
continue
;
}
auto
t_out_p
=
out_var_handle
->
place
_
;
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
_
)
->
FindVar
(
out_var_handle
->
name
_
);
auto
t_out_p
=
out_var_handle
->
place
()
;
auto
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
()
)
->
FindVar
(
out_var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
if
(
is_gpu_place
(
in_tensor
.
place
()))
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
t_out_p
),
...
...
paddle/fluid/framework/details/data_balance_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -100,13 +100,13 @@ void DataBalanceOpHandle::RunImpl() {
std
::
vector
<
std
::
vector
<
LoDTensor
*>>
lod_tensors
(
data_num
);
std
::
vector
<
int
>
device_sizes
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
in_var_handles
.
size
());
++
i
)
{
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
_
,
out_var_handles
[
i
]
->
name_
,
PADDLE_ENFORCE_EQ
(
in_var_handles
[
i
]
->
name
(),
out_var_handles
[
i
]
->
name
()
,
"The name of input and output should be equal."
);
int
place_idx
=
i
/
data_num
;
int
data_idx
=
i
%
data_num
;
auto
*
local_scope
=
local_scopes_
[
place_idx
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
*
tensor_var
=
local_scope
->
FindVar
(
in_var_handles
[
i
]
->
name
_
);
auto
*
tensor_var
=
local_scope
->
FindVar
(
in_var_handles
[
i
]
->
name
()
);
PADDLE_ENFORCE
(
tensor_var
->
IsType
<
LoDTensor
>
());
auto
*
tensor
=
tensor_var
->
GetMutable
<
LoDTensor
>
();
lod_tensors
[
data_idx
].
push_back
(
tensor
);
...
...
paddle/fluid/framework/details/fetch_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -52,12 +52,12 @@ void FetchOpHandle::RunImpl() {
for
(
size_t
i
=
0
;
i
<
inputs_
.
size
();
++
i
)
{
auto
*
var_handle
=
static_cast
<
VarHandle
*>
(
inputs_
[
i
]);
auto
&
scope
=
scopes
.
at
(
var_handle
->
scope_idx
_
);
auto
&
scope
=
scopes
.
at
(
var_handle
->
scope_idx
()
);
auto
*
var
=
scope
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
FindVar
(
var_handle
->
name
_
);
->
FindVar
(
var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"Cannot find variable %s in execution scope"
,
var_handle
->
name
_
);
var_handle
->
name
()
);
auto
&
t
=
var
->
Get
<
framework
::
LoDTensor
>
();
if
(
platform
::
is_gpu_place
(
t
.
place
()))
{
...
...
paddle/fluid/framework/details/fuse_vars_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -29,14 +29,14 @@ void FuseVarsOpHandle::RunImpl() {
auto
scope
=
local_scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
out_var_handle
=
out_var_handles
[
0
];
auto
out_var
=
scope
->
Var
(
out_var_handle
->
name
_
);
auto
out_var
=
scope
->
Var
(
out_var_handle
->
name
()
);
auto
out_tensor
=
out_var
->
GetMutable
<
LoDTensor
>
();
out_tensor
->
Resize
({
total_numel_
}).
mutable_data
(
this
->
place_
,
type_
);
int64_t
s
=
0
;
for
(
size_t
i
=
1
;
i
<
out_var_handles
.
size
();
++
i
)
{
auto
out_name
=
out_var_handles
[
i
]
->
name
_
;
auto
out_name
=
out_var_handles
[
i
]
->
name
()
;
auto
out_t
=
scope
->
Var
(
out_name
)
->
GetMutable
<
LoDTensor
>
();
auto
numel
=
this
->
inputs_numel_
.
at
(
out_name
);
out_t
->
ShareDataWith
(
out_tensor
->
Slice
(
s
,
s
+
numel
));
...
...
paddle/fluid/framework/details/gather_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -49,7 +49,7 @@ void GatherOpHandle::RunImpl() {
auto
in_0_handle
=
in_var_handles
[
0
];
auto
pre_in_var
=
var_scopes
.
at
(
in_0_handle
->
scope_idx
_
)
->
FindVar
(
in_0_handle
->
name_
);
var_scopes
.
at
(
in_0_handle
->
scope_idx
())
->
FindVar
(
in_0_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
PADDLE_ENFORCE
(
pre_in_var
->
IsType
<
framework
::
SelectedRows
>
(),
...
...
@@ -65,7 +65,7 @@ void GatherOpHandle::RunImpl() {
// Gather the inputs
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
*
in_var
=
var_scopes
.
at
(
in_handle
->
scope_idx
_
)
->
FindVar
(
in_handle
->
name_
);
var_scopes
.
at
(
in_handle
->
scope_idx
())
->
FindVar
(
in_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
VariableVisitor
::
EnforceShapeAndDTypeEQ
(
*
in_var
,
*
pre_in_var
);
...
...
@@ -77,7 +77,7 @@ void GatherOpHandle::RunImpl() {
}
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform
::
Place
t_out_p
=
out_var_handle
->
place
_
;
platform
::
Place
t_out_p
=
out_var_handle
->
place
()
;
if
(
platform
::
is_gpu_place
(
pre_in_value
.
place
()))
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
t_out_p
),
"Places of input and output must be all on GPU."
);
...
...
@@ -85,8 +85,8 @@ void GatherOpHandle::RunImpl() {
t_out_p
=
platform
::
CPUPlace
();
}
auto
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx_
)
->
FindVar
(
out_var_handle
->
name_
);
auto
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
())
->
FindVar
(
out_var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_value
=
out_var
->
GetMutable
<
framework
::
SelectedRows
>
();
out_value
->
set_height
(
pre_in_value
.
height
());
...
...
@@ -99,9 +99,9 @@ void GatherOpHandle::RunImpl() {
Tensor
*
out_tensor
=
out_value
->
mutable_value
();
// copy
auto
dev_ctx
=
dev_ctxes_
.
at
(
out_var_handle
->
place
_
);
RunAndRecordEvent
(
out_var_handle
->
place
_
,
[
in_tensors
,
out_tensor
,
&
dev_ctx
,
t_out_p
]
{
auto
dev_ctx
=
dev_ctxes_
.
at
(
out_var_handle
->
place
()
);
RunAndRecordEvent
(
out_var_handle
->
place
()
,
[
in_tensors
,
out_tensor
,
&
dev_ctx
,
t_out_p
]
{
int
s
=
0
,
e
=
0
;
for
(
size_t
j
=
0
;
j
<
in_tensors
.
size
();
++
j
)
{
e
+=
in_tensors
[
j
].
dims
()[
0
];
...
...
paddle/fluid/framework/details/memory_early_delete_pass.cc
浏览文件 @
54f9d44e
...
...
@@ -33,7 +33,7 @@ static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
queue
.
pop
();
for
(
auto
*
op
:
var
->
PendingOps
())
{
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place
_
)
{
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place
()
)
{
return
compute_op
;
}
for
(
auto
*
out_var
:
op
->
Outputs
())
{
...
...
@@ -64,7 +64,7 @@ std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
for
(
auto
&
var
:
vars
)
{
auto
*
var_handle
=
dynamic_cast
<
VarHandle
*>
(
var
);
auto
var_name
=
var
->
Node
()
->
Name
();
auto
&
var_place
=
var_handle
->
place
_
;
auto
&
var_place
=
var_handle
->
place
()
;
if
(
unlived_vars
.
count
(
var_name
)
==
0
)
continue
;
if
(
!
unlived_vars
[
var_name
].
empty
())
{
if
(
compute_op
!=
nullptr
&&
...
...
paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
浏览文件 @
54f9d44e
...
...
@@ -52,11 +52,11 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
vars
[
var_ptr
]
=
cur_var_id
;
if
(
var_handle_ptr
)
{
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
"
<<
var_handle_ptr
->
name
_
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
"
<<
var_handle_ptr
->
name
()
<<
"
\\
n"
<<
var_handle_ptr
->
place
_
<<
"
\\
n"
<<
"scope: "
<<
var_handle_ptr
->
scope_idx
_
<<
"
\\
n"
<<
"v"
<<
var_handle_ptr
->
version
_
<<
"
\"
]"
<<
std
::
endl
;
<<
var_handle_ptr
->
place
()
<<
"
\\
n"
<<
"scope: "
<<
var_handle_ptr
->
scope_idx
()
<<
"
\\
n"
<<
"v"
<<
var_handle_ptr
->
version
()
<<
"
\"
]"
<<
std
::
endl
;
}
else
if
(
dummy_ptr
)
{
sout
<<
"var_"
<<
cur_var_id
<<
" [label=
\"
dummy
\"
]"
<<
std
::
endl
;
}
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -60,8 +60,8 @@ void ReduceOpHandle::GatherSelectedRows(
*
CollectiveContext
::
GetInstance
();
// 1. gather local selected rows, merge them
std
::
string
gathered_var_name
=
out_var_handle
->
name
_
+
"_gathered_tmp"
;
auto
scope
=
local_scopes_
.
at
(
out_var_handle
->
scope_idx
_
);
std
::
string
gathered_var_name
=
out_var_handle
->
name
()
+
"_gathered_tmp"
;
auto
scope
=
local_scopes_
.
at
(
out_var_handle
->
scope_idx
()
);
auto
gathered_var_mid
=
scope
->
Var
(
gathered_var_name
);
auto
gathered_select_rows
=
gathered_var_mid
->
GetMutable
<
framework
::
SelectedRows
>
();
...
...
@@ -73,7 +73,7 @@ void ReduceOpHandle::GatherSelectedRows(
// merge them
auto
merged_dev_ctx
=
dynamic_cast
<
DevCtx
*>
(
dev_ctxes
.
at
(
out_place
));
std
::
string
merged_var_name
=
GetRemoteVarName
(
out_var_handle
->
name
_
,
collective_context
.
trainer_id_
);
GetRemoteVarName
(
out_var_handle
->
name
()
,
collective_context
.
trainer_id_
);
auto
merged_select_rows
=
scope
->
Var
(
merged_var_name
)
->
GetMutable
<
SelectedRows
>
();
operators
::
math
::
scatter
::
MergeAdd
<
DevCtx
,
DataType
>
merge_func
;
...
...
@@ -101,7 +101,7 @@ void ReduceOpHandle::GatherSelectedRows(
operators
::
distributed
::
RemoteVar
var
;
var
.
trainer_id_
=
i
;
var
.
var_name_
=
GetRemoteVarName
(
out_var_handle
->
name
_
,
i
);
var
.
var_name_
=
GetRemoteVarName
(
out_var_handle
->
name
()
,
i
);
var
.
ep_
=
collective_context
.
endpoints_
[
i
];
vars
.
push_back
(
var
);
...
...
@@ -166,7 +166,7 @@ void ReduceOpHandle::RunImpl() {
}
auto
pre_in_var
=
var_scopes
.
at
(
in_0_handle
->
scope_idx
_
)
->
FindVar
(
in_0_handle
->
name_
);
var_scopes
.
at
(
in_0_handle
->
scope_idx
())
->
FindVar
(
in_0_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
pre_in_var
);
// Wait input done, this Wait is asynchronous operation
...
...
@@ -175,15 +175,15 @@ void ReduceOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std
::
vector
<
platform
::
Place
>
in_places
;
// used to get dev_ctx
for
(
auto
*
in_handle
:
in_var_handles
)
{
in_places
.
emplace_back
(
in_handle
->
place
_
);
in_places
.
emplace_back
(
in_handle
->
place
()
);
auto
in_var
=
var_scopes
.
at
(
in_handle
->
scope_idx
_
)
->
FindVar
(
in_handle
->
name_
);
var_scopes
.
at
(
in_handle
->
scope_idx
())
->
FindVar
(
in_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
VariableVisitor
::
EnforceShapeAndDTypeEQ
(
*
pre_in_var
,
*
in_var
);
}
auto
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx_
)
->
FindVar
(
out_var_handle
->
name_
);
auto
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
())
->
FindVar
(
out_var_handle
->
name
()
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
// NOTE: The tensors' Place of input and output must be all on GPU or all on
...
...
@@ -191,9 +191,9 @@ void ReduceOpHandle::RunImpl() {
auto
in_p
=
VariableVisitor
::
GetMutableTensor
(
pre_in_var
).
place
();
platform
::
Place
t_out_p
;
if
(
platform
::
is_gpu_place
(
in_p
))
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
out_var_handle
->
place
_
),
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
out_var_handle
->
place
()
),
"Places of input and output must be all on GPU."
);
t_out_p
=
out_var_handle
->
place
_
;
t_out_p
=
out_var_handle
->
place
()
;
}
else
{
t_out_p
=
platform
::
CPUPlace
();
}
...
...
@@ -253,7 +253,7 @@ void ReduceOpHandle::RunImpl() {
auto
&
reduce_sum_trg
=
*
this
->
local_scopes_
[
0
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
FindVar
(
out_var_handle
->
name
_
)
->
FindVar
(
out_var_handle
->
name
()
)
->
GetMutable
<
framework
::
LoDTensor
>
();
ReduceLoDTensor
func
(
lod_tensors
,
&
reduce_sum_trg
);
VisitDataType
(
lod_tensors
[
0
]
->
type
(),
func
);
...
...
@@ -269,9 +269,9 @@ void ReduceOpHandle::RunImpl() {
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
out_var_handle
->
place
_
,
pre_in
.
type
());
out_var_handle
->
place
()
,
pre_in
.
type
());
auto
out_p
=
out_var_handle
->
place
_
;
auto
out_p
=
out_var_handle
->
place
()
;
int
root_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
out_p
).
device
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
var_scopes
.
size
();
++
i
)
{
...
...
@@ -286,7 +286,7 @@ void ReduceOpHandle::RunImpl() {
if
(
root_id
==
dev_id
)
{
recvbuffer
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
mutable_data
(
out_var_handle
->
place
_
);
out_var_handle
->
place
()
);
}
int
type
=
platform
::
ToNCCLDataType
(
lod_tensor
.
type
());
...
...
@@ -320,8 +320,8 @@ std::vector<const T *> ReduceOpHandle::GetInputValues(
const
std
::
vector
<
const
Scope
*>
&
var_scopes
)
const
{
std
::
vector
<
const
T
*>
in_selected_rows
;
for
(
auto
*
in_handle
:
in_var_handles
)
{
auto
&
in_sr
=
var_scopes
.
at
(
in_handle
->
scope_idx
_
)
->
FindVar
(
in_handle
->
name
_
)
auto
&
in_sr
=
var_scopes
.
at
(
in_handle
->
scope_idx
()
)
->
FindVar
(
in_handle
->
name
()
)
->
Get
<
T
>
();
in_selected_rows
.
emplace_back
(
&
in_sr
);
}
...
...
paddle/fluid/framework/details/rpc_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -30,7 +30,7 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
void
RPCOpHandle
::
RunImpl
()
{
for
(
auto
*
in
:
inputs_
)
{
auto
&
p
=
static_cast
<
VarHandle
*>
(
in
)
->
place
_
;
auto
&
p
=
static_cast
<
VarHandle
*>
(
in
)
->
place
()
;
if
(
ir
::
IsControlDepVar
(
*
in
->
Node
()))
{
continue
;
}
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
54f9d44e
...
...
@@ -68,7 +68,7 @@ struct ScaleLossGradFunctor {
void
ScaleLossGradOpHandle
::
RunImpl
()
{
// Doesn't wait any event
std
::
string
var_name
=
static_cast
<
VarHandle
*>
(
this
->
outputs_
[
0
])
->
name
_
;
std
::
string
var_name
=
static_cast
<
VarHandle
*>
(
this
->
outputs_
[
0
])
->
name
()
;
auto
&
local_scope
=
*
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
*
tensor
=
local_scope
.
FindVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
...
...
paddle/fluid/framework/details/var_handle.h
浏览文件 @
54f9d44e
...
...
@@ -111,15 +111,22 @@ struct VarHandle : public VarHandleBase {
// version field currently is not used, however, just store the version to
// debug easily.
private:
size_t
version_
;
size_t
scope_idx_
;
std
::
string
name_
;
platform
::
Place
place_
;
public:
bool
IsTheSameVar
(
const
VarHandle
&
o
)
const
{
return
o
.
generated_op_
==
generated_op_
&&
o
.
name_
==
name_
&&
o
.
scope_idx_
==
scope_idx_
;
}
size_t
version
()
const
{
return
version_
;
}
size_t
scope_idx
()
const
{
return
scope_idx_
;
}
const
std
::
string
&
name
()
const
{
return
name_
;
}
const
platform
::
Place
&
place
()
const
{
return
place_
;
}
};
// Dummy Variable. It is used to represent dependencies between operators
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
54f9d44e
...
...
@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_
=
true
;
tensorrt_workspace_size_
=
workspace_size
;
tensorrt_max_batchsize_
=
max_batch_size
;
tensorrt_min_subgraph_size_
=
min_subgraph_size
;
Update
();
}
...
...
@@ -145,8 +146,8 @@ void contrib::AnalysisConfig::Update() {
LOG
(
ERROR
)
<<
"TensorRT engine is not available when EnableGpu() not actived."
;
}
else
{
// Append after the
infer_clean
pass.
pass_builder
()
->
InsertPass
(
1
,
"tensorrt_subgraph_pass"
);
// Append after the
Affine_channel_conv_fuse
pass.
pass_builder
()
->
InsertPass
(
3
,
"tensorrt_subgraph_pass"
);
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
54f9d44e
...
...
@@ -561,6 +561,7 @@ AnalysisPredictor::~AnalysisPredictor() {
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
clone_mutex_
);
auto
*
x
=
new
AnalysisPredictor
(
config_
);
x
->
Init
(
scope_
,
inference_program_
);
return
std
::
unique_ptr
<
PaddlePredictor
>
(
x
);
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
54f9d44e
...
...
@@ -115,6 +115,8 @@ class AnalysisPredictor : public PaddlePredictor {
// concurrency problems, wrong results and memory leak, so cache them.
std
::
vector
<
framework
::
LoDTensor
>
feed_tensors_
;
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
// A mutex help to make Clone thread safe.
std
::
mutex
clone_mutex_
;
private:
// Some status here that help to determine the status inside the predictor.
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
54f9d44e
...
...
@@ -179,8 +179,9 @@ TEST(AnalysisPredictor, Clone) {
threads
.
emplace_back
([
&
predictors
,
&
inputs
,
i
]
{
LOG
(
INFO
)
<<
"thread #"
<<
i
<<
" running"
;
std
::
vector
<
PaddleTensor
>
outputs
;
auto
predictor
=
predictors
.
front
()
->
Clone
();
for
(
int
j
=
0
;
j
<
10
;
j
++
)
{
ASSERT_TRUE
(
predictor
s
[
i
]
->
Run
(
inputs
,
&
outputs
));
ASSERT_TRUE
(
predictor
->
Run
(
inputs
,
&
outputs
));
}
});
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
54f9d44e
...
...
@@ -161,13 +161,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
}
std
::
unique_ptr
<
PaddlePredictor
>
NativePaddlePredictor
::
Clone
()
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
clone_mutex_
);
VLOG
(
3
)
<<
"Predictor::clone"
;
std
::
unique_ptr
<
PaddlePredictor
>
cls
(
new
NativePaddlePredictor
(
config_
));
if
(
!
dynamic_cast
<
NativePaddlePredictor
*>
(
cls
.
get
())
->
Init
(
scope_
))
{
// Hot fix the bug that result diff in multi-thread.
// TODO(Superjomn) re-implement a real clone here.
if
(
!
dynamic_cast
<
NativePaddlePredictor
*>
(
cls
.
get
())
->
Init
(
nullptr
))
{
LOG
(
ERROR
)
<<
"fail to call Init"
;
return
nullptr
;
}
#ifdef __clang__
// fix clang compile error
return
cls
;
...
...
paddle/fluid/inference/api/api_impl.h
浏览文件 @
54f9d44e
...
...
@@ -74,6 +74,8 @@ class NativePaddlePredictor : public PaddlePredictor {
// Do not use unique_ptr, use parent scope to delete
framework
::
Scope
*
sub_scope_
{
nullptr
};
details
::
TensorArrayBatchCleaner
tensor_array_batch_cleaner_
;
// A mutex to make Clone thread safe.
std
::
mutex
clone_mutex_
;
};
}
// namespace paddle
paddle/fluid/inference/api/details/zero_copy_tensor.cc
浏览文件 @
54f9d44e
...
...
@@ -33,9 +33,15 @@ void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
tensor
->
Resize
(
framework
::
make_ddim
(
shape
));
}
#define EAGER_GET_TENSOR \
if (!tensor_) { \
tensor_ = FindTensor(); \
} \
auto *tensor = static_cast<framework::LoDTensor *>(tensor_);
template
<
typename
T
>
T
*
ZeroCopyTensor
::
mutable_data
(
PaddlePlace
place
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
EAGER_GET_TENSOR
;
switch
(
static_cast
<
int
>
(
place
))
{
case
static_cast
<
int
>
(
PaddlePlace
::
kCPU
):
{
return
tensor
->
mutable_data
<
T
>
(
platform
::
CPUPlace
());
...
...
@@ -52,7 +58,7 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
template
<
typename
T
>
T
*
ZeroCopyTensor
::
data
(
PaddlePlace
*
place
,
int
*
size
)
const
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
EAGER_GET_TENSOR
;
auto
*
res
=
tensor
->
data
<
T
>
();
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
...
...
@@ -87,13 +93,13 @@ void *ZeroCopyTensor::FindTensor() const {
}
std
::
vector
<
int64_t
>
ZeroCopyTensor
::
shape
()
const
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
PADDLE_ENFORCE
(
tensor
,
"not found tensor called %s in the scope"
,
name_
);
EAGER_GET_TENSOR
;
PADDLE_ENFORCE
(
tensor
_
,
"not found tensor called %s in the scope"
,
name_
);
return
framework
::
vectorize
(
tensor
->
dims
());
}
void
ZeroCopyTensor
::
SetLoD
(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
x
)
{
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
())
;
EAGER_GET_TENSOR
;
framework
::
LoD
lod
;
for
(
auto
&
level
:
x
)
{
lod
.
emplace_back
(
level
);
...
...
@@ -102,8 +108,8 @@ void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
}
std
::
vector
<
std
::
vector
<
size_t
>>
ZeroCopyTensor
::
lod
()
const
{
EAGER_GET_TENSOR
;
std
::
vector
<
std
::
vector
<
size_t
>>
res
;
auto
*
tensor
=
static_cast
<
framework
::
LoDTensor
*>
(
FindTensor
());
for
(
auto
&
level
:
tensor
->
lod
())
{
res
.
emplace_back
(
level
);
}
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
54f9d44e
...
...
@@ -146,6 +146,9 @@ class ZeroCopyTensor {
bool
input_or_output_
;
friend
class
AnalysisPredictor
;
void
*
scope_
{
nullptr
};
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
mutable
void
*
tensor_
{
nullptr
};
};
/** A simple Inference API for Paddle.
...
...
@@ -167,18 +170,40 @@ class PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
=
0
;
/** Zero copy input and output optimization.
* Get the input or output tensors, and operate on their memory directly,
* without copy.
/** \brief Get a mutable tensor directly.
*
* NOTE Only works in AnalysisPredictor.
*
* One can also use this to modify any temporary variable related tensors in
* the predictor.
*
*/
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetInputTensor
(
const
std
::
string
&
name
)
{
return
nullptr
;
}
/**
* \brief Get an immutable tensor without copy.
*
* NOTE Only works in AnalysisPredictor.
* One can use this API to get any temporary tensors in the predictor and
* read it.
*/
virtual
std
::
unique_ptr
<
ZeroCopyTensor
>
GetOutputTensor
(
const
std
::
string
&
name
)
{
return
nullptr
;
}
/**
* \brief Run the predictor with zero-copied inputs and outputs.
*
* NOTE Only works in AnalysisPredictor.
*
* This will save the IO copy for transfering inputs and outputs to predictor
* workspace and get some performance improvement.
* To use it, one should call the `AnalysisConfig.SwitchUseFeedFetchOp(true)`
* and then use the `GetInputTensor` and `GetOutputTensor` to directly write
* or read the input/output tensors.
*/
virtual
bool
ZeroCopyRun
()
{
return
false
;
}
/** Clone a predictor that share the model weights, the Cloned predictor
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
54f9d44e
...
...
@@ -84,7 +84,12 @@ inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_te
# MM DNN
set
(
MM_DNN_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/mm_dnn"
)
download_model_and_data
(
${
MM_DNN_INSTALL_DIR
}
"MM_DNN_model.tar.gz"
"MM_DNN_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_mm_dnn
${
MM_DNN_INSTALL_DIR
}
analyzer_mm_dnn_tester.cc
)
inference_analysis_api_test
(
test_analyzer_mm_dnn
${
MM_DNN_INSTALL_DIR
}
analyzer_mm_dnn_tester.cc SERIAL
)
# Pyramid DNN
set
(
PYRAMID_DNN_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/pyramid_dnn"
)
download_model_and_data
(
${
PYRAMID_DNN_INSTALL_DIR
}
"PyramidDNN_model.tar.gz"
"PyramidDNN_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_pyramid_dnn
${
PYRAMID_DNN_INSTALL_DIR
}
analyzer_pyramid_dnn_tester.cc
)
# text_classification
set
(
TEXT_CLASSIFICATION_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/text_classification"
)
...
...
paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
using
contrib
::
AnalysisConfig
;
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
int64_t
>>
query_basic
,
query_phrase
,
title_basic
,
title_phrase
;
std
::
vector
<
size_t
>
lod1
,
lod2
,
lod3
,
lod4
;
size_t
batch_iter
{
0
},
batch_size
{
1
},
num_samples
;
// total number of samples
DataRecord
()
=
default
;
explicit
DataRecord
(
const
std
::
string
&
path
,
int
batch_size
=
1
)
:
batch_size
(
batch_size
)
{
Load
(
path
);
}
DataRecord
NextBatch
()
{
DataRecord
data
;
size_t
batch_end
=
batch_iter
+
batch_size
;
// NOTE skip the final batch, if no enough data is provided.
if
(
batch_end
<=
query_basic
.
size
())
{
GetInputPerBatch
(
query_basic
,
&
data
.
query_basic
,
&
data
.
lod1
,
batch_iter
,
batch_end
);
GetInputPerBatch
(
query_phrase
,
&
data
.
query_phrase
,
&
data
.
lod2
,
batch_iter
,
batch_end
);
GetInputPerBatch
(
title_basic
,
&
data
.
title_basic
,
&
data
.
lod3
,
batch_iter
,
batch_end
);
GetInputPerBatch
(
title_phrase
,
&
data
.
title_phrase
,
&
data
.
lod4
,
batch_iter
,
batch_end
);
}
batch_iter
+=
batch_size
;
return
data
;
}
void
Load
(
const
std
::
string
&
path
)
{
std
::
ifstream
file
(
path
);
std
::
string
line
;
int
num_lines
=
0
;
while
(
std
::
getline
(
file
,
line
))
{
std
::
vector
<
std
::
string
>
data
;
split
(
line
,
';'
,
&
data
);
// load query data
std
::
vector
<
int64_t
>
query_basic_data
;
split_to_int64
(
data
[
1
],
' '
,
&
query_basic_data
);
std
::
vector
<
int64_t
>
query_phrase_data
;
split_to_int64
(
data
[
2
],
' '
,
&
query_phrase_data
);
// load title data
std
::
vector
<
int64_t
>
title_basic_data
;
split_to_int64
(
data
[
3
],
' '
,
&
title_basic_data
);
std
::
vector
<
int64_t
>
title_phrase_data
;
split_to_int64
(
data
[
4
],
' '
,
&
title_phrase_data
);
// filter the empty data
bool
flag
=
data
[
1
].
size
()
&&
data
[
2
].
size
()
&&
data
[
3
].
size
()
&&
data
[
4
].
size
();
if
(
flag
)
{
query_basic
.
push_back
(
std
::
move
(
query_basic_data
));
query_phrase
.
push_back
(
std
::
move
(
query_phrase_data
));
title_basic
.
push_back
(
std
::
move
(
title_basic_data
));
title_phrase
.
push_back
(
std
::
move
(
title_phrase_data
));
num_lines
++
;
}
}
num_samples
=
num_lines
;
}
};
void
PrepareInputs
(
std
::
vector
<
PaddleTensor
>
*
input_slots
,
DataRecord
*
data
,
int
batch_size
)
{
PaddleTensor
query_basic_tensor
,
query_phrase_tensor
,
title_basic_tensor
,
title_phrase_tensor
;
query_basic_tensor
.
name
=
"query_basic"
;
query_phrase_tensor
.
name
=
"query_phrase"
;
title_basic_tensor
.
name
=
"pos_title_basic"
;
title_phrase_tensor
.
name
=
"pos_title_phrase"
;
auto
one_batch
=
data
->
NextBatch
();
// assign data
TensorAssignData
<
int64_t
>
(
&
query_basic_tensor
,
one_batch
.
query_basic
,
one_batch
.
lod1
);
TensorAssignData
<
int64_t
>
(
&
query_phrase_tensor
,
one_batch
.
query_phrase
,
one_batch
.
lod2
);
TensorAssignData
<
int64_t
>
(
&
title_basic_tensor
,
one_batch
.
title_basic
,
one_batch
.
lod3
);
TensorAssignData
<
int64_t
>
(
&
title_phrase_tensor
,
one_batch
.
title_phrase
,
one_batch
.
lod4
);
// Set inputs.
input_slots
->
assign
({
query_basic_tensor
,
query_phrase_tensor
,
title_basic_tensor
,
title_phrase_tensor
});
for
(
auto
&
tensor
:
*
input_slots
)
{
tensor
.
dtype
=
PaddleDType
::
INT64
;
}
}
void
SetConfig
(
contrib
::
AnalysisConfig
*
cfg
)
{
cfg
->
SetModel
(
FLAGS_infer_model
);
cfg
->
DisableGpu
();
cfg
->
SwitchSpecifyInputNames
();
cfg
->
SwitchIrOptim
();
}
void
SetInput
(
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
*
inputs
)
{
DataRecord
data
(
FLAGS_infer_data
,
FLAGS_batch_size
);
std
::
vector
<
PaddleTensor
>
input_slots
;
int
epoch
=
FLAGS_test_all_data
?
data
.
num_samples
/
FLAGS_batch_size
:
1
;
LOG
(
INFO
)
<<
"number of samples: "
<<
epoch
*
FLAGS_batch_size
;
for
(
int
bid
=
0
;
bid
<
epoch
;
++
bid
)
{
PrepareInputs
(
&
input_slots
,
&
data
,
FLAGS_batch_size
);
(
*
inputs
).
emplace_back
(
input_slots
);
}
}
// Easy for profiling independently.
TEST
(
Analyzer_Pyramid_DNN
,
profile
)
{
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
PaddleTensor
>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
if
(
FLAGS_num_threads
==
1
&&
!
FLAGS_test_all_data
)
{
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
1UL
);
size_t
size
=
GetSize
(
outputs
[
0
]);
PADDLE_ENFORCE_GT
(
size
,
0
);
float
*
result
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
// output is probability, which is in (0, 1).
for
(
size_t
i
=
0
;
i
<
size
;
i
++
)
{
EXPECT_GT
(
result
[
i
],
0
);
EXPECT_LT
(
result
[
i
],
1
);
}
}
}
// Check the fuse status
TEST
(
Analyzer_Pyramid_DNN
,
fuse_statis
)
{
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
}
// Compare result of NativeConfig and AnalysisConfig
TEST
(
Analyzer_Pyramid_DNN
,
compare
)
{
contrib
::
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Compare Deterministic result
TEST
(
Analyzer_Pyramid_DNN
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
54f9d44e
...
...
@@ -65,7 +65,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler
tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions
)
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
...
...
paddle/fluid/operators/controlflow/while_op.cc
浏览文件 @
54f9d44e
...
...
@@ -58,7 +58,6 @@ class WhileOp : public framework::OperatorBase {
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
PADDLE_ENFORCE_NOT_NULL
(
scope
.
FindVar
(
Input
(
kCondition
)));
auto
&
cond
=
scope
.
FindVar
(
Input
(
kCondition
))
->
Get
<
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
cond
.
dims
(),
paddle
::
framework
::
make_ddim
({
1
}));
...
...
@@ -73,27 +72,18 @@ class WhileOp : public framework::OperatorBase {
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
cond
.
place
()),
"Condition of while op must in CPU memory."
);
bool
is_test
=
Attr
<
bool
>
(
"is_test"
);
auto
&
skip_vars
=
Attr
<
std
::
vector
<
std
::
string
>>
(
kSkipEagerDeletionVars
);
VLOG
(
2
)
<<
GetSkipEagerDeletionVarsDebugString
(
skip_vars
);
bool
is_test
=
Attr
<
bool
>
(
"is_test"
);
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
(),
skip_vars
);
if
(
!
is_test
)
{
while
(
cond
.
data
<
bool
>
()[
0
])
{
auto
&
current_scope
=
scope
.
NewScope
();
step_scopes
->
push_back
(
&
current_scope
);
executor
.
RunPreparedContext
(
ctx
.
get
(),
&
current_scope
,
false
,
true
,
true
);
}
}
else
{
while
(
cond
.
data
<
bool
>
()[
0
])
{
auto
&
current_scope
=
scope
.
NewScope
();
executor
.
CreateVariables
(
*
program
,
&
current_scope
,
block
->
ID
()
);
while
(
cond
.
data
<
bool
>
()[
0
])
{
executor
.
RunPreparedContext
(
ctx
.
get
(),
&
current_scope
,
false
,
false
,
fals
e
);
step_scopes
->
push_back
(
&
current_scope
);
executor
.
RunPreparedContext
(
ctx
.
get
(),
&
current_scope
,
false
,
true
,
true
);
if
(
is_test
)
{
scope
.
DeleteScope
(
&
current_scop
e
);
}
scope
.
DeleteScope
(
&
current_scope
);
}
}
};
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
54f9d44e
...
...
@@ -7,7 +7,7 @@ if(WITH_GRPC)
else
()
set
(
cc_generic_services
"true"
)
endif
()
configure_file
(
send_recv.proto.in
${
CMAKE_CURRENT_
BINARY
_DIR
}
/send_recv.proto @ONLY
)
configure_file
(
send_recv.proto.in
${
CMAKE_CURRENT_
SOURCE
_DIR
}
/send_recv.proto @ONLY
)
# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
...
...
@@ -19,8 +19,8 @@ if(WITH_GRPC)
variable_response.cc
collective_client.cc collective_server.cc
${
GRPC_SRCS
}
PROTO
${
CMAKE_CURRENT_BINARY_DIR
}
/
send_recv.proto
DEPS lod_tensor selected_rows_functor memory
${
GRPC_DEPS
}
)
PROTO send_recv.proto
DEPS lod_tensor selected_rows_functor memory
)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set
(
RPC_DEPS sendrecvop_rpc
${
GRPC_DEPS
}
)
...
...
paddle/fluid/operators/gru_op.cc
浏览文件 @
54f9d44e
...
...
@@ -137,6 +137,10 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, defalut: False) "
"whether to compute reversed GRU."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"origin_mode"
,
"bool"
"use origin mode in article https://arxiv.org/abs/1412.3555"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
GRU Operator implements part calculations of the complete GRU as following:
...
...
@@ -221,6 +225,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
bool
origin_mode
=
context
.
Attr
<
bool
>
(
"origin_mode"
);
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
...
...
@@ -327,7 +332,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_node
);
cur_batch_size
,
active_node
,
origin_mode
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
...
...
@@ -351,7 +356,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
active_gate
,
origin_mode
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
...
...
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
54f9d44e
...
...
@@ -21,6 +21,7 @@ template <typename DeviceContext, typename T>
class
GRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
bool
origin_mode
=
context
.
Attr
<
bool
>
(
"origin_mode"
);
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
...
...
@@ -87,7 +88,7 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
active_gate
,
origin_mode
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
...
...
paddle/fluid/operators/gru_op.h
浏览文件 @
54f9d44e
...
...
@@ -41,6 +41,7 @@ template <typename DeviceContext, typename T>
class
GRUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
bool
origin_mode
=
context
.
Attr
<
bool
>
(
"origin_mode"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
...
...
@@ -146,7 +147,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
math
::
GRUUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
gru_grad
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
active_gate
,
origin_mode
);
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
...
...
paddle/fluid/operators/gru_unit_op.cc
浏览文件 @
54f9d44e
...
...
@@ -111,6 +111,13 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
"The activation type used in update gate and reset gate."
)
.
SetDefault
(
sigmoid
)
.
InEnum
({
identity
,
sigmoid
,
tanh
,
relu
});
AddAttr
<
bool
>
(
"origin_mode"
,
"bool"
"use origin mode in article <Learning Phrase Representations "
"using RNN Encoder–Decoder
\n
"
"for Statistical Machine "
"Translation>(https://arxiv.org/pdf/1406.1078.pdf)"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
GRUUnit Operator implements partial calculations of the GRU unit as following:
...
...
paddle/fluid/operators/gru_unit_op.h
浏览文件 @
54f9d44e
...
...
@@ -113,7 +113,11 @@ class GRUUnitKernel : public framework::OpKernel<T> {
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
// calculate final output
h
.
device
(
place
)
=
u
*
(
c
-
h_p
)
+
h_p
;
if
(
context
.
Attr
<
bool
>
(
"origin_mode"
))
{
h
.
device
(
place
)
=
c
+
u
*
(
h_p
-
c
);
// (1 - u) * c + u * h_p
}
else
{
h
.
device
(
place
)
=
u
*
(
c
-
h_p
)
+
h_p
;
// u * c + (1 - u) * h_p
}
}
};
...
...
@@ -180,11 +184,19 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
auto
c
=
g
.
slice
(
c_offsets
,
extents
);
// output candidate
// backward for unactivated update gate
ActGradCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
u
,
u
,
d_g
.
slice
(
u_offsets
,
extents
),
d_h
*
(
c
-
h_p
));
// backward for unactivated output candidate
ActGradCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
c
,
c
,
d_g
.
slice
(
c_offsets
,
extents
),
d_h
*
u
);
if
(
context
.
Attr
<
bool
>
(
"origin_mode"
))
{
ActGradCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
u
,
u
,
d_g
.
slice
(
u_offsets
,
extents
),
d_h
*
(
h_p
-
c
));
// backward for unactivated output candidate
ActGradCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
c
,
c
,
d_g
.
slice
(
c_offsets
,
extents
),
d_h
*
(
1
-
u
));
}
else
{
ActGradCompute
(
context
.
Attr
<
int
>
(
"gate_activation"
),
place
,
u
,
u
,
d_g
.
slice
(
u_offsets
,
extents
),
d_h
*
(
c
-
h_p
));
// backward for unactivated output candidate
ActGradCompute
(
context
.
Attr
<
int
>
(
"activation"
),
place
,
c
,
c
,
d_g
.
slice
(
c_offsets
,
extents
),
d_h
*
u
);
}
// backward for reset_hidden_prev
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
,
1
,
...
...
@@ -213,7 +225,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
T
*
hidden_prev_grad_data
=
hidden_prev_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
d_h_p
=
EigenMatrix
<
T
>::
From
(
*
hidden_prev_grad
);
d_h_p
.
device
(
place
)
=
d_r_h_p
*
r
+
d_h
*
(
u
.
constant
(
T
(
1
))
-
u
);
if
(
context
.
Attr
<
bool
>
(
"origin_mode"
))
{
d_h_p
.
device
(
place
)
=
d_r_h_p
*
r
+
d_h
*
u
;
}
else
{
d_h_p
.
device
(
place
)
=
d_r_h_p
*
r
+
d_h
*
(
1
-
u
);
}
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
*
2
,
1
,
gate_grad_data
,
frame_size
*
3
,
weight_data
,
frame_size
*
2
,
1
,
hidden_prev_grad_data
,
frame_size
);
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
54f9d44e
...
...
@@ -60,6 +60,7 @@ math_library(matrix_bit_code)
math_library
(
unpooling
)
math_library
(
vol2col
)
math_library
(
prelu
)
math_library
(
tree2col DEPS math_function
)
cc_test
(
math_function_test SRCS math_function_test.cc DEPS math_function
)
cc_test
(
selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor
)
...
...
paddle/fluid/operators/math/detail/gru_cpu_kernel.h
浏览文件 @
54f9d44e
...
...
@@ -56,7 +56,8 @@ template <class OpFinalOutput, typename T>
void
hl_naive_gru_forward_final_output
(
OpFinalOutput
op_final_output
,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
output_value
,
int
frame_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
T
r_value_update_gate
;
T
r_value_frame_state
;
T
r_prev_out
=
0
;
...
...
@@ -72,7 +73,7 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
}
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
&
r_output
,
active_node
);
&
r_output
,
active_node
,
origin_mode
);
frame_state
[
i
]
=
r_value_frame_state
;
output_value
[
i
]
=
r_output
;
...
...
@@ -146,7 +147,8 @@ template <class OpFinalOutput, typename T>
void
hl_avx_gru_forward_final_output
(
OpFinalOutput
op_final_output
,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
output_value
,
int
frame_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
#ifdef __AVX__
__m256
r_value_update_gate
,
r_value_update_gate_last
=
_mm256_set1_ps
(
0.0
f
);
__m256
r_value_frame_state
,
r_value_frame_state_last
=
_mm256_set1_ps
(
0.0
f
);
...
...
@@ -180,7 +182,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
}
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
&
r_output
,
active_node
);
&
r_output
,
active_node
,
origin_mode
);
_mm256_storeu_ps
(
reinterpret_cast
<
float
*>
(
frame_state
+
i
),
r_value_frame_state
);
...
...
@@ -190,7 +192,7 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
if
(
rest
>
0
)
{
i
=
n
-
block
;
op_final_output
(
&
r_value_update_gate_last
,
&
r_value_frame_state_last
,
&
r_prev_out_last
,
&
r_output
,
active_node
);
&
r_prev_out_last
,
&
r_output
,
active_node
,
origin_mode
);
_mm256_storeu_ps
(
reinterpret_cast
<
float
*>
(
frame_state
+
i
),
r_value_frame_state_last
);
...
...
@@ -227,17 +229,18 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
template
<
class
OpFinalOutput
,
typename
T
>
inline
void
forward_final_output
(
OpFinalOutput
op_final_output
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
)
{
int
batch_size
,
ActivationType
active_node
,
bool
origin_mode
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
if
(
OpFinalOutput
::
avx
&&
(
frame_size
>
static_cast
<
int
>
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
hl_avx_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
active_node
);
frame_size
,
active_node
,
origin_mode
);
}
else
{
hl_naive_gru_forward_final_output
(
op_final_output
,
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
active_node
);
value
.
output_value
,
frame_size
,
active_node
,
origin_mode
);
}
value
.
gate_value
+=
frame_size
*
3
;
...
...
@@ -253,7 +256,8 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
prev_out_grad
,
T
*
output_grad
,
int
frame_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
T
r_update_gate_value
;
T
r_update_gate_grad
;
T
r_frame_state_value
;
...
...
@@ -279,7 +283,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
);
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
,
origin_mode
);
update_gate_grad
[
i
]
=
r_update_gate_grad
;
frame_state_grad
[
i
]
=
r_frame_state_grad
;
...
...
@@ -338,8 +342,8 @@ template <class OpStateGrad, typename T>
void
hl_avx_gru_backward_state_grad
(
OpStateGrad
op_state_grad
,
T
*
gate_value
,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
prev_out_grad
,
T
*
output_grad
,
int
frame_size
,
ActivationType
active_n
ode
)
{
int
frame_size
,
ActivationType
active_node
,
bool
origin_m
ode
)
{
#ifdef __AVX__
__m256
r_update_gate_value
;
__m256
r_update_gate_grad
;
...
...
@@ -368,7 +372,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
);
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
,
origin_mode
);
update_gate_grad
[
i
]
=
r_update_gate_grad
;
frame_state_grad
[
i
]
=
r_frame_state_grad
;
...
...
@@ -431,16 +435,18 @@ template <class OpStateGrad, typename T>
inline
void
backward_state_grad
(
OpStateGrad
op_state_grad
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
if
(
OpStateGrad
::
avx
&&
!
(
frame_size
&
(
8
-
1
))
&&
(
sizeof
(
T
)
==
4
))
{
hl_avx_gru_backward_state_grad
(
op_state_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
);
hl_avx_gru_backward_state_grad
(
op_state_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
,
origin_mode
);
}
else
{
hl_naive_gru_backward_state_grad
(
op_state_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
);
hl_naive_gru_backward_state_grad
(
op_state_grad
,
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
active_node
,
origin_mode
);
}
value
.
gate_value
+=
frame_size
*
3
;
...
...
paddle/fluid/operators/math/detail/gru_gpu_kernel.h
浏览文件 @
54f9d44e
...
...
@@ -72,7 +72,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
T
*
gate_value
,
T
*
prev_output_value
,
T
*
output_value
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
frame_idx
>=
frame_size
)
return
;
int
batch_idx
=
0
;
...
...
@@ -94,7 +95,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
}
op_final_output
(
&
r_value_update_gate
,
&
r_value_frame_state
,
&
r_prev_out
,
&
r_output
,
active_node
);
&
r_output
,
active_node
,
origin_mode
);
gate_value
[
frame_idx
+
frame_size
*
2
]
=
r_value_frame_state
;
output_value
[
frame_idx
]
=
r_output
;
...
...
@@ -109,7 +110,8 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
T
*
gate_grad
,
T
*
prev_out_value
,
T
*
prev_out_grad
,
T
*
output_grad
,
int
frame_size
,
int
batch_size
,
ActivationType
active_node
)
{
ActivationType
active_node
,
bool
origin_mode
)
{
const
int
frame_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
frame_idx
>=
frame_size
)
return
;
int
batch_idx
=
0
;
...
...
@@ -139,7 +141,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
op_state_grad
(
&
r_update_gate_value
,
&
r_update_gate_grad
,
&
r_frame_state_value
,
&
r_frame_state_grad
,
&
r_prev_out_value
,
&
r_prev_out_grad
,
&
r_out_grad
,
active_node
);
&
r_out_grad
,
active_node
,
origin_mode
);
gate_grad
[
frame_idx
+
frame_size
*
0
]
=
r_update_gate_grad
;
gate_grad
[
frame_idx
+
frame_size
*
2
]
=
r_frame_state_grad
;
...
...
paddle/fluid/operators/math/detail/gru_kernel.h
浏览文件 @
54f9d44e
...
...
@@ -57,10 +57,16 @@ class gru_finalOutput {
public:
HOSTDEVICE
void
operator
()(
T
*
value_update_gate
,
T
*
value_frame_state
,
T
*
prev_out
,
T
*
value_output
,
ActivationType
act_input
)
{
ActivationType
act_input
,
bool
origin_mode
)
{
*
value_frame_state
=
activation
(
*
value_frame_state
,
act_input
);
*
value_output
=
*
prev_out
-
((
*
value_update_gate
)
*
(
*
prev_out
))
+
((
*
value_update_gate
)
*
(
*
value_frame_state
));
if
(
origin_mode
)
{
*
value_output
=
((
*
value_update_gate
)
*
(
*
prev_out
))
+
*
value_frame_state
-
((
*
value_update_gate
)
*
(
*
value_frame_state
));
}
else
{
*
value_output
=
*
prev_out
-
((
*
value_update_gate
)
*
(
*
prev_out
))
+
((
*
value_update_gate
)
*
(
*
value_frame_state
));
}
}
#ifndef __NVCC__
#ifndef __AVX__
...
...
@@ -69,11 +75,20 @@ class gru_finalOutput {
static
const
bool
avx
=
true
;
HOSTDEVICE
void
operator
()(
__m256
*
value_update_gate
,
__m256
*
value_frame_state
,
__m256
*
prev_out
,
__m256
*
value_output
,
ActivationType
act_input
)
{
__m256
*
value_output
,
ActivationType
act_input
,
bool
origin_mode
)
{
*
value_frame_state
=
activation
(
*
value_frame_state
,
act_input
);
*
value_output
=
_mm256_add_ps
(
_mm256_sub_ps
(
*
prev_out
,
_mm256_mul_ps
(
*
value_update_gate
,
*
prev_out
)),
_mm256_mul_ps
(
*
value_update_gate
,
*
value_frame_state
));
if
(
origin_mode
)
{
*
value_output
=
_mm256_sub_ps
(
_mm256_add_ps
(
_mm256_mul_ps
(
*
value_update_gate
,
*
prev_out
),
*
value_frame_state
),
_mm256_mul_ps
(
*
value_update_gate
,
*
value_frame_state
));
}
else
{
*
value_output
=
_mm256_add_ps
(
_mm256_sub_ps
(
*
prev_out
,
_mm256_mul_ps
(
*
value_update_gate
,
*
prev_out
)),
_mm256_mul_ps
(
*
value_update_gate
,
*
value_frame_state
));
}
}
#endif
#endif
...
...
@@ -88,13 +103,23 @@ class gru_stateGrad {
HOSTDEVICE
void
operator
()(
T
*
value_update_gate
,
T
*
grad_update_gate
,
T
*
value_frame_state
,
T
*
grad_frame_state
,
T
*
value_prev_out
,
T
*
grad_prev_out
,
T
*
grad_output
,
ActivationType
act_input
)
{
*
grad_update_gate
=
(
*
grad_output
*
(
*
value_frame_state
));
*
grad_update_gate
-=
(
*
grad_output
*
(
*
value_prev_out
));
*
grad_prev_out
-=
(
*
grad_output
*
(
*
value_update_gate
));
*
grad_prev_out
+=
*
grad_output
;
*
grad_frame_state
=
activation
(
*
grad_output
*
(
*
value_update_gate
),
*
value_frame_state
,
act_input
);
T
*
grad_output
,
ActivationType
act_input
,
bool
origin_mode
)
{
if
(
origin_mode
)
{
*
grad_update_gate
=
(
*
grad_output
)
*
((
*
value_prev_out
)
-
(
*
value_frame_state
));
*
grad_prev_out
+=
(
*
grad_output
*
(
*
value_update_gate
));
*
grad_frame_state
=
activation
(
*
grad_output
*
(
static_cast
<
T
>
(
1.0
)
-
(
*
value_update_gate
)),
*
value_frame_state
,
act_input
);
}
else
{
*
grad_update_gate
=
(
*
grad_output
)
*
((
*
value_frame_state
)
-
(
*
value_prev_out
));
*
grad_prev_out
+=
(
*
grad_output
*
(
static_cast
<
T
>
(
1.0
)
-
*
value_update_gate
));
*
grad_frame_state
=
activation
(
*
grad_output
*
(
*
value_update_gate
),
*
value_frame_state
,
act_input
);
}
}
#ifndef __NVCC__
#ifndef __AVX__
...
...
@@ -106,17 +131,27 @@ class gru_stateGrad {
__m256
*
value_frame_state
,
__m256
*
grad_frame_state
,
__m256
*
value_prev_out
,
__m256
*
grad_prev_out
,
__m256
*
grad_output
,
ActivationType
act_input
)
{
*
grad_update_gate
=
_mm256_mul_ps
(
*
grad_output
,
*
value_frame_state
);
*
grad_update_gate
=
_mm256_sub_ps
(
*
grad_update_gate
,
_mm256_mul_ps
(
*
grad_output
,
*
value_prev_out
));
*
grad_prev_out
=
_mm256_add_ps
(
_mm256_sub_ps
(
*
grad_prev_out
,
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
)),
*
grad_output
);
*
grad_frame_state
=
activation
(
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
),
*
value_frame_state
,
act_input
);
ActivationType
act_input
,
bool
origin_mode
)
{
if
(
origin_mode
)
{
*
grad_update_gate
=
_mm256_mul_ps
(
*
grad_output
,
_mm256_sub_ps
(
*
value_prev_out
,
*
value_frame_state
));
*
grad_prev_out
=
_mm256_add_ps
(
*
grad_prev_out
,
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
));
*
grad_frame_state
=
activation
(
_mm256_mul_ps
(
*
grad_output
,
_mm256_sub_ps
(
_mm256_set1_ps
(
1.0
f
),
*
value_update_gate
)),
*
value_frame_state
,
act_input
);
}
else
{
*
grad_update_gate
=
_mm256_mul_ps
(
*
grad_output
,
_mm256_sub_ps
(
*
value_frame_state
,
*
value_prev_out
));
*
grad_prev_out
=
_mm256_add_ps
(
*
grad_prev_out
,
_mm256_mul_ps
(
*
grad_output
,
_mm256_sub_ps
(
_mm256_set1_ps
(
1.0
f
),
*
value_update_gate
)));
*
grad_frame_state
=
activation
(
_mm256_mul_ps
(
*
grad_output
,
*
value_update_gate
),
*
value_frame_state
,
act_input
);
}
}
#endif
#endif
...
...
paddle/fluid/operators/math/gru_compute.cc
浏览文件 @
54f9d44e
...
...
@@ -23,7 +23,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
static
void
compute
(
const
platform
::
CPUDeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
#ifndef __NVCC__
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
if
(
value
.
prev_out_value
)
{
...
...
@@ -43,7 +44,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
}
detail
::
forward_final_output
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_node
);
frame_size
,
batch_size
,
active_node
,
origin_mode
);
#endif
}
};
...
...
@@ -54,10 +56,12 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
#ifndef __NVCC__
detail
::
backward_state_grad
(
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
,
grad
,
frame_size
,
batch_size
,
active_node
);
grad
,
frame_size
,
batch_size
,
active_node
,
origin_mode
);
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
if
(
value
.
prev_out_value
&&
grad
.
prev_out_grad
)
{
blas
.
GEMM
(
false
,
true
,
batch_size
,
frame_size
,
frame_size
,
1
,
...
...
paddle/fluid/operators/math/gru_compute.cu
浏览文件 @
54f9d44e
...
...
@@ -24,7 +24,8 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
static
void
compute
(
const
platform
::
CUDADeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
auto
stream
=
context
.
stream
();
dim3
threads
;
dim3
grid
;
...
...
@@ -73,14 +74,14 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
batch_size
,
active_node
);
active_node
,
origin_mode
);
}
else
{
detail
::
KeGruForwardFinalOutput
<
detail
::
forward
::
gru_finalOutput
<
T
>
,
/* is_batch= */
true
,
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
forward
::
gru_finalOutput
<
T
>
(),
value
.
gate_value
,
value
.
prev_out_value
,
value
.
output_value
,
frame_size
,
batch_size
,
active_node
);
active_node
,
origin_mode
);
}
}
};
...
...
@@ -91,7 +92,8 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
)
{
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
)
{
auto
stream
=
context
.
stream
();
dim3
threads
;
dim3
grid
;
...
...
@@ -111,14 +113,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
/* is_batch= */
false
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
);
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
,
origin_mode
);
}
else
{
detail
::
KeGruBackwardStateGrad
<
detail
::
backward
::
gru_stateGrad
<
T
>
,
/* is_batch= */
true
><<<
grid
,
threads
,
0
,
stream
>>>
(
detail
::
backward
::
gru_stateGrad
<
T
>
(),
value
.
gate_value
,
grad
.
gate_grad
,
value
.
prev_out_value
,
grad
.
prev_out_grad
,
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
);
grad
.
output_grad
,
frame_size
,
batch_size
,
active_node
,
origin_mode
);
}
auto
blas
=
math
::
GetBlas
<
platform
::
CUDADeviceContext
,
T
>
(
context
);
...
...
paddle/fluid/operators/math/gru_compute.h
浏览文件 @
54f9d44e
...
...
@@ -44,7 +44,8 @@ struct GRUUnitFunctor {
static
void
compute
(
const
DeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
);
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
);
};
template
<
typename
DeviceContext
,
typename
T
>
...
...
@@ -52,7 +53,8 @@ struct GRUUnitGradFunctor {
static
void
compute
(
const
DeviceContext
&
context
,
GRUMetaValue
<
T
>
value
,
GRUMetaGrad
<
T
>
grad
,
int
frame_size
,
int
batch_size
,
const
detail
::
ActivationType
active_node
,
const
detail
::
ActivationType
active_gate
);
const
detail
::
ActivationType
active_gate
,
bool
origin_mode
);
};
}
// namespace math
...
...
paddle/fluid/operators/math/tree2col.cc
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/math/tree2col.h"
#include <deque>
#include <stack>
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
std
::
vector
<
TreeNode
>
Tree2ColUtil
::
construct_patch
(
size_t
root
,
int
max_depth
,
const
std
::
vector
<
std
::
vector
<
int
>>
&
tr
)
{
std
::
stack
<
TreeNode
,
std
::
deque
<
TreeNode
>>
stack
;
std
::
unordered_map
<
int
,
bool
>
visited
;
std
::
vector
<
TreeNode
>
patch
;
stack
.
push
(
TreeNode
(
root
,
1
,
1
,
0
));
patch
.
emplace_back
(
TreeNode
(
root
,
1
,
1
,
0
));
visited
[
root
]
=
true
;
while
(
!
stack
.
empty
())
{
TreeNode
&
u
=
stack
.
top
();
bool
end
=
true
;
size_t
node
=
u
.
get_node
(),
sz
=
tr
[
node
].
size
();
visited
[
node
]
=
true
;
for
(
size_t
i
=
0
;
i
<
sz
;
i
++
)
{
size_t
v
=
tr
[
node
][
i
];
if
(
!
visited
[
v
]
&&
static_cast
<
int
>
(
u
.
get_depth
())
+
1
<
max_depth
)
{
visited
[
v
]
=
true
;
stack
.
push
(
TreeNode
(
v
,
i
,
sz
,
u
.
get_depth
()
+
1
));
patch
.
push_back
(
TreeNode
(
v
,
i
+
1
,
sz
,
u
.
get_depth
()
+
1
));
end
=
false
;
}
}
if
(
end
)
{
stack
.
pop
();
}
}
return
patch
;
}
void
Tree2ColUtil
::
construct_tree
(
const
paddle
::
Tensor
&
EdgeSet
,
std
::
vector
<
std
::
vector
<
int
>>
*
tr
,
size_t
*
node_count
)
{
auto
edge_set_dims
=
EdgeSet
.
dims
();
PADDLE_ENFORCE_EQ
(
edge_set_dims
[
1
],
2
);
int64_t
edge_count
=
EdgeSet
.
numel
();
const
int
*
edge_data
=
EdgeSet
.
data
<
int
>
();
for
(
int64_t
i
=
0
;
i
<
edge_count
;
i
+=
2
)
{
int
u
=
edge_data
[
i
],
v
=
edge_data
[
i
+
1
];
if
(
u
!=
0
&&
v
!=
0
)
(
*
node_count
)
++
;
}
(
*
node_count
)
++
;
tr
->
resize
(
static_cast
<
size_t
>
(
*
node_count
+
1
));
for
(
int64_t
i
=
0
;
i
<
edge_count
;
i
+=
2
)
{
int
u
=
edge_data
[
i
],
v
=
edge_data
[
i
+
1
];
if
(
u
!=
0
&&
v
!=
0
)
{
tr
->
at
(
u
).
push_back
(
v
);
}
else
{
break
;
}
}
}
template
<
typename
T
>
class
Tree2ColFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
node_features
,
framework
::
Tensor
*
patch
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
feature_dims
=
node_features
.
dims
();
auto
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
constant
;
int64_t
feature_size
=
feature_dims
[
1
];
size_t
patch_elem_size
=
3
*
static_cast
<
size_t
>
(
feature_size
);
size_t
node_count
=
0
,
patch_count
=
0
,
patch_size
;
Tree2ColUtil
::
construct_tree
(
EdgeSet
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
TreeNode
>>
processing_list
;
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
TreeNode
>
temp_patch
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
temp_patch
.
empty
())
{
processing_list
.
emplace_back
(
temp_patch
);
}
}
patch_size
=
processing_list
.
size
();
T
*
patch_data
=
patch
->
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
patch_size
),
static_cast
<
int64_t
>
(
patch_elem_size
)},
cpu_place
);
constant
(
context
,
patch
,
0
);
const
T
*
features
=
node_features
.
data
<
T
>
();
for
(
auto
&
patch_item
:
processing_list
)
{
size_t
pointer_base
=
patch_count
*
patch_elem_size
;
for
(
auto
&
v
:
patch_item
)
{
T
eta_l
=
v
.
eta_l
<
T
>
(
max_depth
),
eta_r
=
v
.
eta_r
<
T
>
(
max_depth
),
eta_t
=
v
.
eta_t
<
T
>
(
max_depth
);
size_t
id
=
v
.
get_node
()
-
1
;
for
(
int
i
=
0
;
i
<
feature_size
;
i
++
)
{
patch_data
[
pointer_base
+
i
*
3
]
+=
eta_l
*
features
[
id
*
feature_size
+
i
];
patch_data
[
pointer_base
+
i
*
3
+
1
]
+=
eta_r
*
features
[
id
*
feature_size
+
i
];
patch_data
[
pointer_base
+
i
*
3
+
2
]
+=
eta_t
*
features
[
id
*
feature_size
+
i
];
}
}
patch_count
++
;
}
patch
->
Resize
({
static_cast
<
int64_t
>
(
patch_count
),
static_cast
<
int64_t
>
(
patch_elem_size
)});
}
};
template
<
typename
T
>
class
Col2TreeFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
out_grad
,
framework
::
Tensor
*
in_grad
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
output_dims
=
out_grad
.
dims
();
auto
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
constant
;
int64_t
output_size
=
output_dims
[
1
];
size_t
grad_elem_size
=
3
*
static_cast
<
size_t
>
(
output_size
);
size_t
node_count
=
0
,
grad_count
=
0
;
Tree2ColUtil
::
construct_tree
(
EdgeSet
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
TreeNode
>>
processing_list
;
std
::
vector
<
std
::
vector
<
TreeNode
>>
grad_list
;
grad_list
.
resize
(
node_count
);
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
TreeNode
>
tmp
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
tmp
.
empty
())
{
processing_list
.
push_back
(
tmp
);
}
}
for
(
size_t
patch_id
=
0
;
patch_id
<
processing_list
.
size
();
patch_id
++
)
{
for
(
auto
v
:
processing_list
[
patch_id
])
{
grad_list
[
v
.
get_node
()
-
1
].
push_back
(
v
.
change_node
(
patch_id
+
1
));
}
}
T
*
grad_data
=
in_grad
->
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
node_count
),
static_cast
<
int64_t
>
(
grad_elem_size
)},
cpu_place
);
constant
(
context
,
in_grad
,
0
);
const
T
*
out_g
=
out_grad
.
data
<
T
>
();
for
(
auto
&
patch_item
:
grad_list
)
{
size_t
pointer_base
=
grad_count
*
grad_elem_size
;
for
(
auto
&
v
:
patch_item
)
{
T
eta_l
=
v
.
eta_l
<
T
>
(
max_depth
),
eta_r
=
v
.
eta_r
<
T
>
(
max_depth
),
eta_t
=
v
.
eta_t
<
T
>
(
max_depth
);
size_t
id
=
v
.
get_node
()
-
1
;
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
grad_data
[
pointer_base
+
i
*
3
]
+=
eta_l
*
out_g
[
id
*
output_size
+
i
];
grad_data
[
pointer_base
+
i
*
3
+
1
]
+=
eta_r
*
out_g
[
id
*
output_size
+
i
];
grad_data
[
pointer_base
+
i
*
3
+
2
]
+=
eta_t
*
out_g
[
id
*
output_size
+
i
];
}
}
grad_count
++
;
}
}
};
template
class
Tree2ColFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
Tree2ColFunctor
<
platform
::
CPUDeviceContext
,
double
>;
template
class
Col2TreeFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
Col2TreeFunctor
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/tree2col.cu
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stack>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/tree2col.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
using
Node
=
paddle
::
operators
::
math
::
TreeNode
;
template
<
typename
T
>
__global__
void
tree2col
(
const
T
*
eta
,
const
int
*
node
,
const
int
*
index
,
const
T
*
vectors
,
T
*
result
,
int
feature_size
,
int
n
)
{
const
int
thread_id
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
patch_id
=
thread_id
/
feature_size
;
const
int
j
=
thread_id
%
feature_size
;
if
(
patch_id
<
n
)
{
const
int
begin_o
=
patch_id
*
3
*
feature_size
;
const
int
begin
=
index
[
patch_id
*
2
],
end
=
index
[
patch_id
*
2
+
1
];
T
res_l
=
0
,
res_r
=
0
,
res_t
=
0
;
for
(
int
i
=
begin
;
i
<
end
;
i
++
)
{
const
int
id
=
node
[
i
];
const
T
vec
=
vectors
[
id
*
feature_size
+
j
];
res_l
+=
eta
[
i
*
3
]
*
vec
;
res_r
+=
eta
[
i
*
3
+
1
]
*
vec
;
res_t
+=
eta
[
i
*
3
+
2
]
*
vec
;
}
result
[
begin_o
+
j
*
3
]
=
res_l
;
result
[
begin_o
+
j
*
3
+
1
]
=
res_r
;
result
[
begin_o
+
j
*
3
+
2
]
=
res_t
;
}
}
template
<
typename
T
>
class
Tree2ColFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
paddle
::
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
node_features
,
framework
::
Tensor
*
patch
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
context
.
GetPlace
());
auto
cpu_place
=
platform
::
CPUPlace
();
auto
stream
=
context
.
stream
();
auto
feature_dims
=
node_features
.
dims
();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant
;
Tensor
EdgeSet_cpu
;
framework
::
TensorCopy
(
EdgeSet
,
cpu_place
,
&
EdgeSet_cpu
);
int64_t
feature_size
=
feature_dims
[
1
];
size_t
patch_elem_size
=
3
*
static_cast
<
size_t
>
(
feature_size
);
size_t
node_count
=
0
,
patch_count
=
0
,
total_size
=
0
;
size_t
max_size
=
feature_dims
[
0
];
Tree2ColUtil
::
construct_tree
(
EdgeSet_cpu
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
Node
>>
processing_list
;
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
Node
>
tmp
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
tmp
.
empty
())
{
processing_list
.
push_back
(
tmp
);
total_size
+=
tmp
.
size
();
}
}
size_t
patch_size
=
processing_list
.
size
();
Tensor
node_cpu
,
node_gpu
,
eta_cpu
,
eta_gpu
,
index_cpu
,
index_gpu
;
int
*
node
=
node_cpu
.
mutable_data
<
int
>
({
static_cast
<
int64_t
>
(
total_size
)},
cpu_place
);
T
*
eta
=
eta_cpu
.
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
total_size
*
3
)},
cpu_place
);
int
*
index
=
index_cpu
.
mutable_data
<
int
>
(
{
static_cast
<
int64_t
>
(
patch_size
*
2
)},
cpu_place
);
int
idx
=
0
,
index_idx
=
0
;
for
(
auto
&
tmp
:
processing_list
)
{
index
[
index_idx
++
]
=
idx
;
for
(
auto
&
v
:
tmp
)
{
node
[
idx
]
=
static_cast
<
int
>
(
v
.
node
-
1
);
eta
[
idx
*
3
]
=
v
.
eta_l
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
1
]
=
v
.
eta_r
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
2
]
=
v
.
eta_t
<
T
>
(
max_depth
);
idx
++
;
}
index
[
index_idx
++
]
=
idx
;
}
framework
::
TensorCopy
(
node_cpu
,
gpu_place
,
context
,
&
node_gpu
);
framework
::
TensorCopy
(
eta_cpu
,
gpu_place
,
context
,
&
eta_gpu
);
framework
::
TensorCopy
(
index_cpu
,
gpu_place
,
context
,
&
index_gpu
);
int
elem_size
=
patch_size
*
feature_size
;
int
blocks
=
(
elem_size
+
1024
-
1
)
/
1024
;
int
block_x
=
512
;
int
block_y
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
block_x
,
block_y
);
patch
->
mutable_data
<
T
>
(
{
static_cast
<
int64_t
>
(
max_size
),
static_cast
<
int64_t
>
(
patch_elem_size
)},
gpu_place
);
constant
(
context
,
patch
,
0
);
tree2col
<
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
eta_gpu
.
data
<
T
>
(),
node_gpu
.
data
<
int
>
(),
index_gpu
.
data
<
int
>
(),
node_features
.
data
<
T
>
(),
patch
->
data
<
T
>
(),
feature_size
,
patch_size
);
}
};
template
<
typename
T
>
class
Col2TreeFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
patch_grad
,
framework
::
Tensor
*
embedding_grad
,
int
max_depth
)
{
std
::
vector
<
std
::
vector
<
int
>>
tr
;
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
context
.
GetPlace
());
auto
cpu_place
=
platform
::
CPUPlace
();
auto
stream
=
context
.
stream
();
auto
output_dims
=
patch_grad
.
dims
();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant
;
Tensor
EdgeSet_cpu
;
framework
::
TensorCopy
(
EdgeSet
,
cpu_place
,
&
EdgeSet_cpu
);
int64_t
output_size
=
output_dims
[
1
];
size_t
patch_elem_size
=
3
*
static_cast
<
size_t
>
(
output_size
);
size_t
node_count
=
0
,
patch_count
=
0
;
size_t
max_size
=
output_dims
[
0
];
Tree2ColUtil
::
construct_tree
(
EdgeSet_cpu
,
&
tr
,
&
node_count
);
std
::
vector
<
std
::
vector
<
Node
>>
processing_list
;
std
::
vector
<
std
::
vector
<
Node
>>
grad_list
;
grad_list
.
resize
(
node_count
);
size_t
total_size
=
0
,
grad_size
=
node_count
;
for
(
size_t
u
=
1
;
u
<=
node_count
;
u
++
)
{
std
::
vector
<
Node
>
tmp
=
Tree2ColUtil
::
construct_patch
(
u
,
max_depth
,
tr
);
if
(
!
tmp
.
empty
())
{
processing_list
.
push_back
(
tmp
);
}
}
for
(
size_t
patch_id
=
0
;
patch_id
<
processing_list
.
size
();
patch_id
++
)
{
for
(
auto
v
:
processing_list
[
patch_id
])
{
grad_list
[
v
.
get_node
()
-
1
].
push_back
(
v
.
change_node
(
patch_id
+
1
));
}
}
for
(
auto
&
tmp
:
grad_list
)
{
total_size
+=
tmp
.
size
();
}
Tensor
node_cpu
,
node_gpu
,
eta_cpu
,
eta_gpu
,
index_cpu
,
index_gpu
;
int
*
node
=
node_cpu
.
mutable_data
<
int
>
({
static_cast
<
int64_t
>
(
total_size
)},
cpu_place
);
T
*
eta
=
eta_cpu
.
mutable_data
<
T
>
({
static_cast
<
int64_t
>
(
total_size
*
3
)},
cpu_place
);
int
*
index
=
index_cpu
.
mutable_data
<
int
>
(
{
static_cast
<
int64_t
>
(
grad_size
*
2
)},
cpu_place
);
size_t
idx
=
0
,
index_idx
=
0
;
for
(
auto
&
tmp
:
grad_list
)
{
index
[
index_idx
++
]
=
idx
;
for
(
auto
&
v
:
tmp
)
{
node
[
idx
]
=
static_cast
<
int
>
(
v
.
node
-
1
);
eta
[
idx
*
3
]
=
v
.
eta_l
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
1
]
=
v
.
eta_r
<
T
>
(
max_depth
);
eta
[
idx
*
3
+
2
]
=
v
.
eta_t
<
T
>
(
max_depth
);
idx
++
;
}
index
[
index_idx
++
]
=
idx
;
}
framework
::
TensorCopy
(
node_cpu
,
gpu_place
,
&
node_gpu
);
framework
::
TensorCopy
(
eta_cpu
,
gpu_place
,
&
eta_gpu
);
framework
::
TensorCopy
(
index_cpu
,
gpu_place
,
&
index_gpu
);
int
elem_size
=
output_size
*
grad_size
;
int
blocks
=
(
elem_size
+
1024
-
1
)
/
1024
;
int
block_x
=
512
;
int
block_y
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
block_x
,
block_y
);
embedding_grad
->
mutable_data
<
T
>
(
{
static_cast
<
int64_t
>
(
max_size
),
static_cast
<
int64_t
>
(
patch_elem_size
)},
gpu_place
);
constant
(
context
,
embedding_grad
,
0
);
tree2col
<
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
eta_gpu
.
data
<
T
>
(),
node_gpu
.
data
<
int
>
(),
index_gpu
.
data
<
int
>
(),
patch_grad
.
data
<
T
>
(),
embedding_grad
->
data
<
T
>
(),
output_size
,
grad_size
);
}
};
template
class
Tree2ColFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Tree2ColFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
Col2TreeFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
Col2TreeFunctor
<
platform
::
CUDADeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/tree2col.h
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <array>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
using
Tensor
=
framework
::
Tensor
;
using
DDim
=
framework
::
DDim
;
namespace
operators
{
namespace
math
{
class
TreeNode
{
public:
size_t
node
;
explicit
TreeNode
(
size_t
node
=
0
,
size_t
index
=
0
,
size_t
pclen
=
0
,
size_t
depth
=
0
)
:
node
(
node
),
index
(
index
),
pclen
(
pclen
),
depth
(
depth
)
{}
template
<
typename
T
>
T
eta_t
(
T
filter_depth
)
{
return
((
filter_depth
-
this
->
depth
)
/
filter_depth
);
}
template
<
typename
T
>
T
eta_l
(
T
filter_depth
)
{
T
temp
;
if
(
this
->
pclen
==
1
)
{
temp
=
0.5
;
}
else
{
temp
=
(
this
->
index
-
1.0
)
/
(
this
->
pclen
-
1.0
);
}
return
(
1.0
-
this
->
eta_t
<
T
>
(
filter_depth
))
*
temp
;
}
template
<
typename
T
>
T
eta_r
(
T
filter_depth
)
{
return
(
1.0
-
this
->
eta_t
<
T
>
(
filter_depth
))
*
(
1.0
-
this
->
eta_l
<
T
>
(
filter_depth
));
}
TreeNode
change_node
(
size_t
v
)
{
return
TreeNode
(
v
,
this
->
index
,
this
->
pclen
,
this
->
depth
);
}
size_t
get_node
()
{
return
this
->
node
;
}
size_t
get_depth
()
{
return
this
->
depth
;
}
private:
size_t
index
,
pclen
,
depth
;
};
class
Tree2ColUtil
{
public:
static
std
::
vector
<
TreeNode
>
construct_patch
(
size_t
root
,
int
max_depth
,
const
std
::
vector
<
std
::
vector
<
int
>>
&
tr
);
static
void
construct_tree
(
const
Tensor
&
EdgeSet
,
std
::
vector
<
std
::
vector
<
int
>>
*
tr
,
size_t
*
node_count
);
};
template
<
typename
DeviceContext
,
typename
T
>
class
Tree2ColFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
node_features
,
framework
::
Tensor
*
patch
,
int
max_depth
);
};
template
<
typename
DeviceContext
,
typename
T
>
class
Col2TreeFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
EdgeSet
,
const
framework
::
Tensor
&
out_grad
,
framework
::
Tensor
*
in_grad
,
int
max_depth
);
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/tree_conv_op.cc
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/tree_conv_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
class
TreeConvOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"NodesVector"
,
"(Tensor) The feature vector of every node on the tree. "
"The shape of the feature vector must be "
"[max_tree_node_size, feature_size]."
);
AddInput
(
"EdgeSet"
,
"(Tensor) The Edges of Tree. The edge must be directional. "
"The shape of the edge set must be [max_tree_node_size, 2]."
);
AddInput
(
"Filter"
,
"(Tensor) The feature detector. "
"The shape of the filter is "
"[feature_size, 3, output_size, num_filters]."
);
AddOutput
(
"Out"
,
"(Tensor) The feature vector of subtrees. "
"The shape of the output tensor is [max_tree_node_size, "
"output_size, num_filters]. "
"The output tensor could be a new feature "
"vector for next tree convolution layers."
);
AddAttr
<
int
>
(
"max_depth"
,
"(int, default: 2) The depth of feature detector."
)
.
SetDefault
(
2
)
.
GreaterThan
(
1
);
AddComment
(
R"DOC(
**Tree-Based Convolution Operator**
Tree-Based Convolution is a kind of convolution based on tree structure.
Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
which is used to classify tree structures, such as Abstract Syntax Tree.
Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
which regards multiway tree as binary tree.
The paper of Tree-Based Convolution Operator is here:
https://arxiv.org/abs/1409.5718v1
)DOC"
);
}
};
class
TreeConvOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
));
auto
edge_dims
=
ctx
->
GetInputDim
(
"EdgeSet"
);
auto
vector_dims
=
ctx
->
GetInputDim
(
"NodesVector"
);
auto
filter_dims
=
ctx
->
GetInputDim
(
"Filter"
);
PADDLE_ENFORCE_EQ
(
edge_dims
[
2
],
2
,
"Input(EdgeSet) dim[2] should be 2"
);
PADDLE_ENFORCE_EQ
(
edge_dims
.
size
(),
3
,
"The dimension of EdgeSet Tensor should be 3"
);
PADDLE_ENFORCE_EQ
(
vector_dims
.
size
(),
3
,
"The dimension of NodesVector Tensor should be 3"
);
PADDLE_ENFORCE_EQ
(
filter_dims
.
size
(),
4
,
"The dimension of Filter Tensor should be 4"
);
PADDLE_ENFORCE_EQ
(
filter_dims
[
1
],
3
,
"Input(Filter) dim[1] should be 3"
);
PADDLE_ENFORCE_EQ
(
filter_dims
[
0
],
vector_dims
[
2
],
"Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]"
);
auto
output_dims
=
framework
::
make_ddim
(
{
vector_dims
[
0
],
vector_dims
[
1
],
filter_dims
[
2
],
filter_dims
[
3
]});
ctx
->
SetOutputDim
(
"Out"
,
output_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"NodesVector"
)
->
type
(),
ctx
.
device_context
());
}
};
class
TreeConvGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
vectors_dims
=
ctx
->
GetInputDim
(
"NodesVector"
);
auto
filter_dims
=
ctx
->
GetInputDim
(
"Filter"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"the gradient of output(Out) must not be null"
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Filter"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Filter"
),
filter_dims
);
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"NodesVector"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"NodesVector"
),
vectors_dims
);
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"NodesVector"
)
->
type
(),
ctx
.
device_context
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
tree_conv
,
ops
::
TreeConvOp
,
ops
::
TreeConvOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
tree_conv_grad
,
ops
::
TreeConvGradOp
);
REGISTER_OP_CPU_KERNEL
(
tree_conv
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
tree_conv_grad
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/tree_conv_op.cu
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/tree_conv_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
tree_conv
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
TreeConvKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
tree_conv_grad
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
TreeConvGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/tree_conv_op.h
0 → 100644
浏览文件 @
54f9d44e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/tree2col.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
DDim
=
framework
::
DDim
;
template
<
typename
DeviceContext
,
typename
T
>
class
TreeConvKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
math
::
Tree2ColFunctor
<
DeviceContext
,
T
>
tree2col
;
math
::
SetConstant
<
DeviceContext
,
T
>
constant
;
auto
*
Edges
=
ctx
.
Input
<
Tensor
>
(
"EdgeSet"
);
auto
*
Embeddings
=
ctx
.
Input
<
Tensor
>
(
"NodesVector"
);
auto
*
Filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
auto
*
output_emb
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
int
max_depth
=
ctx
.
Attr
<
int
>
(
"max_depth"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
Tensor
W
;
W
.
ShareDataWith
(
*
Filter
);
W
.
Resize
(
framework
::
flatten_to_2d
(
Filter
->
dims
(),
2
));
int
batch_size
=
static_cast
<
int
>
(
Edges
->
dims
()[
0
]);
int
n
=
static_cast
<
int
>
(
Embeddings
->
dims
()[
1
]);
int
out_size
=
static_cast
<
int
>
(
Filter
->
dims
()[
2
]);
int
num_filters
=
static_cast
<
int
>
(
Filter
->
dims
()[
3
]);
output_emb
->
mutable_data
<
T
>
({
batch_size
,
n
,
out_size
,
num_filters
},
ctx
.
GetPlace
());
auto
edge_set_slicedim
=
framework
::
slice_ddim
(
Edges
->
dims
(),
1
,
static_cast
<
int
>
(
Edges
->
dims
().
size
()));
auto
embedding_slicedim
=
framework
::
slice_ddim
(
Embeddings
->
dims
(),
1
,
static_cast
<
int
>
(
Embeddings
->
dims
().
size
()));
auto
output_slicedim
=
framework
::
slice_ddim
(
output_emb
->
dims
(),
1
,
static_cast
<
int
>
(
output_emb
->
dims
().
size
()));
output_slicedim
=
framework
::
flatten_to_2d
(
output_slicedim
,
1
);
for
(
int
idx
=
0
;
idx
<
batch_size
;
idx
++
)
{
auto
edge_set
=
Edges
->
Slice
(
idx
,
idx
+
1
).
Resize
(
edge_set_slicedim
);
auto
embeddings
=
Embeddings
->
Slice
(
idx
,
idx
+
1
).
Resize
(
embedding_slicedim
);
auto
out_vec
=
output_emb
->
Slice
(
idx
,
idx
+
1
).
Resize
(
output_slicedim
);
Tensor
patch
;
tree2col
(
dev_ctx
,
edge_set
,
embeddings
,
&
patch
,
max_depth
);
constant
(
dev_ctx
,
&
out_vec
,
0
);
blas
.
MatMul
(
patch
,
W
,
&
out_vec
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
TreeConvGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out_g
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
in_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"NodesVector"
));
auto
*
filter_g
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
int
max_depth
=
ctx
.
Attr
<
int
>
(
"max_depth"
);
auto
*
Embeddings
=
ctx
.
Input
<
Tensor
>
(
"NodesVector"
);
auto
*
edges
=
ctx
.
Input
<
Tensor
>
(
"EdgeSet"
);
auto
*
Filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
math
::
Tree2ColFunctor
<
DeviceContext
,
T
>
tree2col
;
math
::
Col2TreeFunctor
<
DeviceContext
,
T
>
col2tree
;
math
::
SetConstant
<
DeviceContext
,
T
>
constant
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
Tensor
W
;
W
.
ShareDataWith
(
*
Filter
);
W
.
Resize
(
framework
::
flatten_to_2d
(
Filter
->
dims
(),
1
));
int
batch_size
=
static_cast
<
int
>
(
Embeddings
->
dims
()[
0
]);
auto
edge_set_slicedim
=
framework
::
slice_ddim
(
edges
->
dims
(),
1
,
static_cast
<
int
>
(
edges
->
dims
().
size
()));
auto
embedding_slicedim
=
framework
::
slice_ddim
(
Embeddings
->
dims
(),
1
,
static_cast
<
int
>
(
Embeddings
->
dims
().
size
()));
auto
out_grad_dims
=
framework
::
slice_ddim
(
out_g
->
dims
(),
1
,
static_cast
<
int
>
(
out_g
->
dims
().
size
()));
out_grad_dims
=
framework
::
flatten_to_2d
(
out_grad_dims
,
1
);
if
(
filter_g
)
{
filter_g
->
mutable_data
<
T
>
(
Filter
->
dims
(),
ctx
.
GetPlace
());
Tensor
f_g
;
f_g
.
ShareDataWith
(
*
filter_g
);
f_g
.
Resize
(
framework
::
flatten_to_2d
(
Filter
->
dims
(),
2
));
constant
(
dev_ctx
,
filter_g
,
0
);
for
(
int
batch_id
=
0
;
batch_id
<
batch_size
;
batch_id
++
)
{
auto
edge_set
=
edges
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
edge_set_slicedim
);
auto
embeddings
=
Embeddings
->
Slice
(
batch_id
,
batch_id
+
1
)
.
Resize
(
embedding_slicedim
);
auto
out_grad
=
out_g
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
out_grad_dims
);
Tensor
patch
;
tree2col
(
dev_ctx
,
edge_set
,
embeddings
,
&
patch
,
max_depth
);
blas
.
MatMul
(
patch
,
true
,
out_grad
,
false
,
T
(
1.0
),
&
f_g
,
T
(
1.0
));
}
}
if
(
in_g
)
{
auto
input_grad_dims
=
framework
::
slice_ddim
(
in_g
->
dims
(),
1
,
static_cast
<
int
>
(
in_g
->
dims
().
size
()));
in_g
->
mutable_data
<
T
>
(
Embeddings
->
dims
(),
ctx
.
GetPlace
());
constant
(
dev_ctx
,
in_g
,
0
);
for
(
int
batch_id
=
0
;
batch_id
<
batch_size
;
batch_id
++
)
{
auto
edge_set
=
edges
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
edge_set_slicedim
);
auto
out_grad
=
out_g
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
out_grad_dims
);
auto
in_grad
=
in_g
->
Slice
(
batch_id
,
batch_id
+
1
).
Resize
(
input_grad_dims
);
Tensor
in_grad_temp
;
col2tree
(
dev_ctx
,
edge_set
,
out_grad
,
&
in_grad_temp
,
max_depth
);
blas
.
MatMul
(
in_grad_temp
,
false
,
W
,
true
,
&
in_grad
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
python/paddle/fluid/async_executor.py
浏览文件 @
54f9d44e
...
...
@@ -200,7 +200,6 @@ class AsyncExecutor(object):
local_path
,
self
.
instance
.
get_worker_index
(),
self
.
instance
.
get_node_cnt
()
/
2
,
file_cnt
,
multi_processes
=
process_num
)
self
.
instance
.
barrier_worker
()
#wait for download_data
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
54f9d44e
...
...
@@ -183,6 +183,7 @@ __all__ = [
'psroi_pool'
,
'teacher_student_sigmoid_loss'
,
'huber_loss'
,
'tree_conv'
,
]
kIgnoreIndex
=
-
100
...
...
@@ -864,12 +865,14 @@ def dynamic_gru(input,
is_reverse
=
False
,
gate_activation
=
'sigmoid'
,
candidate_activation
=
'tanh'
,
h_0
=
None
):
h_0
=
None
,
origin_mode
=
False
):
"""
**Gated Recurrent Unit (GRU) Layer**
Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
Sequence Modeling <https://arxiv.org/abs/1412.3555>`_ .
if origin_mode is False, then the equation of a gru step is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_ .
The formula is as follows:
...
...
@@ -883,6 +886,21 @@ def dynamic_gru(input,
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot
\\
tilde{h_t}
if origin_mode is True then the equation is from paper
Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
.. math::
u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
\\
tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = u_t \odot h_{t-1} + (1-u_t) \odot
\\
tilde{h_t}
The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
is the update gate and reset gate activation function and :math:`sigmoid`
is usually used for it. :math:`act_c` is the activation function for
...
...
@@ -980,7 +998,8 @@ def dynamic_gru(input,
attrs
=
{
'is_reverse'
:
is_reverse
,
'gate_activation'
:
gate_activation
,
'activation'
:
candidate_activation
'activation'
:
candidate_activation
,
'origin_mode'
:
origin_mode
})
return
hidden
...
...
@@ -991,9 +1010,14 @@ def gru_unit(input,
param_attr
=
None
,
bias_attr
=
None
,
activation
=
'tanh'
,
gate_activation
=
'sigmoid'
):
gate_activation
=
'sigmoid'
,
origin_mode
=
False
):
"""
GRU unit layer. The equation of a gru step is:
**GRU unit layer**
if origin_mode is True, then the equation of a gru step is from paper
`Learning Phrase Representations using RNN Encoder-Decoder for Statistical
Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
...
...
@@ -1002,7 +1026,21 @@ def gru_unit(input,
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
if origin_mode is False, then the equation of a gru step is from paper
`Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
Modeling <https://arxiv.org/pdf/1412.3555.pdf>`_
.. math::
u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), h_{t-1}) + dot(u_t, m_t)
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
of the equation above, the :math:`z_t` is split into 3 parts -
...
...
@@ -9893,3 +9931,73 @@ def huber_loss(input, label, delta):
'Residual'
:
residual
},
attrs
=
{
'delta'
:
delta
})
return
out
@
templatedoc
()
def
tree_conv
(
nodes_vector
,
edge_set
,
output_size
,
num_filters
=
1
,
max_depth
=
2
,
act
=
'tanh'
,
param_attr
=
None
,
bias_attr
=
None
,
name
=
None
):
"""
${comment}
Args:
nodes_vector(${nodes_vector_type}): ${nodes_vector_comment}
edge_set(${edge_set_type}): ${edge_set_comment}
output_size(int): output feature width
num_filters(int): number of filters, Default 1
max_depth(int): max depth of filters, Default 2
act(str): activation function, Default tanh
param_attr(ParamAttr): the parameter attribute for the filters, Default None
bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default None
name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default None
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
nodes_vector = layers.data(name='vectors', shape=[None, 10, 5], dtype='float32)
# None for batch size, 10 for max_node_size of dataset, 5 for vector width
edge_set = layers.data(name='edge_set', shape=[None, 10, 2], dtype='float32')
# None for batch size, 10 for max_node_size of dataset, 2 for every edge has two nodes
# edges must be directional
out_vector = layers.tree_conv(nodes_vector, edge_set, 6, 1, 2, 'tanh',
ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
# the shape of output will be [None, 10, 6, 1],
# None for batch size, 10 for max_node_size of dataset, 6 for output size, 1 for 1 filter
out_vector = layers.reshape(out_vector, shape=[None, 10, 6])
# After reshape, output tensor could be nodes_vector for next tree convolution
out_vector_2 = layers.tree_conv(out_vector, edge_set, 3, 4, 2, 'tanh',
ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
# also output tensor could be pooling(the pooling in paper called global pooling)
pooled = layers.reduce_max(out_vector, dims=2) # global pooling
"""
helper
=
LayerHelper
(
"tree_conv"
,
**
locals
())
dtype
=
helper
.
input_dtype
(
'nodes_vector'
)
feature_size
=
nodes_vector
.
shape
[
2
]
W_shape
=
[
feature_size
,
3
,
output_size
,
num_filters
]
W
=
helper
.
create_parameter
(
attr
=
param_attr
,
shape
=
W_shape
,
dtype
=
dtype
,
is_bias
=
False
)
if
name
==
None
:
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
dtype
)
else
:
out
=
helper
.
create_variable
(
name
=
name
,
dtype
=
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
'tree_conv'
,
inputs
=
{
'NodesVector'
:
nodes_vector
,
'EdgeSet'
:
edge_set
,
'Filter'
:
W
},
outputs
=
{
'Out'
:
out
,
},
attrs
=
{
'max_depth'
:
max_depth
})
if
helper
.
bias_attr
:
pre_activation
=
helper
.
append_bias_op
(
out
)
else
:
pre_activation
=
out
return
helper
.
append_activation
(
pre_activation
)
python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
浏览文件 @
54f9d44e
...
...
@@ -231,14 +231,17 @@ def infer(use_cuda, inference_program, params_dirname):
# Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# level of detail info, indicating that `data` consists of two sequences
# of length 3 and 2, respectively.
user_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
gender_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
age_id
=
fluid
.
create_lod_tensor
([[
0
]],
[[
1
]],
place
)
job_id
=
fluid
.
create_lod_tensor
([[
10
]],
[[
1
]],
place
)
movie_id
=
fluid
.
create_lod_tensor
([[
783
]],
[[
1
]],
place
)
category_id
=
fluid
.
create_lod_tensor
([[
10
,
8
,
9
]],
[[
3
]],
place
)
movie_title
=
fluid
.
create_lod_tensor
([[
1069
,
4140
,
2923
,
710
,
988
]],
[[
5
]],
place
)
user_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)]],
[[
1
]],
place
)
gender_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)]],
[[
1
]],
place
)
age_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
0
)]],
[[
1
]],
place
)
job_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
10
)]],
[[
1
]],
place
)
movie_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
783
)]],
[[
1
]],
place
)
category_id
=
fluid
.
create_lod_tensor
(
[
np
.
array
(
[
10
,
8
,
9
],
dtype
=
'int64'
)],
[[
3
]],
place
)
movie_title
=
fluid
.
create_lod_tensor
(
[
np
.
array
(
[
1069
,
4140
,
2923
,
710
,
988
],
dtype
=
'int64'
)],
[[
5
]],
place
)
results
=
inferencer
.
infer
(
{
...
...
python/paddle/fluid/tests/book/test_recommender_system.py
浏览文件 @
54f9d44e
...
...
@@ -271,26 +271,30 @@ def infer(use_cuda, save_dirname=None):
# Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
# level of detail info, indicating that `data` consists of two sequences
# of length 3 and 2, respectively.
user_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
user_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
1
]
==
"gender_id"
gender_id
=
fluid
.
create_lod_tensor
([[
1
]],
[[
1
]],
place
)
gender_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
1
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
2
]
==
"age_id"
age_id
=
fluid
.
create_lod_tensor
([[
0
]],
[[
1
]],
place
)
age_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
0
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
3
]
==
"job_id"
job_id
=
fluid
.
create_lod_tensor
([[
10
]],
[[
1
]],
place
)
job_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
10
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
4
]
==
"movie_id"
movie_id
=
fluid
.
create_lod_tensor
([[
783
]],
[[
1
]],
place
)
movie_id
=
fluid
.
create_lod_tensor
([[
np
.
int64
(
783
)
]],
[[
1
]],
place
)
assert
feed_target_names
[
5
]
==
"category_id"
category_id
=
fluid
.
create_lod_tensor
([[
10
,
8
,
9
]],
[[
3
]],
place
)
category_id
=
fluid
.
create_lod_tensor
(
[
np
.
array
(
[
10
,
8
,
9
],
dtype
=
'int64'
)],
[[
3
]],
place
)
assert
feed_target_names
[
6
]
==
"movie_title"
movie_title
=
fluid
.
create_lod_tensor
([[
1069
,
4140
,
2923
,
710
,
988
]],
[[
5
]],
place
)
movie_title
=
fluid
.
create_lod_tensor
(
[
np
.
array
(
[
1069
,
4140
,
2923
,
710
,
988
],
dtype
=
'int64'
)],
[[
5
]],
place
)
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
...
...
python/paddle/fluid/tests/unittests/test_auc_op.py
浏览文件 @
54f9d44e
...
...
@@ -24,7 +24,7 @@ class TestAucOp(OpTest):
def
setUp
(
self
):
self
.
op_type
=
"auc"
pred
=
np
.
random
.
random
((
128
,
2
)).
astype
(
"float32"
)
labels
=
np
.
random
.
randint
(
0
,
2
,
(
128
,
1
))
labels
=
np
.
random
.
randint
(
0
,
2
,
(
128
,
1
))
.
astype
(
"int64"
)
num_thresholds
=
200
stat_pos
=
np
.
zeros
((
num_thresholds
+
1
,
)).
astype
(
"int64"
)
...
...
python/paddle/fluid/tests/unittests/test_gru_op.py
浏览文件 @
54f9d44e
...
...
@@ -31,7 +31,8 @@ def gru(
is_reverse
,
act_state
,
act_gate
,
dtype
=
'float32'
):
dtype
=
'float32'
,
origin_mode
=
False
):
def
_seq_to_batch
(
lod
,
is_reverse
):
idx_in_seq_list
=
[]
seq_lens
=
lod
[
0
]
...
...
@@ -66,7 +67,10 @@ def gru(
w_c
=
w
.
flatten
()[
D
*
D
*
2
:].
reshape
((
D
,
D
))
c
=
act_state
(
np
.
dot
(
r_h_p
,
w_c
)
+
g
[:,
D
*
2
:])
g
=
np
.
hstack
((
u_r
,
c
))
h
=
u
*
c
+
(
1
-
u
)
*
h_p
if
origin_mode
:
h
=
(
1
-
u
)
*
c
+
u
*
h_p
else
:
h
=
u
*
c
+
(
1
-
u
)
*
h_p
return
g
,
r_h_p
,
h
T
=
sum
(
lod
[
0
])
...
...
@@ -110,6 +114,7 @@ class TestGRUOp(OpTest):
self
.
act_state
=
'tanh'
self
.
act_gate
=
'sigmoid'
self
.
dtype
=
'float64'
self
.
origin_mode
=
False
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
...
...
@@ -126,7 +131,8 @@ class TestGRUOp(OpTest):
batch_gate
,
batch_reset_hidden_prev
,
batch_hidden
,
hidden
=
gru
(
input
,
self
.
lod
,
h0
,
weight
,
bias
,
self
.
is_reverse
,
ACTIVATION
[
self
.
act_state
],
ACTIVATION
[
self
.
act_gate
],
self
.
dtype
)
ACTIVATION
[
self
.
act_state
],
ACTIVATION
[
self
.
act_gate
],
self
.
dtype
,
self
.
origin_mode
)
self
.
inputs
=
{
'Input'
:
(
input
,
self
.
lod
),
'Weight'
:
weight
}
if
self
.
with_bias
:
...
...
@@ -145,7 +151,8 @@ class TestGRUOp(OpTest):
self
.
attrs
=
{
'activation'
:
self
.
act_state
,
'gate_activation'
:
self
.
act_gate
,
'is_reverse'
:
self
.
is_reverse
'is_reverse'
:
self
.
is_reverse
,
'origin_mode'
:
self
.
origin_mode
}
def
test_check_output
(
self
):
...
...
@@ -155,12 +162,24 @@ class TestGRUOp(OpTest):
self
.
check_grad
([
'Input'
,
'H0'
,
'Weight'
,
'Bias'
],
[
'Hidden'
])
class
TestGRUOriginMode
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
origin_mode
=
True
class
TestGRUOp2
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
D
=
19
self
.
dtype
=
'float32'
class
TestGRUOp2OriginMode
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
D
=
19
self
.
dtype
=
'float32'
self
.
origin_mode
=
True
class
TestGRUOpNoInitial
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
with_h0
=
False
...
...
@@ -182,5 +201,11 @@ class TestGRUOpReverse(TestGRUOp):
self
.
is_reverse
=
True
class
TestGRUOpReverseOriginMode
(
TestGRUOp
):
def
set_confs
(
self
):
self
.
is_reverse
=
True
self
.
origin_mode
=
True
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_gru_unit_op.py
浏览文件 @
54f9d44e
...
...
@@ -53,7 +53,7 @@ class TestGRUUnitOp(OpTest):
GRUActivationType
.
relu
:
relu
,
}
def
set_inputs
(
self
):
def
set_inputs
(
self
,
origin_mode
=
False
):
batch_size
=
self
.
batch_size
frame_size
=
self
.
frame_size
self
.
op_type
=
'gru_unit'
...
...
@@ -68,10 +68,11 @@ class TestGRUUnitOp(OpTest):
}
self
.
attrs
=
{
'activation'
:
GRUActivationType
.
tanh
,
'gate_activation'
:
GRUActivationType
.
sigmoid
'gate_activation'
:
GRUActivationType
.
sigmoid
,
'origin_mode'
:
origin_mode
}
def
set_outputs
(
self
):
def
set_outputs
(
self
,
origin_mode
=
False
):
# GRU calculations
batch_size
=
self
.
batch_size
frame_size
=
self
.
frame_size
...
...
@@ -93,7 +94,10 @@ class TestGRUUnitOp(OpTest):
c
=
self
.
activate
[
self
.
attrs
[
'activation'
]](
np
.
dot
(
r_h_p
,
w_c
)
+
g
[:,
frame_size
*
2
:])
g
=
np
.
hstack
((
u_r
,
c
))
h
=
u
*
c
+
(
1
-
u
)
*
h_p
if
origin_mode
:
h
=
(
1
-
u
)
*
c
+
u
*
h_p
else
:
h
=
u
*
c
+
(
1
-
u
)
*
h_p
self
.
outputs
=
{
'Gate'
:
g
.
astype
(
'float64'
),
'ResetHiddenPrev'
:
r_h_p
.
astype
(
'float64'
),
...
...
@@ -111,8 +115,14 @@ class TestGRUUnitOp(OpTest):
self
.
check_grad
([
'Input'
,
'HiddenPrev'
,
'Weight'
],
[
'Hidden'
])
class
TestGRUUnitOpOriginMode
(
TestGRUUnitOp
):
def
setUp
(
self
):
self
.
set_inputs
(
origin_mode
=
True
)
self
.
set_outputs
(
origin_mode
=
True
)
class
TestGRUUnitOpWithBias
(
TestGRUUnitOp
):
def
set_inputs
(
self
):
def
set_inputs
(
self
,
origin_mode
=
False
):
batch_size
=
self
.
batch_size
frame_size
=
self
.
frame_size
super
(
TestGRUUnitOpWithBias
,
self
).
set_inputs
()
...
...
@@ -120,7 +130,8 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
-
0.1
,
0.1
,
(
1
,
frame_size
*
3
)).
astype
(
'float64'
)
self
.
attrs
=
{
'activation'
:
GRUActivationType
.
identity
,
'gate_activation'
:
GRUActivationType
.
sigmoid
'gate_activation'
:
GRUActivationType
.
sigmoid
,
'origin_mode'
:
origin_mode
}
def
test_check_grad
(
self
):
...
...
@@ -132,5 +143,11 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
no_grad_set
=
set
(
'Input'
))
class
TestGRUUnitOpWithBiasOriginMode
(
TestGRUUnitOpWithBias
):
def
setUp
(
self
):
self
.
set_inputs
(
origin_mode
=
True
)
self
.
set_outputs
(
origin_mode
=
True
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_nce.py
浏览文件 @
54f9d44e
...
...
@@ -68,7 +68,8 @@ class TestNCE(OpTest):
weight
=
np
.
random
.
randn
(
num_classes
,
dim
).
astype
(
np
.
float32
)
bias
=
np
.
random
.
randn
(
num_classes
).
astype
(
np
.
float32
)
sample_weight
=
np
.
random
.
randn
(
batch_size
).
astype
(
np
.
float32
)
labels
=
np
.
random
.
randint
(
0
,
num_classes
,
(
batch_size
,
num_true_class
))
labels
=
np
.
random
.
randint
(
0
,
num_classes
,
(
batch_size
,
num_true_class
)).
astype
(
"int64"
)
self
.
attrs
=
{
'num_total_classes'
:
num_classes
,
'num_neg_samples'
:
num_neg_samples
,
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
浏览文件 @
54f9d44e
...
...
@@ -24,14 +24,14 @@ import os
def
Lenet
(
data
,
class_dim
):
conv1
=
fluid
.
layers
.
conv2d
(
data
,
32
,
5
,
1
,
act
=
None
)
conv1
=
fluid
.
layers
.
conv2d
(
data
,
4
,
5
,
1
,
act
=
None
)
bn1
=
fluid
.
layers
.
batch_norm
(
conv1
,
act
=
'relu'
)
pool1
=
fluid
.
layers
.
pool2d
(
bn1
,
2
,
'max'
,
2
)
conv2
=
fluid
.
layers
.
conv2d
(
pool1
,
50
,
5
,
1
,
act
=
None
)
conv2
=
fluid
.
layers
.
conv2d
(
pool1
,
16
,
5
,
1
,
act
=
None
)
bn2
=
fluid
.
layers
.
batch_norm
(
conv2
,
act
=
'relu'
)
pool2
=
fluid
.
layers
.
pool2d
(
bn2
,
2
,
'max'
,
2
)
fc1
=
fluid
.
layers
.
fc
(
pool2
,
size
=
50
0
,
act
=
'relu'
)
fc1
=
fluid
.
layers
.
fc
(
pool2
,
size
=
50
,
act
=
'relu'
)
fc2
=
fluid
.
layers
.
fc
(
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
return
fc2
...
...
python/paddle/fluid/tests/unittests/test_tree_conv_op.py
0 → 100644
浏览文件 @
54f9d44e
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
op_test
import
OpTest
def
collect_node_patch
(
og
,
max_depth
):
"""
The naive method to construct patches
:param og: original graph
:param max_depth: the depth of convolution filters
:return: convolution patches
"""
def
gen
(
node
,
max_depth
):
collected
=
[(
node
,
1
,
1
,
0
,
max_depth
)]
def
recurse_helper
(
node
,
depth
):
if
depth
>
max_depth
:
return
l
=
len
(
og
[
node
])
for
idx
,
c
in
enumerate
(
og
[
node
],
1
):
if
depth
+
1
<
max_depth
:
collected
.
append
((
c
,
idx
,
l
,
depth
+
1
,
max_depth
))
recurse_helper
(
c
,
depth
+
1
)
recurse_helper
(
node
,
0
)
return
collected
res
=
[]
for
u
in
range
(
1
,
len
(
og
)):
lis
=
gen
(
u
,
max_depth
)
if
len
(
lis
)
>
0
:
res
.
append
(
lis
)
return
res
class
TestTreeConvOp
(
OpTest
):
def
setUp
(
self
):
self
.
n
=
17
self
.
fea_size
=
3
self
.
output_size
=
1
self
.
max_depth
=
2
self
.
batch_size
=
1
self
.
num_filters
=
1
adj_array
=
[
1
,
2
,
1
,
3
,
1
,
4
,
1
,
5
,
2
,
6
,
2
,
7
,
2
,
8
,
4
,
9
,
4
,
10
,
5
,
11
,
6
,
12
,
6
,
13
,
9
,
14
,
9
,
15
,
9
,
16
,
9
,
17
]
adj
=
np
.
array
(
adj_array
).
reshape
((
1
,
self
.
n
-
1
,
2
)).
astype
(
'int32'
)
adj
=
np
.
tile
(
adj
,
(
self
.
batch_size
,
1
,
1
))
self
.
op_type
=
'tree_conv'
vectors
=
np
.
random
.
random
(
(
self
.
batch_size
,
self
.
n
,
self
.
fea_size
)).
astype
(
'float32'
)
self
.
inputs
=
{
'EdgeSet'
:
adj
,
'NodesVector'
:
vectors
,
'Filter'
:
np
.
random
.
random
((
self
.
fea_size
,
3
,
self
.
output_size
,
self
.
num_filters
)).
astype
(
'float32'
)
}
self
.
attrs
=
{
'max_depth'
:
self
.
max_depth
}
vectors
=
[]
for
i
in
range
(
self
.
batch_size
):
vector
=
self
.
get_output_naive
(
i
)
vectors
.
append
(
vector
)
self
.
outputs
=
{
'Out'
:
np
.
array
(
vectors
).
astype
(
'float32'
),
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
(
[
'NodesVector'
,
'Filter'
],
'Out'
,
max_relative_error
=
0.5
)
def
get_output_naive
(
self
,
batch_id
):
og
=
[[]
for
i
in
range
(
1
,
self
.
n
+
2
)]
st
=
np
.
array
(
self
.
inputs
[
'EdgeSet'
][
batch_id
]).
tolist
()
for
e
in
st
:
og
[
e
[
0
]].
append
(
e
[
1
])
patches
=
collect_node_patch
(
og
,
self
.
max_depth
)
W
=
np
.
array
(
self
.
inputs
[
'Filter'
]).
astype
(
'float32'
)
W
=
np
.
transpose
(
W
,
axes
=
[
1
,
0
,
2
,
3
])
vec
=
[]
for
i
,
patch
in
enumerate
(
patches
,
1
):
result
=
np
.
zeros
((
1
,
W
.
shape
[
2
],
W
.
shape
[
3
]))
for
v
in
patch
:
eta_t
=
float
(
v
[
4
]
-
v
[
3
])
/
float
(
v
[
4
])
eta_l
=
(
1.0
-
eta_t
)
*
(
0.5
if
v
[
2
]
==
1
else
float
(
v
[
1
]
-
1.0
)
/
float
(
v
[
2
]
-
1.0
))
eta_r
=
(
1.0
-
eta_t
)
*
(
1.0
-
eta_l
)
x
=
self
.
inputs
[
'NodesVector'
][
batch_id
][
v
[
0
]
-
1
]
eta
=
np
.
array
([
eta_l
,
eta_r
,
eta_t
]).
reshape
(
(
3
,
1
)).
astype
(
'float32'
)
Wconvi
=
np
.
tensordot
(
eta
,
W
,
axes
=
([
0
],
[
0
]))
x
=
np
.
array
(
x
).
reshape
((
1
,
1
,
self
.
fea_size
))
res
=
np
.
tensordot
(
x
,
Wconvi
,
axes
=
2
)
result
=
result
+
res
vec
.
append
(
result
)
vec
=
np
.
concatenate
(
vec
,
axis
=
0
)
vec
=
np
.
concatenate
(
[
vec
,
np
.
zeros
(
(
self
.
n
-
vec
.
shape
[
0
],
W
.
shape
[
2
],
W
.
shape
[
3
]),
dtype
=
'float32'
)
],
axis
=
0
)
return
vec
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录