Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
cc7f5514
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
cc7f5514
编写于
10月 17, 2018
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into optimize-pyreader
test=develop
上级
305d211a
6447155d
变更
94
隐藏空白更改
内联
并排
Showing
94 changed file
with
4511 addition
and
870 deletion
+4511
-870
cmake/inference_lib.cmake
cmake/inference_lib.cmake
+3
-2
paddle/fluid/API.spec
paddle/fluid/API.spec
+4
-1
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+2
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+44
-42
paddle/fluid/framework/executor.h
paddle/fluid/framework/executor.h
+19
-25
paddle/fluid/framework/feed_fetch_method.cc
paddle/fluid/framework/feed_fetch_method.cc
+1
-2
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+35
-104
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+1
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+11
-3
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+4
-0
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+1
-1
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+17
-12
paddle/fluid/framework/scope.h
paddle/fluid/framework/scope.h
+5
-0
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+20
-0
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+14
-0
paddle/fluid/framework/var_desc.h
paddle/fluid/framework/var_desc.h
+1
-0
paddle/fluid/framework/variable.h
paddle/fluid/framework/variable.h
+5
-1
paddle/fluid/framework/variable_test.cc
paddle/fluid/framework/variable_test.cc
+6
-5
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+1
-1
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+18
-0
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+1
-0
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+5
-0
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+1
-0
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+4
-2
paddle/fluid/inference/tensorrt/convert/pad_op.cc
paddle/fluid/inference/tensorrt/convert/pad_op.cc
+68
-0
paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
+52
-0
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/adadelta_op.cc
paddle/fluid/operators/adadelta_op.cc
+12
-0
paddle/fluid/operators/adadelta_op.h
paddle/fluid/operators/adadelta_op.h
+11
-0
paddle/fluid/operators/adagrad_op.h
paddle/fluid/operators/adagrad_op.h
+20
-13
paddle/fluid/operators/adam_op.h
paddle/fluid/operators/adam_op.h
+9
-16
paddle/fluid/operators/adamax_op.cc
paddle/fluid/operators/adamax_op.cc
+10
-0
paddle/fluid/operators/adamax_op.h
paddle/fluid/operators/adamax_op.h
+11
-0
paddle/fluid/operators/decayed_adagrad_op.cc
paddle/fluid/operators/decayed_adagrad_op.cc
+10
-0
paddle/fluid/operators/decayed_adagrad_op.h
paddle/fluid/operators/decayed_adagrad_op.h
+11
-0
paddle/fluid/operators/fill_constant_op.cc
paddle/fluid/operators/fill_constant_op.cc
+8
-1
paddle/fluid/operators/ftrl_op.cc
paddle/fluid/operators/ftrl_op.cc
+10
-0
paddle/fluid/operators/ftrl_op.h
paddle/fluid/operators/ftrl_op.h
+11
-0
paddle/fluid/operators/fusion_lstm_op.cc
paddle/fluid/operators/fusion_lstm_op.cc
+102
-261
paddle/fluid/operators/isfinite_op.cc
paddle/fluid/operators/isfinite_op.cc
+3
-1
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+7
-5
paddle/fluid/operators/math/algorithm.h
paddle/fluid/operators/math/algorithm.h
+44
-0
paddle/fluid/operators/math/cpu_lstm_compute.h
paddle/fluid/operators/math/cpu_lstm_compute.h
+0
-64
paddle/fluid/operators/math/cpu_vec.h
paddle/fluid/operators/math/cpu_vec.h
+16
-19
paddle/fluid/operators/math/cpu_vec_test.cc
paddle/fluid/operators/math/cpu_vec_test.cc
+6
-10
paddle/fluid/operators/math/jit_kernel.cc
paddle/fluid/operators/math/jit_kernel.cc
+41
-0
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+142
-0
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+391
-0
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+400
-0
paddle/fluid/operators/math/jit_kernel_lstm.cc
paddle/fluid/operators/math/jit_kernel_lstm.cc
+308
-0
paddle/fluid/operators/math/jit_kernel_macro.h
paddle/fluid/operators/math/jit_kernel_macro.h
+111
-0
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+749
-0
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+70
-20
paddle/fluid/operators/math/selected_rows_functor.h
paddle/fluid/operators/math/selected_rows_functor.h
+113
-0
paddle/fluid/operators/math/selected_rows_functor_test.cc
paddle/fluid/operators/math/selected_rows_functor_test.cc
+171
-0
paddle/fluid/operators/math/sequence_pooling.cc
paddle/fluid/operators/math/sequence_pooling.cc
+18
-3
paddle/fluid/operators/momentum_op.cc
paddle/fluid/operators/momentum_op.cc
+5
-0
paddle/fluid/operators/momentum_op.cu
paddle/fluid/operators/momentum_op.cu
+11
-0
paddle/fluid/operators/momentum_op.h
paddle/fluid/operators/momentum_op.h
+6
-0
paddle/fluid/operators/parallel_do_op.cc
paddle/fluid/operators/parallel_do_op.cc
+20
-1
paddle/fluid/operators/reader/blocking_queue.h
paddle/fluid/operators/reader/blocking_queue.h
+6
-3
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+6
-4
paddle/fluid/operators/reader/reader_blocking_queue_test.cc
paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+24
-0
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+3
-2
paddle/fluid/operators/rmsprop_op.cc
paddle/fluid/operators/rmsprop_op.cc
+5
-0
paddle/fluid/operators/rmsprop_op.h
paddle/fluid/operators/rmsprop_op.h
+229
-41
paddle/fluid/operators/sequence_concat_op.cc
paddle/fluid/operators/sequence_concat_op.cc
+4
-2
paddle/fluid/operators/sequence_unpad_op.cc
paddle/fluid/operators/sequence_unpad_op.cc
+153
-0
paddle/fluid/operators/sequence_unpad_op.cu
paddle/fluid/operators/sequence_unpad_op.cu
+30
-0
paddle/fluid/operators/sequence_unpad_op.h
paddle/fluid/operators/sequence_unpad_op.h
+104
-0
paddle/fluid/operators/sgd_op.cc
paddle/fluid/operators/sgd_op.cc
+16
-13
paddle/fluid/operators/sgd_op.cu
paddle/fluid/operators/sgd_op.cu
+6
-0
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+1
-1
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+1
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+15
-5
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+5
-3
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+7
-0
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+18
-0
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+6
-0
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-1
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+2
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+94
-44
paddle/fluid/train/demo/README.md
paddle/fluid/train/demo/README.md
+1
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+12
-12
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+2
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+186
-2
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+12
-0
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+19
-2
python/paddle/fluid/tests/unittests/dist_simnet_bow.py
python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+17
-5
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+74
-4
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+21
-0
python/paddle/fluid/tests/unittests/test_rmsprop_op.py
python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+139
-92
python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
...on/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
+75
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+16
-11
未找到文件。
cmake/inference_lib.cmake
浏览文件 @
cc7f5514
...
@@ -18,7 +18,7 @@ function(copy TARGET)
...
@@ -18,7 +18,7 @@ function(copy TARGET)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DSTS DEPS
)
set
(
multiValueArgs SRCS DSTS DEPS
)
cmake_parse_arguments
(
copy_lib
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
copy_lib
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
inference_lib_dist_dep
${
TARGET
}
${
inference
_lib_dist_dep
}
PARENT_SCOPE
)
set
(
fluid_lib_dist_dep
${
TARGET
}
${
fluid
_lib_dist_dep
}
PARENT_SCOPE
)
list
(
LENGTH copy_lib_SRCS copy_lib_SRCS_len
)
list
(
LENGTH copy_lib_SRCS copy_lib_SRCS_len
)
list
(
LENGTH copy_lib_DSTS copy_lib_DSTS_len
)
list
(
LENGTH copy_lib_DSTS copy_lib_DSTS_len
)
...
@@ -185,7 +185,8 @@ copy(cmake_cache
...
@@ -185,7 +185,8 @@ copy(cmake_cache
SRCS
${
CMAKE_CURRENT_BINARY_DIR
}
/CMakeCache.txt
SRCS
${
CMAKE_CURRENT_BINARY_DIR
}
/CMakeCache.txt
DSTS
${
FLUID_INSTALL_DIR
}
)
DSTS
${
FLUID_INSTALL_DIR
}
)
add_custom_target
(
inference_lib_dist DEPENDS
${
inference_lib_dist_dep
}
)
# This command generates a complete fluid library for both train and inference
add_custom_target
(
fluid_lib_dist DEPENDS
${
fluid_lib_dist_dep
}
)
# paddle fluid version
# paddle fluid version
execute_process
(
execute_process
(
...
...
paddle/fluid/API.spec
浏览文件 @
cc7f5514
...
@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
...
@@ -75,7 +75,8 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
...
@@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
...
@@ -84,6 +85,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
...
@@ -127,6 +129,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
...
@@ -127,6 +129,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
cc7f5514
...
@@ -64,7 +64,8 @@ class OpHandleBase {
...
@@ -64,7 +64,8 @@ class OpHandleBase {
virtual
bool
IsMultiDeviceTransfer
()
{
return
false
;
}
virtual
bool
IsMultiDeviceTransfer
()
{
return
false
;
}
const
platform
::
DeviceContext
*
DeviceContext
(
platform
::
Place
place
)
{
const
platform
::
DeviceContext
*
DeviceContext
(
platform
::
Place
place
)
{
return
dev_ctxes_
[
place
];
auto
it
=
dev_ctxes_
.
find
(
place
);
return
it
!=
dev_ctxes_
.
end
()
?
it
->
second
:
nullptr
;
}
}
void
SetDeviceContext
(
platform
::
Place
place
,
platform
::
DeviceContext
*
ctx_
)
{
void
SetDeviceContext
(
platform
::
Place
place
,
platform
::
DeviceContext
*
ctx_
)
{
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
cc7f5514
...
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
...
@@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG
(
5
)
<<
"destroy ExecutorPrepareContext"
;
VLOG
(
5
)
<<
"destroy ExecutorPrepareContext"
;
}
}
template
<
typename
RefCntMap
>
static
void
DeleteUnusedTensors
(
const
Scope
&
scope
,
const
OperatorBase
*
op
,
GarbageCollector
<
Tensor
>*
gc
,
RefCntMap
*
ref_cnts
)
{
std
::
unordered_set
<
Tensor
*>
erase_tensors
;
auto
handler
=
[
&
](
const
VariableNameMap
&
name_map
)
{
for
(
auto
&
name_pair
:
name_map
)
{
for
(
auto
&
name
:
name_pair
.
second
)
{
auto
it
=
ref_cnts
->
find
(
name
);
if
(
it
==
ref_cnts
->
end
())
continue
;
if
((
it
->
second
)
--
==
1
)
{
auto
*
var
=
scope
.
FindVar
(
name
);
if
(
var
!=
nullptr
)
{
VLOG
(
10
)
<<
"Erase tensor
\'
"
<<
name
<<
"
\'
"
;
if
(
var
->
IsType
<
LoDTensor
>
())
{
erase_tensors
.
insert
(
var
->
GetMutable
<
LoDTensor
>
());
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
erase_tensors
.
insert
(
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
());
}
}
}
}
}
};
handler
(
op
->
Inputs
());
handler
(
op
->
Outputs
());
if
(
!
erase_tensors
.
empty
())
{
gc
->
Add
(
erase_tensors
);
}
}
Executor
::
Executor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
Executor
::
Executor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
void
Executor
::
Close
()
{
void
Executor
::
Close
()
{
...
@@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
...
@@ -66,7 +101,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetMutable
<
FeedFetchList
>
();
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
>>
();
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*
>>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
var
->
GetMutable
<
LoDRankTable
>
();
var
->
GetMutable
<
LoDRankTable
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
...
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
...
@@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
}
}
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
std
::
unique_ptr
<
GarbageCollector
<
Tensor
>>
gc
;
std
::
unique_ptr
<
GarbageCollector
<
Tensor
>>
gc
;
if
(
max_memory_size
>=
0
)
{
// WhileOp would set keep_kids to false
// WhileGradOp would need the scopes created in WhileOp
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
if
(
max_memory_size
>=
0
&&
!
keep_kids
)
{
ctx
->
ResetReferenceCount
();
ctx
->
ResetReferenceCount
();
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
if
(
platform
::
is_gpu_place
(
place_
))
{
...
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
...
@@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
op
->
Run
(
*
local_scope
,
place_
);
op
->
Run
(
*
local_scope
,
place_
);
if
(
gc
!=
nullptr
)
{
if
(
gc
!=
nullptr
)
{
std
::
vector
<
std
::
string
>
erase_vars
;
DeleteUnusedTensors
(
*
local_scope
,
op
.
get
(),
gc
.
get
(),
for
(
auto
&
input
:
op
->
Inputs
())
{
&
(
ctx
->
cur_ref_cnts_
));
for
(
auto
&
input_name
:
input
.
second
)
{
auto
it
=
ctx
->
cur_ref_cnts_
.
find
(
input_name
);
if
(
it
==
ctx
->
cur_ref_cnts_
.
end
())
continue
;
if
(
it
->
second
==
1
)
{
// should delete it
erase_vars
.
emplace_back
(
input_name
);
ctx
->
cur_ref_cnts_
.
erase
(
input_name
);
}
else
{
--
(
it
->
second
);
}
}
}
for
(
auto
&
output
:
op
->
Outputs
())
{
for
(
auto
&
output_name
:
output
.
second
)
{
auto
it
=
ctx
->
cur_ref_cnts_
.
find
(
output_name
);
if
(
it
==
ctx
->
cur_ref_cnts_
.
end
())
continue
;
if
(
it
->
second
==
1
)
{
erase_vars
.
emplace_back
(
output_name
);
ctx
->
cur_ref_cnts_
.
erase
(
output_name
);
}
else
{
--
(
it
->
second
);
}
}
}
if
(
!
erase_vars
.
empty
())
{
std
::
vector
<
framework
::
LoDTensor
*>
erase_tensors
;
for
(
auto
&
name
:
erase_vars
)
{
auto
*
var
=
local_scope
->
FindVar
(
name
);
if
(
var
==
nullptr
)
continue
;
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
erase_tensors
.
push_back
(
tensor
);
}
}
if
(
!
erase_tensors
.
empty
())
gc
->
Add
(
erase_tensors
);
}
}
}
if
(
FLAGS_benchmark
)
{
if
(
FLAGS_benchmark
)
{
...
...
paddle/fluid/framework/executor.h
浏览文件 @
cc7f5514
...
@@ -32,38 +32,32 @@ template <typename T>
...
@@ -32,38 +32,32 @@ template <typename T>
std
::
unordered_map
<
std
::
string
,
T
>
GetNonPersistableReferenceCount
(
std
::
unordered_map
<
std
::
string
,
T
>
GetNonPersistableReferenceCount
(
const
ProgramDesc
&
prog
,
size_t
block_id
)
{
const
ProgramDesc
&
prog
,
size_t
block_id
)
{
auto
&
block
=
prog
.
Block
(
block_id
);
auto
&
block
=
prog
.
Block
(
block_id
);
std
::
unordered_set
<
std
::
string
>
ignored_vars
;
std
::
unordered_map
<
std
::
string
,
T
>
ref_cnts
;
std
::
unordered_map
<
std
::
string
,
T
>
ref_cnts
;
for
(
auto
var_desc
:
block
.
AllVars
())
{
auto
update_ref_cnts
=
[
&
](
OpDesc
*
op_desc
,
const
VariableNameMap
&
name_map
)
{
auto
type
=
var_desc
->
Proto
()
->
type
().
type
();
for
(
auto
&
name_pair
:
name_map
)
{
if
(
type
!=
proto
::
VarType
::
LOD_TENSOR
||
var_desc
->
Persistable
())
{
for
(
auto
&
name
:
name_pair
.
second
)
{
ignored_vars
.
insert
(
var_desc
->
Name
());
// ignore persistable vars
auto
*
var_desc
=
block
.
FindVar
(
name
);
}
if
(
var_desc
==
nullptr
||
var_desc
->
Persistable
())
continue
;
}
auto
type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
type
!=
proto
::
VarType
::
LOD_TENSOR
&&
for
(
auto
op_desc
:
block
.
AllOps
())
{
type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
for
(
auto
&
input
:
op_desc
->
Inputs
())
{
continue
;
for
(
auto
&
input_name
:
input
.
second
)
{
if
(
!
ignored_vars
.
count
(
input_name
))
{
if
(
ref_cnts
.
count
(
input_name
))
++
ref_cnts
[
input_name
];
else
ref_cnts
[
input_name
]
=
1
;
}
}
}
}
for
(
auto
&
output
:
op_desc
->
Outputs
())
{
auto
it
=
ref_cnts
.
find
(
name
);
for
(
auto
output_name
:
output
.
second
)
{
if
(
it
!=
ref_cnts
.
end
())
{
if
(
!
ignored_vars
.
count
(
output_name
))
{
++
it
->
second
;
if
(
ref_cnts
.
count
(
output_name
))
}
else
{
++
ref_cnts
[
output_name
];
ref_cnts
[
name
]
=
1
;
else
ref_cnts
[
output_name
]
=
1
;
}
}
}
}
}
}
};
for
(
auto
op_desc
:
block
.
AllOps
())
{
update_ref_cnts
(
op_desc
,
op_desc
->
Inputs
());
update_ref_cnts
(
op_desc
,
op_desc
->
Outputs
());
}
}
return
ref_cnts
;
return
ref_cnts
;
}
}
...
...
paddle/fluid/framework/feed_fetch_method.cc
浏览文件 @
cc7f5514
...
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
...
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
// be created.
// be created.
VLOG
(
3
)
<<
"SetFeedVariable name="
<<
var_name
<<
" index="
<<
index
;
VLOG
(
3
)
<<
"SetFeedVariable name="
<<
var_name
<<
" index="
<<
index
;
Variable
*
g_feed_value
=
scope
->
Var
(
var_name
);
Variable
*
g_feed_value
=
scope
->
Var
(
var_name
);
auto
&
feed_inputs
=
auto
&
feed_inputs
=
*
(
g_feed_value
->
GetMutable
<
FeedFetchList
>
());
*
(
g_feed_value
->
GetMutable
<
std
::
vector
<
paddle
::
framework
::
LoDTensor
>>
());
if
(
index
>=
feed_inputs
.
size
())
{
if
(
index
>=
feed_inputs
.
size
())
{
feed_inputs
.
resize
(
index
+
1
);
feed_inputs
.
resize
(
index
+
1
);
}
}
...
...
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
浏览文件 @
cc7f5514
...
@@ -44,89 +44,6 @@ namespace ir {
...
@@ -44,89 +44,6 @@ namespace ir {
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name)
template
<
typename
UnaryOperation
>
LoDTensor
tensor_apply
(
const
LoDTensor
&
vec
,
UnaryOperation
f
)
{
LoDTensor
vec_y
;
vec_y
.
Resize
(
vec
.
dims
());
const
float
*
x
=
vec
.
data
<
float
>
();
float
*
y
=
vec_y
.
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
vec
.
numel
();
i
++
)
{
y
[
i
]
=
f
(
x
[
i
]);
}
return
vec_y
;
}
void
tensor_apply_inplace
(
LoDTensor
*
vec
,
float
(
*
f
)(
float
))
{
float
*
data
=
vec
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
vec
->
numel
();
i
++
)
{
data
[
i
]
=
f
(
data
[
i
]);
}
}
template
<
typename
BinaryOperation
>
LoDTensor
tensor_apply_eltwise
(
const
LoDTensor
&
vec_a
,
const
LoDTensor
&
vec_b
,
BinaryOperation
f
)
{
PADDLE_ENFORCE_EQ
(
vec_a
.
dims
(),
vec_b
.
dims
());
LoDTensor
vec_y
;
vec_y
.
Resize
(
vec_a
.
dims
());
const
float
*
a
=
vec_a
.
data
<
float
>
();
const
float
*
b
=
vec_b
.
data
<
float
>
();
float
*
y
=
vec_y
.
mutable_data
<
float
>
(
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
vec_a
.
numel
();
i
++
)
{
y
[
i
]
=
f
(
a
[
i
],
b
[
i
]);
}
return
vec_y
;
}
template
<
typename
BinaryOperation
>
LoDTensor
tensor_apply_eltwise_broadcast
(
const
LoDTensor
&
vec_a
,
const
LoDTensor
&
vec_b
,
BinaryOperation
f
)
{
PADDLE_ENFORCE_EQ
(
vec_a
.
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
vec_b
.
dims
().
size
(),
2
);
PADDLE_ENFORCE_EQ
(
vec_a
.
dims
()[
0
],
vec_b
.
dims
()[
0
]);
PADDLE_ENFORCE_EQ
(
vec_b
.
dims
()[
1
],
1
);
LoDTensor
vec_y
;
vec_y
.
Resize
(
vec_a
.
dims
());
const
float
*
a
=
vec_a
.
data
<
float
>
();
const
float
*
b
=
vec_b
.
data
<
float
>
();
float
*
y
=
vec_y
.
mutable_data
<
float
>
(
platform
::
CPUPlace
());
size_t
a_height
=
vec_a
.
dims
()[
0
];
size_t
a_width
=
vec_a
.
dims
()[
1
];
for
(
size_t
h
=
0
;
h
<
a_height
;
h
++
)
{
for
(
size_t
w
=
0
;
w
<
a_width
;
++
w
)
{
*
(
y
++
)
=
f
(
*
(
a
++
),
b
[
h
]);
}
}
return
vec_y
;
}
// reshape to two dimensions {A, B * C * ...}
void
make_tensor_2d
(
LoDTensor
*
tensor_to_reshape
)
{
auto
dims_count
=
tensor_to_reshape
->
dims
().
size
();
PADDLE_ENFORCE_GT
(
dims_count
,
0
);
int
size2
=
1
;
for
(
int
i
=
1
;
i
<
dims_count
;
i
++
)
{
size2
*=
tensor_to_reshape
->
dims
()[
i
];
}
tensor_to_reshape
->
Resize
(
make_ddim
({
tensor_to_reshape
->
dims
()[
0
],
size2
}));
}
void
recompute_conv_weights
(
LoDTensor
*
weights
,
LoDTensor
*
tmp
)
{
// remember the weights tensor shape {A, B, C, ...}
auto
weights_shape
=
weights
->
dims
();
// reduce the weights to 2d {A, B * C * ...}
make_tensor_2d
(
weights
);
// make tmp tensor 2d by adding 1 as second dim {A, 1}
make_tensor_2d
(
tmp
);
*
weights
=
tensor_apply_eltwise_broadcast
(
*
weights
,
*
tmp
,
std
::
multiplies
<
float
>
());
// reshape weights to the original dims {A, B, C, ...}
weights
->
Resize
(
weights_shape
);
}
void
recompute_bias_and_weights
(
const
Scope
*
scope
,
void
recompute_bias_and_weights
(
const
Scope
*
scope
,
ir
::
Node
*
conv_weight
,
//
ir
::
Node
*
conv_weight
,
//
const
ir
::
Node
&
bn_scale
,
//
const
ir
::
Node
&
bn_scale
,
//
...
@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope,
...
@@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope,
const
ir
::
Node
&
bn_variance
,
//
const
ir
::
Node
&
bn_variance
,
//
LoDTensor
*
eltwise_y_in_tensor
,
//
LoDTensor
*
eltwise_y_in_tensor
,
//
float
epsilon
)
{
float
epsilon
)
{
using
EigenVectorArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
float
,
Eigen
::
Dynamic
,
1
>>
;
using
ConstEigenVectorArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
float
,
Eigen
::
Dynamic
,
1
>>
;
using
EigenMatrixArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
float
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
;
// Re-compute bias of conv2d from BN
// Re-compute bias of conv2d from BN
PADDLE_ENFORCE_EQ
(
eltwise_y_in_tensor
->
dims
(),
bn_bias_tensor
.
dims
());
PADDLE_ENFORCE_EQ
(
eltwise_y_in_tensor
->
dims
(),
bn_bias_tensor
.
dims
());
...
@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope,
...
@@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope,
scope
->
FindVar
(
bn_variance
.
Name
())
->
GetMutable
<
LoDTensor
>
();
scope
->
FindVar
(
bn_variance
.
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
*
mean_tensor
=
scope
->
FindVar
(
bn_mean
.
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
*
mean_tensor
=
scope
->
FindVar
(
bn_mean
.
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
std_tensor
=
LoDTensor
();
ConstEigenVectorArrayMap
scale_array
(
scale_tensor
->
data
<
float
>
(),
std_tensor
.
Resize
(
bn_bias_tensor
.
dims
());
scale_tensor
->
numel
(),
1
);
std_tensor
=
EigenVectorArrayMap
variance_array
(
tensor_apply
(
*
variance_tensor
,
[
&
](
float
x
)
{
return
x
+
epsilon
;
});
variance_tensor
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
variance_tensor
->
numel
(),
1
);
ConstEigenVectorArrayMap
mean_array
(
mean_tensor
->
data
<
float
>
(),
mean_tensor
->
numel
(),
1
);
ConstEigenVectorArrayMap
bn_bias_array
(
bn_bias_tensor
.
data
<
float
>
(),
bn_bias_tensor
.
numel
(),
1
);
using
EigenVectorArrayMap
=
// variance will not be used anymore, so make it std_array and then tmp_array
Eigen
::
Map
<
Eigen
::
Array
<
float
,
Eigen
::
Dynamic
,
1
>>
;
variance_array
+=
epsilon
;
variance_array
=
variance_array
.
sqrt
();
variance_array
=
scale_array
/
variance_array
;
EigenVectorArrayMap
eltwise_y_in_array
(
eltwise_y_in_tensor
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
eltwise_y_in_tensor
->
numel
(),
1
);
EigenVectorArrayMap
std_vec
(
eltwise_y_in_array
=
std_tensor
.
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
std_tensor
.
numel
(),
((
eltwise_y_in_array
-
mean_array
)
*
variance_array
)
+
bn_bias_array
;
1
);
std_vec
=
std_vec
.
sqrt
();
auto
tmp_tensor
=
tensor_apply_eltwise
(
*
scale_tensor
,
std_tensor
,
std
::
divides
<
float
>
());
auto
tensor_minus
=
tensor_apply_eltwise
(
*
eltwise_y_in_tensor
,
*
mean_tensor
,
std
::
minus
<
float
>
());
auto
tensor_mul
=
tensor_apply_eltwise
(
tensor_minus
,
tmp_tensor
,
std
::
multiplies
<
float
>
());
*
eltwise_y_in_tensor
=
tensor_apply_eltwise
(
tensor_mul
,
bn_bias_tensor
,
std
::
plus
<
float
>
());
// Re-compute weight of conv2d from BN
// Re-compute weight of conv2d from BN
auto
*
current_param
=
auto
*
weights
=
scope
->
FindVar
(
conv_weight
->
Name
())
->
GetMutable
<
LoDTensor
>
();
scope
->
FindVar
(
conv_weight
->
Name
())
->
GetMutable
<
LoDTensor
>
();
auto
weights_shape
=
weights
->
dims
();
recompute_conv_weights
(
current_param
,
&
tmp_tensor
);
auto
weights_shape_2d
=
flatten_to_2d
(
weights_shape
,
1
);
EigenMatrixArrayMap
weights_array_2d
(
weights
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
weights_shape_2d
[
0
],
weights_shape_2d
[
1
]);
weights_array_2d
.
colwise
()
*=
variance_array
;
}
}
std
::
unique_ptr
<
ir
::
Graph
>
ConvBNFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
ConvBNFusePass
::
ApplyImpl
(
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
cc7f5514
...
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
...
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetMutable
<
FeedFetchList
>
();
var
->
GetMutable
<
FeedFetchList
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
STEP_SCOPES
)
{
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
>>
();
var
->
GetMutable
<
std
::
vector
<
framework
::
Scope
*
>>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_RANK_TABLE
)
{
var
->
GetMutable
<
LoDRankTable
>
();
var
->
GetMutable
<
LoDRankTable
>
();
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
}
else
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
cc7f5514
...
@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
...
@@ -149,9 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform
::
SetDeviceId
(
dev_id
);
platform
::
SetDeviceId
(
dev_id
);
#endif
#endif
}
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
// The profile has a process-wide mutex, results in serious performance issue
RunImpl
(
scope
,
place
);
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if
(
platform
::
IsProfileEnabled
())
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
Type
(),
pool
.
Get
(
place
));
RunImpl
(
scope
,
place
);
}
else
{
RunImpl
(
scope
,
place
);
}
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
}
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
cc7f5514
...
@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
...
@@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() {
}
}
}
}
}
}
// member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_
.
reset
();
}
}
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
cc7f5514
...
@@ -75,7 +75,7 @@ class ParallelExecutor {
...
@@ -75,7 +75,7 @@ class ParallelExecutor {
private:
private:
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
ParallelExecutorPrivate
*
member_
;
std
::
unique_ptr
<
ParallelExecutorPrivate
>
member_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
// ref_cnts_ is only initialized when ParallelExecutor constructs, and then
// ref_cnts_ is only initialized when ParallelExecutor constructs, and then
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
cc7f5514
...
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
...
@@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() {
Scope
::~
Scope
()
{
DropKids
();
}
Scope
::~
Scope
()
{
DropKids
();
}
Scope
&
Scope
::
NewScope
()
const
{
Scope
&
Scope
::
NewScope
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
kids_
.
push_back
(
new
Scope
(
this
));
kids_
.
push_back
(
new
Scope
(
this
));
return
*
kids_
.
back
();
return
*
kids_
.
back
();
}
}
Variable
*
Scope
::
Var
(
const
std
::
string
&
name
)
{
Variable
*
Scope
::
Var
(
const
std
::
string
&
name
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
VarInternal
(
name
);
return
VarInternal
(
name
);
}
}
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
Variable
*
Scope
::
Var
(
std
::
string
*
name
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
if
(
name
!=
nullptr
)
{
if
(
name
!=
nullptr
)
{
*
name
=
new_name
;
*
name
=
new_name
;
...
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
...
@@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) {
}
}
Variable
*
Scope
::
FindVar
(
const
std
::
string
&
name
)
const
{
Variable
*
Scope
::
FindVar
(
const
std
::
string
&
name
)
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
FindVarInternal
(
name
);
return
FindVarInternal
(
name
);
}
}
Variable
*
Scope
::
FindLocalVar
(
const
std
::
string
&
name
)
const
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
FindVarLocally
(
name
);
}
const
Scope
*
Scope
::
FindScope
(
const
Variable
*
var
)
const
{
const
Scope
*
Scope
::
FindScope
(
const
Variable
*
var
)
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
FindScopeInternal
(
var
);
return
FindScopeInternal
(
var
);
}
}
void
Scope
::
DropKids
()
{
void
Scope
::
DropKids
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
for
(
Scope
*
s
:
kids_
)
delete
s
;
for
(
Scope
*
s
:
kids_
)
delete
s
;
kids_
.
clear
();
kids_
.
clear
();
}
}
bool
Scope
::
HasKid
(
const
Scope
*
scope
)
const
{
bool
Scope
::
HasKid
(
const
Scope
*
scope
)
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
return
it
!=
this
->
kids_
.
end
();
return
it
!=
this
->
kids_
.
end
();
}
}
std
::
vector
<
std
::
string
>
Scope
::
LocalVarNames
()
const
{
std
::
vector
<
std
::
string
>
Scope
::
LocalVarNames
()
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
std
::
vector
<
std
::
string
>
known_vars
;
std
::
vector
<
std
::
string
>
known_vars
;
known_vars
.
reserve
(
this
->
vars_
.
size
());
known_vars
.
reserve
(
this
->
vars_
.
size
());
for
(
auto
&
p
:
vars_
)
{
for
(
auto
&
p
:
vars_
)
{
...
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
...
@@ -101,7 +106,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
}
}
void
Scope
::
DeleteScope
(
Scope
*
scope
)
const
{
void
Scope
::
DeleteScope
(
Scope
*
scope
)
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
auto
it
=
std
::
find
(
this
->
kids_
.
begin
(),
this
->
kids_
.
end
(),
scope
);
PADDLE_ENFORCE
(
it
!=
this
->
kids_
.
end
(),
"Cannot find %p as kid scope"
,
scope
);
PADDLE_ENFORCE
(
it
!=
this
->
kids_
.
end
(),
"Cannot find %p as kid scope"
,
scope
);
this
->
kids_
.
erase
(
it
);
this
->
kids_
.
erase
(
it
);
...
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
...
@@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const {
}
}
void
Scope
::
EraseVars
(
const
std
::
vector
<
std
::
string
>&
var_names
)
{
void
Scope
::
EraseVars
(
const
std
::
vector
<
std
::
string
>&
var_names
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
std
::
set
<
std
::
string
>
var_set
(
var_names
.
begin
(),
var_names
.
end
());
std
::
set
<
std
::
string
>
var_set
(
var_names
.
begin
(),
var_names
.
end
());
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
for
(
auto
it
=
vars_
.
begin
();
it
!=
vars_
.
end
();)
{
if
(
var_set
.
find
(
it
->
first
)
!=
var_set
.
end
())
{
if
(
var_set
.
find
(
it
->
first
)
!=
var_set
.
end
())
{
...
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
...
@@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void
Scope
::
Rename
(
const
std
::
string
&
origin_name
,
void
Scope
::
Rename
(
const
std
::
string
&
origin_name
,
const
std
::
string
&
new_name
)
const
{
const
std
::
string
&
new_name
)
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
RenameInternal
(
origin_name
,
new_name
);
RenameInternal
(
origin_name
,
new_name
);
}
}
std
::
string
Scope
::
Rename
(
const
std
::
string
&
origin_name
)
const
{
std
::
string
Scope
::
Rename
(
const
std
::
string
&
origin_name
)
const
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
auto
new_name
=
string
::
Sprintf
(
"%p.%d"
,
this
,
vars_
.
size
());
RenameInternal
(
origin_name
,
new_name
);
RenameInternal
(
origin_name
,
new_name
);
return
new_name
;
return
new_name
;
...
...
paddle/fluid/framework/scope.h
浏览文件 @
cc7f5514
...
@@ -63,6 +63,11 @@ class Scope {
...
@@ -63,6 +63,11 @@ class Scope {
/// Caller doesn't own the returned Variable.
/// Caller doesn't own the returned Variable.
Variable
*
FindVar
(
const
std
::
string
&
name
)
const
;
Variable
*
FindVar
(
const
std
::
string
&
name
)
const
;
/// Find a variable in the current scope.
/// Return nullptr if cannot find.
/// Caller doesn't own the returned Variable.
Variable
*
FindLocalVar
(
const
std
::
string
&
name
)
const
;
const
Scope
*
parent
()
const
{
return
parent_
;
}
const
Scope
*
parent
()
const
{
return
parent_
;
}
/// Find the scope or an ancestor scope that contains the given variable.
/// Find the scope or an ancestor scope that contains the given variable.
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
cc7f5514
...
@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
...
@@ -36,6 +36,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
}
...
@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
...
@@ -71,6 +76,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto
stream
=
auto
stream
=
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
if
(
platform
::
is_same_place
(
src_place
,
dst_place
))
{
if
(
platform
::
is_same_place
(
src_place
,
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
stream
);
stream
);
}
else
{
}
else
{
...
@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -114,6 +124,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
dst_ptr
=
dst
->
mutable_data
(
dst_place
,
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
boost
::
get
<
platform
::
CPUPlace
>
(
src_place
),
src_ptr
,
size
);
}
}
...
@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -130,6 +145,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
platform
::
is_gpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
cc7f5514
...
@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
...
@@ -41,6 +41,11 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
}
}
TensorCopy
(
dst_tensor
,
*
cpu_place
,
&
dst_tensor
);
for
(
size_t
i
=
0
;
i
<
9
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
}
EXPECT_TRUE
(
dst_tensor
.
layout
()
==
src_tensor
.
layout
());
EXPECT_TRUE
(
dst_tensor
.
layout
()
==
src_tensor
.
layout
());
Tensor
slice_tensor
=
src_tensor
.
Slice
(
1
,
2
);
Tensor
slice_tensor
=
src_tensor
.
Slice
(
1
,
2
);
...
@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
...
@@ -82,6 +87,15 @@ TEST(TensorCopy, Tensor) {
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr
[
i
]);
}
}
// Copy the same tensor
TensorCopy
(
gpu_tensor
,
*
gpu_place
,
gpu_ctx
,
&
gpu_tensor
);
gpu_ctx
.
Wait
();
const
int
*
dst_ptr_tmp
=
dst_tensor
.
data
<
int
>
();
EXPECT_NE
(
src_ptr
,
dst_ptr_tmp
);
for
(
size_t
i
=
0
;
i
<
9
;
++
i
)
{
EXPECT_EQ
(
src_ptr
[
i
],
dst_ptr_tmp
[
i
]);
}
Tensor
slice_tensor
=
src_tensor
.
Slice
(
1
,
2
);
Tensor
slice_tensor
=
src_tensor
.
Slice
(
1
,
2
);
// CPU Slice Tensor to GPU Tensor
// CPU Slice Tensor to GPU Tensor
...
...
paddle/fluid/framework/var_desc.h
浏览文件 @
cc7f5514
...
@@ -59,6 +59,7 @@ class VarDesc {
...
@@ -59,6 +59,7 @@ class VarDesc {
public:
public:
explicit
VarDesc
(
const
std
::
string
&
name
)
{
explicit
VarDesc
(
const
std
::
string
&
name
)
{
desc_
.
set_name
(
name
);
desc_
.
set_name
(
name
);
// TODO(paddle-dev): Why default to lodtensor.
desc_
.
mutable_type
()
->
set_type
(
proto
::
VarType
::
LOD_TENSOR
);
desc_
.
mutable_type
()
->
set_type
(
proto
::
VarType
::
LOD_TENSOR
);
}
}
...
...
paddle/fluid/framework/variable.h
浏览文件 @
cc7f5514
...
@@ -38,8 +38,12 @@ class Variable {
...
@@ -38,8 +38,12 @@ class Variable {
template
<
typename
T
>
template
<
typename
T
>
T
*
GetMutable
()
{
T
*
GetMutable
()
{
if
(
!
IsType
<
T
>
()
)
{
if
(
!
holder_
)
{
holder_
.
reset
(
new
PlaceholderImpl
<
T
>
(
new
T
()));
holder_
.
reset
(
new
PlaceholderImpl
<
T
>
(
new
T
()));
}
else
{
PADDLE_ENFORCE
(
IsType
<
T
>
(),
"Variable must be type %s, the holding type is %s"
,
typeid
(
T
).
name
(),
holder_
->
Type
().
name
());
}
}
return
static_cast
<
T
*>
(
holder_
->
Ptr
());
return
static_cast
<
T
*>
(
holder_
->
Ptr
());
}
}
...
...
paddle/fluid/framework/variable_test.cc
浏览文件 @
cc7f5514
...
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
...
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
const
Tensor
&
tt
=
v
->
Get
<
Tensor
>
();
const
Tensor
&
tt
=
v
->
Get
<
Tensor
>
();
EXPECT_EQ
(
1234
,
tt
.
content_
);
EXPECT_EQ
(
1234
,
tt
.
content_
);
std
::
string
*
s
=
v
->
GetMutable
<
std
::
string
>
();
try
{
*
s
=
"hello"
;
v
->
GetMutable
<
std
::
string
>
();
}
catch
(
std
::
exception
&
e
)
{
const
std
::
string
&
ss
=
v
->
Get
<
std
::
string
>
();
return
;
EXPECT_EQ
(
"hello"
,
ss
);
}
EXPECT_TRUE
(
false
);
}
}
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
cc7f5514
...
@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
...
@@ -70,7 +70,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
auto
trt_teller
=
[
&
](
const
Node
*
node
)
{
auto
trt_teller
=
[
&
](
const
Node
*
node
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
{
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
,
"sigmoid"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"depthwise_conv2d"
,
"batch_norm"
,
"concat"
,
"tanh"
,
"pad"
,
"elementwise_add"
,
"dropout"
});
"elementwise_add"
,
"dropout"
});
if
(
!
node
->
IsFunction
())
return
false
;
if
(
!
node
->
IsFunction
())
return
false
;
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
cc7f5514
...
@@ -25,9 +25,11 @@
...
@@ -25,9 +25,11 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool
(
profile
);
DECLARE_bool
(
profile
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
...
@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
...
@@ -47,6 +49,9 @@ bool AnalysisPredictor::Init(
}
}
#endif
#endif
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
if
(
config_
.
use_gpu
)
{
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
LOG
(
WARNING
)
<<
"ir optimize only supports CPU currently, enable_ir_optim "
LOG
(
WARNING
)
<<
"ir optimize only supports CPU currently, enable_ir_optim "
...
@@ -335,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
...
@@ -335,6 +340,19 @@ bool AnalysisPredictor::LoadProgramDesc() {
}
}
return
true
;
return
true
;
}
}
AnalysisPredictor
::~
AnalysisPredictor
()
{
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
platform
::
DisableProfiler
(
platform
::
EventSortingKey
::
kTotal
,
"./profile.log"
);
}
#endif
if
(
sub_scope_
)
{
scope_
->
DeleteScope
(
sub_scope_
);
}
}
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
std
::
unique_ptr
<
PaddlePredictor
>
AnalysisPredictor
::
Clone
()
{
auto
*
x
=
new
AnalysisPredictor
(
config_
);
auto
*
x
=
new
AnalysisPredictor
(
config_
);
x
->
Init
(
scope_
,
inference_program_
);
x
->
Init
(
scope_
,
inference_program_
);
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
cc7f5514
...
@@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor {
...
@@ -72,6 +72,7 @@ class AnalysisPredictor : public PaddlePredictor {
template
<
typename
T
>
template
<
typename
T
>
void
GetFetchOne
(
const
framework
::
LoDTensor
&
fetchs
,
void
GetFetchOne
(
const
framework
::
LoDTensor
&
fetchs
,
PaddleTensor
*
output_data
);
PaddleTensor
*
output_data
);
~
AnalysisPredictor
();
private:
private:
contrib
::
AnalysisConfig
config_
;
contrib
::
AnalysisConfig
config_
;
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
cc7f5514
...
@@ -23,9 +23,11 @@ limitations under the License. */
...
@@ -23,9 +23,11 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool
(
profile
,
false
,
"Turn on profiler for fluid"
);
DEFINE_bool
(
profile
,
false
,
"Turn on profiler for fluid"
);
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
namespace
{
namespace
{
...
@@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init(
...
@@ -72,6 +74,9 @@ bool NativePaddlePredictor::Init(
}
}
#endif
#endif
// no matter with or without MKLDNN
paddle
::
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
if
(
config_
.
use_gpu
)
{
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
}
else
{
}
else
{
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
浏览文件 @
cc7f5514
...
@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax);
...
@@ -185,3 +185,4 @@ USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER
(
batch_norm
);
USE_TRT_CONVERTER
(
batch_norm
);
USE_TRT_CONVERTER
(
concat
);
USE_TRT_CONVERTER
(
concat
);
USE_TRT_CONVERTER
(
dropout
);
USE_TRT_CONVERTER
(
dropout
);
USE_TRT_CONVERTER
(
pad
);
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
cc7f5514
# Add TRT tests
# Add TRT tests
nv_library
(
tensorrt_converter
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry
)
DEPS tensorrt_engine operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
...
@@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
...
@@ -26,6 +26,8 @@ nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine batch_norm_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine batch_norm_op SERIAL
)
nv_test
(
test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
nv_test
(
test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine concat_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine concat_op SERIAL
)
nv_test
(
test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
nv_test
(
test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine dropout_op SERIAL
)
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine dropout_op SERIAL
)
nv_test
(
test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine pad_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/pad_op.cc
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* PadOp.
*/
class
PadOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid transpose op to tensorrt tranpose layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
auto
*
input
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
const
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
const
float
pad_value
=
boost
::
get
<
float
>
(
op_desc
.
GetAttr
(
"pad_value"
));
nvinfer1
::
Dims
input_shape
=
input
->
getDimensions
();
int
nbDims
=
input_shape
.
nbDims
;
int
pad_size
=
static_cast
<
int
>
(
paddings
.
size
());
PADDLE_ENFORCE_GE
(
nbDims
,
2
);
PADDLE_ENFORCE_EQ
((
nbDims
+
1
)
*
2
,
pad_size
);
PADDLE_ENFORCE
(
pad_value
==
0.0
,
"The pad layer of TRT only support zero."
);
nvinfer1
::
DimsHW
pre_pad
(
paddings
[
pad_size
-
4
],
paddings
[
pad_size
-
2
]);
nvinfer1
::
DimsHW
post_pad
(
paddings
[
pad_size
-
3
],
paddings
[
pad_size
-
1
]);
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Padding
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input
),
pre_pad
,
post_pad
);
PADDLE_ENFORCE
(
layer
!=
nullptr
);
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
layer
->
setName
((
"scale (Output: "
+
output_name
+
")"
).
c_str
());
layer
->
getOutput
(
0
)
->
setName
(
output_name
.
c_str
());
if
(
test_mode
)
{
// the test framework can not determine which is the
// output, so place the declaration inside.
engine_
->
DeclareOutput
(
output_name
);
}
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
pad
,
PadOpConverter
);
paddle/fluid/inference/tensorrt/convert/test_pad_op.cc
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
PadConverter
,
main
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
TRTConvertValidation
validator
(
10
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"pad-X"
,
nvinfer1
::
Dims3
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"pad-Out"
,
nvinfer1
::
Dims3
(
3
,
3
,
5
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"pad"
);
desc
.
SetInput
(
"X"
,
{
"pad-X"
});
desc
.
SetOutput
(
"Out"
,
{
"pad-Out"
});
std
::
vector
<
int
>
paddings
=
{
0
,
0
,
0
,
0
,
0
,
1
,
1
,
2
};
float
pad_value
=
0.0
;
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"pad_value"
,
pad_value
);
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
2
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
pad
);
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
cc7f5514
...
@@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
...
@@ -300,7 +300,7 @@ op_library(flatten_op DEPS reshape_op)
op_library
(
sequence_pad_op DEPS sequence_padding
)
op_library
(
sequence_pad_op DEPS sequence_padding
)
op_library
(
unstack_op DEPS stack_op
)
op_library
(
unstack_op DEPS stack_op
)
op_library
(
fake_quantize_op DEPS memory
)
op_library
(
fake_quantize_op DEPS memory
)
op_library
(
fusion_lstm_op DEPS
cpu_lstm_compute
)
op_library
(
fusion_lstm_op DEPS
jit_kernel
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
layer_norm_op DEPS cub
)
op_library
(
layer_norm_op DEPS cub
)
...
...
paddle/fluid/operators/adadelta_op.cc
浏览文件 @
cc7f5514
...
@@ -18,6 +18,7 @@ namespace paddle {
...
@@ -18,6 +18,7 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
class
AdadeltaOp
:
public
framework
::
OperatorWithKernel
{
class
AdadeltaOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
...
@@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel {
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."
);
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"AvgSquaredUpdate"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"AvgSquaredUpdate"
),
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."
);
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of AdadeltaOp should not be null."
);
"Output(ParamOut) of AdadeltaOp should not be null."
);
...
@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
...
@@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
"AvgSquaredGradOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"AvgSquaredGradOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"AvgSquaredUpdateOut"
,
param_dim
);
ctx
->
SetOutputDim
(
"AvgSquaredUpdateOut"
,
param_dim
);
}
}
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
auto
input_data_type
=
...
...
paddle/fluid/operators/adadelta_op.h
浏览文件 @
cc7f5514
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class
AdadeltaOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
AdadeltaOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
avg_squared_grad_out_tensor
=
auto
avg_squared_grad_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"AvgSquaredGradOut"
);
ctx
.
Output
<
framework
::
Tensor
>
(
"AvgSquaredGradOut"
);
...
...
paddle/fluid/operators/adagrad_op.h
浏览文件 @
cc7f5514
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
...
@@ -21,25 +22,31 @@ namespace operators {
...
@@ -21,25 +22,31 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
struct
SparseAdagradFunctor
{
struct
SparseAdagradFunctor
{
void
operator
()(
const
DeviceContext
&
context
,
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
SelectedRows
&
grad
,
const
framework
::
SelectedRows
&
grad
,
const
framework
::
Tensor
&
learning_rate
,
T
epsilon
,
const
framework
::
Tensor
&
learning_rate
,
T
epsilon
,
framework
::
Tensor
*
moment
,
framework
::
Tensor
*
param
);
framework
::
Tensor
*
moment
,
framework
::
Tensor
*
param
);
};
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
AdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
AdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
auto
*
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
auto
*
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
param_out_tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
param_out_tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
moment_out_tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
moment_out_tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
auto
param
=
framework
::
EigenVector
<
T
>::
Flatten
(
auto
param
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
));
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
));
...
@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
...
@@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
));
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Grad"
));
auto
moment
=
framework
::
EigenVector
<
T
>::
Flatten
(
auto
moment
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Moment"
));
*
ctx
.
Input
<
framework
::
Tensor
>
(
"Moment"
));
auto
*
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
*
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
param_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param_out_tensor
);
auto
param_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
param_out_tensor
);
auto
moment_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
moment_out_tensor
);
auto
moment_out
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
moment_out_tensor
);
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
*
place
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
moment_out
.
device
(
*
place
)
=
moment
+
grad
*
grad
;
moment_out
.
device
(
*
place
)
=
moment
+
grad
*
grad
;
Eigen
::
DSizes
<
int
,
1
>
m_dsize
(
moment_out_tensor
->
numel
());
Eigen
::
DSizes
<
int
,
1
>
m_dsize
(
moment_out_tensor
->
numel
());
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()))
{
auto
*
lr
=
learning_rate
->
data
<
T
>
();
auto
*
lr
=
learning_rate
->
data
<
T
>
();
param_out
.
device
(
*
place
)
=
param_out
.
device
(
*
place
)
=
param
-
lr
[
0
]
*
grad
/
(
moment_out
.
sqrt
()
+
epsilon
);
param
-
lr
[
0
]
*
grad
/
(
moment_out
.
sqrt
()
+
epsilon
);
}
else
{
}
else
{
...
@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
...
@@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel<T> {
lr
.
broadcast
(
m_dsize
)
*
grad
/
(
moment_out
.
sqrt
()
+
epsilon
);
lr
.
broadcast
(
m_dsize
)
*
grad
/
(
moment_out
.
sqrt
()
+
epsilon
);
}
}
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
*
param_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
*
param_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_tensor
,
param_out_tensor
);
PADDLE_ENFORCE_EQ
(
param_tensor
,
param_out_tensor
);
auto
*
moment_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Moment"
);
auto
*
moment_tensor
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Moment"
);
PADDLE_ENFORCE_EQ
(
moment_tensor
,
moment_out_tensor
);
PADDLE_ENFORCE_EQ
(
moment_tensor
,
moment_out_tensor
);
SparseAdagradFunctor
<
DeviceContext
,
T
>
functor
;
SparseAdagradFunctor
<
DeviceContext
,
T
>
functor
;
...
...
paddle/fluid/operators/adam_op.h
浏览文件 @
cc7f5514
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/for_range.h"
...
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
...
@@ -199,23 +200,9 @@ struct SparseAdamFunctor {
row_numel_
(
row_numel
),
row_numel_
(
row_numel
),
row_count_
(
row_count
)
{}
row_count_
(
row_count
)
{}
inline
HOSTDEVICE
int64_t
BinarySearchInRows
(
int64_t
row
)
const
{
int64_t
beg
=
0
,
end
=
row_count_
-
1
;
while
(
beg
<=
end
)
{
auto
mid
=
((
beg
+
end
)
>>
1
);
if
(
rows_
[
mid
]
==
row
)
return
mid
;
else
if
(
rows_
[
mid
]
<
row
)
beg
=
mid
+
1
;
else
end
=
mid
-
1
;
}
return
-
1
;
}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
int64_t
row
=
i
/
row_numel_
;
auto
row_idx
=
auto
row_idx
=
BinarySearchInRows
(
row
);
math
::
BinarySearch
<
int64_t
>
(
rows_
,
row_count_
,
i
/
row_numel_
);
T
g
=
row_idx
>=
0
?
grad_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
T
g
=
row_idx
>=
0
?
grad_
[
row_idx
*
row_numel_
+
i
%
row_numel_
]
:
0
;
// The following code is the same as dense
// The following code is the same as dense
...
@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T>
...
@@ -244,6 +231,12 @@ template <typename DeviceContext, typename T>
class
AdamOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
AdamOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
using
paddle
::
framework
::
LoDTensor
;
using
paddle
::
framework
::
LoDTensor
;
using
paddle
::
operators
::
detail
::
Ref
;
using
paddle
::
operators
::
detail
::
Ref
;
...
...
paddle/fluid/operators/adamax_op.cc
浏览文件 @
cc7f5514
...
@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
...
@@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel {
"Input(LearningRate) of AdamaxOp should not be null."
);
"Input(LearningRate) of AdamaxOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Beta1Pow"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Beta1Pow"
),
"Input(Beta1Pow) of AdamaxOp should not be null."
);
"Input(Beta1Pow) of AdamaxOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of AdamaxOp should not be null."
);
"Output(ParamOut) of AdamaxOp should not be null."
);
...
...
paddle/fluid/operators/adamax_op.h
浏览文件 @
cc7f5514
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class
AdamaxOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
AdamaxOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
auto
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
auto
inf_norm_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"InfNormOut"
);
auto
inf_norm_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"InfNormOut"
);
...
...
paddle/fluid/operators/decayed_adagrad_op.cc
浏览文件 @
cc7f5514
...
@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
...
@@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of DecayedAdagradOp should not be null."
);
"Input(LearningRate) of DecayedAdagradOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of DecayedAdagradOp should not be null."
);
"Output(ParamOut) of DecayedAdagradOp should not be null."
);
...
...
paddle/fluid/operators/decayed_adagrad_op.h
浏览文件 @
cc7f5514
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
...
@@ -23,6 +23,17 @@ template <typename DeviceContext, typename T>
class
DecayedAdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
DecayedAdagradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
param_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
auto
moment_out_tensor
=
ctx
.
Output
<
framework
::
Tensor
>
(
"MomentOut"
);
...
...
paddle/fluid/operators/fill_constant_op.cc
浏览文件 @
cc7f5514
...
@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
...
@@ -70,6 +70,12 @@ class FillConstantOp : public framework::OperatorBase {
}
}
};
};
class
FillConstantOpVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{}
};
class
FillConstantOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
FillConstantOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
override
{
void
Make
()
override
{
...
@@ -102,4 +108,5 @@ Fill up a variable with specified constant value.
...
@@ -102,4 +108,5 @@ Fill up a variable with specified constant value.
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantInferShape
,
ops
::
FillConstantOpMaker
,
ops
::
FillConstantInferShape
,
ops
::
FillConstantOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
FillConstantOpVarTypeInference
);
paddle/fluid/operators/ftrl_op.cc
浏览文件 @
cc7f5514
...
@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
...
@@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel {
"Input(Grad) of FTRL should not be null."
);
"Input(Grad) of FTRL should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of FTRL should not be null."
);
"Input(LearningRate) of FTRL should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Grad"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Grad"
).
front
(),
ctx
->
GetInputsVarType
(
"Grad"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of FTRL should not be null."
);
"Output(ParamOut) of FTRL should not be null."
);
...
...
paddle/fluid/operators/ftrl_op.h
浏览文件 @
cc7f5514
...
@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
...
@@ -28,6 +28,17 @@ template <typename DeviceContext, typename T>
class
FTRLOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FTRLOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
*
param_out
=
ctx
.
Output
<
Tensor
>
(
"ParamOut"
);
auto
*
param_out
=
ctx
.
Output
<
Tensor
>
(
"ParamOut"
);
auto
*
sq_accum_out
=
ctx
.
Output
<
Tensor
>
(
"SquaredAccumOut"
);
auto
*
sq_accum_out
=
ctx
.
Output
<
Tensor
>
(
"SquaredAccumOut"
);
auto
*
lin_accum_out
=
ctx
.
Output
<
Tensor
>
(
"LinearAccumOut"
);
auto
*
lin_accum_out
=
ctx
.
Output
<
Tensor
>
(
"LinearAccumOut"
);
...
...
paddle/fluid/operators/fusion_lstm_op.cc
浏览文件 @
cc7f5514
...
@@ -15,11 +15,9 @@ limitations under the License. */
...
@@ -15,11 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include "paddle/fluid/operators/fusion_lstm_op.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -219,121 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
...
@@ -219,121 +217,55 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
template
<
typename
T
>
template
<
typename
T
>
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
class
FuisonLSTMKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
#define INIT_VEC_FUNC \
#define INIT_BASE_DEFINES \
std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
using DeviceContext = paddle::platform::CPUDeviceContext; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto* x = ctx.Input<LoDTensor>("X"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
auto* c0 = ctx.Input<Tensor>("C0"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
auto* wx = ctx.Input<Tensor>("WeightX"); \
math::VecActivations<T, platform::jit::avx> act_functor; \
auto* wh = ctx.Input<Tensor>("WeightH"); \
act_gate = act_functor(act_gate_str); \
auto* bias = ctx.Input<Tensor>("Bias"); \
act_cell = act_functor(act_cell_str); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
act_cand = act_functor(act_cand_str); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
} else { \
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
act_gate = act_functor(act_gate_str); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
act_cell = act_functor(act_cell_str); \
auto x_dims = x->dims();
/* T x M*/
\
act_cand = act_functor(act_cand_str); \
auto wh_dims = wh->dims();
/* D x 4D*/
\
}
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
#define INIT_BASE_INPUT_OUTPUT \
const int D4 = wh_dims[1]
auto* x = ctx.Input<LoDTensor>("X"); \
auto* h0 = ctx.Input<Tensor>("H0"); \
#define INIT_OTHER_DEFINES \
auto* c0 = ctx.Input<Tensor>("C0"); \
const T* x_data = x->data<T>(); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
const T* wx_data = wx->data<T>(); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
const T* wh_data = wh->data<T>(); \
auto* bias = ctx.Input<Tensor>("Bias"); \
/* diagonal weight*/
\
auto* xx = ctx.Output<LoDTensor>("XX"); \
const T* wp_data = bias->data<T>() + D4; \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
/* for peephole only*/
\
auto* cell_out = ctx.Output<LoDTensor>("Cell"); \
T* checked_cell_data = nullptr; \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
auto place = ctx.GetPlace(); \
bool use_peepholes = ctx.Attr<bool>("use_peepholes");
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
#define INIT_BASE_SIZES \
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
auto x_dims = x->dims();
/* T x M*/
\
checked_cell_data = checked_cell->mutable_data<T>(place); \
auto wh_dims = wh->dims();
/* D x 4D*/
\
} \
const int M = x_dims[1]; \
const auto& ker = \
const int D = wh_dims[0]; \
math::jitkernel::KernelPool::Instance() \
const int D2 = D * 2; \
.template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
const int D3 = D * 3; \
const std::string&, const std::string&>( \
const int D4 = wh_dims[1];
ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("candidate_activation"), \
#define INIT_BASE_INPUT_DATAS \
ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
// Wh GEMM
const T* wh_data = wh->data<T>(); \
/* diagonal weight*/
\
const T* wc_data = bias->data<T>() + D4; \
/* for peephole only*/
\
T* checked_cell_data = nullptr; \
auto place = ctx.GetPlace(); \
if (use_peepholes) { \
/* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/
\
auto* checked_cell = ctx.Output<Tensor>("CheckedCell"); \
checked_cell_data = checked_cell->mutable_data<T>(place); \
}
/// Compute LSTM
#define GEMM_WH_ADDON(bs, prev, out) \
#define GEMM_WH_ADDON(bs, prev, out) \
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
wh_data, D4, static_cast<T>(1), out, D4)
wh_data, D4, static_cast<T>(1), out, D4)
#define GET_Ct(ct_1, gates, ct) \
/* C_t = C_t-1 * fgated + cand_gated * igated*/
\
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, gates + D); \
blas.VMUL(D, ct_1, gates + D2, gates + D2); \
blas.VADD(D, gates + D, gates + D2, ct)
#define GET_Ht(ct, gates, ht) \
/* H_t = act_cell(C_t) * ogated */
\
act_cell(D, ct, gates + D2); \
blas.VMUL(D, gates + D2, gates + D3, ht)
#define GET_Ct_NOH0C0(gates, ct) \
/* C_t = igated * cgated*/
\
act_gate(D, gates + D, gates + D); \
act_cand(D, gates, gates); \
blas.VMUL(D, gates, gates + D, ct)
#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
GET_Ct_NOH0C0(gates, ct); \
/* get outgated, put W_oc * C_t on igated */
\
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
act_gate(D3, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
GET_Ht(ct, gates, ht)
#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \
/* get fgated and igated*/
\
blas.VMUL(D, wc_data, ct_1, checked_cell_data); \
blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
act_gate(D2, gates + D, gates + D); \
GET_Ct(ct_1, gates, ct); \
/* get ogated*/
\
blas.VMUL(D, wc_data + D2, ct, gates + D); \
blas.VADD(D, gates + D, gates + D3, gates + D3); \
act_gate(D, gates + D3, gates + D3); \
GET_Ht(ct, gates, ht)
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
INIT_BASE_DEFINES
;
INIT_BASE_INPUT_OUTPUT
INIT_OTHER_DEFINES
;
INIT_BASE_SIZES
INIT_VEC_FUNC
INIT_BASE_INPUT_DATAS
auto
x_lod
=
x
->
lod
();
auto
x_lod
=
x
->
lod
();
const
int
total_T
=
x_dims
[
0
];
const
int
total_T
=
x_dims
[
0
];
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
...
@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -357,89 +289,47 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
gate_offset
=
-
D
;
gate_offset
=
-
D
;
}
}
#define MOVE_ONE_STEP \
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
prev_h_data = h_out_data; \
int
bid
=
is_reverse
?
N
-
1
-
i
:
i
;
prev_c_data = c_out_data; \
int
seq_len
=
x_lod
[
0
][
bid
+
1
]
-
x_lod
[
0
][
bid
];
xx_data = xx_data + xx_offset; \
const
T
*
prev_c_data
=
nullptr
;
h_out_data = h_out_data + gate_offset; \
const
T
*
prev_h_data
=
nullptr
;
c_out_data = c_out_data + gate_offset
int
tstart
=
0
;
if
(
h0_data
)
{
#define PROCESS_H0C0_DEFINES \
prev_h_data
=
h0_data
+
bid
*
D
;
int bid = is_reverse ? N - 1 - i : i; \
prev_c_data
=
c0_data
+
bid
*
D
;
int seq_len = x_lod[0][bid + 1] - x_lod[0][bid]; \
const T* prev_c_data = nullptr; \
const T* prev_h_data = nullptr; \
int tstart = 0
#define PROCESS_H0C0_PEEPHOLE \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
#define PROCESS_H0C0 \
PROCESS_H0C0_DEFINES; \
if (h0_data) { \
prev_h_data = h0_data + bid * D; \
prev_c_data = c0_data + bid * D; \
} else { \
COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
MOVE_ONE_STEP; \
tstart = 1; \
}
if
(
use_peepholes
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
PROCESS_H0C0_PEEPHOLE
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
COMPUTE_CtHt_PEEPHOLE
(
xx_data
,
prev_c_data
,
c_out_data
,
h_out_data
);
MOVE_ONE_STEP
;
}
}
}
else
{
// TODO(TJ): unly workaround, clean me
std
::
function
<
void
(
T
*
,
const
T
*
,
T
*
,
T
*
)
>
compute_ctht
;
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
)
&&
act_gate_str
==
"sigmoid"
&&
act_cand_str
==
"tanh"
&&
act_cell_str
==
"tanh"
&&
D
==
8
)
{
compute_ctht
=
math
::
lstm_compute_ctht
<
T
>
;
}
else
{
}
else
{
compute_ctht
=
[
&
](
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
)
{
ker
->
ComputeC1H1
(
xx_data
,
c_out_data
,
h_out_data
,
wp_data
);
COMPUTE_CtHt
(
gates
,
ct_1
,
ct
,
ht
);
tstart
=
1
;
};
// move one step
prev_h_data
=
h_out_data
;
prev_c_data
=
c_out_data
;
xx_data
=
xx_data
+
xx_offset
;
h_out_data
=
h_out_data
+
gate_offset
;
c_out_data
=
c_out_data
+
gate_offset
;
}
}
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
PROCESS_H0C0
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
for
(
int
step
=
tstart
;
step
<
seq_len
;
++
step
)
{
ker
->
ComputeCtHt
(
xx_data
,
prev_c_data
,
c_out_data
,
h_out_data
,
wp_data
,
GEMM_WH_ADDON
(
1
,
prev_h_data
,
xx_data
);
checked_cell_data
);
compute_ctht
(
xx_data
,
prev_c_data
,
c_out_data
,
h_out_data
);
// move one step
MOVE_ONE_STEP
;
prev_h_data
=
h_out_data
;
}
prev_c_data
=
c_out_data
;
xx_data
=
xx_data
+
xx_offset
;
h_out_data
=
h_out_data
+
gate_offset
;
c_out_data
=
c_out_data
+
gate_offset
;
}
}
}
}
#undef PROCESS_H0C0_DEFINES
#undef PROCESS_H0C0_PEEPHOLE
#undef PROCESS_H0C0
#undef MOVE_ONE_STEP
}
}
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
platform
::
CPUDeviceContext
;
INIT_BASE_DEFINES
;
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
xx
->
Resize
({
x_dims
[
0
],
D4
});
xx
->
Resize
({
x_dims
[
0
],
D4
});
SeqCompute
(
ctx
);
SeqCompute
(
ctx
);
return
;
return
;
}
}
INIT_VEC_FUNC
INIT_OTHER_DEFINES
;
INIT_BASE_INPUT_DATAS
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
reordered_c0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedC0"
);
auto
*
reordered_c0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedC0"
);
...
@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -487,8 +377,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_c_data
=
reordered_c0_data
;
prev_c_data
=
reordered_c0_data
;
size_t
sz
=
sizeof
(
T
)
*
D
;
size_t
sz
=
sizeof
(
T
)
*
D
;
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
std
::
memcpy
(
reordered_h0_data
,
h0_data
+
seq_order
[
i
]
*
D
,
sz
);
blas
.
VCOPY
(
sz
,
h0_data
+
seq_order
[
i
]
*
D
,
reordered_h0_data
);
std
::
memcpy
(
reordered_c0_data
,
c0_data
+
seq_order
[
i
]
*
D
,
sz
);
blas
.
VCOPY
(
sz
,
c0_data
+
seq_order
[
i
]
*
D
,
reordered_c0_data
);
reordered_h0_data
+=
D
;
reordered_h0_data
+=
D
;
reordered_c0_data
+=
D
;
reordered_c0_data
+=
D
;
}
}
...
@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -498,13 +388,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_c_out_data
=
batched_c_out_data
;
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
GET_Ct_NOH0C0
(
cur_in_data
,
cur_c_out_data
);
ker
->
ComputeC1H1
(
cur_in_data
,
cur_c_out_data
,
cur_h_out_data
,
wp_data
);
if
(
use_peepholes
)
{
blas
.
VMUL
(
D
,
wc_data
+
D2
,
cur_c_out_data
,
cur_in_data
+
D
);
blas
.
VADD
(
D
,
cur_in_data
+
D
,
cur_in_data
+
D3
,
cur_in_data
+
D3
);
}
act_gate
(
D
,
cur_in_data
+
D3
,
cur_in_data
+
D3
);
GET_Ht
(
cur_c_out_data
,
cur_in_data
,
cur_h_out_data
);
cur_in_data
+=
D4
;
cur_in_data
+=
D4
;
cur_c_out_data
+=
D
;
cur_c_out_data
+=
D
;
cur_h_out_data
+=
D
;
cur_h_out_data
+=
D
;
...
@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -513,71 +397,37 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
prev_h_data
=
batched_h_out_data
;
prev_h_data
=
batched_h_out_data
;
prev_c_data
=
batched_c_out_data
;
prev_c_data
=
batched_c_out_data
;
}
}
// compute kernel part
const
auto
&
batch_starts
=
batched_lod
[
0
];
const
auto
&
batch_starts
=
batched_lod
[
0
];
const
int
max_seq_len
=
batch_starts
.
size
()
-
1
;
const
int
max_seq_len
=
batch_starts
.
size
()
-
1
;
const
int
offset
=
tstart
*
max_bs
*
D
;
const
int
offset
=
tstart
*
max_bs
*
D
;
batched_input_data
=
batched_input_data
+
offset
*
4
;
batched_input_data
=
batched_input_data
+
offset
*
4
;
batched_h_out_data
=
batched_h_out_data
+
offset
;
batched_h_out_data
=
batched_h_out_data
+
offset
;
batched_c_out_data
=
batched_c_out_data
+
offset
;
batched_c_out_data
=
batched_c_out_data
+
offset
;
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
#define DEFINE_CUR \
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
T* cur_in_data = batched_input_data; \
GEMM_WH_ADDON
(
cur_bs
,
prev_h_data
,
batched_input_data
);
T* cur_prev_c_data = prev_c_data; \
T
*
cur_in_data
=
batched_input_data
;
T* cur_c_out_data = batched_c_out_data; \
T
*
cur_prev_c_data
=
prev_c_data
;
T* cur_h_out_data = batched_h_out_data
T
*
cur_c_out_data
=
batched_c_out_data
;
T
*
cur_h_out_data
=
batched_h_out_data
;
#define MOVE_ONE_BATCH \
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
cur_in_data += D4; \
ker
->
ComputeCtHt
(
cur_in_data
,
cur_prev_c_data
,
cur_c_out_data
,
cur_prev_c_data += D; \
cur_h_out_data
,
wp_data
,
checked_cell_data
);
cur_c_out_data += D; \
// move one batch
cur_h_out_data += D
cur_in_data
+=
D4
;
cur_prev_c_data
+=
D
;
#define MOVE_ONE_STEP \
cur_c_out_data
+=
D
;
prev_c_data = batched_c_out_data; \
cur_h_out_data
+=
D
;
prev_h_data = batched_h_out_data; \
batched_c_out_data = cur_c_out_data; \
batched_h_out_data = cur_h_out_data; \
batched_input_data = cur_in_data
if
(
use_peepholes
)
{
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
GEMM_WH_ADDON
(
cur_bs
,
prev_h_data
,
batched_input_data
);
DEFINE_CUR
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
COMPUTE_CtHt_PEEPHOLE
(
cur_in_data
,
cur_prev_c_data
,
cur_c_out_data
,
cur_h_out_data
);
MOVE_ONE_BATCH
;
}
MOVE_ONE_STEP
;
}
}
else
{
// TODO(TJ): unly workaround, clean me
std
::
function
<
void
(
T
*
,
const
T
*
,
T
*
,
T
*
)
>
compute_ctht
;
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx
)
&&
act_gate_str
==
"sigmoid"
&&
act_cand_str
==
"tanh"
&&
act_cell_str
==
"tanh"
&&
D
==
8
)
{
compute_ctht
=
math
::
lstm_compute_ctht
<
T
>
;
}
else
{
compute_ctht
=
[
&
](
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
)
{
COMPUTE_CtHt
(
gates
,
ct_1
,
ct
,
ht
);
};
}
for
(
int
step
=
tstart
;
step
<
max_seq_len
;
++
step
)
{
const
int
cur_bs
=
batch_starts
[
step
+
1
]
-
batch_starts
[
step
];
GEMM_WH_ADDON
(
cur_bs
,
prev_h_data
,
batched_input_data
);
DEFINE_CUR
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
compute_ctht
(
cur_in_data
,
cur_prev_c_data
,
cur_c_out_data
,
cur_h_out_data
);
MOVE_ONE_BATCH
;
}
MOVE_ONE_STEP
;
}
}
// move one step
prev_c_data
=
batched_c_out_data
;
prev_h_data
=
batched_h_out_data
;
batched_c_out_data
=
cur_c_out_data
;
batched_h_out_data
=
cur_h_out_data
;
batched_input_data
=
cur_in_data
;
}
}
#undef MOVE_ONE_STEP
#undef MOVE_ONE_BATCH
#undef DEFINE_CUR
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batched_h_out
->
set_lod
(
batched_lod
);
batched_h_out
->
set_lod
(
batched_lod
);
...
@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
...
@@ -594,18 +444,9 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
}
}
}
}
#undef COMPUTE_CtHt_PEEPHOLE
#undef COMPUTE_CtHt
#undef GET_Ct_NOH0C0
#undef COMPUTE_CtHt_NOH0C0
#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
#undef GET_Ht
#undef GET_Ct
#undef GEMM_WH_ADDON
#undef GEMM_WH_ADDON
#undef INIT_BASE_INPUT_DATAS
#undef INIT_OTHER_DEFINES
#undef INIT_BASE_SIZES
#undef INIT_BASE_DEFINES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_VEC_FUNC
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/isfinite_op.cc
浏览文件 @
cc7f5514
...
@@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -60,7 +60,7 @@ class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) 1-dim tensor, contains a bool scalar. The output "
"(Tensor) 1-dim tensor, contains a bool scalar. The output "
"tensor of overflow operator."
);
"tensor of overflow operator."
);
AddComment
(
string
::
Sprintf
(
R"DOC(
AddComment
(
string
::
Sprintf
(
R"DOC(
Overflow operator.
Overflow
%s
operator.
$$Out = any(X)$$
$$Out = any(X)$$
...
@@ -69,6 +69,8 @@ Out = Inf if any X contains Inf,
...
@@ -69,6 +69,8 @@ Out = Inf if any X contains Inf,
Out = Nan if any X contains Nan,
Out = Nan if any X contains Nan,
Out = 0 if no Inf/Nan detected.
Out = 0 if no Inf/Nan detected.
If X contains both Inf/Nan, it will return the first indicator it meeted.
If X contains both Inf/Nan, it will return the first indicator it meeted.
%s
)DOC"
,
)DOC"
,
GetName
(),
GetComments
()));
GetName
(),
GetComments
()));
}
}
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
cc7f5514
...
@@ -3,8 +3,8 @@ add_subdirectory(detail)
...
@@ -3,8 +3,8 @@ add_subdirectory(detail)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
function
(
math_library TARGET
)
function
(
math_library TARGET
)
# math_library is a function to create math library.
# math_library is a function to create math library.
# The interface is the same as cc_library.
# The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library.
# But it handle split GPU/CPU code and link some common library.
set
(
cc_srcs
)
set
(
cc_srcs
)
set
(
cu_srcs
)
set
(
cu_srcs
)
...
@@ -45,15 +45,13 @@ math_library(im2col)
...
@@ -45,15 +45,13 @@ math_library(im2col)
if
(
NOT WIN32
)
# windows do not support avx functions yet.
if
(
NOT WIN32
)
# windows do not support avx functions yet.
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
gru_compute DEPS activation_functions math_function
)
math_library
(
lstm_compute DEPS activation_functions
)
math_library
(
lstm_compute DEPS activation_functions
)
# TODO(TJ): ugly workaround, clean me
cc_library
(
cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
cc_library
(
blas SRCS blas.cc DEPS cblas framework_proto device_context
)
cc_library
(
blas SRCS blas.cc DEPS cblas framework_proto device_context
)
math_library
(
math_function DEPS blas
)
math_library
(
math_function DEPS blas
)
math_library
(
maxouting
)
math_library
(
maxouting
)
math_library
(
pooling
)
math_library
(
pooling
)
math_library
(
selected_rows_functor DEPS selected_rows math_function
)
math_library
(
selected_rows_functor DEPS selected_rows math_function
blas
)
math_library
(
sequence2batch
)
math_library
(
sequence2batch
)
math_library
(
sequence_padding
)
math_library
(
sequence_padding
)
math_library
(
sequence_pooling DEPS math_function
)
math_library
(
sequence_pooling DEPS math_function
)
...
@@ -76,3 +74,7 @@ if(WITH_GPU)
...
@@ -76,3 +74,7 @@ if(WITH_GPU)
endif
()
endif
()
cc_test
(
concat_test SRCS concat_test.cc DEPS concat
)
cc_test
(
concat_test SRCS concat_test.cc DEPS concat
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
cc_library
(
jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
DEPS cpu_info cblas activation_functions
)
cc_test
(
jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel
)
paddle/fluid/operators/math/algorithm.h
0 → 100644
浏览文件 @
cc7f5514
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cstdint> // for int64_t
#include <numeric>
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
HOSTDEVICE
inline
int64_t
BinarySearch
(
const
T
*
x
,
int64_t
num
,
const
T
&
val
)
{
int64_t
beg
=
0
,
end
=
num
-
1
;
while
(
beg
<=
end
)
{
auto
mid
=
((
beg
+
end
)
>>
1
);
if
(
x
[
mid
]
==
val
)
return
mid
;
else
if
(
x
[
mid
]
<
val
)
beg
=
mid
+
1
;
else
end
=
mid
-
1
;
}
return
-
1
;
}
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cpu_lstm_compute.h
已删除
100644 → 0
浏览文件 @
305d211a
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
// TODO(TJ): ugly workaround, clean me
template
<
typename
T
>
void
lstm_compute_ctht
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
)
{
// gates: W_ch, W_ih, W_fh, W_oh
vec_sigmoid
<
T
,
platform
::
jit
::
avx
>
(
24
,
gates
+
8
,
gates
+
8
);
vec_tanh
<
T
,
platform
::
jit
::
avx
>
(
8
,
gates
,
gates
);
const
T
*
i
=
gates
+
8
,
*
f
=
gates
+
16
,
*
o
=
gates
+
24
;
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
d
=
0
;
d
<
8
;
++
d
)
{
// C_t = C_t-1 * fgated + cand_gated * igated
ct
[
d
]
=
ct_1
[
d
]
*
f
[
d
]
+
gates
[
d
]
*
i
[
d
];
// H_t = act_cell(C_t) * ogated
T
tmp
=
ct
[
d
]
*
2
;
tmp
=
static_cast
<
T
>
(
0
)
-
((
tmp
<
min
)
?
min
:
((
tmp
>
max
)
?
max
:
tmp
));
vec_exp
<
T
>
(
1
,
&
tmp
,
&
tmp
);
tmp
=
static_cast
<
T
>
(
2
)
/
(
static_cast
<
T
>
(
1
)
+
tmp
)
-
static_cast
<
T
>
(
1
);
ht
[
d
]
=
tmp
*
o
[
d
];
}
}
#ifdef __AVX__
namespace
detail
{
namespace
forward
{
namespace
avx
{
__m256
Sigmoid
(
const
__m256
a
);
__m256
Tanh
(
const
__m256
a
);
}
// namespace avx
}
// namespace forward
}
// namespace detail
template
<
>
void
lstm_compute_ctht
<
float
>
(
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
);
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cpu_vec.h
浏览文件 @
cc7f5514
...
@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
...
@@ -125,10 +125,8 @@ inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
}
}
template
<
>
template
<
>
inline
void
vec_scal
<
float
,
platform
::
jit
::
avx512_common
>
(
const
int
n
,
inline
void
vec_scal
<
float
,
platform
::
jit
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_scal
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
vec_scal
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
}
}
...
@@ -181,10 +179,10 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
...
@@ -181,10 +179,10 @@ inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
}
}
template
<
>
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
jit
::
avx512
_common
>
(
const
int
n
,
inline
void
vec_bias_sub
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
a
,
const
float
a
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_bias_sub
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
vec_bias_sub
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
}
}
...
@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
...
@@ -242,7 +240,7 @@ inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
}
}
template
<
>
template
<
>
inline
void
vec_cross
<
float
,
platform
::
jit
::
avx512
_common
>
(
inline
void
vec_cross
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_cross
<
float
,
platform
::
jit
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
vec_cross
<
float
,
platform
::
jit
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
...
@@ -296,10 +294,10 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
...
@@ -296,10 +294,10 @@ inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
}
}
template
<
>
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
jit
::
avx512
_common
>
(
const
int
n
,
inline
void
vec_add_bias
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
a
,
const
float
a
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_add_bias
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
vec_add_bias
<
float
,
platform
::
jit
::
avx2
>
(
n
,
a
,
x
,
y
);
}
}
...
@@ -390,9 +388,9 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
...
@@ -390,9 +388,9 @@ inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
}
}
template
<
>
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
jit
::
avx512
_common
>
(
const
int
n
,
inline
void
vec_sigmoid
<
float
,
platform
::
jit
::
avx512
f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_sigmoid
<
float
,
platform
::
jit
::
avx2
>
(
n
,
x
,
y
);
vec_sigmoid
<
float
,
platform
::
jit
::
avx2
>
(
n
,
x
,
y
);
}
}
...
@@ -454,9 +452,8 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
...
@@ -454,9 +452,8 @@ inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
}
}
template
<
>
template
<
>
inline
void
vec_relu
<
float
,
platform
::
jit
::
avx512_common
>
(
const
int
n
,
inline
void
vec_relu
<
float
,
platform
::
jit
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
x
,
float
*
y
)
{
float
*
y
)
{
// TODO(TJ): enable me
// TODO(TJ): enable me
vec_relu
<
float
,
platform
::
jit
::
avx2
>
(
n
,
x
,
y
);
vec_relu
<
float
,
platform
::
jit
::
avx2
>
(
n
,
x
,
y
);
}
}
...
...
paddle/fluid/operators/math/cpu_vec_test.cc
浏览文件 @
cc7f5514
...
@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
...
@@ -110,7 +110,7 @@ TEST(CpuVecTest, sigmoid) {
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
_common
>
,
TestAndBench
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
f
>
,
ref_sigmoid
<
float
>
);
ref_sigmoid
<
float
>
);
}
}
TestAndBench
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
TestAndBench
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
...
@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
...
@@ -123,8 +123,7 @@ TEST(CpuVecTest, tanh) {
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512_common
>
,
TestAndBench
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512f
>
,
ref_tanh
<
float
>
);
ref_tanh
<
float
>
);
}
}
TestAndBench
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
TestAndBench
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
}
}
...
@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
...
@@ -136,8 +135,7 @@ TEST(CpuVecTest, relu) {
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512_common
>
,
TestAndBench
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512f
>
,
ref_relu
<
float
>
);
ref_relu
<
float
>
);
}
}
TestAndBench
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
TestAndBench
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
}
}
...
@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
...
@@ -170,7 +168,7 @@ TEST(CpuVecTest, inplace_sigmoid) {
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx2
>
,
ref_sigmoid
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
_common
>
,
TestInplace
<
float
>
(
sz
,
vec_sigmoid
<
float
,
jit
::
avx512
f
>
,
ref_sigmoid
<
float
>
);
ref_sigmoid
<
float
>
);
}
}
TestInplace
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
TestInplace
<
double
>
(
30
,
vec_sigmoid
<
double
>
,
ref_sigmoid
<
double
>
);
...
@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
...
@@ -183,8 +181,7 @@ TEST(CpuVecTest, inplace_tanh) {
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx2
>
,
ref_tanh
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512_common
>
,
TestInplace
<
float
>
(
sz
,
vec_tanh
<
float
,
jit
::
avx512f
>
,
ref_tanh
<
float
>
);
ref_tanh
<
float
>
);
}
}
TestInplace
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
TestInplace
<
double
>
(
30
,
vec_tanh
<
double
>
,
ref_tanh
<
double
>
);
}
}
...
@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
...
@@ -196,8 +193,7 @@ TEST(CpuVecTest, inplace_relu) {
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx2
>
,
ref_relu
<
float
>
);
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512_common
>
,
TestInplace
<
float
>
(
sz
,
vec_relu
<
float
,
jit
::
avx512f
>
,
ref_relu
<
float
>
);
ref_relu
<
float
>
);
}
}
TestInplace
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
TestInplace
<
double
>
(
30
,
vec_relu
<
double
>
,
ref_relu
<
double
>
);
}
}
paddle/fluid/operators/math/
cpu_lstm_compute
.cc
→
paddle/fluid/operators/math/
jit_kernel
.cc
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <iostream>
#include <string>
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
#ifdef __AVX__
namespace
jitkernel
{
template
<
>
void
lstm_compute_ctht
<
float
>
(
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
namespace
jit
=
platform
::
jit
;
float
*
ht
)
{
namespace
act
=
detail
::
forward
::
avx
;
KernelPool
&
KernelPool
::
Instance
()
{
// gates: W_ch, W_ih, W_fh, W_oh
static
thread_local
KernelPool
g_jit_kernels
;
__m256
c
,
i
,
f
,
o
;
return
g_jit_kernels
;
c
=
_mm256_loadu_ps
(
gates
);
}
i
=
_mm256_loadu_ps
(
gates
+
8
);
f
=
_mm256_loadu_ps
(
gates
+
16
);
std
::
shared_ptr
<
const
Kernel
>
KernelPool
::
Get
(
const
std
::
string
&
key
)
const
{
o
=
_mm256_loadu_ps
(
gates
+
24
);
if
(
kers_
.
find
(
key
)
==
kers_
.
end
())
{
return
nullptr
;
/* C_t = C_t-1 * fgated + cand_gated * igated*/
}
c
=
_mm256_mul_ps
(
act
::
Tanh
(
c
),
act
::
Sigmoid
(
i
));
return
kers_
.
at
(
key
);
i
=
_mm256_loadu_ps
(
ct_1
);
f
=
_mm256_mul_ps
(
i
,
act
::
Sigmoid
(
f
));
f
=
_mm256_add_ps
(
c
,
f
);
_mm256_storeu_ps
(
ct
,
f
);
/* H_t = act_cell(C_t) * ogated */
o
=
_mm256_mul_ps
(
act
::
Tanh
(
f
),
act
::
Sigmoid
(
o
));
_mm256_storeu_ps
(
ht
,
o
);
}
}
#endif
}
// namespace jitkernel
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel.h
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <memory> // for shared_ptr
#include <string>
#include <unordered_map>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/macros.h"
// Note: Only support on CPU yet.
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define AVX_FLOAT_BLOCK 8
#define AVX2_FLOAT_BLOCK 8
#define AVX512_FLOAT_BLOCK 16
typedef
enum
{
kLT8
,
kEQ8
,
kGT8LT16
,
kEQ16
,
kGT16
}
jit_block
;
class
Kernel
{
public:
Kernel
()
=
default
;
virtual
~
Kernel
()
=
default
;
int
num_
{
0
};
int
end_
{
0
};
int
rest_
{
0
};
DISABLE_COPY_AND_ASSIGN
(
Kernel
);
};
class
KernelPool
{
public:
static
KernelPool
&
Instance
();
template
<
typename
Ker
,
typename
...
ARGS
>
std
::
shared_ptr
<
const
Ker
>
Get
(
ARGS
...
args
);
std
::
shared_ptr
<
const
Kernel
>
Get
(
const
std
::
string
&
key
)
const
;
private:
KernelPool
()
=
default
;
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
const
Kernel
>>
kers_
;
DISABLE_COPY_AND_ASSIGN
(
KernelPool
);
};
template
<
typename
T
>
class
VMulKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
};
template
<
typename
T
>
class
VAddKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
};
template
<
typename
T
>
class
VScalKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
T
*
x
)
const
=
0
;
};
template
<
typename
T
>
class
VAddBiasKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VActKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VReluKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VIdentityKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VExpKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VSigmoidKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VTanhKernel
:
public
VActKernel
<
T
>
{
public:
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
LSTMKernel
:
public
Kernel
{
public:
virtual
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
/* below only used in peephole*/
const
T
*
wp_data
=
nullptr
,
T
*
checked
=
nullptr
)
const
=
0
;
// compute c1 and h1 without c0 or h0
virtual
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
/* below only used in peephole*/
const
T
*
wp_data
=
nullptr
)
const
=
0
;
};
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_blas.cc
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
/* VMUL JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VMulKernelImpl
:
public
VMulKernel
<
T
>
{
public:
explicit
VMulKernelImpl
(
int
d
)
:
VMulKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VMulKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsMul(this->num_, x, y, z); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VMulKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdMul(this->num_, x, y, z); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VMulKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_mul_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
// avx > for > mkl
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VADD JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddKernelImpl
:
public
VAddKernel
<
T
>
{
public:
explicit
VAddKernelImpl
(
int
d
)
:
VAddKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VAddKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsAdd(this->num_, x, y, z); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VAddKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdAdd(this->num_, x, y, z); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VAddKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_add_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VSCAL JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VScalKernelImpl
:
public
VScalKernel
<
T
>
{
public:
explicit
VScalKernelImpl
(
int
d
)
:
VScalKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
void
Compute
(
const
T
a
,
T
*
x
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VScalKernelImpl<float, isa, block>::Compute(const float a, float* x) \
const { \
platform::dynload::cblas_sscal(this->num_, a, x, 1); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VScalKernelImpl<double, isa, block>::Compute(const double a, double* x) \
const { \
platform::dynload::cblas_dscal(this->num_, a, x, 1); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI8_INPLACE_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute(const float a, float* x) \
const { \
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(x, tmp); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI8_INPLACE_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI8_INPLACE_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI8_INPLACE_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI8_INPLACE_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
/* VAddBias JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddBiasKernelImpl
:
public
VAddBiasKernel
<
T
>
{
public:
explicit
VAddBiasKernelImpl
(
int
d
)
:
VAddBiasKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \
tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
/* VRelu JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VReluKernelImpl
:
public
VReluKernel
<
T
>
{
public:
explicit
VReluKernelImpl
(
int
d
)
:
VReluKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + this->rest_, tmp1); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp); \
} \
__m256 tmp = _mm256_loadu_ps(x + this->rest_); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + this->rest_, tmp); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx
);
INTRI_GT16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx2
);
INTRI_GT16_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx512f
);
INTRI_GT16_FLOAT
(
jit
::
avx512f
);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
/* An empty JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VIdentityKernelImpl
:
public
VIdentityKernel
<
T
>
{
public:
explicit
VIdentityKernelImpl
(
int
d
)
:
VIdentityKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{}
};
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL
(
videntity
,
VIdentityKernel
);
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_exp.cc
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <cmath> // for exp
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
#ifdef __AVX__
namespace
detail
{
__m256
Exp
(
__m256
a
);
}
// namespace detail
#endif
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
/* VExp JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VExpKernelImpl
:
public
VExpKernel
<
T
>
{
public:
explicit
VExpKernelImpl
(
int
d
)
:
VExpKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VExpKernelImpl<float, isa, block>::Compute(const float* x, float* y) \
const { \
platform::dynload::vsExp(this->num_, x, y); \
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VExpKernelImpl<double, isa, block>::Compute(const double* x, double* y) \
const { \
platform::dynload::vdExp(this->num_, x, y); \
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kLT8
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT8LT16
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
_mm256_storeu_ps(y, detail::Exp(tmp)); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = detail::Exp(tmp0); \
tmp1 = detail::Exp(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
REGISTER_JITKERNEL
(
vexp
,
VExpKernel
);
/* VSigmoid JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VSigmoidKernelImpl
:
public
VSigmoidKernel
<
T
>
{
public:
explicit
VSigmoidKernelImpl
(
int
d
)
:
VSigmoidKernel
<
T
>
()
{
this
->
num_
=
d
;
vexp_
=
KernelPool
::
Instance
().
template
Get
<
VExpKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
vexp_
->
Compute
(
y
,
y
);
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
private:
std
::
shared_ptr
<
const
VExpKernel
<
T
>>
vexp_
;
};
#define INTRI_SIGMOID(tmp, min, max) \
tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); \
tmp = detail::Exp(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
#define INTRI8_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_SIGMOID(tmp0, min, max); \
INTRI_SIGMOID(tmp1, min, max); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx
);
INTRI_GT16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
// INTRI_GT8LT16_FLOAT(jit::avx2);
// INTRI_GT16_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
// INTRI_GT8LT16_FLOAT(jit::avx512f);
// INTRI_GT16_FLOAT(jit::avx512f);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
#undef INTRI_VSIGMOID
REGISTER_JITKERNEL
(
vsigmoid
,
VSigmoidKernel
);
/* VTanh JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VTanhKernelImpl
:
public
VTanhKernel
<
T
>
{
public:
explicit
VTanhKernelImpl
(
int
d
)
:
VTanhKernel
<
T
>
()
{
this
->
num_
=
d
;
vscal_
=
KernelPool
::
Instance
().
template
Get
<
VScalKernel
<
T
>
>
(
d
);
vsigmoid_
=
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
d
);
vaddbias_
=
KernelPool
::
Instance
().
template
Get
<
VAddBiasKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
vscal_
->
Compute
(
static_cast
<
T
>
(
2
),
x
,
y
);
vsigmoid_
->
Compute
(
y
,
y
);
vscal_
->
Compute
(
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
static_cast
<
T
>
(
-
1
),
y
,
y
);
}
private:
std
::
shared_ptr
<
const
VScalKernel
<
T
>>
vscal_
;
std
::
shared_ptr
<
const
VSigmoidKernel
<
T
>>
vsigmoid_
;
std
::
shared_ptr
<
const
VAddBiasKernel
<
T
>>
vaddbias_
;
};
#define INTRI_VTANH(tmp) \
tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), tmp); \
tmp = _mm256_min_ps(tmp, _mm256_set1_ps(EXP_MAX_INPUT)); \
tmp = detail::Exp(tmp); \
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
#define INTRI8_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_VTANH(tmp0); \
INTRI_VTANH(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
vscal_->Compute(2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(2.f, y); \
vaddbias_->Compute(-1.f, y, y); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y + i, tmp); \
} \
x += this->end_; \
y += this->end_; \
vscal_->Compute(2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(2.f, y); \
vaddbias_->Compute(-1.f, y, y); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
INTRI16_FLOAT
(
jit
::
avx
);
INTRI_GT8LT16_FLOAT
(
jit
::
avx
);
INTRI_GT16_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
INTRI16_FLOAT
(
jit
::
avx2
);
// maybe use avx at gt8lt16 and gt16
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
INTRI16_FLOAT
(
jit
::
avx512f
);
// maybe use avx at gt8lt16 and gt16
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
#undef INTRI_VTANH
REGISTER_JITKERNEL
(
vtanh
,
VTanhKernel
);
#undef JITKERNEL_NEW_ACT_IMPL
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_lstm.cc
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
#ifdef __AVX__
namespace
detail
{
__m256
Exp
(
__m256
a
);
}
// namespace detail
#endif
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
#ifdef __AVX__
typedef
enum
{
kSigmoid
,
kRelu
,
kTanh
,
kIdentity
}
act_type
;
class
AVXAct
{
public:
virtual
~
AVXAct
()
=
default
;
virtual
__m256
Compute
(
__m256
x
)
const
=
0
;
};
template
<
act_type
type
>
class
AVXActImpl
:
public
AVXAct
{
public:
__m256
Compute
(
__m256
x
)
const
override
{
PADDLE_THROW
(
"Unkown type!"
);
}
};
template
<
>
__m256
AVXActImpl
<
kSigmoid
>::
Compute
(
__m256
x
)
const
{
__m256
ones
=
_mm256_set1_ps
(
1.0
f
);
x
=
_mm256_max_ps
(
x
,
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MIN
));
x
=
_mm256_min_ps
(
x
,
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MAX
));
x
=
_mm256_sub_ps
(
_mm256_set1_ps
(
0.0
f
),
x
);
x
=
detail
::
Exp
(
x
);
x
=
_mm256_add_ps
(
ones
,
x
);
return
_mm256_div_ps
(
ones
,
x
);
}
template
<
>
__m256
AVXActImpl
<
kTanh
>::
Compute
(
__m256
x
)
const
{
__m256
ones
=
_mm256_set1_ps
(
1.0
f
);
x
=
_mm256_mul_ps
(
_mm256_set1_ps
(
-
2.0
f
),
x
);
x
=
_mm256_min_ps
(
x
,
_mm256_set1_ps
(
EXP_MAX_INPUT
));
x
=
detail
::
Exp
(
x
);
x
=
_mm256_add_ps
(
ones
,
x
);
x
=
_mm256_div_ps
(
_mm256_set1_ps
(
2.0
f
),
x
);
return
_mm256_sub_ps
(
x
,
ones
);
}
template
<
>
__m256
AVXActImpl
<
kRelu
>::
Compute
(
__m256
x
)
const
{
return
_mm256_max_ps
(
x
,
_mm256_setzero_ps
());
}
template
<
>
__m256
AVXActImpl
<
kIdentity
>::
Compute
(
__m256
x
)
const
{
return
x
;
}
#endif
template
<
typename
T
>
static
std
::
shared_ptr
<
const
VActKernel
<
T
>>
GetActKernel
(
const
std
::
string
&
type
,
int
n
)
{
if
(
type
==
"sigmoid"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"relu"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VReluKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"tanh"
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VTanhKernel
<
T
>
>
(
n
));
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
std
::
dynamic_pointer_cast
<
const
VActKernel
<
T
>>
(
KernelPool
::
Instance
().
template
Get
<
VIdentityKernel
<
T
>
>
(
n
));
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
return
nullptr
;
}
/* LSTM JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
LSTMKernelImpl
:
public
LSTMKernel
<
T
>
{
public:
explicit
LSTMKernelImpl
(
const
std
::
string
&
act_gate
,
const
std
::
string
&
act_cand
,
const
std
::
string
&
act_cell
,
int
d
)
:
LSTMKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
d3_
=
d
*
3
;
act_gate_d3_
=
GetActKernel
<
T
>
(
act_gate
,
d3_
);
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_cand_d_
=
GetActKernel
<
T
>
(
act_cand
,
d
);
act_cell_d_
=
GetActKernel
<
T
>
(
act_cell
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
vadd_d_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d
);
#ifdef __AVX__
auto
GetAVXAct
=
[
&
](
const
std
::
string
&
type
)
->
std
::
unique_ptr
<
AVXAct
>
{
if
(
type
==
"sigmoid"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kSigmoid
>
());
}
else
if
(
type
==
"relu"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kRelu
>
());
}
else
if
(
type
==
"tanh"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kTanh
>
());
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kIdentity
>
());
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
};
avx_act_gate_
=
GetAVXAct
(
act_gate
);
avx_act_cand_
=
GetAVXAct
(
act_cand
);
avx_act_cell_
=
GetAVXAct
(
act_cell
);
#endif
}
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
T
*
checked
)
const
override
{
// gates: W_ch, W_ih, W_fh, W_oh
act_gate_d3_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated */
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
private:
int
d_
,
d2_
,
d3_
;
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d3_
,
act_gate_d_
,
act_cand_d_
,
act_cell_d_
;
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
std
::
shared_ptr
<
const
VAddKernel
<
T
>>
vadd_d_
;
#ifdef __AVX__
std
::
unique_ptr
<
const
AVXAct
>
avx_act_gate_
,
avx_act_cand_
,
avx_act_cell_
;
#endif
};
#define INTRI8_FLOAT(isa) \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt( \
float* gates, const float* ct_1, float* ct, float* ht, \
const float* wp_data, float* checked) const { \
/* gates: W_ch, W_ih, W_fh, W_oh */
\
__m256 c, i, f, o; \
c = _mm256_loadu_ps(gates); \
i = _mm256_loadu_ps(gates + 8); \
f = _mm256_loadu_ps(gates + 16); \
o = _mm256_loadu_ps(gates + 24); \
/* C_t = C_t-1 * fgated + cand_gated * igated*/
\
c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
i = _mm256_loadu_ps(ct_1); \
f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \
f = _mm256_add_ps(c, f); \
_mm256_storeu_ps(ct, f); \
/* H_t = act_cell(C_t) * ogated */
\
o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
_mm256_storeu_ps(ht, o); \
}
// TODO(TJ): optimize keq16
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
/* Peephole JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
PeepholeKernelImpl
:
public
LSTMKernel
<
T
>
{
public:
explicit
PeepholeKernelImpl
(
const
std
::
string
&
act_gate
,
const
std
::
string
&
act_cand
,
const
std
::
string
&
act_cell
,
int
d
)
:
LSTMKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
d3_
=
d
*
3
;
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_cand_d_
=
GetActKernel
<
T
>
(
act_cand
,
d
);
act_cell_d_
=
GetActKernel
<
T
>
(
act_cell
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
vadd_d_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d
);
vadd_d2_
=
KernelPool
::
Instance
().
template
Get
<
VAddKernel
<
T
>
>
(
d2_
);
act_gate_d2_
=
GetActKernel
<
T
>
(
act_gate
,
d2_
);
}
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
T
*
checked
)
const
override
{
/* get fgated and igated*/
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
);
act_gate_d2_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
/* get ogated*/
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
);
/* get outgated, put W_oc * C_t on igated */
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
}
private:
int
d_
,
d2_
,
d3_
;
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d2_
,
act_gate_d_
,
act_cand_d_
,
act_cell_d_
;
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
std
::
shared_ptr
<
const
VAddKernel
<
T
>>
vadd_d_
,
vadd_d2_
;
};
#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const LSTMKernel<ker_dtype>> \
KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&, \
const std::string&, const std::string&, int, bool>( \
const std::string& act_gate, const std::string& act_cand, \
const std::string& act_cell, int d, bool use_peephole)
#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
(use_peephole ? "p" : "n")
#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \
if (use_peephole) { \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<PeepholeKernelImpl<dtype, isa, k>>( \
act_gate, act_cand, act_cell, d)); \
} else { \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
act_cell, d)); \
}
REGISTER_JITKERNEL_ARGS
(
lstm
,
LSTMKernel
,
JITKERNEL_DECLARE_LSTM
,
JITKERNEL_KEY_LSTM
,
JITKERNEL_NEW_LSTM_IMPL
);
#undef INTRI8_FLOAT
#undef JITKERNEL_DECLARE_LSTM
#undef JITKERNEL_KEY_LSTM
#undef JITKERNEL_NEW_LSTM_IMPL
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_macro.h
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
#define SEARCH_BLOCK(macro_, ker, dtype, isa) \
if (d < AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kLT8); \
} else if (d == AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ8); \
} else if (d > AVX_FLOAT_BLOCK && d < AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kGT8LT16); \
} else if (d == AVX512_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kEQ16); \
} else { \
macro_(ker, dtype, isa, kGT16); \
}
#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \
if (jit::MayIUse(jit::avx512f)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
} else if (jit::MayIUse(jit::avx2)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \
} else if (jit::MayIUse(jit::avx)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \
} else { \
SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
}
#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const ker_class<ker_dtype>> \
KernelPool::Get<ker_class<ker_dtype>, int>(int d)
#define JITKERNEL_KEY(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d)
#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(d))
#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
marco_declare, macro_key, macro_impl) \
marco_declare(ker_class, ker_dtype) { \
std::string key = macro_key(ker_key, dtype_key); \
if (kers_.find(key) == kers_.end()) { \
std::shared_ptr<ker_class<ker_dtype>> p; \
SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \
kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)}); \
return p; \
} \
return std::dynamic_pointer_cast<const ker_class<ker_dtype>>( \
kers_.at(key)); \
}
#define REGISTER_JITKERNEL(ker_key, ker_class) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \
macro_impl) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
macro_impl); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \
macro_key, macro_impl)
#define FOR_EACH_ISA(macro_, block) \
macro_(jit::avx512f, block); \
macro_(jit::avx2, block); \
macro_(jit::avx, block); \
macro_(jit::isa_any, block)
#define FOR_EACH_BLOCK(macro_, isa) \
macro_(isa, kLT8); \
macro_(isa, kEQ8); \
macro_(isa, kGT8LT16); \
macro_(isa, kEQ16); \
macro_(isa, kGT16)
#define FOR_EACH_ISA_BLOCK(macro_) \
FOR_EACH_BLOCK(macro_, jit::avx512f); \
FOR_EACH_BLOCK(macro_, jit::avx2); \
FOR_EACH_BLOCK(macro_, jit::avx); \
FOR_EACH_BLOCK(macro_, jit::isa_any)
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel_test.cc
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <sys/time.h>
#include <cmath> // for exp
#include <cstring> // for memcpy
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
#ifdef __AVX__
#include <immintrin.h>
#endif
constexpr
int
repeat
=
20000
;
inline
double
GetCurrentUS
()
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+6
*
time
.
tv_sec
+
time
.
tv_usec
;
}
template
<
typename
T
>
void
RandomVec
(
const
int
n
,
T
*
a
,
const
T
lower
=
static_cast
<
T
>
(
-
20.
f
),
const
T
upper
=
static_cast
<
T
>
(
20.
f
))
{
static
unsigned
int
seed
=
100
;
std
::
mt19937
rng
(
seed
++
);
std
::
uniform_real_distribution
<
double
>
uniform_dist
(
0
,
1
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
a
[
i
]
=
static_cast
<
T
>
(
uniform_dist
(
rng
)
*
(
upper
-
lower
)
+
lower
);
}
}
void
vrelu_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0.
f
?
x
[
i
]
:
0.
f
;
}
}
#if defined __AVX__ || defined __AVX2__
void
vrelu_intri8
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
__m256
tmp
=
_mm256_loadu_ps
(
x
);
tmp
=
_mm256_max_ps
(
tmp
,
_mm256_setzero_ps
());
_mm256_storeu_ps
(
y
,
tmp
);
}
#endif
TEST
(
JitKernel
,
vrelu
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
10.
f
,
1.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VReluKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vrelu_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vrelu_intri8
(
d
,
x_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vaddbias_ref
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
TEST
(
JitKernel
,
vaddbias
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddBiasKernel
<
float
>
>
(
d
);
const
float
a
=
2.
f
;
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vaddbias_ref
(
d
,
a
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
a
,
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vexp_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
#ifdef PADDLE_WITH_MKLML
void
vexp_mkl
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
paddle
::
platform
::
dynload
::
vsExp
(
n
,
x
,
y
);
}
#endif
TEST
(
JitKernel
,
vexp
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VExpKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vexp_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vexp_mkl
(
d
,
x_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
inline
float
_sigmoid
(
float
x
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
float
tmp
=
(
x
<
min
)
?
min
:
((
x
>
max
)
?
max
:
x
);
return
1.
f
/
(
1.
f
+
std
::
exp
(
-
tmp
));
}
void
vsigmoid_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
_sigmoid
(
x
[
i
]);
}
}
void
vsigmoid_better
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp
,
const
int
n
,
const
float
*
x
,
float
*
y
)
{
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
0.
f
-
y
[
i
];
}
vexp
->
Compute
(
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.
f
/
(
1.
f
+
y
[
i
]);
}
}
TEST
(
JitKernel
,
vsigmoid
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
d
);
const
auto
&
vexp
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VExpKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vsigmoid_better
(
vexp
,
d
,
x_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vsigmoid_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
inline
float
_tanh
(
float
x
)
{
return
2.
f
*
_sigmoid
(
2.
f
*
x
)
-
1.
f
;
}
void
vtanh_ref
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
_tanh
(
x
[
i
]);
}
}
void
vtanh_better
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VScalKernel
<
float
>>&
vscal
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
vsigmoid
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VAddBiasKernel
<
float
>>&
vaddbias
,
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vscal
->
Compute
(
2.
f
,
x
,
y
);
vsigmoid
->
Compute
(
y
,
y
);
vscal
->
Compute
(
2.
f
,
y
);
vaddbias
->
Compute
(
-
1.
f
,
y
,
y
);
}
TEST
(
JitKernel
,
vtanh
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VTanhKernel
<
float
>
>
(
d
);
const
auto
&
vscal
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VScalKernel
<
float
>
>
(
d
);
const
auto
&
vsigmoid
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
d
);
const
auto
&
vaddbias
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddBiasKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vtanh_better
(
vscal
,
vsigmoid
,
vaddbias
,
d
,
x_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vtanh_ref
(
d
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
lstm_ctht_ref
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
vsigmoid_3d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VTanhKernel
<
float
>>&
vtanh_d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VExpKernel
<
float
>>&
vexp_1
,
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
);
vtanh_d
->
Compute
(
gates
,
gates
);
const
float
*
i
=
gates
+
d
,
*
f
=
gates
+
d
*
2
,
*
o
=
gates
+
d
*
3
;
const
float
min
=
SIGMOID_THRESHOLD_MIN
;
const
float
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
k
=
0
;
k
<
d
;
++
k
)
{
// C_t = C_t-1 * fgated + cand_gated * igated
ct
[
k
]
=
ct_1
[
k
]
*
f
[
k
]
+
gates
[
k
]
*
i
[
k
];
// H_t = act_cell(C_t) * ogated
float
tmp
=
ct
[
k
]
*
2
;
tmp
=
0.
f
-
((
tmp
<
min
)
?
min
:
((
tmp
>
max
)
?
max
:
tmp
));
vexp_1
->
Compute
(
&
tmp
,
&
tmp
);
tmp
=
2.
f
/
(
1.
f
+
tmp
)
-
1.
f
;
ht
[
k
]
=
tmp
*
o
[
k
];
}
}
void
lstm_ctht_better
(
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VSigmoidKernel
<
float
>>&
vsigmoid_3d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VTanhKernel
<
float
>>&
vtanh_d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VMulKernel
<
float
>>&
vmul_d
,
const
std
::
shared_ptr
<
const
paddle
::
operators
::
math
::
jitkernel
::
VAddKernel
<
float
>>&
vadd_d
,
const
int
d
,
float
*
gates
,
const
float
*
ct_1
,
float
*
ct
,
float
*
ht
)
{
int
d2
=
d
*
2
;
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
);
vtanh_d
->
Compute
(
gates
,
gates
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
);
vadd_d
->
Compute
(
gates
+
d
,
gates
+
d2
,
ct
);
/* H_t = act_cell(C_t) * ogated */
vtanh_d
->
Compute
(
ct
,
gates
+
d2
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
);
}
TEST
(
JitKernel
,
lstm
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
32
,
64
,
100
})
{
int
d4
=
d
*
4
;
int
d3
=
d
*
3
;
std
::
vector
<
float
>
x
(
d4
),
xref
(
d4
);
std
::
vector
<
float
>
ct_1
(
d
),
ct_tgt
(
d
),
ht_tgt
(
d
);
std
::
vector
<
float
>
ct_ref
(
d
),
ht_ref
(
d
);
RandomVec
<
float
>
(
d4
,
x
.
data
(),
-
2.
f
,
2.
f
);
RandomVec
<
float
>
(
d
,
ct_1
.
data
(),
-
2.
f
,
2.
f
);
memcpy
(
xref
.
data
(),
x
.
data
(),
sizeof
(
float
)
*
d4
);
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
d
,
false
);
// below kernels are used to compute refer
const
auto
&
vsigmoid_3d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VSigmoidKernel
<
float
>
>
(
d3
);
const
auto
&
vtanh_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VTanhKernel
<
float
>
>
(
d
);
const
auto
&
vexp_1
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VExpKernel
<
float
>
>
(
1
);
const
auto
&
vmul_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
d
);
const
auto
&
vadd_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddKernel
<
float
>
>
(
d
);
float
*
x_data
=
x
.
data
();
float
*
xref_data
=
xref
.
data
();
const
float
*
ct_1_data
=
ct_1
.
data
();
float
*
ct_tgt_data
=
ct_tgt
.
data
();
float
*
ht_tgt_data
=
ht_tgt
.
data
();
float
*
ct_ref_data
=
ct_ref
.
data
();
float
*
ht_ref_data
=
ht_ref
.
data
();
// compute once to check correctness
lstm_ctht_ref
(
vsigmoid_3d
,
vtanh_d
,
vexp_1
,
d
,
xref_data
,
ct_1_data
,
ct_ref_data
,
ht_ref_data
);
ker
->
ComputeCtHt
(
x_data
,
ct_1_data
,
ct_tgt_data
,
ht_tgt_data
);
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ct_tgt_data
[
i
],
ct_ref_data
[
i
],
1e-3
);
EXPECT_NEAR
(
ht_tgt_data
[
i
],
ht_ref_data
[
i
],
1e-3
);
}
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
lstm_ctht_better
(
vsigmoid_3d
,
vtanh_d
,
vmul_d
,
vadd_d
,
d
,
xref_data
,
ct_1_data
,
ct_ref_data
,
ht_ref_data
);
}
auto
tmkle
=
GetCurrentUS
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
lstm_ctht_ref
(
vsigmoid_3d
,
vtanh_d
,
vexp_1
,
d
,
xref_data
,
ct_1_data
,
ct_ref_data
,
ht_ref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
ComputeCtHt
(
x_data
,
ct_1_data
,
ct_tgt_data
,
ht_tgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
}
}
void
vscal_ref
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
void
vscal_inp_ref
(
const
int
n
,
const
float
a
,
float
*
x
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
void
vscal_intri8
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
__m256
tmp
;
__m256
scalar
=
_mm256_set1_ps
(
a
);
tmp
=
_mm256_loadu_ps
(
x
);
tmp
=
_mm256_mul_ps
(
tmp
,
scalar
);
_mm256_storeu_ps
(
y
,
tmp
);
}
void
vscal_inp_intri8
(
const
int
n
,
const
float
a
,
float
*
x
)
{
__m256
tmp
;
__m256
scalar
=
_mm256_set1_ps
(
a
);
tmp
=
_mm256_loadu_ps
(
x
);
tmp
=
_mm256_mul_ps
(
tmp
,
scalar
);
_mm256_storeu_ps
(
x
,
tmp
);
}
#endif
#ifdef PADDLE_WITH_MKLML
void
vscal_inp_mkl
(
const
int
n
,
const
float
a
,
float
*
x
)
{
paddle
::
platform
::
dynload
::
cblas_sscal
(
n
,
a
,
x
,
1
);
}
#endif
TEST
(
JitKernel
,
vscal
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
std
::
memcpy
(
y
.
data
(),
x
.
data
(),
sizeof
(
float
)
*
d
);
float
a
=
2.
f
;
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VScalKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
float
*
y_data
=
y
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_ref
(
d
,
a
,
x_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
auto
trefs1
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_inp_ref
(
d
,
a
,
y_data
);
}
auto
trefe1
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_inp_mkl
(
d
,
a
,
y_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_intri8
(
d
,
a
,
x_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
auto
si2
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vscal_inp_intri8
(
d
,
a
,
y_data
);
}
auto
si3
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
a
,
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgts1
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
a
,
y_data
);
}
auto
ttgte1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, inplace takes: "
<<
(
trefe1
-
trefs1
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl inplace takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vmul_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
void
vmul_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
__m256
tmpx
,
tmpy
;
tmpx
=
_mm256_loadu_ps
(
x
);
tmpy
=
_mm256_loadu_ps
(
y
);
tmpx
=
_mm256_mul_ps
(
tmpx
,
tmpy
);
_mm256_storeu_ps
(
z
,
tmpx
);
}
#endif
#ifdef PADDLE_WITH_MKLML
void
vmul_mkl
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
paddle
::
platform
::
dynload
::
vsMul
(
n
,
x
,
y
,
z
);
}
#endif
TEST
(
JitKernel
,
vmul
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
RandomVec
<
float
>
(
d
,
y
.
data
());
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
const
float
*
y_data
=
y
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vmul_ref
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vmul_mkl
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vmul_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
void
vadd_ref
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
#if defined __AVX__ || defined __AVX2__
void
vadd_intri8
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
__m256
tmpx
,
tmpy
;
tmpx
=
_mm256_loadu_ps
(
x
);
tmpy
=
_mm256_loadu_ps
(
y
);
tmpx
=
_mm256_add_ps
(
tmpx
,
tmpy
);
_mm256_storeu_ps
(
z
,
tmpx
);
}
#endif
#ifdef PADDLE_WITH_MKLML
void
vadd_mkl
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
paddle
::
platform
::
dynload
::
vsAdd
(
n
,
x
,
y
,
z
);
}
#endif
TEST
(
JitKernel
,
vadd
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
RandomVec
<
float
>
(
d
,
y
.
data
());
const
auto
&
ker
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VAddKernel
<
float
>
>
(
d
);
const
float
*
x_data
=
x
.
data
();
const
float
*
y_data
=
y
.
data
();
float
*
ztgt_data
=
ztgt
.
data
();
float
*
zref_data
=
zref
.
data
();
auto
trefs
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_ref
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
trefe
=
GetCurrentUS
();
#ifdef PADDLE_WITH_MKLML
auto
tmkls
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_mkl
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
tmkle
=
GetCurrentUS
();
#endif
#if defined __AVX__ || defined __AVX2__
if
(
d
==
8
)
{
auto
si0
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
vadd_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
}
}
TEST
(
JitKernel
,
pool
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
const
int
frame_size
=
4
;
std
::
string
act_gate
=
"sigmoid"
,
act_cand
=
"tanh"
,
act_cell
=
"tanh"
;
const
auto
&
plstm1
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
frame_size
,
false
);
const
auto
&
plstm2
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
frame_size
,
false
);
const
auto
&
peephole
=
jit
::
KernelPool
::
Instance
()
.
template
Get
<
jit
::
LSTMKernel
<
float
>,
const
std
::
string
&
,
const
std
::
string
&
,
const
std
::
string
&>
(
act_gate
,
act_cand
,
act_cell
,
frame_size
,
true
);
EXPECT_TRUE
(
plstm1
!=
peephole
);
const
auto
&
pvmul_f
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
float
>
>
(
4
);
EXPECT_TRUE
(
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
plstm2
)
!=
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_f
));
const
auto
&
pvmul_d
=
jit
::
KernelPool
::
Instance
().
template
Get
<
jit
::
VMulKernel
<
double
>
>
(
4
);
EXPECT_TRUE
(
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_f
)
!=
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_d
));
const
auto
&
pvmul_from_key
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf4"
);
EXPECT_EQ
(
pvmul_f
,
pvmul_from_key
);
const
auto
&
pvmul_from_key2
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf5"
);
EXPECT_TRUE
(
pvmul_from_key2
==
nullptr
);
}
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
cc7f5514
...
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <map>
#include <set>
#include <set>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/math/
math_function
.h"
#include "paddle/fluid/operators/math/
blas
.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -150,6 +151,45 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
...
@@ -150,6 +151,45 @@ template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
template
struct
SelectedRowsAddTo
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddTo
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
<
typename
T
>
struct
SelectedRowsSumTo
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
SelectedRows
*>&
input1
,
const
std
::
vector
<
int64_t
>&
input2_offsets
,
framework
::
SelectedRows
*
input2
)
{
// Ensure all selected rows have the same height
size_t
size
=
0u
;
for
(
auto
iter
=
input1
.
begin
();
iter
!=
input1
.
end
();
++
iter
)
{
auto
&
in_rows
=
(
*
iter
)
->
rows
();
size
+=
in_rows
.
end
()
-
in_rows
.
begin
();
auto
in1_height
=
(
*
iter
)
->
height
();
PADDLE_ENFORCE_EQ
(
in1_height
,
input2
->
height
());
}
// concat rows
std
::
vector
<
int64_t
>
in2_rows
;
in2_rows
.
reserve
(
in2_rows
.
size
()
+
size
);
for
(
auto
iter
=
input1
.
begin
();
iter
!=
input1
.
end
();
++
iter
)
{
const
framework
::
Vector
<
int64_t
>&
in_rows
=
(
*
iter
)
->
rows
();
in2_rows
.
insert
(
in2_rows
.
end
(),
in_rows
.
begin
(),
in_rows
.
end
());
}
input2
->
set_rows
(
in2_rows
);
auto
*
in2_value
=
input2
->
mutable_value
();
auto
*
in2_data
=
in2_value
->
data
<
T
>
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
size_t
offset
=
0u
;
for
(
size_t
i
=
0u
;
i
!=
input1
.
size
();
++
i
)
{
auto
&
in_value
=
input1
[
i
]
->
value
();
const
auto
*
in_data
=
in_value
.
data
<
T
>
();
offset
+=
input2_offsets
[
i
];
blas
.
VCOPY
(
in_value
.
numel
(),
in_data
,
in2_data
+
offset
);
}
}
};
template
struct
SelectedRowsSumTo
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
SelectedRowsSumTo
<
platform
::
CPUDeviceContext
,
double
>;
template
<
typename
T
>
template
<
typename
T
>
struct
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
T
>
{
struct
SelectedRowsAddToTensor
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
...
@@ -207,35 +247,45 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
...
@@ -207,35 +247,45 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const
framework
::
SelectedRows
&
input
,
const
framework
::
SelectedRows
&
input
,
framework
::
SelectedRows
*
output
)
{
framework
::
SelectedRows
*
output
)
{
framework
::
SelectedRows
&
out
=
*
output
;
framework
::
SelectedRows
&
out
=
*
output
;
auto
input_rows
=
input
.
rows
();
std
::
vector
<
int64_t
>
input_rows
(
input
.
rows
());
std
::
set
<
int64_t
>
row_set
(
input_rows
.
begin
(),
input_rows
.
end
());
std
::
vector
<
int64_t
>
merge_rows
(
row_set
.
begin
(),
row_set
.
end
());
auto
input_width
=
input
.
value
().
dims
()[
1
];
std
::
map
<
int64_t
,
std
::
vector
<
int64_t
>>
merge_row_map
;
out
.
set_rows
(
merge_rows
);
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
++
i
)
{
merge_row_map
[
input_rows
[
i
]].
push_back
(
i
);
}
std
::
vector
<
int64_t
>
merge_rows
(
merge_row_map
.
size
());
size_t
idx
=
0
;
int64_t
input_width
=
input
.
value
().
dims
()[
1
];
out
.
set_height
(
input
.
height
());
out
.
set_height
(
input
.
height
());
out
.
mutable_value
()
->
mutable_data
<
T
>
(
T
*
out_data
=
out
.
mutable_value
()
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
merge_rows
.
size
()),
input_width
}),
{
static_cast
<
int64_t
>
(
merge_rows
.
size
()),
input_width
}),
context
.
GetPlace
());
context
.
GetPlace
());
const
T
*
in_data
=
input
.
value
().
data
<
T
>
();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
for
(
auto
&
row_pair
:
merge_row_map
)
{
auto
*
out_ptr
=
out_data
+
idx
*
input_width
;
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
&
rows
=
row_pair
.
second
;
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
merge_rows
[
idx
]
=
row_pair
.
first
;
++
idx
;
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
i
++
)
{
// rows.size() is always larger than 0
size_t
out_i
=
FindPos
(
merge_rows
,
input_rows
[
i
]);
std
::
memcpy
(
out_ptr
,
in_data
+
rows
[
0
]
*
input_width
,
for
(
int64_t
j
=
0
;
j
<
input_width
;
j
++
)
{
sizeof
(
T
)
*
input_width
);
out_data
[
out_i
*
input_width
+
j
]
+=
input_data
[
i
*
input_width
+
j
];
for
(
size_t
i
=
1
;
i
<
rows
.
size
();
++
i
)
{
auto
*
in_ptr
=
in_data
+
rows
[
i
]
*
input_width
;
for
(
int64_t
j
=
0
;
j
<
input_width
;
++
j
)
{
out_ptr
[
j
]
+=
in_ptr
[
j
];
}
}
}
}
}
out
.
set_rows
(
merge_rows
);
}
}
};
};
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
float
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
int64_t
>;
...
...
paddle/fluid/operators/math/selected_rows_functor.h
浏览文件 @
cc7f5514
...
@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <map>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#define INLINE_FOR2(sizei, sizej) \
#define INLINE_FOR2(sizei, sizej) \
...
@@ -49,6 +55,15 @@ struct SelectedRowsAddTo {
...
@@ -49,6 +55,15 @@ struct SelectedRowsAddTo {
const
int64_t
input2_offset
,
framework
::
SelectedRows
*
input2
);
const
int64_t
input2_offset
,
framework
::
SelectedRows
*
input2
);
};
};
// input2 = [all input in input1] + input2
template
<
typename
DeviceContext
,
typename
T
>
struct
SelectedRowsSumTo
{
void
operator
()(
const
DeviceContext
&
context
,
const
std
::
vector
<
framework
::
SelectedRows
*>&
input1
,
const
std
::
vector
<
int64_t
>&
input2_offsets
,
framework
::
SelectedRows
*
input2
);
};
// input2 = input1 + input2
// input2 = input1 + input2
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
struct
SelectedRowsAddToTensor
{
struct
SelectedRowsAddToTensor
{
...
@@ -70,6 +85,104 @@ struct MergeAdd {
...
@@ -70,6 +85,104 @@ struct MergeAdd {
framework
::
SelectedRows
*
output
);
framework
::
SelectedRows
*
output
);
};
};
template
<
>
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
float
>
{
framework
::
SelectedRows
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
)
{
framework
::
SelectedRows
out
;
(
*
this
)(
context
,
input
,
&
out
);
return
out
;
}
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
,
framework
::
SelectedRows
*
output
)
{
framework
::
SelectedRows
&
out
=
*
output
;
std
::
vector
<
int64_t
>
input_rows
(
input
.
rows
());
std
::
map
<
int64_t
,
std
::
vector
<
int64_t
>>
merge_row_map
;
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
++
i
)
{
merge_row_map
[
input_rows
[
i
]].
push_back
(
i
);
}
std
::
vector
<
int64_t
>
merge_rows
(
merge_row_map
.
size
());
size_t
idx
=
0
;
int64_t
input_width
=
input
.
value
().
dims
()[
1
];
out
.
set_height
(
input
.
height
());
auto
*
out_data
=
out
.
mutable_value
()
->
mutable_data
<
float
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
merge_rows
.
size
()),
input_width
}),
context
.
GetPlace
());
auto
*
in_data
=
input
.
value
().
data
<
float
>
();
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
float
>
(
context
);
for
(
auto
&
row_pair
:
merge_row_map
)
{
auto
*
out_ptr
=
out_data
+
idx
*
input_width
;
auto
&
rows
=
row_pair
.
second
;
merge_rows
[
idx
]
=
row_pair
.
first
;
++
idx
;
// rows.size() is always larger than 0
blas
.
VCOPY
(
input_width
,
in_data
+
rows
[
0
]
*
input_width
,
out_ptr
);
for
(
size_t
i
=
1
;
i
<
rows
.
size
();
++
i
)
{
blas
.
AXPY
(
input_width
,
1.
,
in_data
+
rows
[
i
]
*
input_width
,
out_ptr
);
}
}
out
.
set_rows
(
merge_rows
);
}
};
template
<
>
struct
MergeAdd
<
platform
::
CPUDeviceContext
,
double
>
{
framework
::
SelectedRows
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
)
{
framework
::
SelectedRows
out
;
(
*
this
)(
context
,
input
,
&
out
);
return
out
;
}
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
SelectedRows
&
input
,
framework
::
SelectedRows
*
output
)
{
framework
::
SelectedRows
&
out
=
*
output
;
std
::
vector
<
int64_t
>
input_rows
(
input
.
rows
());
std
::
map
<
int64_t
,
std
::
vector
<
int64_t
>>
merge_row_map
;
for
(
size_t
i
=
0
;
i
<
input_rows
.
size
();
++
i
)
{
merge_row_map
[
input_rows
[
i
]].
push_back
(
i
);
}
std
::
vector
<
int64_t
>
merge_rows
(
merge_row_map
.
size
());
size_t
idx
=
0
;
int64_t
input_width
=
input
.
value
().
dims
()[
1
];
out
.
set_height
(
input
.
height
());
auto
*
out_data
=
out
.
mutable_value
()
->
mutable_data
<
double
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
merge_rows
.
size
()),
input_width
}),
context
.
GetPlace
());
auto
*
in_data
=
input
.
value
().
data
<
double
>
();
auto
blas
=
GetBlas
<
platform
::
CPUDeviceContext
,
double
>
(
context
);
for
(
auto
&
row_pair
:
merge_row_map
)
{
auto
*
out_ptr
=
out_data
+
idx
*
input_width
;
auto
&
rows
=
row_pair
.
second
;
merge_rows
[
idx
]
=
row_pair
.
first
;
++
idx
;
// rows.size() is always larger than 0
blas
.
VCOPY
(
input_width
,
in_data
+
rows
[
0
]
*
input_width
,
out_ptr
);
for
(
size_t
i
=
1
;
i
<
rows
.
size
();
++
i
)
{
blas
.
AXPY
(
input_width
,
1.
,
in_data
+
rows
[
i
]
*
input_width
,
out_ptr
);
}
}
out
.
set_rows
(
merge_rows
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
struct
Add
{
struct
Add
{
framework
::
SelectedRows
operator
()(
const
DeviceContext
&
context
,
framework
::
SelectedRows
operator
()(
const
DeviceContext
&
context
,
...
...
paddle/fluid/operators/math/selected_rows_functor_test.cc
浏览文件 @
cc7f5514
...
@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) {
...
@@ -219,3 +219,174 @@ TEST(selected_rows_functor, cpu_add_to) {
// row9: 2.0 + 3.0
// row9: 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
9
*
row_numel
+
6
],
5.0
);
EXPECT_EQ
(
tensor1_data
[
9
*
row_numel
+
6
],
5.0
);
}
}
TEST
(
selected_rows_functor
,
cpu_merge_add_float
)
{
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
paddle
::
operators
::
math
::
SetConstant
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
functor
;
int64_t
height
=
10
;
int64_t
row_numel
=
10
;
std
::
vector
<
int64_t
>
rows
{
0
,
4
,
4
,
7
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows
{
new
paddle
::
framework
::
SelectedRows
(
rows
,
height
)};
auto
*
in_value
=
selected_rows
->
mutable_value
();
in_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in_value
,
1.0
);
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
output
{
new
paddle
::
framework
::
SelectedRows
()};
paddle
::
operators
::
math
::
scatter
::
MergeAdd
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
merge_add_functor
;
merge_add_functor
(
ctx
,
*
selected_rows
,
output
.
get
());
auto
out_height
=
output
->
height
();
EXPECT_EQ
(
out_height
,
height
);
auto
&
out_rows
=
output
->
rows
();
EXPECT_EQ
(
out_rows
[
0
],
0
);
EXPECT_EQ
(
out_rows
[
1
],
4
);
EXPECT_EQ
(
out_rows
[
2
],
7
);
auto
*
out_data
=
output
->
value
().
data
<
float
>
();
EXPECT_EQ
(
out_data
[
0
*
row_numel
],
1.0
);
EXPECT_EQ
(
out_data
[
1
*
row_numel
],
2.0
);
EXPECT_EQ
(
out_data
[
2
*
row_numel
],
1.0
);
}
TEST
(
selected_rows_functor
,
cpu_merge_add_int
)
{
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
paddle
::
operators
::
math
::
SetConstant
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
functor
;
int64_t
height
=
10
;
int64_t
row_numel
=
10
;
std
::
vector
<
int64_t
>
rows
{
0
,
4
,
4
,
7
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows
{
new
paddle
::
framework
::
SelectedRows
(
rows
,
height
)};
auto
*
in_value
=
selected_rows
->
mutable_value
();
in_value
->
mutable_data
<
int
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in_value
,
1
);
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
output
{
new
paddle
::
framework
::
SelectedRows
()};
paddle
::
operators
::
math
::
scatter
::
MergeAdd
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
merge_add_functor
;
merge_add_functor
(
ctx
,
*
selected_rows
,
output
.
get
());
auto
out_height
=
output
->
height
();
EXPECT_EQ
(
out_height
,
height
);
auto
&
out_rows
=
output
->
rows
();
EXPECT_EQ
(
out_rows
[
0
],
0
);
EXPECT_EQ
(
out_rows
[
1
],
4
);
EXPECT_EQ
(
out_rows
[
2
],
7
);
auto
*
out_data
=
output
->
value
().
data
<
int
>
();
EXPECT_EQ
(
out_data
[
0
*
row_numel
],
1
);
EXPECT_EQ
(
out_data
[
1
*
row_numel
],
2
);
EXPECT_EQ
(
out_data
[
2
*
row_numel
],
1
);
}
TEST
(
selected_rows_functor
,
cpu_sum_to
)
{
paddle
::
platform
::
CPUPlace
cpu_place
;
paddle
::
platform
::
CPUDeviceContext
ctx
(
cpu_place
);
paddle
::
operators
::
math
::
SetConstant
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
functor
;
int64_t
height
=
10
;
int64_t
row_numel
=
10
;
std
::
vector
<
int64_t
>
rows1
{
0
,
4
,
7
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows1
{
new
paddle
::
framework
::
SelectedRows
(
rows1
,
height
)};
auto
*
in1_value
=
selected_rows1
->
mutable_value
();
in1_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows1
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in1_value
,
1.0
);
std
::
vector
<
int64_t
>
rows2
{
0
,
5
,
7
,
9
};
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
selected_rows2
{
new
paddle
::
framework
::
SelectedRows
(
rows2
,
height
)};
auto
*
in2_value
=
selected_rows2
->
mutable_value
();
in2_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
rows2
.
size
()),
row_numel
}),
cpu_place
);
functor
(
ctx
,
in2_value
,
2.0
);
std
::
unique_ptr
<
paddle
::
framework
::
SelectedRows
>
output
{
new
paddle
::
framework
::
SelectedRows
()};
output
->
set_height
(
height
);
auto
*
out_value
=
output
->
mutable_value
();
// simplely concat two SelectedRows
out_value
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
7
,
10
}),
cpu_place
);
paddle
::
operators
::
math
::
SelectedRowsSumTo
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
sum_to_functor
;
sum_to_functor
(
ctx
,
std
::
vector
<
paddle
::
framework
::
SelectedRows
*>
(
{
selected_rows1
.
get
(),
selected_rows2
.
get
()}),
std
::
vector
<
int64_t
>
({
0
,
in1_value
->
numel
()}),
output
.
get
());
auto
out_height
=
output
->
height
();
EXPECT_EQ
(
out_height
,
height
);
auto
&
out_rows
=
output
->
rows
();
// input1 rows
EXPECT_EQ
(
out_rows
[
0
],
0
);
EXPECT_EQ
(
out_rows
[
1
],
4
);
EXPECT_EQ
(
out_rows
[
2
],
7
);
// input2 rows
EXPECT_EQ
(
out_rows
[
3
],
0
);
EXPECT_EQ
(
out_rows
[
4
],
5
);
EXPECT_EQ
(
out_rows
[
5
],
7
);
EXPECT_EQ
(
out_rows
[
6
],
9
);
auto
*
out_data
=
output
->
value
().
data
<
float
>
();
// input1 value
EXPECT_EQ
(
out_data
[
0
*
row_numel
+
0
],
1.0
);
EXPECT_EQ
(
out_data
[
0
*
row_numel
+
8
],
1.0
);
EXPECT_EQ
(
out_data
[
1
*
row_numel
+
1
],
1.0
);
EXPECT_EQ
(
out_data
[
2
*
row_numel
+
6
],
1.0
);
// input2 value
EXPECT_EQ
(
out_data
[
3
*
row_numel
+
3
],
2.0
);
EXPECT_EQ
(
out_data
[
3
*
row_numel
+
8
],
2.0
);
EXPECT_EQ
(
out_data
[
4
*
row_numel
+
4
],
2.0
);
EXPECT_EQ
(
out_data
[
5
*
row_numel
+
7
],
2.0
);
EXPECT_EQ
(
out_data
[
6
*
row_numel
+
9
],
2.0
);
std
::
unique_ptr
<
paddle
::
framework
::
Tensor
>
tensor1
{
new
paddle
::
framework
::
Tensor
()};
tensor1
->
mutable_data
<
float
>
(
paddle
::
framework
::
make_ddim
({
height
,
row_numel
}),
cpu_place
);
functor
(
ctx
,
tensor1
.
get
(),
3.0
);
paddle
::
operators
::
math
::
SelectedRowsAddToTensor
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
add_to_tensor_functor
;
add_to_tensor_functor
(
ctx
,
*
output
,
tensor1
.
get
());
auto
*
tensor1_data
=
tensor1
->
data
<
float
>
();
// row0: 1.0 + 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
0
*
row_numel
+
0
],
6.0
);
// row1: 3.0
EXPECT_EQ
(
tensor1_data
[
1
*
row_numel
+
1
],
3.0
);
// row4 : 1.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
4
*
row_numel
+
6
],
4.0
);
// row5: 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
5
*
row_numel
+
7
],
5.0
);
// row6: 3.0
EXPECT_EQ
(
tensor1_data
[
6
*
row_numel
+
1
],
3.0
);
// row7: 1.0 + 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
7
*
row_numel
+
3
],
6.0
);
// row9: 2.0 + 3.0
EXPECT_EQ
(
tensor1_data
[
9
*
row_numel
+
6
],
5.0
);
}
paddle/fluid/operators/math/sequence_pooling.cc
浏览文件 @
cc7f5514
...
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/math/sequence_pooling.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
...
@@ -180,6 +182,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
}
}
auto
lod
=
input
.
lod
()[
0
];
auto
lod
=
input
.
lod
()[
0
];
auto
&
place
=
*
context
.
eigen_device
();
auto
&
place
=
*
context
.
eigen_device
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
Tensor
in_t
=
Tensor
in_t
=
input
.
Slice
(
static_cast
<
int
>
(
lod
[
i
]),
static_cast
<
int
>
(
lod
[
i
+
1
]));
input
.
Slice
(
static_cast
<
int
>
(
lod
[
i
]),
static_cast
<
int
>
(
lod
[
i
+
1
]));
...
@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
...
@@ -191,7 +194,14 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
if
(
pooltype
==
"AVERAGE"
)
{
if
(
pooltype
==
"AVERAGE"
)
{
out_e
.
device
(
place
)
=
in_e
.
mean
(
Eigen
::
array
<
int
,
1
>
({{
0
}}));
out_e
.
device
(
place
)
=
in_e
.
mean
(
Eigen
::
array
<
int
,
1
>
({{
0
}}));
}
else
if
(
pooltype
==
"SUM"
)
{
}
else
if
(
pooltype
==
"SUM"
)
{
out_e
.
device
(
place
)
=
in_e
.
sum
(
Eigen
::
array
<
int
,
1
>
({{
0
}}));
if
(
h
>
0
)
{
const
T
*
in_data
=
in_t
.
data
<
T
>
();
T
*
out_data
=
out_t
.
mutable_data
<
T
>
(
context
.
GetPlace
());
blas
.
VCOPY
(
w
,
in_data
,
out_data
);
for
(
int64_t
r
=
1
;
r
!=
h
;
++
r
)
{
blas
.
AXPY
(
w
,
1.
,
in_data
+
r
*
w
,
out_data
);
}
}
}
else
if
(
pooltype
==
"SQRT"
)
{
}
else
if
(
pooltype
==
"SQRT"
)
{
out_e
.
device
(
place
)
=
in_e
.
sum
(
Eigen
::
array
<
int
,
1
>
({{
0
}}))
/
out_e
.
device
(
place
)
=
in_e
.
sum
(
Eigen
::
array
<
int
,
1
>
({{
0
}}))
/
std
::
sqrt
(
static_cast
<
T
>
(
h
));
std
::
sqrt
(
static_cast
<
T
>
(
h
));
...
@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
...
@@ -223,6 +233,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
}
}
auto
lod
=
in_grad
->
lod
()[
0
];
auto
lod
=
in_grad
->
lod
()[
0
];
auto
&
place
=
*
context
.
eigen_device
();
auto
&
place
=
*
context
.
eigen_device
();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
context
);
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
lod
.
size
())
-
1
;
++
i
)
{
auto
in_g_t
=
in_grad
->
Slice
(
static_cast
<
int
>
(
lod
[
i
]),
auto
in_g_t
=
in_grad
->
Slice
(
static_cast
<
int
>
(
lod
[
i
]),
static_cast
<
int
>
(
lod
[
i
+
1
]));
static_cast
<
int
>
(
lod
[
i
+
1
]));
...
@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
...
@@ -237,7 +248,11 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
if
(
pooltype
==
"AVERAGE"
)
{
if
(
pooltype
==
"AVERAGE"
)
{
in_g_e
.
device
(
place
)
=
(
out_g_e
/
static_cast
<
T
>
(
h
)).
broadcast
(
bcast
);
in_g_e
.
device
(
place
)
=
(
out_g_e
/
static_cast
<
T
>
(
h
)).
broadcast
(
bcast
);
}
else
if
(
pooltype
==
"SUM"
)
{
}
else
if
(
pooltype
==
"SUM"
)
{
in_g_e
.
device
(
place
)
=
(
out_g_e
).
broadcast
(
bcast
);
const
T
*
out_g_data
=
out_g_t
.
data
<
T
>
();
T
*
in_g_data
=
in_g_t
.
mutable_data
<
T
>
(
context
.
GetPlace
());
for
(
int
r
=
0
;
r
!=
h
;
++
r
)
{
blas
.
VCOPY
(
w
,
out_g_data
,
in_g_data
+
r
*
w
);
}
}
else
if
(
pooltype
==
"SQRT"
)
{
}
else
if
(
pooltype
==
"SQRT"
)
{
in_g_e
.
device
(
place
)
=
in_g_e
.
device
(
place
)
=
(
out_g_e
/
std
::
sqrt
(
static_cast
<
T
>
(
h
))).
broadcast
(
bcast
);
(
out_g_e
/
std
::
sqrt
(
static_cast
<
T
>
(
h
))).
broadcast
(
bcast
);
...
...
paddle/fluid/operators/momentum_op.cc
浏览文件 @
cc7f5514
...
@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
...
@@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel {
"Input(velocity) of Momentum should not be null."
);
"Input(velocity) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"LearningRate"
),
"Input(LearningRate) of Momentum should not be null."
);
"Input(LearningRate) of Momentum should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(ParamOut) of Momentum should not be null."
);
"Output(ParamOut) of Momentum should not be null."
);
...
...
paddle/fluid/operators/momentum_op.cu
浏览文件 @
cc7f5514
...
@@ -46,6 +46,17 @@ template <typename T>
...
@@ -46,6 +46,17 @@ template <typename T>
class
MomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
MomentumOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
const
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Grad"
).
front
(),
grad_var
->
Type
().
name
());
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
...
...
paddle/fluid/operators/momentum_op.h
浏览文件 @
cc7f5514
...
@@ -23,6 +23,12 @@ template <typename T>
...
@@ -23,6 +23,12 @@ template <typename T>
class
MomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
MomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
velocity_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"VelocityOut"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
...
...
paddle/fluid/operators/parallel_do_op.cc
浏览文件 @
cc7f5514
...
@@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
...
@@ -397,6 +397,24 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
}
}
};
};
class
ParallelDoGradOpVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
framework
::
BlockDesc
*
sub_block
=
boost
::
get
<
framework
::
BlockDesc
*>
(
op_desc
.
GetAttr
(
kParallelBlock
));
for
(
auto
&
out_vars
:
op_desc
.
Outputs
())
{
for
(
auto
&
out_var
:
out_vars
.
second
)
{
auto
&
var
=
block
->
FindRecursiveOrCreateVar
(
out_var
);
auto
sub_var
=
sub_block
->
FindRecursiveOrCreateVar
(
out_var
);
if
(
sub_var
.
GetType
()
!=
var
.
GetType
())
{
var
.
SetType
(
sub_var
.
GetType
());
}
}
}
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
...
@@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
...
@@ -404,4 +422,5 @@ REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
paddle
::
operators
::
ParallelDoOpProtoMaker
,
paddle
::
operators
::
ParallelDoOpProtoMaker
,
paddle
::
operators
::
ParallelDoGradOpDescMaker
);
paddle
::
operators
::
ParallelDoGradOpDescMaker
);
REGISTER_OPERATOR
(
parallel_do_grad
,
paddle
::
operators
::
ParallelDoGradOp
,
REGISTER_OPERATOR
(
parallel_do_grad
,
paddle
::
operators
::
ParallelDoGradOp
,
paddle
::
operators
::
ParallelDoGradOpShapeInference
);
paddle
::
operators
::
ParallelDoGradOpShapeInference
,
paddle
::
operators
::
ParallelDoGradOpVarTypeInference
);
paddle/fluid/operators/reader/blocking_queue.h
浏览文件 @
cc7f5514
...
@@ -31,8 +31,8 @@ class BlockingQueue {
...
@@ -31,8 +31,8 @@ class BlockingQueue {
// is a workaround and a simplified version of framework::Channel as it
// is a workaround and a simplified version of framework::Channel as it
// doesn't support GPU and it implements on buffered blocking queue.
// doesn't support GPU and it implements on buffered blocking queue.
public:
public:
explicit
BlockingQueue
(
size_t
capacity
)
explicit
BlockingQueue
(
size_t
capacity
,
bool
speed_test_mode
=
false
)
:
capacity_
(
capacity
),
closed_
(
false
)
{
:
capacity_
(
capacity
),
speed_test_mode_
(
speed_test_mode
),
closed_
(
false
)
{
PADDLE_ENFORCE_GT
(
PADDLE_ENFORCE_GT
(
capacity_
,
0
,
capacity_
,
0
,
"The capacity of a reader::BlockingQueue must be greater than 0."
);
"The capacity of a reader::BlockingQueue must be greater than 0."
);
...
@@ -72,7 +72,9 @@ class BlockingQueue {
...
@@ -72,7 +72,9 @@ class BlockingQueue {
if
(
!
queue_
.
empty
())
{
if
(
!
queue_
.
empty
())
{
PADDLE_ENFORCE_NOT_NULL
(
elem
);
PADDLE_ENFORCE_NOT_NULL
(
elem
);
*
elem
=
queue_
.
front
();
*
elem
=
queue_
.
front
();
queue_
.
pop_front
();
if
(
LIKELY
(
!
speed_test_mode_
))
{
queue_
.
pop_front
();
}
send_cv_
.
notify_one
();
send_cv_
.
notify_one
();
return
true
;
return
true
;
}
else
{
}
else
{
...
@@ -114,6 +116,7 @@ class BlockingQueue {
...
@@ -114,6 +116,7 @@ class BlockingQueue {
private:
private:
size_t
capacity_
;
size_t
capacity_
;
bool
speed_test_mode_
;
bool
closed_
;
bool
closed_
;
std
::
deque
<
T
>
queue_
;
std
::
deque
<
T
>
queue_
;
...
...
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
浏览文件 @
cc7f5514
...
@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
...
@@ -33,8 +33,9 @@ class LoDTensorBlockingQueue {
private:
private:
LoDTensorBlockingQueue
(
size_t
capacity
,
LoDTensorBlockingQueue
(
size_t
capacity
,
const
std
::
vector
<
framework
::
DDim
>&
dims
)
const
std
::
vector
<
framework
::
DDim
>&
dims
,
:
queue_
(
capacity
),
dims_
(
dims
)
{}
bool
speed_test_mode
=
false
)
:
queue_
(
capacity
,
speed_test_mode
),
dims_
(
dims
)
{}
public:
public:
bool
Push
(
const
std
::
vector
<
framework
::
LoDTensor
>&
lod_tensor_vec
)
{
bool
Push
(
const
std
::
vector
<
framework
::
LoDTensor
>&
lod_tensor_vec
)
{
...
@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
...
@@ -69,11 +70,12 @@ class LoDTensorBlockingQueue {
class
LoDTensorBlockingQueueHolder
{
class
LoDTensorBlockingQueueHolder
{
public:
public:
void
InitOnce
(
size_t
capacity
,
const
std
::
vector
<
framework
::
DDim
>&
dims
)
{
void
InitOnce
(
size_t
capacity
,
const
std
::
vector
<
framework
::
DDim
>&
dims
,
bool
speed_test_mode
=
false
)
{
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
queue_
==
nullptr
,
queue_
==
nullptr
,
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"
);
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"
);
queue_
.
reset
(
new
LoDTensorBlockingQueue
(
capacity
,
dims
));
queue_
.
reset
(
new
LoDTensorBlockingQueue
(
capacity
,
dims
,
speed_test_mode
));
}
}
inline
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
GetQueue
()
const
{
inline
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
GetQueue
()
const
{
...
...
paddle/fluid/operators/reader/reader_blocking_queue_test.cc
浏览文件 @
cc7f5514
...
@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
...
@@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) {
q
.
Receive
(
&
b
);
q
.
Receive
(
&
b
);
EXPECT_EQ
(
a
.
val_
,
b
.
val_
);
EXPECT_EQ
(
a
.
val_
,
b
.
val_
);
}
}
TEST
(
BlockingQueue
,
speed_test_mode
)
{
size_t
queue_size
=
10
;
BlockingQueue
<
size_t
>
q1
(
queue_size
,
false
);
for
(
size_t
i
=
0
;
i
<
queue_size
;
++
i
)
{
q1
.
Send
(
i
);
}
size_t
b
;
for
(
size_t
i
=
0
;
i
<
queue_size
;
++
i
)
{
q1
.
Receive
(
&
b
);
EXPECT_EQ
(
b
,
i
);
}
EXPECT_EQ
(
q1
.
Size
(),
0
);
BlockingQueue
<
size_t
>
q2
(
queue_size
,
true
);
for
(
size_t
i
=
0
;
i
<
queue_size
;
++
i
)
{
q2
.
Send
(
i
);
}
for
(
size_t
i
=
0
;
i
<
queue_size
;
++
i
)
{
q2
.
Receive
(
&
b
);
EXPECT_EQ
(
b
,
0
);
}
EXPECT_EQ
(
q2
.
Size
(),
queue_size
);
}
paddle/fluid/operators/reshape_op.cc
浏览文件 @
cc7f5514
...
@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
...
@@ -164,7 +164,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
Attr(shape) still should be set correctly to gurantee shape inference in
Attr(shape) still should be set correctly to gurantee shape inference in
compile-time.
compile-time.
)DOC"
);
)DOC"
);
...
@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp {
...
@@ -259,7 +259,6 @@ class Reshape2Op : public ReshapeOp {
:
ReshapeOp
(
type
,
inputs
,
outputs
,
attrs
)
{}
:
ReshapeOp
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
ReshapeOp
::
InferShape
(
ctx
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XShape"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"XShape"
),
"Output(XShape) of ReshapeOp should not be null."
);
"Output(XShape) of ReshapeOp should not be null."
);
const
auto
&
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
auto
&
x_dims
=
ctx
->
GetInputDim
(
"X"
);
...
@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp {
...
@@ -270,6 +269,8 @@ class Reshape2Op : public ReshapeOp {
}
}
ctx
->
SetOutputDim
(
"XShape"
,
framework
::
make_ddim
(
xshape_dims
));
ctx
->
SetOutputDim
(
"XShape"
,
framework
::
make_ddim
(
xshape_dims
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
"XShape"
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"XShape"
);
ReshapeOp
::
InferShape
(
ctx
);
}
}
};
};
...
...
paddle/fluid/operators/rmsprop_op.cc
浏览文件 @
cc7f5514
...
@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
...
@@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel {
"Input(Grad) of RmspropOp should not be null."
);
"Input(Grad) of RmspropOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Moment"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Moment"
),
"Input(Moment) of RmspropOp should not be null."
);
"Input(Moment) of RmspropOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
GetInputsVarType
(
"Param"
).
front
()
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
"The input var's type should be LoDTensor, but the received is %s"
,
ctx
->
Inputs
(
"Param"
).
front
(),
ctx
->
GetInputsVarType
(
"Param"
).
front
());
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"ParamOut"
),
"Output(param_out) of RmspropOp should not be null."
);
"Output(param_out) of RmspropOp should not be null."
);
...
...
paddle/fluid/operators/rmsprop_op.h
浏览文件 @
cc7f5514
...
@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and
...
@@ -13,66 +13,254 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <math.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
using
EigenVector
=
framework
::
EigenVector
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
struct
DenseRmspropGradFunctor
{
inline
explicit
DenseRmspropGradFunctor
(
const
T
*
grad
)
:
grad_
(
grad
)
{}
HOSTDEVICE
inline
T
operator
()(
int64_t
idx
)
const
{
return
grad_
[
idx
];
}
const
T
*
grad_
;
};
template
<
typename
T
>
struct
SparseRmspropGradFunctor
{
inline
SparseRmspropGradFunctor
(
const
T
*
grad
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
row_count
)
:
grad_
(
grad
),
rows_
(
rows
),
row_numel_
(
row_numel
),
row_count_
(
row_count
)
{}
HOSTDEVICE
inline
T
operator
()(
int64_t
idx
)
const
{
auto
row_idx
=
math
::
BinarySearch
(
rows_
,
row_count_
,
idx
/
row_numel_
);
return
row_idx
>=
0
?
grad_
[
row_idx
*
row_numel_
+
idx
%
row_numel_
]
:
0
;
}
const
T
*
grad_
;
const
int64_t
*
rows_
;
int64_t
row_numel_
;
int64_t
row_count_
;
};
template
<
typename
T
,
typename
GradFunctor
>
struct
UncenteredRmspropFunctor
{
UncenteredRmspropFunctor
(
T
*
param
,
T
*
ms
,
T
*
mom
,
const
T
*
lr
,
T
rho
,
T
epsilon
,
T
momentum
,
const
GradFunctor
&
grad_functor
)
:
param_
(
param
),
ms_
(
ms
),
mom_
(
mom
),
lr_
(
lr
),
rho_
(
rho
),
epsilon_
(
epsilon
),
momentum_
(
momentum
),
grad_functor_
(
grad_functor
)
{}
HOSTDEVICE
inline
void
operator
()(
int64_t
idx
)
const
{
T
g
=
grad_functor_
(
idx
);
T
ms_out
=
rho_
*
ms_
[
idx
]
+
(
1
-
rho_
)
*
g
*
g
;
T
mom_out
=
momentum_
*
mom_
[
idx
]
+
lr_
[
0
]
*
g
/
sqrt
(
ms_out
+
epsilon_
);
param_
[
idx
]
-=
mom_out
;
ms_
[
idx
]
=
ms_out
;
mom_
[
idx
]
=
mom_out
;
}
T
*
param_
;
T
*
ms_
;
T
*
mom_
;
const
T
*
lr_
;
T
rho_
;
T
epsilon_
;
T
momentum_
;
GradFunctor
grad_functor_
;
};
template
<
typename
T
,
typename
GradFunctor
>
struct
CenteredRmspropFunctor
{
CenteredRmspropFunctor
(
T
*
param
,
T
*
ms
,
T
*
mom
,
T
*
mean_grad
,
const
T
*
lr
,
T
rho
,
T
epsilon
,
T
momentum
,
const
GradFunctor
&
grad_functor
)
:
param_
(
param
),
ms_
(
ms
),
mom_
(
mom
),
mean_grad_
(
mean_grad
),
lr_
(
lr
),
rho_
(
rho
),
epsilon_
(
epsilon
),
momentum_
(
momentum
),
grad_functor_
(
grad_functor
)
{}
HOSTDEVICE
inline
void
operator
()(
int64_t
idx
)
const
{
T
g
=
grad_functor_
(
idx
);
T
ms_out
=
rho_
*
ms_
[
idx
]
+
(
1
-
rho_
)
*
g
*
g
;
T
mg_out
=
rho_
*
mean_grad_
[
idx
]
+
(
1
-
rho_
)
*
g
;
T
mom_out
=
momentum_
*
mom_
[
idx
]
+
lr_
[
0
]
*
g
/
sqrt
(
ms_out
-
mg_out
*
mg_out
+
epsilon_
);
param_
[
idx
]
-=
mom_out
;
ms_
[
idx
]
=
ms_out
;
mom_
[
idx
]
=
mom_out
;
mean_grad_
[
idx
]
=
mg_out
;
}
T
*
param_
;
T
*
ms_
;
T
*
mom_
;
T
*
mean_grad_
;
const
T
*
lr_
;
T
rho_
;
T
epsilon_
;
T
momentum_
;
GradFunctor
grad_functor_
;
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
RmspropOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
RmspropOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
param_out
=
ctx
.
Output
<
Tensor
>
(
"ParamOut"
);
using
LoDTensor
=
framework
::
LoDTensor
;
auto
*
moment_out
=
ctx
.
Output
<
Tensor
>
(
"MomentOut"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
auto
*
mean_square_out
=
ctx
.
Output
<
Tensor
>
(
"MeanSquareOut"
);
auto
*
param_out
=
ctx
.
Output
<
LoDTensor
>
(
"ParamOut"
);
auto
*
moment_out
=
ctx
.
Output
<
LoDTensor
>
(
"MomentOut"
);
auto
*
mean_square_out
=
ctx
.
Output
<
LoDTensor
>
(
"MeanSquareOut"
);
auto
grad
=
ctx
.
Input
<
Tensor
>
(
"Grad"
);
auto
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
rho
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"decay"
));
auto
momentum
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"momentum"
));
bool
centered
=
ctx
.
Attr
<
bool
>
(
"centered"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
p_tensor
=
*
ctx
.
Input
<
LoDTensor
>
(
"Param"
);
moment_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
ms_tensor
=
*
ctx
.
Input
<
LoDTensor
>
(
"MeanSquare"
);
mean_square_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
lr_tensor
=
*
ctx
.
Input
<
LoDTensor
>
(
"LearningRate"
);
auto
&
mom_tensor
=
*
ctx
.
Input
<
LoDTensor
>
(
"Moment"
);
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
PADDLE_ENFORCE_EQ
(
&
p_tensor
,
param_out
,
float
rho
=
ctx
.
Attr
<
float
>
(
"decay"
);
"Param and ParamOut must be the same Tensor"
);
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
PADDLE_ENFORCE_EQ
(
&
mom_tensor
,
moment_out
,
bool
centered
=
ctx
.
Attr
<
bool
>
(
"centered"
);
"Moment and MomentOut must be the same Tensor"
);
PADDLE_ENFORCE_EQ
(
&
ms_tensor
,
mean_square_out
,
"MeanSquare and MeanSquareOut must be the same Tensor"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
size_t
limit
=
static_cast
<
size_t
>
(
ms_tensor
.
numel
());
if
(
grad_var
->
IsType
<
LoDTensor
>
())
{
auto
&
grad_tensor
=
grad_var
->
Get
<
LoDTensor
>
();
if
(
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
)
{
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
lr_value
=
lr_tensor
.
data
<
T
>
()[
0
];
auto
p
=
EigenVector
<
T
>::
Flatten
(
p_tensor
);
auto
ms
=
EigenVector
<
T
>::
Flatten
(
ms_tensor
);
auto
g
=
EigenVector
<
T
>::
Flatten
(
grad_tensor
);
auto
mom
=
EigenVector
<
T
>::
Flatten
(
mom_tensor
);
auto
p_out
=
EigenVector
<
T
>::
Flatten
(
*
param_out
);
auto
mom_out
=
EigenVector
<
T
>::
Flatten
(
*
moment_out
);
auto
ms_out
=
EigenVector
<
T
>::
Flatten
(
*
mean_square_out
);
ms_out
.
device
(
place
)
=
rho
*
ms
+
(
1
-
rho
)
*
g
*
g
;
if
(
centered
)
{
auto
&
mg_tensor
=
*
ctx
.
Input
<
LoDTensor
>
(
"MeanGrad"
);
auto
mg
=
EigenVector
<
T
>::
Flatten
(
mg_tensor
);
auto
*
mean_grad_out
=
ctx
.
Output
<
LoDTensor
>
(
"MeanGradOut"
);
PADDLE_ENFORCE
(
&
mg_tensor
,
mean_grad_out
,
"MeanGrad and MeanGradOut must be the same Tensor"
);
auto
mg_out
=
EigenVector
<
T
>::
Flatten
(
*
mean_grad_out
);
mg_out
.
device
(
place
)
=
rho
*
mg
+
(
1
-
rho
)
*
g
;
mom_out
.
device
(
place
)
=
momentum
*
mom
+
lr_value
*
g
/
(
ms_out
-
mg_out
.
square
()
+
epsilon
).
sqrt
();
}
else
{
mom_out
.
device
(
place
)
=
momentum
*
mom
+
lr_value
*
g
/
(
ms_out
+
epsilon
).
sqrt
();
}
p_out
.
device
(
place
)
=
p
-
mom_out
;
}
else
{
DenseRmspropGradFunctor
<
T
>
grad_func
(
grad_tensor
.
data
<
T
>
());
platform
::
ForRange
<
DeviceContext
>
for_range
(
dev_ctx
,
limit
);
if
(
centered
)
{
auto
&
mg_tensor
=
*
ctx
.
Input
<
LoDTensor
>
(
"MeanGrad"
);
auto
*
mean_grad_out
=
ctx
.
Output
<
LoDTensor
>
(
"MeanGradOut"
);
PADDLE_ENFORCE
(
&
mg_tensor
,
mean_grad_out
,
"MeanGrad and MeanGradOut must be the same Tensor"
);
for_range
(
CenteredRmspropFunctor
<
T
,
DenseRmspropGradFunctor
<
T
>>
(
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
mean_square_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
moment_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
mean_grad_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
lr_tensor
.
data
<
T
>
(),
rho
,
epsilon
,
momentum
,
grad_func
));
}
else
{
for_range
(
UncenteredRmspropFunctor
<
T
,
DenseRmspropGradFunctor
<
T
>>
(
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
mean_square_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
moment_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
lr_tensor
.
data
<
T
>
(),
rho
,
epsilon
,
momentum
,
grad_func
));
}
}
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
grad
=
grad_var
->
Get
<
framework
::
SelectedRows
>
();
auto
*
merged_grad
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
()
->
GetMutable
<
framework
::
SelectedRows
>
();
math
::
scatter
::
MergeAdd
<
DeviceContext
,
T
>
merge_func
;
merge_func
(
dev_ctx
,
grad
,
merged_grad
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
dev_ctx
,
limit
);
const
int64_t
*
rows
;
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
rows
=
merged_grad
->
rows
().
CUDAData
(
ctx
.
GetPlace
());
}
else
{
#endif
rows
=
merged_grad
->
rows
().
data
();
#ifdef PADDLE_WITH_CUDA
}
#endif
auto
&
merged_tensor
=
merged_grad
->
value
();
int64_t
row_count
=
merged_grad
->
rows
().
size
();
int64_t
row_numel
=
merged_tensor
.
numel
()
/
row_count
;
SparseRmspropGradFunctor
<
T
>
grad_func
(
merged_tensor
.
data
<
T
>
(),
rows
,
row_numel
,
row_count
);
auto
p
=
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
Tensor
>
(
"Param"
));
if
(
centered
)
{
auto
ms
=
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
Tensor
>
(
"MeanSquare"
));
auto
&
mg_tensor
=
*
ctx
.
Input
<
LoDTensor
>
(
"MeanGrad"
);
auto
lr
=
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
Tensor
>
(
"LearningRate"
));
auto
*
mean_grad_out
=
ctx
.
Output
<
LoDTensor
>
(
"MeanGradOut"
);
auto
g
=
EigenVector
<
T
>::
Flatten
(
*
grad
);
PADDLE_ENFORCE
(
&
mg_tensor
,
mean_grad_out
,
auto
mom
=
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
Tensor
>
(
"Moment"
));
"MeanGrad and MeanGradOut must be the same Tensor"
);
for_range
(
CenteredRmspropFunctor
<
T
,
SparseRmspropGradFunctor
<
T
>>
(
auto
p_out
=
EigenVector
<
T
>::
Flatten
(
*
param_out
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
auto
mom_out
=
EigenVector
<
T
>::
Flatten
(
*
moment_out
);
mean_square_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
auto
ms_out
=
EigenVector
<
T
>::
Flatten
(
*
mean_square_out
);
moment_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
mean_grad_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
lr_tensor
.
data
<
T
>
(),
rho
,
epsilon
,
momentum
,
grad_func
));
Eigen
::
DSizes
<
int
,
1
>
grad_dsize
(
static_cast
<
int
>
(
grad
->
numel
()));
}
else
{
for_range
(
UncenteredRmspropFunctor
<
T
,
SparseRmspropGradFunctor
<
T
>>
(
ms_out
.
device
(
place
)
=
rho
*
ms
+
(
1
-
rho
)
*
g
*
g
;
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
if
(
centered
)
{
mean_square_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
auto
mg
=
EigenVector
<
T
>::
Flatten
(
*
ctx
.
Input
<
Tensor
>
(
"MeanGrad"
));
moment_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
lr_tensor
.
data
<
T
>
(),
auto
*
mean_grad_out
=
ctx
.
Output
<
Tensor
>
(
"MeanGradOut"
);
rho
,
epsilon
,
momentum
,
grad_func
));
mean_grad_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
auto
mg_out
=
EigenVector
<
T
>::
Flatten
(
*
mean_grad_out
);
mg_out
.
device
(
place
)
=
rho
*
mg
+
(
1
-
rho
)
*
g
;
mom_out
.
device
(
place
)
=
momentum
*
mom
+
lr
.
broadcast
(
grad_dsize
)
*
g
/
(
ms_out
-
mg_out
.
square
()
+
epsilon
).
sqrt
();
}
else
{
}
else
{
mom_out
.
device
(
place
)
=
PADDLE_THROW
(
"RMSProp only supports LoDTensor or SelectedRows gradient"
);
momentum
*
mom
+
lr
.
broadcast
(
grad_dsize
)
*
g
/
(
ms_out
+
epsilon
).
sqrt
();
}
}
p_out
.
device
(
place
)
=
p
-
mom_out
;
}
}
};
};
...
...
paddle/fluid/operators/sequence_concat_op.cc
浏览文件 @
cc7f5514
...
@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
...
@@ -90,11 +90,13 @@ REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel,
paddle
::
framework
::
DefaultGradOpDescMaker
<
false
>
);
paddle
::
framework
::
DefaultGradOpDescMaker
<
false
>
);
template
<
typename
T
>
template
<
typename
T
>
using
Kernel
=
op
::
SeqConcatKernel
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
;
using
Kernel
=
op
::
SeqConcatKernel
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
;
REGISTER_OP_CPU_KERNEL
(
sequence_concat
,
Kernel
<
float
>
,
Kernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
sequence_concat
,
Kernel
<
float
>
,
Kernel
<
double
>
,
Kernel
<
int64_t
>
);
REGISTER_OPERATOR
(
sequence_concat_grad
,
paddle
::
framework
::
OperatorWithKernel
,
REGISTER_OPERATOR
(
sequence_concat_grad
,
paddle
::
framework
::
OperatorWithKernel
,
op
::
SeqConcatGradShapeInferer
);
op
::
SeqConcatGradShapeInferer
);
template
<
typename
T
>
template
<
typename
T
>
using
GradKernel
=
using
GradKernel
=
op
::
SeqConcatGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
;
op
::
SeqConcatGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
;
REGISTER_OP_CPU_KERNEL
(
sequence_concat_grad
,
GradKernel
<
float
>
,
REGISTER_OP_CPU_KERNEL
(
sequence_concat_grad
,
GradKernel
<
float
>
,
GradKernel
<
double
>
);
GradKernel
<
double
>
,
GradKernel
<
int64_t
>
);
paddle/fluid/operators/sequence_unpad_op.cc
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_unpad_op.h"
namespace
paddle
{
namespace
operators
{
class
SequenceUnpadOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequenceUnpadOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Length"
),
"Input(Length) of SequenceUnpadOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of SequenceUnpadOp should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
"The rank of Input(X) can't be less than 2."
);
auto
len_dims
=
ctx
->
GetInputDim
(
"Length"
);
PADDLE_ENFORCE
(
len_dims
.
size
()
==
2
&&
len_dims
[
1
]
==
1
,
"The shape of Input(Length) should be [batch_size, 1]."
);
PADDLE_ENFORCE
(
len_dims
[
0
]
==
x_dims
[
0
],
"Input(X) and Input(Length) should have the same first dimension."
);
int64_t
out_dim_0
=
-
1
;
if
(
ctx
->
IsRuntime
())
{
out_dim_0
=
x_dims
[
0
]
*
x_dims
[
1
];
}
std
::
vector
<
int64_t
>
out_dims_vec
{
out_dim_0
};
if
(
x_dims
.
size
()
==
2
)
{
out_dims_vec
.
push_back
(
1
);
}
else
{
for
(
size_t
i
=
2
;
i
<
x_dims
.
size
();
++
i
)
{
out_dims_vec
.
push_back
(
x_dims
[
i
]);
}
}
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims_vec
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"X"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
class
SequenceUnpadOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(LoDTensor, default LoDTensor<float>) Input tensor which "
"contains the padded sequences with equal length."
);
AddInput
(
"Length"
,
"(LoDTensor) The input tensor which specifies the actual ength of "
"sequences after unpadding."
);
AddOutput
(
"Out"
,
"(LoDTensor) The output tensor which contains unpadded sequences."
);
AddComment
(
R"DOC(
Sequence Unpad Operator
This operator removes the padding data in the input sequences and convert
them into sequences with actual length as output, identitied by lod
information.
Example:
Given input tensor Input(X):
X.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0],
[ 6.0, 7.0, 8.0, 9.0, 10.0],
[11.0, 12.0, 13.0, 14.0, 15.0]],
`
in which there are 3 sequences padded to length 5, and the acutal length
specified by Input(Length):
Length.data = [[2], [3], [4]],
after unpadding, Output(Out) will be:
Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
Out.lod = [[0, 2, 5, 9]]
)DOC"
);
}
};
class
SequenceUnpadGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequenceUnpadGradOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) of SequenceUnpadGradOp should not be null."
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
framework
::
GradVarName
(
"X"
));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"X"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
sequence_unpad
,
ops
::
SequenceUnpadOp
,
ops
::
SequenceUnpadOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
sequence_unpad_grad
,
ops
::
SequenceUnpadGradOp
);
REGISTER_OP_CPU_KERNEL
(
sequence_unpad
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
sequence_unpad_grad
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/sequence_unpad_op.cu
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_unpad_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
sequence_unpad
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SequenceUnpadOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
sequence_unpad_grad
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SequenceUnpadGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/sequence_unpad_op.h
0 → 100644
浏览文件 @
cc7f5514
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h"
namespace
paddle
{
namespace
operators
{
using
LoDTensor
=
framework
::
LoDTensor
;
using
LoD
=
framework
::
LoD
;
template
<
typename
DeviceContext
,
typename
T
>
class
SequenceUnpadOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x_t
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
len_t
=
ctx
.
Input
<
LoDTensor
>
(
"Length"
);
auto
*
out_t
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
out_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
int64_t
*
seq_len_ptr
=
nullptr
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
LoDTensor
seq_len_cpu
;
seq_len_cpu
.
Resize
(
len_t
->
dims
());
seq_len_ptr
=
seq_len_cpu
.
mutable_data
<
int64_t
>
(
platform
::
CPUPlace
());
framework
::
TensorCopy
(
*
len_t
,
platform
::
CPUPlace
(),
ctx
.
template
device_context
<
DeviceContext
>(),
&
seq_len_cpu
);
}
else
{
seq_len_ptr
=
len_t
->
data
<
int64_t
>
();
}
size_t
batch_size
=
x_t
->
dims
()[
0
];
std
::
vector
<
size_t
>
out_lod0
(
batch_size
+
1
,
0
);
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
out_lod0
[
i
+
1
]
=
out_lod0
[
i
]
+
seq_len_ptr
[
i
];
}
framework
::
LoD
out_lod
;
out_lod
.
push_back
(
out_lod0
);
out_t
->
set_lod
(
out_lod
);
std
::
vector
<
int64_t
>
out_dims_vec
{
static_cast
<
int64_t
>
(
out_lod0
.
back
())};
if
(
x_t
->
dims
().
size
()
==
2
)
{
out_dims_vec
.
push_back
(
1
);
}
else
{
for
(
size_t
i
=
2
;
i
<
x_t
->
dims
().
size
();
++
i
)
{
out_dims_vec
.
push_back
(
x_t
->
dims
()[
i
]);
}
}
out_t
->
Resize
(
framework
::
make_ddim
(
out_dims_vec
));
int64_t
padded_length
=
x_t
->
dims
()[
1
];
math
::
UnpaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
*
x_t
,
out_t
,
padded_length
,
0
,
false
,
math
::
kBatchLengthWidth
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
SequenceUnpadGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
if
(
d_x
)
{
const
auto
*
d_out
=
ctx
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
const
auto
*
x_t
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
padded_length
=
x_t
->
dims
()[
1
];
LoDTensor
zero_pads
;
zero_pads
.
Resize
({
1
,
1
});
zero_pads
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
set_zero
(
dev_ctx
,
&
zero_pads
,
static_cast
<
T
>
(
0
));
math
::
PaddingLoDTensorFunctor
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
*
d_out
,
d_x
,
zero_pads
,
padded_length
,
0
,
false
,
math
::
kBatchLengthWidth
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/sgd_op.cc
浏览文件 @
cc7f5514
...
@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
...
@@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel {
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Param"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Param"
),
"Input(Param) of SGDOp should not be null."
);
"Input(Param) of SGDOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grad"
),
...
@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
...
@@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel {
protected:
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Param"
));
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
}
...
@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
...
@@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel {
class
SGDOpInferVarType
:
public
framework
::
VarTypeInference
{
class
SGDOpInferVarType
:
public
framework
::
VarTypeInference
{
public:
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
framework
::
BlockDesc
*
block
)
const
override
{
auto
input_var
=
op_desc
.
Input
(
"Param"
)[
0
];
auto
input_var_n
=
op_desc
.
Input
(
"Param"
)[
0
];
for
(
auto
&
out_var
:
op_desc
.
Output
(
"ParamOut"
))
{
auto
in_var_type
=
block
->
FindRecursiveOrCreateVar
(
input_var_n
).
GetType
();
if
(
block
->
FindRecursiveOrCreateVar
(
input_var
).
GetType
()
==
PADDLE_ENFORCE
(
in_var_type
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
||
framework
::
proto
::
VarType
::
SELECTED_ROWS
)
{
in_var_type
==
framework
::
proto
::
VarType
::
LOD_TENSOR
,
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
"The input Var's type should be LoDtensor or SelectedRows,"
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
" but the received var(%s)'s type is %s"
,
}
else
{
input_var_n
,
in_var_type
);
block
->
FindRecursiveOrCreateVar
(
out_var
).
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
for
(
auto
&
out_var_n
:
op_desc
.
Output
(
"ParamOut"
))
{
auto
&
out_var
=
block
->
FindRecursiveOrCreateVar
(
out_var_n
);
if
(
out_var
.
GetType
()
!=
in_var_type
)
{
out_var
.
SetType
(
in_var_type
);
}
}
}
}
}
}
...
...
paddle/fluid/operators/sgd_op.cu
浏览文件 @
cc7f5514
...
@@ -56,6 +56,12 @@ template <typename T>
...
@@ -56,6 +56,12 @@ template <typename T>
class
SGDOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
SGDOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE
(
param_var
->
IsType
<
framework
::
LoDTensor
>
(),
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s"
,
ctx
.
Inputs
(
"Param"
).
front
(),
param_var
->
Type
().
name
());
auto
*
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
*
param
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Param"
);
auto
*
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
param_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"ParamOut"
);
auto
*
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
auto
*
learning_rate
=
ctx
.
Input
<
framework
::
Tensor
>
(
"LearningRate"
);
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
cc7f5514
...
@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
...
@@ -128,7 +128,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
return
cpu
.
has
(
Cpu
::
tAVX
);
return
cpu
.
has
(
Cpu
::
tAVX
);
case
avx2
:
case
avx2
:
return
cpu
.
has
(
Cpu
::
tAVX2
);
return
cpu
.
has
(
Cpu
::
tAVX2
);
case
avx512
_common
:
case
avx512
f
:
return
cpu
.
has
(
Cpu
::
tAVX512F
);
return
cpu
.
has
(
Cpu
::
tAVX512F
);
case
avx512_core
:
case
avx512_core
:
return
true
&&
cpu
.
has
(
Cpu
::
tAVX512F
)
&&
cpu
.
has
(
Cpu
::
tAVX512BW
)
&&
return
true
&&
cpu
.
has
(
Cpu
::
tAVX512F
)
&&
cpu
.
has
(
Cpu
::
tAVX512BW
)
&&
...
...
paddle/fluid/platform/cpu_info.h
浏览文件 @
cc7f5514
...
@@ -43,7 +43,7 @@ typedef enum {
...
@@ -43,7 +43,7 @@ typedef enum {
sse42
,
sse42
,
avx
,
avx
,
avx2
,
avx2
,
avx512
_common
,
avx512
f
,
avx512_core
,
avx512_core
,
avx512_core_vnni
,
avx512_core_vnni
,
avx512_mic
,
avx512_mic
,
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
cc7f5514
...
@@ -198,9 +198,9 @@ class CudnnHolder {
...
@@ -198,9 +198,9 @@ class CudnnHolder {
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
),
cudnn_holder_
(
nullptr
)
{
:
place_
(
place
),
cudnn_holder_
(
nullptr
)
{
SetDeviceId
(
place_
.
device
);
SetDeviceId
(
place_
.
device
);
compute_capability
=
GetCUDAComputeCapability
(
place_
.
device
);
compute_capability
_
=
GetCUDAComputeCapability
(
place_
.
device
);
multi_process
=
GetCUDAMultiProcessors
(
place_
.
device
);
multi_process
_
=
GetCUDAMultiProcessors
(
place_
.
device
);
max_threads_per_mp
=
GetCUDAMaxThreadsPerMultiProcessor
(
place_
.
device
);
max_threads_per_mp
_
=
GetCUDAMaxThreadsPerMultiProcessor
(
place_
.
device
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
...
@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
...
@@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
cudnn_holder_
.
reset
(
new
CudnnHolder
(
&
stream_
,
place
));
cudnn_holder_
.
reset
(
new
CudnnHolder
(
&
stream_
,
place
));
}
}
driver_version_
=
GetCUDADriverVersion
(
place_
.
device
);
runtime_version_
=
GetCUDARuntimeVersion
(
place_
.
device
);
LOG
(
INFO
)
<<
"device: "
<<
place_
.
device
<<
", CUDA Capability: "
<<
compute_capability_
<<
", Driver Version: "
<<
driver_version_
/
1000
<<
"."
<<
(
driver_version_
%
100
)
/
10
<<
", Runtime Version: "
<<
runtime_version_
/
1000
<<
"."
<<
(
runtime_version_
%
100
)
/
10
;
callback_manager_
.
reset
(
new
StreamCallbackManager
(
stream_
));
callback_manager_
.
reset
(
new
StreamCallbackManager
(
stream_
));
}
}
...
@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const {
...
@@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const {
}
}
int
CUDADeviceContext
::
GetComputeCapability
()
const
{
int
CUDADeviceContext
::
GetComputeCapability
()
const
{
return
compute_capability
;
return
compute_capability
_
;
}
}
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
return
multi_process
*
max_threads_per_mp
;
return
multi_process
_
*
max_threads_per_mp_
;
}
}
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
cc7f5514
...
@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t
stream_
;
cudaStream_t
stream_
;
cublasHandle_t
cublas_handle_
;
cublasHandle_t
cublas_handle_
;
int
compute_capability
;
int
compute_capability_
;
int
multi_process
;
int
runtime_version_
;
int
max_threads_per_mp
;
int
driver_version_
;
int
multi_process_
;
int
max_threads_per_mp_
;
mutable
std
::
mutex
mtx_
;
mutable
std
::
mutex
mtx_
;
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
cc7f5514
...
@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
...
@@ -130,6 +130,13 @@ struct EOFException : public std::exception {
#define UNLIKELY(condition) (condition == 0)
#define UNLIKELY(condition) (condition == 0)
#endif
#endif
#if !defined(_WIN32)
#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
#else
// there is no equivalent intrinsics in msvc.
#define LIKELY(condition) (condition != 0)
#endif
template
<
typename
...
Args
>
template
<
typename
...
Args
>
inline
typename
std
::
enable_if
<
sizeof
...(
Args
)
!=
0
,
void
>::
type
throw_on_error
(
inline
typename
std
::
enable_if
<
sizeof
...(
Args
)
!=
0
,
void
>::
type
throw_on_error
(
bool
stat
,
const
Args
&
...
args
)
{
bool
stat
,
const
Args
&
...
args
)
{
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
cc7f5514
...
@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) {
...
@@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) {
return
device_prop
.
major
*
10
+
device_prop
.
minor
;
return
device_prop
.
major
*
10
+
device_prop
.
minor
;
}
}
int
GetCUDARuntimeVersion
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
runtime_version
=
0
;
PADDLE_ENFORCE
(
cudaRuntimeGetVersion
(
&
runtime_version
),
"cudaRuntimeGetVersion failed in "
"paddle::platform::cudaRuntimeGetVersion"
);
return
runtime_version
;
}
int
GetCUDADriverVersion
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
driver_version
=
0
;
PADDLE_ENFORCE
(
cudaDriverGetVersion
(
&
driver_version
),
"cudaDriverGetVersion failed in "
"paddle::platform::GetCUDADriverVersion"
);
return
driver_version
;
}
int
GetCUDAMultiProcessors
(
int
id
)
{
int
GetCUDAMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
int
count
;
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
cc7f5514
...
@@ -29,6 +29,12 @@ int GetCUDADeviceCount();
...
@@ -29,6 +29,12 @@ int GetCUDADeviceCount();
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
int
GetCUDAComputeCapability
(
int
i
);
int
GetCUDAComputeCapability
(
int
i
);
//! Get the runtime version of the ith GPU
int
GetCUDARuntimeVersion
(
int
id
);
//! Get the driver version of the ith GPU
int
GetCUDADriverVersion
(
int
id
);
//! Get the MultiProcessors of the ith GPU.
//! Get the MultiProcessors of the ith GPU.
int
GetCUDAMultiProcessors
(
int
i
);
int
GetCUDAMultiProcessors
(
int
i
);
...
...
paddle/fluid/platform/init.cc
浏览文件 @
cc7f5514
...
@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
...
@@ -116,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
platform
::
SetNumThreads
(
FLAGS_paddle_num_threads
);
#endif
#endif
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx512
_common
))
{
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx512
f
))
{
#ifndef __AVX512F__
#ifndef __AVX512F__
LOG
(
WARNING
)
<<
"AVX512F is available, Please re-compile on local machine"
;
LOG
(
WARNING
)
<<
"AVX512F is available, Please re-compile on local machine"
;
#endif
#endif
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
cc7f5514
...
@@ -370,8 +370,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
...
@@ -370,8 +370,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
std
::
vector
<
std
::
vector
<
Event
>>
merged_events_list
;
std
::
vector
<
std
::
vector
<
Event
>>
merged_events_list
;
if
(
merge_thread
)
{
if
(
merge_thread
)
{
std
::
vector
<
Event
>
merged_events
;
std
::
vector
<
Event
>
merged_events
;
for
(
in
t
i
=
0
;
i
<
events
.
size
();
++
i
)
{
for
(
size_
t
i
=
0
;
i
<
events
.
size
();
++
i
)
{
for
(
in
t
j
=
0
;
j
<
events
[
i
].
size
();
++
j
)
{
for
(
size_
t
j
=
0
;
j
<
events
[
i
].
size
();
++
j
)
{
merged_events
.
push_back
(
events
[
i
][
j
]);
merged_events
.
push_back
(
events
[
i
][
j
]);
}
}
}
}
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
cc7f5514
...
@@ -57,6 +57,10 @@ limitations under the License. */
...
@@ -57,6 +57,10 @@ limitations under the License. */
#include "pybind11/stl.h"
#include "pybind11/stl.h"
DEFINE_bool
(
reader_queue_speed_test_mode
,
false
,
"If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing"
);
// disable auto conversion to list in Python
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE
(
paddle
::
framework
::
LoDTensorArray
);
PYBIND11_MAKE_OPAQUE
(
paddle
::
framework
::
LoDTensorArray
);
...
@@ -170,14 +174,14 @@ PYBIND11_PLUGIN(core) {
...
@@ -170,14 +174,14 @@ PYBIND11_PLUGIN(core) {
A LoDTensor X can look like the example below. It contains 2 sequences.
A LoDTensor X can look like the example below. It contains 2 sequences.
The first has length 2 and the second has length 3, as described by x.lod.
The first has length 2 and the second has length 3, as described by x.lod.
The first tensor dimension
6
=2+3 is calculated from LoD if it's available.
The first tensor dimension
5
=2+3 is calculated from LoD if it's available.
It means the total number of sequence element. In X, each element has 2
It means the total number of sequence element. In X, each element has 2
columns, hence [
6
, 2].
columns, hence [
5
, 2].
x.lod = [[2, 3]]
x.lod = [[2, 3]]
x.data = [[1, 2], [3, 4],
x.data = [[1, 2], [3, 4],
[5, 6], [7, 8], [9, 10]
, [11, 12]
]
[5, 6], [7, 8], [9, 10]]
x.shape = [
6
, 2]
x.shape = [
5
, 2]
LoD can have multiple levels (for example, a paragraph can have multiple
LoD can have multiple levels (for example, a paragraph can have multiple
sentences and a sentence can have multiple words). In the following
sentences and a sentence can have multiple words). In the following
...
@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle.
return
make_ddim
(
shape
);
return
make_ddim
(
shape
);
});
});
auto
*
holder
=
var
.
GetMutable
<
LoDTensorBlockingQueueHolder
>
();
auto
*
holder
=
var
.
GetMutable
<
LoDTensorBlockingQueueHolder
>
();
holder
->
InitOnce
(
capacity
,
dims
);
holder
->
InitOnce
(
capacity
,
dims
,
FLAGS_reader_queue_speed_test_mode
);
return
holder
->
GetQueue
();
return
holder
->
GetQueue
();
},
},
py
::
return_value_policy
::
copy
);
py
::
return_value_policy
::
copy
);
...
@@ -667,16 +672,17 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -667,16 +672,17 @@ All parameter, weight, gradient are variables in Paddle.
ExecutionStrategy allows the user to more preciously control how to run
ExecutionStrategy allows the user to more preciously control how to run
the program in ParallelExecutor by setting the property.
the program in ParallelExecutor by setting the property.
The available properties include:
Examples:
use_cuda (bool): Whether to use CUDA or not. Default True.
.. code-block:: python
num_threads (int): The number of threads that used to run the
operators in ParallelExecutor. If it is not set, it will be
exec_strategy = fluid.ExecutionStrategy()
set in ParallelExecutor according to the device count.
exec_strategy.num_threads = 4
Default 0.
allow_op_delay (bool): Whether to delay the communication operators
train_exe = fluid.ParallelExecutor(use_cuda=True,
to run. Default False.
loss_name=loss.name,
num_iteration_per_drop_scope (int): how many iterations between
exec_strategy=exec_strategy)
the two dropping local scopes. Default 100.
train_loss, = train_exe.run([loss.name], feed=feed_dict)
)DOC"
);
)DOC"
);
...
@@ -686,19 +692,34 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -686,19 +692,34 @@ All parameter, weight, gradient are variables in Paddle.
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
num_threads_
;
},
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
num_threads_
;
},
[](
ExecutionStrategy
&
self
,
size_t
num_threads
)
{
[](
ExecutionStrategy
&
self
,
size_t
num_threads
)
{
self
.
num_threads_
=
num_threads
;
self
.
num_threads_
=
num_threads
;
})
},
R"DOC(The type is INT, num_threads represents the size of thread pool that
used to run the operators of the current program in ParallelExecutor.
If :math:`num\_threads=1`, all the operators will execute one by one,
but the order maybe difference between iterations.
If it is not set, it will be set in ParallelExecutor according to the
device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
:math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
if it is not set, ParallelExecutor will get the cpu count by calling
`multiprocessing.cpu_count()`. Default 0.)DOC"
)
.
def_property
(
.
def_property
(
"use_cuda"
,
"use_cuda"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_cuda_
;
},
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_cuda_
;
},
[](
ExecutionStrategy
&
self
,
bool
use_cuda
)
{
[](
ExecutionStrategy
&
self
,
bool
use_cuda
)
{
self
.
use_cuda_
=
use_cuda
;
self
.
use_cuda_
=
use_cuda
;
})
})
// FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
// make user confuse, because ParallelExecutor has a parameter named
// 'use_cuda' too, in current implementation, ParallelExecutor's
// 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
.
def_property
(
.
def_property
(
"allow_op_delay"
,
"allow_op_delay"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
allow_op_delay_
;
},
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
allow_op_delay_
;
},
[](
ExecutionStrategy
&
self
,
bool
allow_op_delay
)
{
[](
ExecutionStrategy
&
self
,
bool
allow_op_delay
)
{
self
.
allow_op_delay_
=
allow_op_delay
;
self
.
allow_op_delay_
=
allow_op_delay
;
})
},
R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
communication operators to run, it may make the execution faster.
Note that in some models, allow_op_delay may cause program hang. Default False.)DOC"
)
.
def_property
(
.
def_property
(
"num_iteration_per_drop_scope"
,
"num_iteration_per_drop_scope"
,
[](
const
ExecutionStrategy
&
self
)
{
[](
const
ExecutionStrategy
&
self
)
{
...
@@ -706,7 +727,19 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -706,7 +727,19 @@ All parameter, weight, gradient are variables in Paddle.
},
},
[](
ExecutionStrategy
&
self
,
size_t
num_iteration_per_drop_scope
)
{
[](
ExecutionStrategy
&
self
,
size_t
num_iteration_per_drop_scope
)
{
self
.
num_iteration_per_drop_scope_
=
num_iteration_per_drop_scope
;
self
.
num_iteration_per_drop_scope_
=
num_iteration_per_drop_scope
;
});
},
R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
many iterations to clean up the temp variables which
is generated during execution. It may make the execution faster,
because the temp variable's shape maybe the same between two iterations. Default 100.
NOTES:
1. If you fetch data when calling the 'run', the ParallelExecutor
will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`.
)DOC"
);
exec_strategy
.
def_property
(
exec_strategy
.
def_property
(
"use_experimental_executor"
,
"use_experimental_executor"
,
[](
const
ExecutionStrategy
&
self
)
{
[](
const
ExecutionStrategy
&
self
)
{
...
@@ -721,20 +754,17 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -721,20 +754,17 @@ All parameter, weight, gradient are variables in Paddle.
BuildStrategy allows the user to more preciously control how to
BuildStrategy allows the user to more preciously control how to
build the SSA Graph in ParallelExecutor by setting the property.
build the SSA Graph in ParallelExecutor by setting the property.
The available properties include:
Examples:
reduce_strategy (str): There are two reduce strategies, 'AllReduce'
.. code-block:: python
and 'Reduce'. If you want that all parameters will be optimized
on all devices, you can choose 'AllReduce'; if you choose
build_strategy = fluid.BuildStrategy()
'Reduce', all parameters will be evenly allocated to different
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
devices for optimization, and then broadcast the optimized
parameter to other devices. Default 'AllReduce'.
train_exe = fluid.ParallelExecutor(use_cuda=True,
gradient_scale_strategy (str): There are two ways of defining loss@grad,
loss_name=loss.name,
'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor
build_strategy=build_strategy)
sets the loss@grad according to the number of devices. If you want
to customize loss@grad, you can choose 'Customized'.
train_loss, = train_exe.run([loss.name], feed=feed_dict)
Default 'CoeffNumDevice'.
debug_graphviz_path (str): Whether to write the SSA Graph to file in the
form of graphviz. It is useful for debugging. Default "".
)DOC"
);
)DOC"
);
py
::
enum_
<
BuildStrategy
::
ReduceStrategy
>
(
build_strategy
,
"ReduceStrategy"
)
py
::
enum_
<
BuildStrategy
::
ReduceStrategy
>
(
build_strategy
,
"ReduceStrategy"
)
...
@@ -753,31 +783,51 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -753,31 +783,51 @@ All parameter, weight, gradient are variables in Paddle.
[](
const
BuildStrategy
&
self
)
{
return
self
.
reduce_
;
},
[](
const
BuildStrategy
&
self
)
{
return
self
.
reduce_
;
},
[](
BuildStrategy
&
self
,
BuildStrategy
::
ReduceStrategy
strategy
)
{
[](
BuildStrategy
&
self
,
BuildStrategy
::
ReduceStrategy
strategy
)
{
self
.
reduce_
=
strategy
;
self
.
reduce_
=
strategy
;
})
},
R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
'AllReduce' and 'Reduce'. If you want that all the parameters'
optimization are done on all devices independently, you should choose 'AllReduce';
if you choose 'Reduce', all the parameters' optimization will be evenly distributed
to different devices, and then broadcast the optimized parameter to other devices.
In some models, `Reduce` is faster. Default 'AllReduce'. )DOC"
)
.
def_property
(
.
def_property
(
"gradient_scale_strategy"
,
"gradient_scale_strategy"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
gradient_scale_
;
},
[](
const
BuildStrategy
&
self
)
{
return
self
.
gradient_scale_
;
},
[](
BuildStrategy
&
self
,
[](
BuildStrategy
&
self
,
BuildStrategy
::
GradientScaleStrategy
strategy
)
{
BuildStrategy
::
GradientScaleStrategy
strategy
)
{
self
.
gradient_scale_
=
strategy
;
self
.
gradient_scale_
=
strategy
;
})
},
R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
If you want to customize :math:`loss@grad`, you can choose 'Customized'.
Default 'CoeffNumDevice'.)DOC"
)
.
def_property
(
.
def_property
(
"debug_graphviz_path"
,
"debug_graphviz_path"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
debug_graphviz_path_
;
},
[](
const
BuildStrategy
&
self
)
{
return
self
.
debug_graphviz_path_
;
},
[](
BuildStrategy
&
self
,
const
std
::
string
&
path
)
{
[](
BuildStrategy
&
self
,
const
std
::
string
&
path
)
{
self
.
debug_graphviz_path_
=
path
;
self
.
debug_graphviz_path_
=
path
;
})
},
R"DOC(The type is STR, debug_graphviz_path indicate the path that
writing the SSA Graph to file in the form of graphviz, you.
It is useful for debugging. Default "")DOC"
)
.
def_property
(
.
def_property
(
"enable_data_balance"
,
"enable_data_balance"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
enable_data_balance_
;
},
[](
const
BuildStrategy
&
self
)
{
return
self
.
enable_data_balance_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
enable_data_balance_
=
b
;
})
[](
BuildStrategy
&
self
,
bool
b
)
{
.
def_property
(
"fuse_elewise_add_act_ops"
,
self
.
enable_data_balance_
=
b
;
[](
const
BuildStrategy
&
self
)
{
})
// FIXME(chengudo): enable_data_balance seems not important
return
self
.
fuse_elewise_add_act_ops_
;
.
def_property
(
},
"fuse_elewise_add_act_ops"
,
[](
BuildStrategy
&
self
,
bool
b
)
{
[](
const
BuildStrategy
&
self
)
{
self
.
fuse_elewise_add_act_ops_
=
b
;
return
self
.
fuse_elewise_add_act_ops_
;
})
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
fuse_elewise_add_act_ops_
=
b
;
},
R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
to fuse elementwise_add_op and activation_op,
it may make the execution faster. Default False)DOC"
)
.
def
(
"_create_passes_from_strategy"
,
.
def
(
"_create_passes_from_strategy"
,
[](
BuildStrategy
&
self
)
->
std
::
shared_ptr
<
ir
::
PassBuilder
>
{
[](
BuildStrategy
&
self
)
->
std
::
shared_ptr
<
ir
::
PassBuilder
>
{
return
self
.
CreatePassesFromStrategy
();
return
self
.
CreatePassesFromStrategy
();
...
...
paddle/fluid/train/demo/README.md
浏览文件 @
cc7f5514
...
@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
...
@@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
-DWITH_MKL=OFF \
-DWITH_MKL=OFF \
-DWITH_MKLDNN=OFF
-DWITH_MKLDNN=OFF
make -j8
make -j8
make -j8
inference
_lib_dist
make -j8
fluid
_lib_dist
```
```
### step 2. generate program desc
### step 2. generate program desc
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
cc7f5514
...
@@ -648,25 +648,25 @@ function gen_capi_package() {
...
@@ -648,25 +648,25 @@ function gen_capi_package() {
fi
fi
}
}
function
gen_fluid_
inference_
lib
()
{
function
gen_fluid_lib
()
{
mkdir
-p
${
PADDLE_ROOT
}
/build
mkdir
-p
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
cat
<<
EOF
cat
<<
EOF
========================================
========================================
Generating fluid
inference library
...
Generating fluid
library for train and inference
...
========================================
========================================
EOF
EOF
cmake ..
-DWITH_DISTRIBUTE
=
OFF
cmake ..
-DWITH_DISTRIBUTE
=
OFF
make
-j
`
nproc
`
inference
_lib_dist
make
-j
`
nproc
`
fluid
_lib_dist
fi
fi
}
}
function
tar_fluid_
inference_
lib
()
{
function
tar_fluid_lib
()
{
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
cat
<<
EOF
cat
<<
EOF
========================================
========================================
Taring fluid
inference library
...
Taring fluid
library for train and inference
...
========================================
========================================
EOF
EOF
cd
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
...
@@ -675,11 +675,11 @@ EOF
...
@@ -675,11 +675,11 @@ EOF
fi
fi
}
}
function
test_fluid_
inference_
lib
()
{
function
test_fluid_lib
()
{
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
cat
<<
EOF
cat
<<
EOF
========================================
========================================
Testing fluid
inference library
...
Testing fluid
library for inference
...
========================================
========================================
EOF
EOF
cd
${
PADDLE_ROOT
}
/paddle/fluid/inference/api/demo_ci
cd
${
PADDLE_ROOT
}
/paddle/fluid/inference/api/demo_ci
...
@@ -731,9 +731,9 @@ function main() {
...
@@ -731,9 +731,9 @@ function main() {
;;
;;
fluid_inference_lib
)
fluid_inference_lib
)
cmake_gen
${
PYTHON_ABI
:-
""
}
cmake_gen
${
PYTHON_ABI
:-
""
}
gen_fluid_
inference_
lib
gen_fluid_lib
tar_fluid_
inference_
lib
tar_fluid_lib
test_fluid_
inference_
lib
test_fluid_lib
;;
;;
check_style
)
check_style
)
check_style
check_style
...
@@ -744,8 +744,8 @@ function main() {
...
@@ -744,8 +744,8 @@ function main() {
assert_api_not_changed
${
PYTHON_ABI
:-
""
}
assert_api_not_changed
${
PYTHON_ABI
:-
""
}
run_test
run_test
gen_capi_package
gen_capi_package
gen_fluid_
inference_
lib
gen_fluid_lib
test_fluid_
inference_
lib
test_fluid_lib
assert_api_spec_approvals
assert_api_spec_approvals
;;
;;
maccheck
)
maccheck
)
...
...
python/paddle/fluid/__init__.py
浏览文件 @
cc7f5514
...
@@ -113,7 +113,8 @@ def __bootstrap__():
...
@@ -113,7 +113,8 @@ def __bootstrap__():
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'cpu_deterministic'
,
'eager_delete_tensor_gb'
'dist_threadpool_size'
,
'cpu_deterministic'
,
'eager_delete_tensor_gb'
,
'reader_queue_speed_test_mode'
]
]
if
core
.
is_compiled_with_dist
():
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'rpc_deadline'
)
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
cc7f5514
...
@@ -56,6 +56,7 @@ __all__ = [
...
@@ -56,6 +56,7 @@ __all__ = [
'sequence_expand'
,
'sequence_expand'
,
'sequence_expand_as'
,
'sequence_expand_as'
,
'sequence_pad'
,
'sequence_pad'
,
'sequence_unpad'
,
'lstm_unit'
,
'lstm_unit'
,
'reduce_sum'
,
'reduce_sum'
,
'reduce_mean'
,
'reduce_mean'
,
...
@@ -64,6 +65,7 @@ __all__ = [
...
@@ -64,6 +65,7 @@ __all__ = [
'reduce_prod'
,
'reduce_prod'
,
'sequence_first_step'
,
'sequence_first_step'
,
'sequence_last_step'
,
'sequence_last_step'
,
'sequence_slice'
,
'dropout'
,
'dropout'
,
'split'
,
'split'
,
'ctc_greedy_decoder'
,
'ctc_greedy_decoder'
,
...
@@ -107,6 +109,7 @@ __all__ = [
...
@@ -107,6 +109,7 @@ __all__ = [
'log'
,
'log'
,
'crop'
,
'crop'
,
'rank_loss'
,
'rank_loss'
,
'margin_rank_loss'
,
'elu'
,
'elu'
,
'relu6'
,
'relu6'
,
'pow'
,
'pow'
,
...
@@ -1901,6 +1904,76 @@ def sequence_last_step(input):
...
@@ -1901,6 +1904,76 @@ def sequence_last_step(input):
return
sequence_pool
(
input
=
input
,
pool_type
=
"last"
)
return
sequence_pool
(
input
=
input
,
pool_type
=
"last"
)
def
sequence_slice
(
input
,
offset
,
length
,
name
=
None
):
"""
**Sequence Slice Layer**
The layer crops a subsequence from given sequence with given start
offset and subsequence length.
It only supports sequence data (LoDTensor with lod_level equal to 1).
.. code-block:: text
- Case:
Given the input Variable **input**:
input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
input.lod = [[3, 2]],
input.dims = (5, 2),
with offset.data = [[0], [1]] and length.data = [[2], [1]],
the output Variable will be
out.data = [[a1, a2], [b1, b2], [e1, e2]],
out.lod = [[2, 1]],
out.dims = (3, 2).
NOTE: The first dimension size of **input**, **offset** and **length**
should be equal. The **offset** should start from 0.
Args:
input(Variable): The input Variable which consists of the complete
sequences.
offset(Variable): The offset to slice each sequence.
length(Variable): The length of each subsequence.
name(str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
Returns:
Variable: The output subsequences.
Examples:
.. code-block:: python
import numpy as np
seqs = fluid.layers.data(name='x', shape=[10, 5],
dtype='float32', lod_level=1)
offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
length=length)
"""
helper
=
LayerHelper
(
"sequence_slice"
,
**
locals
())
dtype
=
helper
.
input_dtype
()
out
=
helper
.
create_tmp_variable
(
dtype
)
offset
.
stop_gradient
=
True
length
.
stop_gradient
=
True
helper
.
append_op
(
type
=
"sequence_slice"
,
inputs
=
{
"X"
:
input
,
"Offset"
:
offset
,
"Length"
:
length
},
outputs
=
{
"Out"
:
out
})
return
out
@
templatedoc
()
@
templatedoc
()
def
pool2d
(
input
,
def
pool2d
(
input
,
pool_size
=-
1
,
pool_size
=-
1
,
...
@@ -2792,7 +2865,7 @@ def sequence_expand_as(x, y, name=None):
...
@@ -2792,7 +2865,7 @@ def sequence_expand_as(x, y, name=None):
@
templatedoc
()
@
templatedoc
()
def
sequence_pad
(
x
,
pad_value
,
maxlen
=
None
):
def
sequence_pad
(
x
,
pad_value
,
maxlen
=
None
,
name
=
None
):
"""
"""
${comment}
${comment}
...
@@ -2806,7 +2879,9 @@ def sequence_pad(x, pad_value, maxlen=None):
...
@@ -2806,7 +2879,9 @@ def sequence_pad(x, pad_value, maxlen=None):
None or any positive int. When it is None, all sequences will be
None or any positive int. When it is None, all sequences will be
padded up to the length of the longest one among them; when it a
padded up to the length of the longest one among them; when it a
certain positive value, it must be greater than the length of the
certain positive value, it must be greater than the length of the
longest original sequence."
longest original sequence.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Returns:
Variable: The padded sequence batch and the original lengths before
Variable: The padded sequence batch and the original lengths before
...
@@ -2843,6 +2918,66 @@ def sequence_pad(x, pad_value, maxlen=None):
...
@@ -2843,6 +2918,66 @@ def sequence_pad(x, pad_value, maxlen=None):
return
out
,
length
return
out
,
length
def
sequence_unpad
(
x
,
length
,
name
=
None
):
"""
**Sequence Unpad Layer**
This layer removes the padding data in the input sequences and convert
them into sequences with actual length as output, identitied by lod
information.
.. code-block:: text
Example:
Given input Variable **x**:
x.data = [[ 1.0, 2.0, 3.0, 4.0, 5.0],
[ 6.0, 7.0, 8.0, 9.0, 10.0],
[11.0, 12.0, 13.0, 14.0, 15.0]],
in which there are 3 sequences padded to length 5, and the acutal length
specified by input Variable **length**:
length.data = [[2], [3], [4]],
after unpadding, the output Variable will be:
out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
out.lod = [[2, 3, 4]]
Args:
x(Variable): Input Variable which contains the padded sequences with
equal length.
length(Variable): The Variable that specifies the actual ength of
sequences after unpadding.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The Variable contains the unpadded sequences.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32')
len = fluid.layers.data(name='length', shape=[1], dtype='int64')
out = fluid.layers.sequence_unpad(x=x, length=len)
"""
helper
=
LayerHelper
(
'sequence_unpad'
,
input
=
x
,
**
locals
())
dtype
=
helper
.
input_dtype
()
out
=
helper
.
create_tmp_variable
(
dtype
)
length
.
stop_gradient
=
True
helper
.
append_op
(
type
=
'sequence_unpad'
,
inputs
=
{
'X'
:
x
,
'Length'
:
length
},
outputs
=
{
'Out'
:
out
})
return
out
def
beam_search
(
pre_ids
,
def
beam_search
(
pre_ids
,
pre_scores
,
pre_scores
,
ids
,
ids
,
...
@@ -5827,6 +5962,54 @@ def rank_loss(label, left, right, name=None):
...
@@ -5827,6 +5962,54 @@ def rank_loss(label, left, right, name=None):
return
out
return
out
def
margin_rank_loss
(
label
,
left
,
right
,
margin
=
0.1
,
name
=
None
):
"""
Margin Ranking Loss Layer for ranking problem,
which compares left score and right score passed in.
The ranking loss can be defined as following equation:
.. math::
rank\_loss &= max(0, -label * (left - right) + margin)
Args:
label (Variable): Indicates whether the left is ranked higher than the right or not.
left (Variable): Ranking score for left.
right (Variable): Ranking score for right.
margin (float): Indicates the given margin.
name (str|None): A name for this layer (optional). If set None, the layer
will be named automatically.
Returns:
Variable: The ranking loss.
Raises:
ValueError: Any of label, left, and right is not a Variable.
Examples:
.. code-block:: python
label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
out = fluid.layers.margin_rank_loss(label, left, right)
"""
helper
=
LayerHelper
(
'margin_rank_loss'
,
**
locals
())
if
not
isinstance
(
label
,
Variable
):
raise
ValueError
(
"The label should be a Variable."
)
if
not
isinstance
(
left
,
Variable
):
raise
ValueError
(
"The left should be a Variable."
)
if
not
isinstance
(
right
,
Variable
):
raise
ValueError
(
"The right should be a Variable."
)
out
=
helper
.
create_tmp_variable
(
left
.
dtype
)
act
=
helper
.
create_tmp_variable
(
left
.
dtype
)
helper
.
append_op
(
type
=
'margin_rank_loss'
,
inputs
=
{
"Label"
:
label
,
"X1"
:
left
,
"X2"
:
right
},
outputs
=
{
'Out'
:
out
,
'Activated'
:
act
},
attrs
=
{
'margin'
:
margin
})
return
out
def
pad2d
(
input
,
def
pad2d
(
input
,
paddings
=
[
0
,
0
,
0
,
0
],
paddings
=
[
0
,
0
,
0
,
0
],
mode
=
'constant'
,
mode
=
'constant'
,
...
@@ -6290,6 +6473,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
...
@@ -6290,6 +6473,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
outputs
=
{
'Out'
:
out
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'win_size'
:
win_size
,
attrs
=
{
'win_size'
:
win_size
,
'pad_value'
:
pad_value
})
'pad_value'
:
pad_value
})
return
out
def
sequence_mask
(
x
,
maxlen
=
None
,
dtype
=
'int64'
,
name
=
None
):
def
sequence_mask
(
x
,
maxlen
=
None
,
dtype
=
'int64'
,
name
=
None
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
cc7f5514
...
@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
...
@@ -659,6 +659,9 @@ class AdamaxOptimizer(Optimizer):
optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
optimizer.minimize(cost)
optimizer.minimize(cost)
Notes:
Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
"""
"""
_moment_acc_str
=
"moment"
_moment_acc_str
=
"moment"
_inf_norm_acc_str
=
"inf_norm"
_inf_norm_acc_str
=
"inf_norm"
...
@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
...
@@ -778,6 +781,9 @@ class DecayedAdagradOptimizer(Optimizer):
optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
optimizer.minimize(cost)
optimizer.minimize(cost)
Notes:
Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.
"""
"""
_moment_acc_str
=
"moment"
_moment_acc_str
=
"moment"
...
@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
...
@@ -858,6 +864,9 @@ class AdadeltaOptimizer(Optimizer):
optimizer = fluid.optimizer.Adadelta(
optimizer = fluid.optimizer.Adadelta(
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
_, params_grads = optimizer.minimize(cost)
_, params_grads = optimizer.minimize(cost)
Notes:
Currently, AdadeltaOptimizer doesn't support sparse parameter optimization.
"""
"""
_avg_squared_grad_acc_str
=
"_avg_squared_grad"
_avg_squared_grad_acc_str
=
"_avg_squared_grad"
...
@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
...
@@ -1126,6 +1135,9 @@ class FtrlOptimizer(Optimizer):
optimizer = fluid.optimizer.Ftrl(0.0001)
optimizer = fluid.optimizer.Ftrl(0.0001)
_, params_grads = optimizer.minimize(cost)
_, params_grads = optimizer.minimize(cost)
Notes:
Currently, FtrlOptimizer doesn't support sparse parameter optimization.
"""
"""
_squared_acc_str
=
"squared"
_squared_acc_str
=
"squared"
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
cc7f5514
...
@@ -31,15 +31,32 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
...
@@ -31,15 +31,32 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
class
ParallelExecutor
(
object
):
class
ParallelExecutor
(
object
):
"""
"""
ParallelExecutor can run program in parallel.
ParallelExecutor is designed for data parallelism, which focuses on distributing
the data across different nodes and every node operates on the data in parallel.
If you use ParallelExecutor to run the current program on GPU, the node means GPU
device, and ParallelExecutor will get the available GPU device automatically on
the current machine. If you use ParallelExecutor to run the current program on CPU,
the node means the CPU device, and you can specify the CPU device number by adding
'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
of CPUs in the system.
Args:
Args:
use_cuda (bool): Whether to use CUDA or not.
use_cuda (bool): Whether to use CUDA or not.
loss_name (str): The loss name must set in training. Default None.
loss_name (str): The loss name must set in training. Default None.
main_program (Program): The program that need to run, if not provided,
main_program (Program): The program that need to run, if not provided,
then default_main_program will be used. Default None.
then default_main_program will be used. Default None.
share_vars_from(ParallelExecutor): If provi
ed
, it will share variables
share_vars_from(ParallelExecutor): If provi
de
, it will share variables
from the specified ParallelExecutor. Default None.
from the specified ParallelExecutor. Default None.
exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
the program in ParallelExecutor, for example how many threads are used to
execute the program, how many iterations to clean up the temp variables
which is generated during execution. For more information, please refer
to fluid.ExecutionStrategy. Default None.
build_strategy(BuildStrategy): build_strategy is used to control how to
build the SSA Graph in ParallelExecutor by setting the property,
for example reduce_strategy, gradient_scale_strategy. For more information,
please refer to fluid.BuildStrategy. Default None.
num_trainers(int): If greater than 1, NCCL will be initialized with
num_trainers(int): If greater than 1, NCCL will be initialized with
multiple rank of nodes, each node should have same number of GPUs.
multiple rank of nodes, each node should have same number of GPUs.
Distributed training will be enabled then. Default 1.
Distributed training will be enabled then. Default 1.
...
...
python/paddle/fluid/tests/unittests/dist_simnet_bow.py
浏览文件 @
cc7f5514
...
@@ -81,7 +81,10 @@ def get_optimizer():
...
@@ -81,7 +81,10 @@ def get_optimizer():
return
optimizer
return
optimizer
def
train_network
(
batch_size
,
is_distributed
=
False
,
is_sparse
=
False
):
def
train_network
(
batch_size
,
is_distributed
=
False
,
is_sparse
=
False
,
is_self_contained_lr
=
False
):
# query
# query
q
=
fluid
.
layers
.
data
(
q
=
fluid
.
layers
.
data
(
name
=
"query_ids"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
name
=
"query_ids"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
...
@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
...
@@ -93,7 +96,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
,
name
=
"__emb__"
,
learning_rate
=
emb_lr
),
learning_rate
=
emb_lr
)
if
is_self_contained_lr
else
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
),
is_sparse
=
is_sparse
)
is_sparse
=
is_sparse
)
## vsum
## vsum
q_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
q_emb
,
pool_type
=
'sum'
)
q_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
q_emb
,
pool_type
=
'sum'
)
...
@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
...
@@ -119,7 +124,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
,
name
=
"__emb__"
,
learning_rate
=
emb_lr
),
learning_rate
=
emb_lr
)
if
is_self_contained_lr
else
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
),
is_sparse
=
is_sparse
)
is_sparse
=
is_sparse
)
## vsum
## vsum
pt_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
pt_emb
,
pool_type
=
'sum'
)
pt_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
pt_emb
,
pool_type
=
'sum'
)
...
@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
...
@@ -144,7 +151,9 @@ def train_network(batch_size, is_distributed=False, is_sparse=False):
param_attr
=
fluid
.
ParamAttr
(
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
,
name
=
"__emb__"
,
learning_rate
=
emb_lr
),
learning_rate
=
emb_lr
)
if
is_self_contained_lr
else
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
),
name
=
"__emb__"
),
is_sparse
=
is_sparse
)
is_sparse
=
is_sparse
)
## vsum
## vsum
nt_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
nt_emb
,
pool_type
=
'sum'
)
nt_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
nt_emb
,
pool_type
=
'sum'
)
...
@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
...
@@ -220,7 +229,10 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
def
get_model
(
self
,
batch_size
=
2
):
def
get_model
(
self
,
batch_size
=
2
):
# Train program
# Train program
avg_cost
,
acc
,
predict
=
\
avg_cost
,
acc
,
predict
=
\
train_network
(
batch_size
,
bool
(
int
(
os
.
environ
[
"IS_DISTRIBUTED"
])),
bool
(
int
(
os
.
environ
[
"IS_SPARSE"
])))
train_network
(
batch_size
,
bool
(
int
(
os
.
environ
[
"IS_DISTRIBUTED"
])),
bool
(
int
(
os
.
environ
[
"IS_SPARSE"
])),
bool
(
int
(
os
.
environ
[
"IS_SELF_CONTAINED_LR"
])))
inference_program
=
fluid
.
default_main_program
().
clone
()
inference_program
=
fluid
.
default_main_program
().
clone
()
...
...
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
浏览文件 @
cc7f5514
...
@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase):
...
@@ -25,7 +25,11 @@ class TestDistSimnetBowDense2x2(TestDistBase):
self
.
_enforce_place
=
"CPU"
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
}
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
}
self
.
check_with_place
(
self
.
check_with_place
(
"dist_simnet_bow.py"
,
"dist_simnet_bow.py"
,
delta
=
1e-5
,
delta
=
1e-5
,
...
@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
...
@@ -39,7 +43,11 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self
.
_enforce_place
=
"CPU"
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
}
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
}
self
.
check_with_place
(
self
.
check_with_place
(
"dist_simnet_bow.py"
,
"dist_simnet_bow.py"
,
delta
=
100
,
delta
=
100
,
...
@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
...
@@ -53,7 +61,11 @@ class TestDistSimnetBowSparse2x2(TestDistBase):
self
.
_enforce_place
=
"CPU"
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'1'
}
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'1'
,
'IS_SELF_CONTAINED_LR'
:
'1'
}
self
.
check_with_place
(
self
.
check_with_place
(
"dist_simnet_bow.py"
,
"dist_simnet_bow.py"
,
delta
=
1e-5
,
delta
=
1e-5
,
...
@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
...
@@ -67,7 +79,11 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self
.
_enforce_place
=
"CPU"
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'1'
}
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'1'
,
'IS_SELF_CONTAINED_LR'
:
'1'
}
self
.
check_with_place
(
self
.
check_with_place
(
"dist_simnet_bow.py"
,
"dist_simnet_bow.py"
,
delta
=
100
,
delta
=
100
,
...
@@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
...
@@ -75,5 +91,59 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
need_envs
=
need_envs
)
need_envs
=
need_envs
)
class
TestDistSimnetBow2x2LookupTableSync
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'1'
,
"IS_SPARSE"
:
'1'
,
'IS_SELF_CONTAINED_LR'
:
'1'
}
self
.
check_with_place
(
"dist_simnet_bow.py"
,
delta
=
1e-5
,
check_error_log
=
False
,
need_envs
=
need_envs
)
class
TestDistSimnetBow2x2LookupTableAsync
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'1'
,
"IS_SPARSE"
:
'1'
,
'IS_SELF_CONTAINED_LR'
:
'1'
}
self
.
check_with_place
(
"dist_simnet_bow.py"
,
delta
=
100
,
check_error_log
=
False
,
need_envs
=
need_envs
)
class
TestDistSimnetBow2x2LookupTableNotContainLRSync
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'1'
,
"IS_SPARSE"
:
'1'
,
'IS_SELF_CONTAINED_LR'
:
'0'
}
self
.
check_with_place
(
"dist_simnet_bow.py"
,
delta
=
1e-5
,
check_error_log
=
False
,
need_envs
=
need_envs
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
cc7f5514
...
@@ -194,6 +194,14 @@ class TestBook(unittest.TestCase):
...
@@ -194,6 +194,14 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
layers
.
sequence_expand
(
x
=
x
,
y
=
y
,
ref_level
=
1
))
self
.
assertIsNotNone
(
layers
.
sequence_expand
(
x
=
x
,
y
=
y
,
ref_level
=
1
))
print
(
str
(
program
))
print
(
str
(
program
))
def
test_sequence_unpad
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
10
,
5
],
dtype
=
'float32'
)
length
=
layers
.
data
(
name
=
'length'
,
shape
=
[
1
],
dtype
=
'int64'
)
self
.
assertIsNotNone
(
layers
.
sequence_unpad
(
x
=
x
,
length
=
length
))
print
(
str
(
program
))
def
test_lstm_unit
(
self
):
def
test_lstm_unit
(
self
):
program
=
Program
()
program
=
Program
()
with
program_guard
(
program
):
with
program_guard
(
program
):
...
@@ -406,6 +414,19 @@ class TestBook(unittest.TestCase):
...
@@ -406,6 +414,19 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
print
(
str
(
program
))
def
test_sequence_slice
(
self
):
program
=
Program
()
with
program_guard
(
program
):
import
numpy
as
np
seqs
=
layers
.
data
(
name
=
'x'
,
shape
=
[
10
,
5
],
dtype
=
'float32'
,
lod_level
=
1
)
offset
=
layers
.
assign
(
input
=
np
.
array
([[
0
,
1
]]).
astype
(
'int32'
))
length
=
layers
.
assign
(
input
=
np
.
array
([[
2
,
1
]]).
astype
(
'int32'
))
out
=
layers
.
sequence_slice
(
input
=
seqs
,
offset
=
offset
,
length
=
length
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_lod_reset
(
self
):
def
test_lod_reset
(
self
):
program
=
Program
()
program
=
Program
()
with
program_guard
(
program
):
with
program_guard
(
program
):
...
...
python/paddle/fluid/tests/unittests/test_rmsprop_op.py
浏览文件 @
cc7f5514
...
@@ -19,33 +19,76 @@ import unittest
...
@@ -19,33 +19,76 @@ import unittest
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.op
import
Operator
import
paddle.fluid
as
fluid
def
create_selected_rows_and_tensor
(
scope
,
place
,
height
,
row_num
,
embedding_size
):
sr
=
scope
.
var
(
"@selected_rows@"
).
get_selected_rows
()
tensor
=
scope
.
var
(
"grad"
).
get_tensor
()
rows
=
np
.
random
.
random_integers
(
low
=
0
,
high
=
height
-
1
,
size
=
[
row_num
,
]).
astype
(
'int64'
)
sr_val
=
np
.
random
.
random
(
size
=
[
row_num
,
embedding_size
]).
astype
(
'float32'
)
sr
.
set_height
(
height
)
sr
.
set_rows
(
rows
)
sr
.
get_tensor
().
set
(
sr_val
,
place
)
tensor_val
=
np
.
zeros
(
shape
=
[
height
,
embedding_size
],
dtype
=
'float32'
)
for
i
in
range
(
row_num
):
row
=
rows
[
i
]
tensor_val
[
row
,
:]
=
tensor_val
[
row
,
:]
+
sr_val
[
i
,
:]
tensor
.
set
(
tensor_val
,
place
)
return
tensor_val
,
sr_val
class
TestBase
(
unittest
.
TestCase
):
class
TestBase
(
unittest
.
TestCase
):
def
setup
(
self
,
centered
,
epsilon
=
1e-6
):
def
setup
(
self
,
place
,
is_sparse
,
centered
,
size
,
row_num
=
None
,
epsilon
=
1e-6
):
np
.
random
.
seed
(
5
)
# fix seed
np
.
random
.
seed
(
5
)
# fix seed
self
.
scope
=
fluid
.
global_scope
()
self
.
place
=
place
self
.
param_name
=
"param"
self
.
param_name
=
"param"
self
.
param
=
np
.
random
.
random
(
(
123
,
321
)
).
astype
(
"float32"
)
self
.
param
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
self
.
mean_square_name
=
"mean_square"
self
.
mean_square_name
=
"mean_square"
self
.
mean_square
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float32"
)
self
.
mean_square
=
np
.
random
.
uniform
(
low
=
1
,
high
=
2
,
size
=
size
).
astype
(
"float32"
)
self
.
mean_grad_name
=
"mean_grad"
self
.
mean_grad_name
=
"mean_grad"
self
.
mean_grad
=
np
.
random
.
random
(
(
123
,
321
)
).
astype
(
"float32"
)
self
.
mean_grad
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
self
.
lr_name
=
"lr"
self
.
lr_name
=
"lr"
self
.
learning_rate
=
np
.
array
([
0.01
]).
astype
(
"float32"
)
self
.
learning_rate
=
np
.
array
([
0.01
]).
astype
(
"float32"
)
self
.
grad_name
=
"grad"
self
.
grad_name
=
"grad"
self
.
grad
=
np
.
random
.
random
((
123
,
321
)).
astype
(
"float32"
)
self
.
is_sparse
=
is_sparse
if
self
.
is_sparse
:
self
.
grad_sr_name
=
"@selected_rows@"
self
.
grad
,
self
.
grad_sr
=
create_selected_rows_and_tensor
(
self
.
scope
,
place
,
size
[
0
],
row_num
,
size
[
1
])
else
:
self
.
grad
=
np
.
random
.
random
(
size
).
astype
(
"float32"
)
grad_tensor
=
self
.
scope
.
var
(
self
.
grad_name
).
get_tensor
()
grad_tensor
.
set
(
self
.
grad
,
place
)
self
.
moment_name
=
"moment"
self
.
moment_name
=
"moment"
self
.
moment
=
np
.
zeros
((
123
,
321
)).
astype
(
"float32"
)
self
.
moment
=
np
.
random
.
uniform
(
low
=
0
,
high
=
1
,
size
=
size
).
astype
(
"float32"
)
self
.
epsilon
=
epsilon
self
.
epsilon
=
epsilon
self
.
decay
=
0.9
self
.
decay
=
0.9
self
.
momentum
=
0.
0
self
.
momentum
=
0.
1
self
.
centered
=
centered
self
.
centered
=
centered
self
.
ms_out
=
self
.
decay
*
self
.
mean_square
+
(
1
-
self
.
decay
self
.
ms_out
=
self
.
decay
*
self
.
mean_square
+
(
1
-
self
.
decay
...
@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
...
@@ -61,118 +104,122 @@ class TestBase(unittest.TestCase):
self
.
param_out
=
self
.
param
-
self
.
moment_out
self
.
param_out
=
self
.
param
-
self
.
moment_out
def
check
(
self
,
actual_t
,
expect_t
,
place
,
out_name
,
atol
=
1e-5
,
equal_nan
=
False
):
self
.
assertTrue
(
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
,
equal_nan
=
equal_nan
),
"Output ("
+
out_name
+
") has diff at "
+
str
(
place
)
+
"
\n
Expect "
+
str
(
expect_t
)
+
"
\n
"
+
"But Got"
+
str
(
actual_t
))
class
TestRmspropOp
(
TestBase
):
def
check_with_place
(
self
,
place
,
centered
,
epsilon
):
self
.
setup
(
centered
,
epsilon
)
scope
=
core
.
Scope
()
# create and initialize Param Variable
# create and initialize Param Variable
param
=
scope
.
var
(
self
.
param_name
).
get_tensor
()
self
.
param_tensor
=
self
.
scope
.
var
(
self
.
param_name
).
get_tensor
()
param
.
set
(
self
.
param
,
place
)
self
.
param_tensor
.
set
(
self
.
param
,
place
)
mean_square
=
scope
.
var
(
self
.
mean_square_name
).
get_tensor
()
self
.
mean_square_tensor
=
self
.
scope
.
var
(
mean_square
.
set
(
self
.
mean_square
,
place
)
self
.
mean_square_name
).
get_tensor
()
self
.
mean_square_tensor
.
set
(
self
.
mean_square
,
place
)
lr
=
scope
.
var
(
self
.
lr_name
).
get_tensor
()
lr
=
s
elf
.
s
cope
.
var
(
self
.
lr_name
).
get_tensor
()
lr
.
set
(
self
.
learning_rate
,
place
)
lr
.
set
(
self
.
learning_rate
,
place
)
grad
=
scope
.
var
(
self
.
grad
_name
).
get_tensor
()
self
.
moment_tensor
=
self
.
scope
.
var
(
self
.
moment
_name
).
get_tensor
()
grad
.
set
(
self
.
grad
,
place
)
self
.
moment_tensor
.
set
(
self
.
moment
,
place
)
moment
=
scope
.
var
(
self
.
moment_name
).
get_tensor
()
if
self
.
centered
:
moment
.
set
(
self
.
moment
,
place
)
self
.
mean_grad_tensor
=
self
.
scope
.
var
(
self
.
mean_grad_name
).
get_tensor
()
self
.
mean_grad_tensor
.
set
(
self
.
mean_grad
,
place
)
# create and run sgd operator
def
check
(
self
,
actual_t
,
expect_t
,
place
,
out_name
,
atol
=
1e-5
):
self
.
assertTrue
(
np
.
allclose
(
actual_t
,
expect_t
,
atol
=
atol
),
"Output ("
+
out_name
+
") has diff at "
+
str
(
place
)
+
"
\n
Expect "
+
str
(
expect_t
)
+
"
\n
"
+
"But Got"
+
str
(
actual_t
))
if
self
.
centered
:
mean_grad
=
scope
.
var
(
self
.
mean_grad_name
).
get_tensor
()
class
TestRmspropOp
(
TestBase
):
mean_grad
.
set
(
self
.
mean_grad
,
place
)
def
check_with_place
(
self
,
place
,
rmsprop_op
=
Operator
(
is_sparse
,
"rmsprop"
,
centered
,
Param
=
self
.
param_name
,
size
,
Grad
=
self
.
grad_name
,
row_num
=
None
,
MeanSquare
=
self
.
mean_square_name
,
epsilon
=
1e-6
):
MeanGrad
=
self
.
mean_grad_name
,
self
.
setup
(
place
,
is_sparse
,
centered
,
size
,
row_num
,
epsilon
)
Moment
=
self
.
moment_name
,
self
.
run_and_check
()
LearningRate
=
self
.
lr_name
,
ParamOut
=
self
.
param_name
,
def
run_and_check
(
self
):
MeanSquareOut
=
self
.
mean_square_name
,
grad_name
=
self
.
grad_sr_name
if
self
.
is_sparse
else
self
.
grad_name
MomentOut
=
self
.
moment_name
,
MeanGradOut
=
self
.
mean_grad_name
,
kwargs
=
{
epsilon
=
self
.
epsilon
,
'Param'
:
self
.
param_name
,
decay
=
self
.
decay
,
'Grad'
:
grad_name
,
momentum
=
self
.
momentum
,
'MeanSquare'
:
self
.
mean_square_name
,
centered
=
True
)
'Moment'
:
self
.
moment_name
,
else
:
'LearningRate'
:
self
.
lr_name
,
rmsprop_op
=
Operator
(
'ParamOut'
:
self
.
param_name
,
"rmsprop"
,
'MeanSquareOut'
:
self
.
mean_square_name
,
Param
=
self
.
param_name
,
'MomentOut'
:
self
.
moment_name
,
Grad
=
self
.
grad_name
,
'epsilon'
:
self
.
epsilon
,
MeanSquare
=
self
.
mean_square_name
,
'decay'
:
self
.
decay
,
Moment
=
self
.
moment_name
,
'momentum'
:
self
.
momentum
,
LearningRate
=
self
.
lr_name
,
'centered'
:
self
.
centered
ParamOut
=
self
.
param_name
,
}
MeanSquareOut
=
self
.
mean_square_name
,
MomentOut
=
self
.
moment_name
,
epsilon
=
self
.
epsilon
,
decay
=
self
.
decay
,
momentum
=
self
.
momentum
,
centered
=
False
)
rmsprop_op
.
run
(
scope
,
place
)
atol
=
1e-5
equal_nan
=
False
if
self
.
centered
:
if
self
.
centered
:
atol
=
1e-3
kwargs
[
'MeanGrad'
]
=
self
.
mean_grad_name
equal_nan
=
True
kwargs
[
'MeanGradOut'
]
=
self
.
mean_grad_name
rmsprop_op
=
Operator
(
'rmsprop'
,
**
kwargs
)
atol
=
1e-6
rmsprop_op
.
run
(
self
.
scope
,
self
.
place
)
self
.
check
(
self
.
check
(
np
.
array
(
mean_square
),
self
.
ms_out
,
place
,
self
.
mean_square_name
)
np
.
array
(
self
.
mean_square_tensor
),
self
.
ms_out
,
self
.
place
,
self
.
mean_square_name
,
atol
=
atol
)
self
.
check
(
self
.
check
(
np
.
array
(
moment
),
np
.
array
(
self
.
moment_tensor
),
self
.
moment_out
,
self
.
moment_out
,
place
,
self
.
place
,
self
.
moment_name
,
self
.
moment_name
,
atol
=
atol
,
atol
=
atol
)
equal_nan
=
equal_nan
)
self
.
check
(
self
.
check
(
np
.
array
(
param
),
np
.
array
(
self
.
param_tensor
),
self
.
param_out
,
self
.
param_out
,
place
,
self
.
place
,
self
.
param_name
,
self
.
param_name
,
atol
=
atol
,
atol
=
atol
)
equal_nan
=
equal_nan
)
if
self
.
centered
:
if
self
.
centered
:
self
.
check
(
self
.
check
(
np
.
array
(
mean_grad
),
self
.
mg_out
,
place
,
self
.
mean_grad_name
)
np
.
array
(
self
.
mean_grad_tensor
),
self
.
mg_out
,
self
.
place
,
self
.
mean_grad_name
)
def
test_rmsprop
(
self
):
def
test_rmsprop
(
self
):
places
=
[
core
.
CPUPlace
()]
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
places
.
append
(
core
.
CUDAPlace
(
0
))
size
=
(
128
,
320
)
for
place
in
places
:
for
place
in
places
:
self
.
check_with_place
(
place
,
False
,
1e-6
)
for
centered
in
[
False
,
True
]:
self
.
check_with_place
(
place
,
False
,
1e-10
)
with
fluid
.
scope_guard
(
core
.
Scope
()):
self
.
check_with_place
(
place
,
True
,
1e-6
)
self
.
check_with_place
(
self
.
check_with_place
(
place
,
True
,
1e-10
)
place
,
is_sparse
=
False
,
centered
=
centered
,
size
=
size
)
with
fluid
.
scope_guard
(
core
.
Scope
()):
self
.
check_with_place
(
place
,
is_sparse
=
True
,
centered
=
centered
,
row_num
=
512
,
size
=
size
)
with
fluid
.
scope_guard
(
core
.
Scope
()):
self
.
check_with_place
(
place
,
is_sparse
=
True
,
centered
=
centered
,
row_num
=
60
,
size
=
size
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py
0 → 100644
浏览文件 @
cc7f5514
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
six
import
numpy
as
np
from
op_test
import
OpTest
class
TestSequenceUnpadOp
(
OpTest
):
def
init
(
self
):
self
.
length
=
[
2
,
3
,
4
]
self
.
x_shape
=
(
3
,
5
)
self
.
dtype
=
"float32"
def
compute
(
self
):
assert
len
(
self
.
length
)
==
self
.
x_shape
[
0
]
x
=
np
.
random
.
random
(
self
.
x_shape
).
astype
(
self
.
dtype
)
out_lod
=
[
self
.
length
]
out
=
x
[
0
,
0
:
self
.
length
[
0
]]
for
i
in
six
.
moves
.
xrange
(
1
,
x
.
shape
[
0
]):
out
=
np
.
append
(
out
,
x
[
i
,
0
:
self
.
length
[
i
]],
axis
=
0
)
out_shape
=
(
sum
(
self
.
length
),
)
if
len
(
self
.
x_shape
)
==
2
:
out_shape
=
out_shape
+
(
1
,
)
else
:
out_shape
=
out_shape
+
self
.
x_shape
[
2
:]
self
.
inputs
=
{
'X'
:
x
,
'Length'
:
np
.
array
(
self
.
length
).
astype
(
'int64'
).
reshape
(
-
1
,
1
)
}
self
.
outputs
=
{
'Out'
:
(
out
.
reshape
(
out_shape
),
out_lod
)}
def
setUp
(
self
):
self
.
op_type
=
'sequence_unpad'
self
.
init
()
self
.
compute
()
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
"X"
],
"Out"
)
class
TestSequenceUnpadOp2
(
TestSequenceUnpadOp
):
def
init
(
self
):
self
.
length
=
[
2
,
3
,
4
]
self
.
x_shape
=
(
3
,
5
,
4
,
3
)
self
.
dtype
=
"float32"
class
TestSequenceUnpadOp3
(
TestSequenceUnpadOp
):
def
init
(
self
):
self
.
length
=
[
5
,
2
,
3
,
4
]
self
.
x_shape
=
(
4
,
5
,
3
,
3
,
6
)
self
.
dtype
=
"float64"
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
cc7f5514
...
@@ -788,7 +788,8 @@ in a single call.")
...
@@ -788,7 +788,8 @@ in a single call.")
tuple: (main_program, startup_program), of type "Program"
tuple: (main_program, startup_program), of type "Program"
"""
"""
pserver_prog
=
self
.
get_pserver_program
(
endpoint
)
pserver_prog
=
self
.
get_pserver_program
(
endpoint
)
pserver_startup
=
self
.
get_startup_program
(
endpoint
)
pserver_startup
=
self
.
get_startup_program
(
endpoint
,
pserver_program
=
pserver_prog
)
return
pserver_prog
,
pserver_startup
return
pserver_prog
,
pserver_startup
def
get_startup_program
(
self
,
def
get_startup_program
(
self
,
...
@@ -1118,6 +1119,7 @@ to transpile() call.")
...
@@ -1118,6 +1119,7 @@ to transpile() call.")
def
_split_table_grad_and_add_send_vars
(
self
,
program
,
pserver_endpoints
):
def
_split_table_grad_and_add_send_vars
(
self
,
program
,
pserver_endpoints
):
# 2. add split_ids_op and send_op to send gradient to pservers
# 2. add split_ids_op and send_op to send gradient to pservers
# there should only be one table_name
# there should only be one table_name
all_ops
=
program
.
global_block
().
ops
all_ops
=
program
.
global_block
().
ops
table_grad_name
=
grad_var_name
(
self
.
table_name
)
table_grad_name
=
grad_var_name
(
self
.
table_name
)
...
@@ -1142,7 +1144,7 @@ to transpile() call.")
...
@@ -1142,7 +1144,7 @@ to transpile() call.")
if
self
.
sync_mode
else
[]
if
self
.
sync_mode
else
[]
},
},
attrs
=
{
attrs
=
{
"sync_mode"
:
self
.
sync_mode
,
"sync_mode"
:
not
self
.
sync_mode
,
"epmap"
:
pserver_endpoints
,
"epmap"
:
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
,
OP_ROLE_VAR_ATTR_NAME
:
[
OP_ROLE_VAR_ATTR_NAME
:
[
...
@@ -1188,7 +1190,15 @@ to transpile() call.")
...
@@ -1188,7 +1190,15 @@ to transpile() call.")
def
_create_table_optimize_block
(
self
,
pserver_index
,
pserver_program
,
def
_create_table_optimize_block
(
self
,
pserver_index
,
pserver_program
,
pre_block_idx
,
grad_to_block_id
):
pre_block_idx
,
grad_to_block_id
):
# STEP: create table optimize block
# STEP: create table optimize block
table_opt_block
=
pserver_program
.
_create_block
(
pre_block_idx
)
# create table param and grad var in pserver program
# create table param and grad var in pserver program
# create table optimize block in pserver program
table_opt_op
=
[
op
for
op
in
self
.
optimize_ops
if
'Param'
in
op
.
input_names
and
op
.
input
(
"Param"
)[
0
]
==
self
.
table_name
][
0
]
origin_param_var
=
self
.
origin_program
.
global_block
().
vars
[
origin_param_var
=
self
.
origin_program
.
global_block
().
vars
[
self
.
table_name
]
self
.
table_name
]
...
@@ -1204,19 +1214,16 @@ to transpile() call.")
...
@@ -1204,19 +1214,16 @@ to transpile() call.")
dtype
=
origin_param_var
.
dtype
,
dtype
=
origin_param_var
.
dtype
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
persistable
=
True
)
persistable
=
True
)
# parameter must be selected rows
# parameter must be selected rows
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
param_var
.
desc
.
set_type
(
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
)
grad_var
=
pserver_program
.
global_block
().
_clone_variable
(
grad_var
=
pserver_program
.
global_block
().
_clone_variable
(
self
.
origin_program
.
global_block
().
vars
[
grad_var_name
(
self
.
origin_program
.
global_block
().
vars
[
grad_var_name
(
self
.
table_name
)])
self
.
table_name
)])
# create table optimize block in pserver program
lr_var
=
pserver_program
.
global_block
().
_clone_variable
(
table_opt_op
=
[
self
.
origin_program
.
global_block
().
vars
[
table_opt_op
.
input
(
op
for
op
in
self
.
optimize_ops
"LearningRate"
)[
0
]])
if
'Param'
in
op
.
input_names
and
op
.
input
(
"Param"
)[
0
]
==
self
.
table_name
][
0
]
table_opt_block
=
pserver_program
.
_create_block
(
pre_block_idx
)
if
self
.
sync_mode
:
if
self
.
sync_mode
:
# create grad vars in pserver program
# create grad vars in pserver program
...
@@ -1248,8 +1255,6 @@ to transpile() call.")
...
@@ -1248,8 +1255,6 @@ to transpile() call.")
grad_var
=
pserver_program
.
global_block
().
_rename_var
(
grad_var
=
pserver_program
.
global_block
().
_rename_var
(
origin_grad_name
,
splited_grad_name
)
origin_grad_name
,
splited_grad_name
)
lr_var
=
pserver_program
.
global_block
().
vars
[
table_opt_op
.
input
(
"LearningRate"
)[
0
]]
inputs
=
{
inputs
=
{
"Param"
:
[
param_var
],
"Param"
:
[
param_var
],
"Grad"
:
[
grad_var
],
"Grad"
:
[
grad_var
],
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录