Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d0c9f7e8
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d0c9f7e8
编写于
9月 25, 2020
作者:
S
smallv0221
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into yxp0925
Merge branch 'develop'.
上级
a35a2ee9
09f19532
变更
91
隐藏空白更改
内联
并排
Showing
91 changed file
with
2345 addition
and
520 deletion
+2345
-520
cmake/generic.cmake
cmake/generic.cmake
+1
-0
cmake/third_party.cmake
cmake/third_party.cmake
+5
-4
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+4
-1
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+1
-0
paddle/fluid/framework/fleet/fleet_wrapper.cc
paddle/fluid/framework/fleet/fleet_wrapper.cc
+50
-0
paddle/fluid/framework/fleet/fleet_wrapper.h
paddle/fluid/framework/fleet/fleet_wrapper.h
+8
-0
paddle/fluid/framework/fleet/heter_wrapper.cc
paddle/fluid/framework/fleet/heter_wrapper.cc
+48
-11
paddle/fluid/framework/heterxpu_trainer.cc
paddle/fluid/framework/heterxpu_trainer.cc
+119
-8
paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
...d/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+21
-0
paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
.../framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+8
-0
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
...framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+8
-0
paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+7
-0
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+13
-0
paddle/fluid/framework/pull_dense_worker.cc
paddle/fluid/framework/pull_dense_worker.cc
+17
-3
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+15
-2
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+2
-1
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+19
-3
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
.../tests/api/analyzer_seq_pool1_compare_determine_tester.cc
+40
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
.../inference/tests/api/analyzer_seq_pool1_compare_tester.cc
+39
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
...s/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
+46
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
...erence/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+48
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
.../inference/tests/api/analyzer_seq_pool1_profile_tester.cc
+42
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
...id/inference/tests/api/analyzer_seq_pool1_tester_helper.h
+7
-72
paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
...nference/tests/api/analyzer_transformer_compare_tester.cc
+44
-0
paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
...d/inference/tests/api/analyzer_transformer_fuse_tester.cc
+36
-0
paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
...nference/tests/api/analyzer_transformer_profile_tester.cc
+45
-0
paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
.../inference/tests/api/analyzer_transformer_tester_helper.h
+8
-53
paddle/fluid/memory/allocation/retry_allocator_test.cc
paddle/fluid/memory/allocation/retry_allocator_test.cc
+3
-1
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+39
-3
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+13
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+21
-0
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+34
-21
paddle/fluid/operators/benchmark/op_tester.cc
paddle/fluid/operators/benchmark/op_tester.cc
+20
-12
paddle/fluid/operators/benchmark/op_tester_config.cc
paddle/fluid/operators/benchmark/op_tester_config.cc
+15
-5
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+4
-2
paddle/fluid/operators/instance_norm_op.cc
paddle/fluid/operators/instance_norm_op.cc
+3
-3
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+9
-3
paddle/fluid/operators/jit/gen/embseqpool.cc
paddle/fluid/operators/jit/gen/embseqpool.cc
+25
-5
paddle/fluid/operators/jit/gen/matmul.cc
paddle/fluid/operators/jit/gen/matmul.cc
+20
-4
paddle/fluid/operators/jit/gen/matmul.h
paddle/fluid/operators/jit/gen/matmul.h
+4
-1
paddle/fluid/operators/jit/gen/seqpool.cc
paddle/fluid/operators/jit/gen/seqpool.cc
+8
-2
paddle/fluid/operators/jit/gen/seqpool.h
paddle/fluid/operators/jit/gen/seqpool.h
+7
-2
paddle/fluid/operators/jit/gen/sgd.cc
paddle/fluid/operators/jit/gen/sgd.cc
+18
-3
paddle/fluid/operators/jit/gen/vbroadcast.cc
paddle/fluid/operators/jit/gen/vbroadcast.cc
+5
-1
paddle/fluid/operators/jit/gen_base.cc
paddle/fluid/operators/jit/gen_base.cc
+8
-3
paddle/fluid/operators/jit/helper.cc
paddle/fluid/operators/jit/helper.cc
+17
-6
paddle/fluid/operators/jit/helper.h
paddle/fluid/operators/jit/helper.h
+18
-7
paddle/fluid/operators/jit/more/mix/mix.cc
paddle/fluid/operators/jit/more/mix/mix.cc
+2
-1
paddle/fluid/operators/jit/more/mkl/mkl.h
paddle/fluid/operators/jit/more/mkl/mkl.h
+51
-10
paddle/fluid/operators/jit/refer/refer.h
paddle/fluid/operators/jit/refer/refer.h
+42
-9
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+9
-2
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+29
-0
paddle/fluid/operators/matmul_v2_op.cu
paddle/fluid/operators/matmul_v2_op.cu
+6
-4
paddle/fluid/operators/matmul_v2_op.h
paddle/fluid/operators/matmul_v2_op.h
+31
-24
paddle/fluid/operators/norm_utils.cu.h
paddle/fluid/operators/norm_utils.cu.h
+58
-17
paddle/fluid/operators/utils.h
paddle/fluid/operators/utils.h
+12
-6
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+38
-0
paddle/scripts/installation_validate.py
paddle/scripts/installation_validate.py
+1
-0
paddle/scripts/paddle_build.bat
paddle/scripts/paddle_build.bat
+10
-7
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+0
-5
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+23
-0
python/paddle/distributed/fleet/meta_optimizers/__init__.py
python/paddle/distributed/fleet/meta_optimizers/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
...ributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+146
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+1
-1
python/paddle/fluid/reader.py
python/paddle/fluid/reader.py
+45
-44
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-3
python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
...paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+63
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
...sts/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
+106
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
...nference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+92
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
...ference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+81
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
...ttests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+73
-0
python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
...ts/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
+5
-17
python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
...n/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+24
-0
python/paddle/fluid/tests/unittests/test_activation_op.py
python/paddle/fluid/tests/unittests/test_activation_op.py
+53
-0
python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
...d/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+11
-3
python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
...e/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
+15
-22
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
.../fluid/tests/unittests/test_fleet_distributed_strategy.py
+10
-0
python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
...sts/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
+91
-0
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+4
-1
python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
...addle/fluid/tests/unittests/test_imperative_se_resnext.py
+20
-5
python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+74
-25
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
...d/tests/unittests/test_multiprocess_dataloader_dynamic.py
+0
-1
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
.../test_multiprocess_dataloader_iterable_dataset_dynamic.py
+0
-1
python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+36
-0
python/paddle/fluid/tests/unittests/test_paddle_save_load.py
python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+23
-26
python/paddle/tensor/linalg.py
python/paddle/tensor/linalg.py
+2
-2
tools/check_file_diff_approvals.sh
tools/check_file_diff_approvals.sh
+1
-1
tools/dockerfile/Dockerfile.centos
tools/dockerfile/Dockerfile.centos
+2
-4
tools/dockerfile/ci_dockerfile.sh
tools/dockerfile/ci_dockerfile.sh
+10
-1
tools/sampcd_processor.py
tools/sampcd_processor.py
+12
-11
tools/wlist.json
tools/wlist.json
+38
-24
未找到文件。
cmake/generic.cmake
浏览文件 @
d0c9f7e8
...
@@ -721,6 +721,7 @@ function(proto_library TARGET_NAME)
...
@@ -721,6 +721,7 @@ function(proto_library TARGET_NAME)
set
(
proto_hdrs
)
set
(
proto_hdrs
)
paddle_protobuf_generate_cpp
(
proto_srcs proto_hdrs
${
proto_library_SRCS
}
)
paddle_protobuf_generate_cpp
(
proto_srcs proto_hdrs
${
proto_library_SRCS
}
)
cc_library
(
${
TARGET_NAME
}
SRCS
${
proto_srcs
}
DEPS
${
proto_library_DEPS
}
protobuf
)
cc_library
(
${
TARGET_NAME
}
SRCS
${
proto_srcs
}
DEPS
${
proto_library_DEPS
}
protobuf
)
add_dependencies
(
extern_xxhash
${
TARGET_NAME
}
)
endfunction
()
endfunction
()
function
(
py_proto_compile TARGET_NAME
)
function
(
py_proto_compile TARGET_NAME
)
...
...
cmake/third_party.cmake
浏览文件 @
d0c9f7e8
...
@@ -39,6 +39,7 @@ set(third_party_deps)
...
@@ -39,6 +39,7 @@ set(third_party_deps)
# REPOSITORY ${TARGET_REPOSITORY}
# REPOSITORY ${TARGET_REPOSITORY}
# TAG ${TARGET_TAG}
# TAG ${TARGET_TAG}
# DIR ${TARGET_SOURCE_DIR})
# DIR ${TARGET_SOURCE_DIR})
FUNCTION
(
cache_third_party TARGET
)
FUNCTION
(
cache_third_party TARGET
)
SET
(
options
""
)
SET
(
options
""
)
SET
(
oneValueArgs URL REPOSITORY TAG DIR
)
SET
(
oneValueArgs URL REPOSITORY TAG DIR
)
...
@@ -269,6 +270,10 @@ if(WITH_PSLIB)
...
@@ -269,6 +270,10 @@ if(WITH_PSLIB)
endif
()
endif
()
endif
(
WITH_PSLIB
)
endif
(
WITH_PSLIB
)
if
(
NOT WIN32 AND NOT APPLE
)
include
(
external/gloo
)
list
(
APPEND third_party_deps extern_gloo
)
endif
()
if
(
WITH_BOX_PS
)
if
(
WITH_BOX_PS
)
include
(
external/box_ps
)
include
(
external/box_ps
)
...
@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
...
@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
endif
(
WITH_BOX_PS
)
endif
(
WITH_BOX_PS
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_GLOO
)
include
(
external/gloo
)
list
(
APPEND third_party_deps extern_gloo
)
endif
()
if
(
WITH_GRPC
)
if
(
WITH_GRPC
)
list
(
APPEND third_party_deps extern_grpc
)
list
(
APPEND third_party_deps extern_grpc
)
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
d0c9f7e8
...
@@ -74,7 +74,9 @@ class PullDenseWorker {
...
@@ -74,7 +74,9 @@ class PullDenseWorker {
virtual
void
Initialize
(
const
TrainerDesc
&
param
);
virtual
void
Initialize
(
const
TrainerDesc
&
param
);
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
void
AddStream
(
const
cudaStream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
void
AddStream
(
const
cudaStream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
void
AddPlace
(
const
paddle
::
platform
::
Place
place
)
{
void
AddPlace
(
const
paddle
::
platform
::
Place
place
)
{
places_
.
push_back
(
place
);
places_
.
push_back
(
place
);
}
}
...
@@ -135,9 +137,9 @@ class PullDenseWorker {
...
@@ -135,9 +137,9 @@ class PullDenseWorker {
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
cudaStream_t
>
copy_streams_
;
std
::
vector
<
cudaStream_t
>
copy_streams_
;
#endif
std
::
vector
<
paddle
::
platform
::
Place
>
places_
;
std
::
vector
<
paddle
::
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
thread_scopes_
;
std
::
vector
<
Scope
*>
thread_scopes_
;
#endif
};
};
// should incorporate different type of device
// should incorporate different type of device
...
@@ -161,6 +163,7 @@ class DeviceWorker {
...
@@ -161,6 +163,7 @@ class DeviceWorker {
virtual
void
SetDataFeed
(
DataFeed
*
data_feed
);
virtual
void
SetDataFeed
(
DataFeed
*
data_feed
);
virtual
void
SetWorkerNum
(
int
num
)
{}
virtual
void
SetWorkerNum
(
int
num
)
{}
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{}
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{}
virtual
void
GetXpuOpIndex
()
{}
virtual
void
SetNeedDumpField
(
bool
need_dump_field
)
{
virtual
void
SetNeedDumpField
(
bool
need_dump_field
)
{
need_dump_field_
=
need_dump_field
;
need_dump_field_
=
need_dump_field
;
}
}
...
...
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
d0c9f7e8
...
@@ -127,6 +127,7 @@ message DistributedStrategy {
...
@@ -127,6 +127,7 @@ message DistributedStrategy {
optional
int32
conv_workspace_size_limit
=
22
[
default
=
4000
];
optional
int32
conv_workspace_size_limit
=
22
[
default
=
4000
];
optional
bool
cudnn_batchnorm_spatial_persistent
=
23
[
default
=
true
];
optional
bool
cudnn_batchnorm_spatial_persistent
=
23
[
default
=
true
];
optional
bool
adaptive_localsgd
=
24
[
default
=
false
];
optional
bool
adaptive_localsgd
=
24
[
default
=
false
];
optional
bool
fp16_allreduce
=
25
[
default
=
false
];
optional
RecomputeConfig
recompute_configs
=
101
;
optional
RecomputeConfig
recompute_configs
=
101
;
optional
AMPConfig
amp_configs
=
102
;
optional
AMPConfig
amp_configs
=
102
;
...
...
paddle/fluid/framework/fleet/fleet_wrapper.cc
浏览文件 @
d0c9f7e8
...
@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
...
@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
push_sparse_status
->
push_back
(
std
::
move
(
status
));
push_sparse_status
->
push_back
(
std
::
move
(
status
));
}
}
}
}
#endif
#ifdef PADDLE_WITH_XPU
void
FleetWrapper
::
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
)
{
#ifdef PADDLE_WITH_PSLIB
std
::
vector
<
paddle
::
ps
::
Region
>
regions
;
for
(
auto
&
t
:
var_names
)
{
Variable
*
var
=
scope
.
FindVar
(
t
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
int
count
=
tensor
->
numel
();
float
*
g_data
=
tensor
->
data
<
float
>
();
Variable
*
pin_var
=
scope
.
FindVar
(
t
+
"pin"
);
LoDTensor
*
pin_tensor
=
pin_var
->
GetMutable
<
LoDTensor
>
();
float
*
pin_g
=
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CPUPlace
());
memory
::
Copy
(
platform
::
CPUPlace
(),
pin_g
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
),
g_data
,
sizeof
(
float
)
*
count
);
float
*
g
=
pin_g
;
if
(
scale_datanorm
>=
0
)
{
if
(
t
.
find
(
".batch_size@GRAD"
)
!=
std
::
string
::
npos
||
t
.
find
(
".batch_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
Eigen
::
Map
<
Eigen
::
MatrixXf
>
mat
(
g
,
1
,
count
);
float
scale
=
1.0
/
batch_size
;
mat
*=
scale
;
}
else
if
(
t
.
find
(
".batch_square_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
VLOG
(
3
)
<<
"epsilon: "
<<
scale_datanorm
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
g
[
i
]
=
(
g
[
i
]
-
batch_size
*
scale_datanorm
)
/
batch_size
+
batch_size
*
scale_datanorm
;
}
}
}
paddle
::
ps
::
Region
reg
(
g
,
count
);
regions
.
emplace_back
(
std
::
move
(
reg
));
}
auto
status
=
pslib_ptr_
->
_worker_ptr
->
push_dense
(
regions
.
data
(),
regions
.
size
(),
table_id
);
if
(
push_sparse_status
)
{
push_sparse_status
->
push_back
(
std
::
move
(
status
));
}
#endif
}
#endif
#endif
void
FleetWrapper
::
PushDenseVarsAsync
(
void
FleetWrapper
::
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
Scope
&
scope
,
const
uint64_t
table_id
,
...
...
paddle/fluid/framework/fleet/fleet_wrapper.h
浏览文件 @
d0c9f7e8
...
@@ -160,6 +160,14 @@ class FleetWrapper {
...
@@ -160,6 +160,14 @@ class FleetWrapper {
float
scale_datanorm
,
int
batch_size
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
,
cudaStream_t
stream
,
const
paddle
::
platform
::
Place
&
place
,
cudaStream_t
stream
,
cudaEvent_t
event
);
cudaEvent_t
event
);
#endif
#ifdef PADDLE_WITH_XPU
void
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
);
#endif
#endif
void
PushDenseVarsAsync
(
void
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
Scope
&
scope
,
const
uint64_t
table_id
,
...
...
paddle/fluid/framework/fleet/heter_wrapper.cc
浏览文件 @
d0c9f7e8
...
@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
...
@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
memcpy
(
data_ptr
,
tensor
->
data
<
void
>
(),
memcpy
(
data_ptr
,
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#ifdef PADDLE_WITH_CUDA
}
else
{
}
else
{
#ifdef PADDLE_WITH_CUDA
memory
::
Copy
(
platform
::
CPUPlace
(),
data_ptr
,
memory
::
Copy
(
platform
::
CPUPlace
(),
data_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tensor
->
place
()),
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tensor
->
place
()),
tensor
->
data
<
void
>
(),
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
nullptr
);
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
nullptr
);
}
#else
}
#endif
#endif
#ifdef PADDLE_WITH_XPU
memory
::
Copy
(
platform
::
CPUPlace
(),
data_ptr
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
tensor
->
place
()),
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#endif
}
}
}
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
,
platform
::
Place
place
,
cudaStream_t
stream
)
{
cudaStream_t
stream
)
{
// const VariableMessage& req_var = request->vars();
auto
*
var
=
scope
->
FindVar
(
req_var
.
varname
());
auto
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
std
::
vector
<
int
>
vec_dim
;
for
(
auto
&
x
:
req_var
.
dims
())
{
vec_dim
.
push_back
(
x
);
}
tensor
->
Resize
(
make_ddim
(
vec_dim
));
LoD
lod
;
for
(
int
i
=
0
;
i
<
req_var
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
req_var
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
req_var
.
lod
(
i
).
lod_data
(
j
));
}
lod
.
push_back
(
v
);
}
tensor
->
set_lod
(
lod
);
void
*
tensor_data
=
tensor
->
mutable_data
(
place
,
ToVarType
(
req_var
.
data_type
()));
#ifdef PADDLE_WITH_CUDA
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
tensor_data
,
platform
::
CPUPlace
(),
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
stream
);
#else
#else
memcpy
(
tensor_data
,
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#endif
}
#endif
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
)
{
platform
::
Place
place
)
{
#endif
// const VariableMessage& req_var = request->vars();
// const VariableMessage& req_var = request->vars();
auto
*
var
=
scope
->
FindVar
(
req_var
.
varname
());
auto
*
var
=
scope
->
FindVar
(
req_var
.
varname
());
auto
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
auto
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
...
@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
...
@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
void
*
tensor_data
=
void
*
tensor_data
=
tensor
->
mutable_data
(
place
,
ToVarType
(
req_var
.
data_type
()));
tensor
->
mutable_data
(
place
,
ToVarType
(
req_var
.
data_type
()));
#ifdef PADDLE_WITH_
CUDA
#ifdef PADDLE_WITH_
XPU
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDA
Place
,
place
),
tensor_data
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPU
Place
,
place
),
tensor_data
,
platform
::
CPUPlace
(),
req_var
.
data
().
data
(),
platform
::
CPUPlace
(),
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
())
,
stream
);
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#else
#else
memcpy
(
tensor_data
,
req_var
.
data
().
data
(),
memcpy
(
tensor_data
,
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
...
@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
...
@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
case
VariableMessage
::
BOOL
:
case
VariableMessage
::
BOOL
:
return
framework
::
proto
::
VarType
::
BOOL
;
// NOLINT
return
framework
::
proto
::
VarType
::
BOOL
;
// NOLINT
default:
default:
VLOG
(
0
)
<<
"Not support type "
<<
type
;
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ToVarType:Unsupported type %d"
,
type
));
}
}
}
}
...
...
paddle/fluid/framework/heterxpu_trainer.cc
浏览文件 @
d0c9f7e8
...
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
...
@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
int
place_num
=
trainer_desc
.
worker_places_size
();
int
place_num
=
trainer_desc
.
worker_places_size
();
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
int
num
=
trainer_desc
.
worker_places
(
i
);
int
num
=
trainer_desc
.
worker_places
(
i
);
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
cudaStream_t
stream
;
cudaStream_t
stream
;
...
@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
...
@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
events_
.
push_back
(
event
);
events_
.
push_back
(
event
);
#endif
#ifdef PADDLE_WITH_XPU
platform
::
XPUPlace
place
=
platform
::
XPUPlace
(
num
);
places_
.
push_back
(
place
);
#endif
}
}
// thread_num_ = trainer_desc.thread_num();
// thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset);
// SetDataset(dataset);
...
@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
...
@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
void
HeterXpuTrainer
::
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
)
{
void
HeterXpuTrainer
::
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
)
{
auto
place
=
places_
[
num
];
auto
place
=
places_
[
num
];
Scope
*
scope
=
place_scopes_
[
num
];
Scope
*
scope
=
place_scopes_
[
num
];
#ifdef PADDLE_WITH_CUDA
auto
stream
=
copy_streams_
[
num
];
auto
stream
=
copy_streams_
[
num
];
auto
event
=
events_
[
num
];
auto
event
=
events_
[
num
];
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
#endif
#ifdef PADDLE_WITH_XPU
xpu_set_device
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
);
#endif
auto
&
block
=
program
.
Block
(
0
);
auto
&
block
=
program
.
Block
(
0
);
for
(
auto
&
var
:
block
.
AllVars
())
{
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
if
(
var
->
Persistable
())
{
...
@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
...
@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \
} \
} while (0)
} while (0)
#define HeterMemcpyXpuFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
} \
} while (0)
#ifdef PADDLE_WITH_CUDA
_ForEachDataType_
(
HeterMemcpyFunc
);
_ForEachDataType_
(
HeterMemcpyFunc
);
#endif
#ifdef PADDLE_WITH_XPU
_ForEachDataType_
(
HeterMemcpyXpuFunc
);
#endif
}
}
}
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
cudaEventSynchronize
(
event
);
cudaEventSynchronize
(
event
);
#endif
}
}
#ifdef PADDLE_WITH_CUDA
template
<
typename
T
>
template
<
typename
T
>
void
HeterXpuTrainer
::
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
void
HeterXpuTrainer
::
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
LoDTensor
*
root_tensor
,
...
@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
...
@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
(),
stream
);
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
(),
stream
);
}
}
}
}
#endif
#ifdef PADDLE_WITH_XPU
template
<
typename
T
>
void
HeterXpuTrainer
::
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
)
{
T
*
thread_ptr
=
thread_tensor
->
mutable_data
<
T
>
(
root_tensor
->
dims
(),
thread_place
);
T
*
root_ptr
=
root_tensor
->
data
<
T
>
();
if
(
platform
::
is_cpu_place
(
root_tensor
->
place
()))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
thread_place
),
thread_ptr
,
platform
::
CPUPlace
(),
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
());
}
else
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
thread_place
),
thread_ptr
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
root_tensor
->
place
()),
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
());
}
}
#endif
void
HeterXpuTrainer
::
DumpWork
(
int
tid
)
{}
void
HeterXpuTrainer
::
DumpWork
(
int
tid
)
{}
...
@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
...
@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
CreateThreadParam
(
main_program
,
i
);
CreateThreadParam
(
main_program
,
i
);
pull_dense_worker_
->
AddThreadScope
(
scope
);
pull_dense_worker_
->
AddThreadScope
(
scope
);
pull_dense_worker_
->
AddPlace
(
places_
[
i
]);
pull_dense_worker_
->
AddPlace
(
places_
[
i
]);
#ifdef PADDLE_WITH_CUDA
pull_dense_worker_
->
AddStream
(
copy_streams_
[
i
]);
pull_dense_worker_
->
AddStream
(
copy_streams_
[
i
]);
#endif
}
}
pull_dense_worker_
->
Start
();
pull_dense_worker_
->
Start
();
#ifdef PADDLE_WITH_CUDA
for
(
auto
&
stream
:
copy_streams_
)
{
for
(
auto
&
stream
:
copy_streams_
)
{
cudaStreamSynchronize
(
stream
);
cudaStreamSynchronize
(
stream
);
}
}
#endif
op_names_
.
clear
();
op_names_
.
clear
();
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
std
::
unique_ptr
<
OperatorBase
>
local_op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
std
::
unique_ptr
<
OperatorBase
>
local_op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
...
@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
...
@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
(
context
->
ops_
).
push_back
(
local_op_ptr
);
(
context
->
ops_
).
push_back
(
local_op_ptr
);
}
}
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
#endif
object_pool_
.
Push
(
context
);
object_pool_
.
Push
(
context
);
}
}
}
}
...
@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
...
@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
} \
} \
} while (0)
} while (0)
_ForEachDataType_
(
MergeCallback
);
_ForEachDataType_
(
MergeCallback
);
if
(
platform
::
is_gpu_place
(
thread_tensor
->
place
()))
{
if
(
!
platform
::
is_cpu_place
(
thread_tensor
->
place
()))
{
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
thread_tensor
->
place
()).
device
;
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
thread_tensor
->
place
()).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
cudaMemset
(
thread_tensor
->
data
<
void
>
(),
0
,
cudaMemset
(
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
#endif
#ifdef PADDLE_WITH_XPU
auto
place
=
thread_tensor
->
place
();
xpu_set_device
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContext
*
dev_ctx
=
pool
.
Get
(
place
);
const
platform
::
XPUDeviceContext
*
xpu_ctx
=
reinterpret_cast
<
const
platform
::
XPUDeviceContext
*>
(
dev_ctx
);
xpu
::
memset
(
xpu_ctx
->
x_context
(),
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
#endif
}
else
{
}
else
{
memset
(
thread_tensor
->
data
<
void
>
(),
0
,
memset
(
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
...
@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
...
@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
auto
*
merge_var
=
response
->
add_vars
();
auto
*
merge_var
=
response
->
add_vars
();
heter_ptr_
->
SerializeToReq
(
need_merge_var_names_
[
i
],
root_scope_
,
heter_ptr_
->
SerializeToReq
(
need_merge_var_names_
[
i
],
root_scope_
,
merge_var
);
merge_var
);
if
(
platform
::
is_gpu_place
(
root_tensor
->
place
()))
{
if
(
!
platform
::
is_cpu_place
(
root_tensor
->
place
()))
{
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
root_tensor
->
place
()).
device
;
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
root_tensor
->
place
()).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
cudaMemset
(
root_tensor
->
data
<
void
>
(),
0
,
cudaMemset
(
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
#endif
#ifdef PADDLE_WITH_XPU
auto
place
=
root_tensor
->
place
();
xpu_set_device
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContext
*
dev_ctx
=
pool
.
Get
(
place
);
const
platform
::
XPUDeviceContext
*
xpu_ctx
=
reinterpret_cast
<
const
platform
::
XPUDeviceContext
*>
(
dev_ctx
);
xpu
::
memset
(
xpu_ctx
->
x_context
(),
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
#endif
}
else
{
}
else
{
memset
(
root_tensor
->
data
<
void
>
(),
0
,
memset
(
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
...
@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
...
@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
(
context
->
ops_
).
push_back
(
local_op_ptr
);
(
context
->
ops_
).
push_back
(
local_op_ptr
);
}
}
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
#endif
}
}
context
->
Reset
();
context
->
Reset
();
...
@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
...
@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
auto
deserial_timer
=
auto
deserial_timer
=
std
::
make_shared
<
paddle
::
ps
::
CostTimer
>
(
"xpu_service_deserial"
);
std
::
make_shared
<
paddle
::
ps
::
CostTimer
>
(
"xpu_service_deserial"
);
for
(
int
i
=
0
;
i
<
request
->
vars_size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
request
->
vars_size
();
++
i
)
{
#ifdef PADDLE_WITH_CUDA
heter_ptr_
->
DeSerializeToTensor
(
context
->
scope_
,
request
->
vars
(
i
),
place
,
heter_ptr_
->
DeSerializeToTensor
(
context
->
scope_
,
request
->
vars
(
i
),
place
,
copy_streams_
[
context
->
place_num_
]);
copy_streams_
[
context
->
place_num_
]);
#endif
#ifdef PADDLE_WITH_XPU
heter_ptr_
->
DeSerializeToTensor
(
context
->
scope_
,
request
->
vars
(
i
),
place
);
#endif
}
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
context
->
event_
,
copy_streams_
[
context
->
place_num_
]));
cudaEventRecord
(
context
->
event_
,
copy_streams_
[
context
->
place_num_
]));
while
(
cudaEventQuery
(
context
->
event_
)
!=
cudaSuccess
)
{
while
(
cudaEventQuery
(
context
->
event_
)
!=
cudaSuccess
)
{
VLOG
(
3
)
<<
"wait for kernel"
;
VLOG
(
3
)
<<
"wait for kernel"
;
bthread_yield
();
bthread_yield
();
}
}
#endif
}
}
{
{
...
@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
...
@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
op
->
Run
(
*
(
context
->
scope_
),
place
);
op
->
Run
(
*
(
context
->
scope_
),
place
);
}
}
}
}
#ifdef PADDLE_WITH_CUDA
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
...
@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
bthread_yield
();
bthread_yield
();
}
}
}
}
#endif
#ifdef PADDLE_WITH_XPU
xpu_wait
();
#endif
for
(
int
i
=
0
;
i
<
trainer_desc_
.
xpu_send_list_size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
trainer_desc_
.
xpu_send_list_size
();
++
i
)
{
const
std
::
string
&
varname
=
trainer_desc_
.
xpu_send_list
(
i
);
const
std
::
string
&
varname
=
trainer_desc_
.
xpu_send_list
(
i
);
...
@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
...
@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
++
i
)
{
++
i
)
{
uint64_t
tid
=
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_dense_table_id
(
i
));
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_dense_table_id
(
i
));
#ifdef PADDLE_WITH_CUDA
fleet_ptr_
->
PushDenseVarsAsync
(
fleet_ptr_
->
PushDenseVarsAsync
(
*
(
context
->
scope_
),
tid
,
dense_grad_names_
[
tid
],
*
(
context
->
scope_
),
tid
,
dense_grad_names_
[
tid
],
&
(
context
->
push_dense_status_
),
scale_datanorm_
,
request
->
cur_batch
(),
&
(
context
->
push_dense_status_
),
scale_datanorm_
,
request
->
cur_batch
(),
places_
[
context
->
place_num_
],
copy_streams_
[
context
->
place_num_
],
places_
[
context
->
place_num_
],
copy_streams_
[
context
->
place_num_
],
context
->
event_
);
context
->
event_
);
#endif
#ifdef PADDLE_WITH_XPU
fleet_ptr_
->
PushDenseVarsAsync
(
*
(
context
->
scope_
),
tid
,
dense_grad_names_
[
tid
],
&
(
context
->
push_dense_status_
),
scale_datanorm_
,
request
->
cur_batch
(),
places_
[
context
->
place_num_
]);
#endif
}
}
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
++
i
)
{
++
i
)
{
...
@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
...
@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
pull_dense_worker_
->
Stop
();
pull_dense_worker_
->
Stop
();
root_scope_
->
DropKids
();
root_scope_
->
DropKids
();
}
}
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
#endif
#endif
paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -103,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
...
@@ -103,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
REGISTER_PASS
(
conv_relu_mkldnn_fuse_pass
,
REGISTER_PASS
(
conv_relu_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
ConvActivationFusePass
);
paddle
::
framework
::
ir
::
ConvActivationFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_relu_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"relu"
,
0
));
REGISTER_PASS
(
conv_leaky_relu_mkldnn_fuse_pass
,
REGISTER_PASS
(
conv_leaky_relu_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
Conv2DLeakyReLUFusePass
);
paddle
::
framework
::
ir
::
Conv2DLeakyReLUFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_leaky_relu_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
LE
(
"leaky_relu"
,
1
));
REGISTER_PASS
(
conv_relu6_mkldnn_fuse_pass
,
REGISTER_PASS
(
conv_relu6_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
Conv2DReLU6FusePass
);
paddle
::
framework
::
ir
::
Conv2DReLU6FusePass
);
REGISTER_PASS_CAPABILITY
(
conv_relu6_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"relu6"
,
0
));
REGISTER_PASS
(
conv_swish_mkldnn_fuse_pass
,
REGISTER_PASS
(
conv_swish_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
Conv2DSwishFusePass
);
paddle
::
framework
::
ir
::
Conv2DSwishFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_swish_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"swish"
,
0
));
paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -123,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
...
@@ -123,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS
(
conv_concat_relu_mkldnn_fuse_pass
,
REGISTER_PASS
(
conv_concat_relu_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
ConvConcatReLUFusePass
);
paddle
::
framework
::
ir
::
ConvConcatReLUFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_concat_relu_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"concat"
,
0
)
.
EQ
(
"relu"
,
0
));
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
#include <paddle/fluid/string/pretty_log.h>
#include <paddle/fluid/string/pretty_log.h>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const {
...
@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const {
REGISTER_PASS
(
matmul_transpose_reshape_fuse_pass
,
REGISTER_PASS
(
matmul_transpose_reshape_fuse_pass
,
paddle
::
framework
::
ir
::
MatmulTransposeReshapeMKLDNNPass
);
paddle
::
framework
::
ir
::
MatmulTransposeReshapeMKLDNNPass
);
REGISTER_PASS_CAPABILITY
(
matmul_transpose_reshape_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"matmul"
,
0
)
.
EQ
(
"transpose"
,
0
)
.
EQ
(
"reshape"
,
0
));
paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -90,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
...
@@ -90,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS
(
scale_matmul_fuse_pass
,
REGISTER_PASS
(
scale_matmul_fuse_pass
,
paddle
::
framework
::
ir
::
ScaleMatmulFusePass
);
paddle
::
framework
::
ir
::
ScaleMatmulFusePass
);
REGISTER_PASS_CAPABILITY
(
scale_matmul_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"scale"
,
0
)
.
EQ
(
"matmul"
,
0
));
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <string>
#include <string>
#include <unordered_set>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -255,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
...
@@ -255,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS
(
seq_concat_fc_fuse_pass
,
REGISTER_PASS
(
seq_concat_fc_fuse_pass
,
paddle
::
framework
::
ir
::
SeqConcatFcFusePass
);
paddle
::
framework
::
ir
::
SeqConcatFcFusePass
);
REGISTER_PASS_CAPABILITY
(
seq_concat_fc_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"sequence_expand"
,
0
)
.
EQ
(
"concat"
,
0
)
.
EQ
(
"mul"
,
0
)
.
EQ
(
"elementwise_add"
,
0
)
.
EQ
(
"sigmoid"
,
0
)
.
EQ
(
"tanh"
,
0
)
.
EQ
(
"relu"
,
0
)
.
EQ
(
"identity"
,
0
)
.
EQ
(
"fusion_seqexpand_concat_fc"
,
0
));
paddle/fluid/framework/pull_dense_worker.cc
浏览文件 @
d0c9f7e8
...
@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
...
@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
copy_streams_
.
clear
();
copy_streams_
.
clear
();
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
places_
.
clear
();
places_
.
clear
();
thread_scopes_
.
clear
();
thread_scopes_
.
clear
();
#endif
#endif
}
}
void
PullDenseWorker
::
CreatePinVar
()
{
void
PullDenseWorker
::
CreatePinVar
()
{
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
// for (auto& v : dense_value_names_) {
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
// for (auto& name : v.second) {
for
(
int
i
=
0
;
i
<
dwp_param_
.
program_config
(
0
).
pull_dense_table_id_size
();
for
(
int
i
=
0
;
i
<
dwp_param_
.
program_config
(
0
).
pull_dense_table_id_size
();
...
@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
...
@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
auto
*
ptr
=
root_scope_
->
Var
(
name
+
"pin"
);
auto
*
ptr
=
root_scope_
->
Var
(
name
+
"pin"
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
LoDTensor
*
pin_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
LoDTensor
*
pin_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
#ifdef PADDLE_WITH_CUDA
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CUDAPinnedPlace
());
platform
::
CUDAPinnedPlace
());
#endif
#ifdef PADDLE_WITH_XPU
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CPUPlace
());
#endif
}
}
}
}
#endif
#endif
...
@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
...
@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit
(
-
1
);
exit
(
-
1
);
}
}
status_vec
->
resize
(
0
);
status_vec
->
resize
(
0
);
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// for (auto& v : dense_value_names_) {
// for (auto& v : dense_value_names_) {
...
@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
...
@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
Variable
*
var
=
thread_scopes_
[
i
]
->
FindVar
(
name
);
Variable
*
var
=
thread_scopes_
[
i
]
->
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
float
*
w
=
tensor
->
data
<
float
>
();
float
*
w
=
tensor
->
data
<
float
>
();
#ifdef PADDLE_WITH_CUDA
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
places_
[
i
]),
w
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
places_
[
i
]),
w
,
platform
::
CUDAPinnedPlace
(),
pin_w
,
platform
::
CUDAPinnedPlace
(),
pin_w
,
sizeof
(
float
)
*
tensor
->
numel
(),
copy_streams_
[
i
]);
sizeof
(
float
)
*
tensor
->
numel
(),
copy_streams_
[
i
]);
#endif
#ifdef PADDLE_WITH_XPU
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
places_
[
i
]),
w
,
platform
::
CPUPlace
(),
pin_w
,
sizeof
(
float
)
*
tensor
->
numel
());
#endif
}
}
}
}
}
}
...
@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
...
@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t
tid
=
static_cast
<
uint64_t
>
(
uint64_t
tid
=
static_cast
<
uint64_t
>
(
dwp_param_
.
program_config
(
0
).
pull_dense_table_id
(
i
));
dwp_param_
.
program_config
(
0
).
pull_dense_table_id
(
i
));
if
(
force_update
||
CheckUpdateParam
(
tid
))
{
if
(
force_update
||
CheckUpdateParam
(
tid
))
{
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
VLOG
(
3
)
<<
"pull dense "
<<
force_update
<<
" "
<<
tid
;
VLOG
(
3
)
<<
"pull dense "
<<
force_update
<<
" "
<<
tid
;
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
,
false
);
&
pull_dense_status_
,
false
);
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
d0c9f7e8
...
@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
...
@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
};
};
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class
HeterServiceContext
{
class
HeterServiceContext
{
public:
public:
HeterServiceContext
()
{}
HeterServiceContext
()
{}
...
@@ -151,7 +152,9 @@ class HeterServiceContext {
...
@@ -151,7 +152,9 @@ class HeterServiceContext {
void
Reset
()
{
push_dense_status_
.
clear
();
}
void
Reset
()
{
push_dense_status_
.
clear
();
}
int
place_num_
;
int
place_num_
;
Scope
*
scope_
{
nullptr
};
Scope
*
scope_
{
nullptr
};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t
event_
;
cudaEvent_t
event_
;
#endif
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
};
};
...
@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
...
@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{
new
(
&
program_
)
ProgramDesc
(
main_program
);
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
}
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
template
<
typename
T
>
#ifdef PADDLE_WITH_CUDA
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
const
paddle
::
platform
::
Place
&
thread_place
,
cudaStream_t
stream
);
cudaStream_t
stream
);
#endif
#ifdef PADDLE_WITH_XPU
void
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
);
#endif
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
template
<
typename
T
>
template
<
typename
T
>
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
...
@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
...
@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
std
::
vector
<
std
::
string
>
op_names_
;
std
::
vector
<
std
::
string
>
op_names_
;
std
::
vector
<
Scope
*>
place_scopes_
;
std
::
vector
<
Scope
*>
place_scopes_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
std
::
vector
<
cudaStream_t
>
copy_streams_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
cudaStream_t
>
copy_streams_
;
std
::
vector
<
cudaEvent_t
>
events_
;
std
::
vector
<
cudaEvent_t
>
events_
;
#endif
};
};
#endif
#endif
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
d0c9f7e8
...
@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
...
@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
#endif
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
d0c9f7e8
...
@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML)
...
@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML)
# seq_pool1
# seq_pool1
set
(
SEQ_POOL1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/seq_pool"
)
set
(
SEQ_POOL1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/seq_pool"
)
download_model_and_data
(
${
SEQ_POOL1_INSTALL_DIR
}
"seq_pool1_model_.tar.gz"
"seq_pool1_data.txt.tar.gz"
)
download_model_and_data
(
${
SEQ_POOL1_INSTALL_DIR
}
"seq_pool1_model_.tar.gz"
"seq_pool1_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_seq_pool1
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_compare_determine
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_compare_determine_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_compare_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_fuse_compare_zero_copy
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_fuse_statis
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_fuse_statis_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_profile
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_profile_tester.cc
)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
set_tests_properties
(
test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150
)
set_tests_properties
(
test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120
)
endif
()
endif
()
else
()
else
()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
...
@@ -215,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
...
@@ -215,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# transformer, the dataset only works on batch_size=8 now
# transformer, the dataset only works on batch_size=8 now
set
(
TRANSFORMER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/transformer"
)
set
(
TRANSFORMER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/transformer"
)
download_model_and_data
(
${
TRANSFORMER_INSTALL_DIR
}
"temp/transformer_model.tar.gz"
"temp/transformer_data.txt.tar.gz"
)
download_model_and_data
(
${
TRANSFORMER_INSTALL_DIR
}
"temp/transformer_model.tar.gz"
"temp/transformer_data.txt.tar.gz"
)
inference_analysis_test
(
test_analyzer_transformer SRCS analyzer_transformer_tester.cc
inference_analysis_test
(
test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRANSFORMER_INSTALL_DIR
}
/model --infer_data=
${
TRANSFORMER_INSTALL_DIR
}
/data.txt --batch_size=8
--cpu_num_threads=
${
CPU_NUM_THREADS_ON_CI
}
)
inference_analysis_test
(
test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRANSFORMER_INSTALL_DIR
}
/model --infer_data=
${
TRANSFORMER_INSTALL_DIR
}
/data.txt --batch_size=8
--cpu_num_threads=
${
CPU_NUM_THREADS_ON_CI
}
)
inference_analysis_test
(
test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRANSFORMER_INSTALL_DIR
}
/model --infer_data=
${
TRANSFORMER_INSTALL_DIR
}
/data.txt --batch_size=8
ARGS --infer_model=
${
TRANSFORMER_INSTALL_DIR
}
/model --infer_data=
${
TRANSFORMER_INSTALL_DIR
}
/data.txt --batch_size=8
--cpu_num_threads=
${
CPU_NUM_THREADS_ON_CI
}
)
--cpu_num_threads=
${
CPU_NUM_THREADS_ON_CI
}
)
...
...
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// Compare Deterministic result
TEST
(
Analyzer_seq_pool1_compare_determine
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
TEST
(
Analyzer_seq_pool1_compare
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST
(
Analyzer_seq_pool1_compare_zero_copy
,
compare_zero_copy
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
AnalysisConfig
cfg1
;
SetConfig
(
&
cfg1
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
std
::
vector
<
std
::
string
>
outputs_name
;
outputs_name
.
emplace_back
(
out_var_name
);
CompareAnalysisAndZeroCopy
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg
),
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg1
),
input_slots_all
,
outputs_name
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// Check the fuse status
TEST
(
Analyzer_seq_pool1_fuse_statis
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"seqpool_concat_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"squared_mat_sub_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"repeated_fc_relu_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
10
);
EXPECT_EQ
(
fuse_statis
.
at
(
"seqpool_concat_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"squared_mat_sub_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"repeated_fc_relu_fuse"
),
2
);
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
EXPECT_EQ
(
num_ops
,
171
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
,
use_mkldnn
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_seq_pool1_profile
,
profile
)
{
profile
();
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester
.cc
→
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester
_helper.h
浏览文件 @
d0c9f7e8
...
@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#include <algorithm>
#include <algorithm>
#include <fstream>
#include <fstream>
#include <iostream>
#include <iostream>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
// diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
static
const
char
out_var_name
[]
=
"reduce_sum_0.tmp_0"
;
static
const
char
out_var_name
[]
=
"reduce_sum_0.tmp_0"
;
...
@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
...
@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg
->
pass_builder
()
->
InsertPass
(
2
,
"seqpool_concat_fuse_pass"
);
cfg
->
pass_builder
()
->
InsertPass
(
2
,
"seqpool_concat_fuse_pass"
);
}
}
void
profile
(
bool
use_mkldnn
=
false
)
{
}
// namespace seq_pool1_tester
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
,
use_mkldnn
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_seq_pool1
,
profile
)
{
profile
();
}
// Compare result of NativeConfig and AnalysisConfig
TEST
(
Analyzer_seq_pool1
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Compare Deterministic result
TEST
(
Analyzer_seq_pool1
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Check the fuse status
TEST
(
Analyzer_seq_pool1
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"seqpool_concat_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"squared_mat_sub_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"repeated_fc_relu_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
10
);
EXPECT_EQ
(
fuse_statis
.
at
(
"seqpool_concat_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"squared_mat_sub_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"repeated_fc_relu_fuse"
),
2
);
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
EXPECT_EQ
(
num_ops
,
171
);
}
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST
(
Analyzer_seq_pool1
,
compare_zero_copy
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
AnalysisConfig
cfg1
;
SetConfig
(
&
cfg1
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
std
::
vector
<
std
::
string
>
outputs_name
;
outputs_name
.
emplace_back
(
out_var_name
);
CompareAnalysisAndZeroCopy
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg
),
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg1
),
input_slots_all
,
outputs_name
);
}
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
void
compare
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
TEST
(
Analyzer_Transformer
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
compare_mkldnn
)
{
compare
(
true
/* use_mkldnn */
);
}
#endif
}
// namespace transformer_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
// Check the fuse status
TEST
(
Analyzer_Transformer
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
}
}
// namespace transformer_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_Transformer
,
profile
)
{
profile
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
profile_mkldnn
)
{
profile
(
true
);
}
#endif
}
// namespace transformer_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_tester
.cc
→
paddle/fluid/inference/tests/api/analyzer_transformer_tester
_helper.h
浏览文件 @
d0c9f7e8
...
@@ -11,11 +11,16 @@
...
@@ -11,11 +11,16 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
inference
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
struct
DataRecord
{
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
int64_t
>>
src_word
,
src_pos
,
trg_word
,
init_idx
;
std
::
vector
<
std
::
vector
<
int64_t
>>
src_word
,
src_pos
,
trg_word
,
init_idx
;
...
@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...
@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
}
}
}
}
// Easy for profiling independently.
}
// namespace transformer_tester
void
profile
(
bool
use_mkldnn
=
false
)
{
}
// namespace analysis
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_Transformer
,
profile
)
{
profile
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
profile_mkldnn
)
{
profile
(
true
);
}
#endif
// Check the fuse status
TEST
(
Analyzer_Transformer
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
}
// Compare result of NativeConfig and AnalysisConfig
void
compare
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
TEST
(
Analyzer_Transformer
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
compare_mkldnn
)
{
compare
(
true
/* use_mkldnn */
);
}
#endif
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/memory/allocation/retry_allocator_test.cc
浏览文件 @
d0c9f7e8
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include <algorithm>
#include <algorithm>
#include <chrono> // NOLINT
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <condition_variable> // NOLINT
...
@@ -20,6 +21,7 @@
...
@@ -20,6 +21,7 @@
#include <string>
#include <string>
#include <thread> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include <vector>
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
...
@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) {
...
@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) {
size_t
thread_num
=
4
;
size_t
thread_num
=
4
;
size_t
sleep_time
=
40
;
size_t
sleep_time
=
40
;
size_t
extra_time
=
1
0
;
size_t
extra_time
=
2
0
;
// Reserve to perform more tests in the future
// Reserve to perform more tests in the future
std
::
vector
<
std
::
shared_ptr
<
Allocator
>>
allocators
;
std
::
vector
<
std
::
shared_ptr
<
Allocator
>>
allocators
;
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
d0c9f7e8
...
@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
...
@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
}
}
};
};
//
// AbsGrad: dx=dy if x >=0 else -dy
// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
template
<
typename
T
>
class
AbsDoubleGradMaker
:
public
::
paddle
::
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
::
paddle
::
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
protected:
void
Apply
(
GradOpPtr
<
T
>
op
)
const
override
{
op
->
SetType
(
"abs_grad_grad"
);
// input1: x
op
->
SetInput
(
"X"
,
this
->
Input
(
"X"
));
// input2: ddx
op
->
SetInput
(
"DDX"
,
this
->
OutputGrad
(
framework
::
GradVarName
(
"X"
)));
op
->
SetAttrMap
(
this
->
Attrs
());
// output: ddy
op
->
SetOutput
(
"DDOut"
,
this
->
InputGrad
(
framework
::
GradVarName
(
"Out"
)));
}
};
// ReluGrad: dx = dy if y >= 0 else 0
// ReluGrad: dx = dy if y >= 0 else 0
// ReluGradGrad: ddy = ddx if y >= 0 else 0
// ReluGradGrad: ddy = ddx if y >= 0 else 0
//
template
<
typename
T
>
template
<
typename
T
>
class
ReluDoubleGradMaker
:
public
::
paddle
::
framework
::
SingleGradOpMaker
<
T
>
{
class
ReluDoubleGradMaker
:
public
::
paddle
::
framework
::
SingleGradOpMaker
<
T
>
{
public:
public:
...
@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR(
...
@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR(
std
::
conditional
<
ops
::
CanInplaceAct
<
ops
::
AbsGradFunctor
<
float
>>
(),
std
::
conditional
<
ops
::
CanInplaceAct
<
ops
::
AbsGradFunctor
<
float
>>
(),
ops
::
ActFwdInplaceInferer
,
void
>::
type
);
ops
::
ActFwdInplaceInferer
,
void
>::
type
);
REGISTER_OPERATOR
(
abs_grad
,
ops
::
ActivationOpGrad
,
REGISTER_OPERATOR
(
abs_grad
,
ops
::
ActivationOpGrad
,
ops
::
ActivationGradOpInplaceInferer
);
ops
::
ActivationGradOpInplaceInferer
,
ops
::
AbsDoubleGradMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
AbsDoubleGradMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OPERATOR
(
abs_grad_grad
,
ops
::
ActivationOpDoubleGrad
<
ops
::
AbsGradGradFunctor
<
float
>::
FwdDeps
()
>
,
ops
::
ActivationDoubleGradOpInplaceInferer
);
REGISTER_OP_CPU_KERNEL
(
abs
,
REGISTER_OP_CPU_KERNEL
(
abs
,
ops
::
ActivationKernel
<
paddle
::
platform
::
CPUDeviceContext
,
ops
::
ActivationKernel
<
paddle
::
platform
::
CPUDeviceContext
,
...
@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL(
...
@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL(
ops
::
AbsGradFunctor
<
int
>>
,
ops
::
AbsGradFunctor
<
int
>>
,
ops
::
ActivationGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
ops
::
ActivationGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
ops
::
AbsGradFunctor
<
int64_t
>>
);
ops
::
AbsGradFunctor
<
int64_t
>>
);
REGISTER_OP_CPU_KERNEL
(
abs_grad_grad
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
float
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
double
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
plat
::
float16
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
int
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
int64_t
>>
);
/* ========================================================================== */
/* ========================================================================== */
/* ========================== register checkpoint ===========================*/
/* ========================== register checkpoint ===========================*/
...
...
paddle/fluid/operators/activation_op.cu
浏览文件 @
d0c9f7e8
...
@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
...
@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
ExpGradFunctor
<
plat
::
float16
>>
);
ops
::
ExpGradFunctor
<
plat
::
float16
>>
);
/* ========================================================================== */
/* ========================================================================== */
/* ==========================
exp
register ============================ */
/* ==========================
abs
register ============================ */
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
abs
,
ops
::
ActivationKernel
<
plat
::
CUDADeviceContext
,
ops
::
AbsFunctor
<
float
>>
,
abs
,
ops
::
ActivationKernel
<
plat
::
CUDADeviceContext
,
ops
::
AbsFunctor
<
float
>>
,
...
@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL(
...
@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
AbsGradFunctor
<
int64_t
>>
,
ops
::
AbsGradFunctor
<
int64_t
>>
,
ops
::
ActivationGradKernel
<
plat
::
CUDADeviceContext
,
ops
::
ActivationGradKernel
<
plat
::
CUDADeviceContext
,
ops
::
AbsGradFunctor
<
plat
::
float16
>>
);
ops
::
AbsGradFunctor
<
plat
::
float16
>>
);
REGISTER_OP_CUDA_KERNEL
(
abs_grad_grad
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
float
>>
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
double
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
plat
::
float16
>>
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
int
>>
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
int64_t
>>
);
/* ========================================================================== */
/* ========================================================================== */
paddle/fluid/operators/activation_op.h
浏览文件 @
d0c9f7e8
...
@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
...
@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
}
}
};
};
template
<
typename
T
>
struct
AbsGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
framework
::
Tensor
*
X
,
const
framework
::
Tensor
*
Out
,
const
framework
::
Tensor
*
ddX
,
framework
::
Tensor
*
ddOut
,
framework
::
Tensor
*
dOut
,
framework
::
Tensor
*
dX
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"AbsGradGrad"
));
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
X
,
"Input"
,
"X"
,
"AbsGradGrad"
));
if
(
ddOut
)
{
auto
ddout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddOut
,
"Output"
,
"DDOut"
,
"AbsGradGrad"
));
ddout
.
device
(
*
d
)
=
ddx
*
x
.
sign
();
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
kDepX
;
}
};
template
<
typename
T
>
template
<
typename
T
>
struct
ReluGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
struct
ReluGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
template
<
typename
Device
>
...
...
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
d0c9f7e8
...
@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
...
@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
op
->
SetInput
(
"SavedMean"
,
this
->
Input
(
"SavedMean"
));
op
->
SetInput
(
"SavedMean"
,
this
->
Input
(
"SavedMean"
));
op
->
SetInput
(
"SavedVariance"
,
this
->
Input
(
"SavedVariance"
));
op
->
SetInput
(
"SavedVariance"
,
this
->
Input
(
"SavedVariance"
));
if
(
BOOST_GET_CONST
(
bool
,
this
->
GetAttr
(
"use_global_stats"
)))
{
if
(
BOOST_GET_CONST
(
bool
,
this
->
GetAttr
(
"use_global_stats"
)))
{
op
->
SetInput
(
"Mean"
,
this
->
Input
(
"Mean"
));
op
->
SetInput
(
"Variance"
,
this
->
Input
(
"Variance"
));
op
->
SetInput
(
"Variance"
,
this
->
Input
(
"Variance"
));
}
}
op
->
SetInput
(
"DDX"
,
this
->
OutputGrad
(
framework
::
GradVarName
(
"X"
)));
op
->
SetInput
(
"DDX"
,
this
->
OutputGrad
(
framework
::
GradVarName
(
"X"
)));
...
@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
...
@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
"BatchNormDoubleGrad"
);
"BatchNormDoubleGrad"
);
}
}
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"DDX"
),
"Input"
,
"DDX"
,
"BatchNormDoubleGrad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"DY"
),
"Input"
,
"DY"
,
"BatchNormDoubleGrad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"DY"
),
"Input"
,
"DY"
,
"BatchNormDoubleGrad"
);
// check output
// check output
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"DX"
),
"Output"
,
"DX"
,
"BatchNormDoubleGrad"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"DX"
),
"Output"
,
"DX"
,
"BatchNormDoubleGrad"
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
int
C
=
x_dims
[
1
];
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"data_layout"
));
const
int
C
=
((
this
->
IsMKLDNNType
()
==
true
)
||
(
data_layout
==
DataLayout
::
kNCHW
)
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
if
(
ctx
->
HasOutput
(
"DX"
))
{
if
(
ctx
->
HasOutput
(
"DX"
))
{
ctx
->
SetOutputDim
(
"DX"
,
x_dims
);
ctx
->
SetOutputDim
(
"DX"
,
x_dims
);
}
}
...
@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
...
@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
Tensor
inv_var_tensor
;
Tensor
inv_var_tensor
;
if
(
use_global_stats
)
{
if
(
use_global_stats
)
{
const
auto
*
running_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
running_variance
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
running_variance
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
mean_data
=
running_mean
->
data
<
T
>
();
inv_var_tensor
.
Resize
({
C
});
inv_var_tensor
.
Resize
({
C
});
T
*
running_inv_var_data
=
inv_var_tensor
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
running_inv_var_data
=
inv_var_tensor
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
...
@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy,
// np.sum(dy,
// axis=(n,h,w)) * (x - mean) *
// axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
// (np.mean(ddx, axis=(n,h,w)) - ddx)
)
+ ddr * (dy * inv_var -
// inv_var
// inv_var
// *
// *
// np.mean(dy, axis=(n,h,w)) -
// np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))
)
// axis=(n,h,w)))
if
(
ddX
)
{
if
(
ddX
)
{
dx_arr
+=
dx_arr
+=
...
@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
...
@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
C
,
sample_size
);
C
,
sample_size
);
ddy_arr
.
setZero
();
ddy_arr
.
setZero
();
if
(
use_global_stats
)
{
if
(
use_global_stats
)
{
// math: ddy = r * ddx * inv_var
// math: ddy = r * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
if
(
ddX
)
{
if
(
ddX
)
{
ddy_arr
=
scale_tile_data
*
ddx_arr
*
inv_var_tile_data
;
ddy_arr
=
scale_tile_data
*
ddx_arr
*
inv_var_tile_data
;
}
}
...
@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
...
@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
.
replicate
(
1
,
sample_size
)
/
.
replicate
(
1
,
sample_size
)
/
sample_size
);
sample_size
);
}
}
if
(
ddScale
&&
ddBias
)
{
}
ConstEigenVectorArrayMap
<
T
>
ddscale_arr
(
ddScale
->
data
<
T
>
(),
C
);
if
(
ddScale
)
{
Tensor
ddscale_tile
;
ConstEigenVectorArrayMap
<
T
>
ddscale_arr
(
ddScale
->
data
<
T
>
(),
C
);
ddscale_tile
.
Resize
({
C
,
sample_size
});
Tensor
ddscale_tile
;
EigenArrayMap
<
T
>
ddscale_tile_data
(
ddscale_tile
.
Resize
({
C
,
sample_size
});
ddscale_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
EigenArrayMap
<
T
>
ddscale_tile_data
(
ddscale_tile_data
=
ddscale_arr
.
replicate
(
1
,
sample_size
);
ddscale_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
ddscale_tile_data
=
ddscale_arr
.
replicate
(
1
,
sample_size
);
ddy_arr
+=
x_sub_mean_mul_invstd_arr
*
ddscale_tile_data
;
}
ConstEigenVectorArrayMap
<
T
>
ddbias_arr
(
ddBias
->
data
<
T
>
(),
C
);
if
(
ddBias
)
{
Tensor
ddbias_tile
;
ConstEigenVectorArrayMap
<
T
>
ddbias_arr
(
ddBias
->
data
<
T
>
(),
C
);
ddbias_tile
.
Resize
({
C
,
sample_size
});
Tensor
ddbias_tile
;
EigenArrayMap
<
T
>
ddbias_tile_data
(
ddbias_tile
.
Resize
({
C
,
sample_size
});
ddbias_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
EigenArrayMap
<
T
>
ddbias_tile_data
(
ddbias_tile_data
=
ddbias_arr
.
replicate
(
1
,
sample_size
);
ddbias_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
ddbias_tile_data
=
ddbias_arr
.
replicate
(
1
,
sample_size
);
ddy_arr
+=
x_sub_mean_mul_invstd_arr
*
ddscale_tile_data
;
ddy_arr
+=
ddbias_tile_data
;
ddy_arr
+=
ddbias_tile_data
;
}
}
}
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
VLOG
(
3
)
<<
"Transform batchnorm output from NHWC to NCHW"
;
VLOG
(
3
)
<<
"Transform batchnorm output from NHWC to NCHW"
;
TransToChannelFirst
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
TransToChannelFirst
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
...
...
paddle/fluid/operators/benchmark/op_tester.cc
浏览文件 @
d0c9f7e8
...
@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
...
@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
CreateInputVarDesc
();
CreateInputVarDesc
();
CreateOutputVarDesc
();
CreateOutputVarDesc
();
}
else
{
}
else
{
PADDLE_THROW
(
platform
::
errors
::
NotFound
(
"Operator '%s' is not registered."
,
PADDLE_THROW
(
platform
::
errors
::
NotFound
(
config_
.
op_type
));
"Operator '%s' is not registered in OpTester."
,
config_
.
op_type
));
}
}
if
(
config_
.
device_id
>=
0
)
{
if
(
config_
.
device_id
>=
0
)
{
...
@@ -81,7 +81,8 @@ void OpTester::Run() {
...
@@ -81,7 +81,8 @@ void OpTester::Run() {
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kAll
);
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kAll
);
platform
::
SetDeviceId
(
config_
.
device_id
);
platform
::
SetDeviceId
(
config_
.
device_id
);
#else
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'CUDAPlace' is not supported in CPU only device."
));
#endif
#endif
}
}
...
@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
...
@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
}
else
if
(
str
==
"fp64"
)
{
}
else
if
(
str
==
"fp64"
)
{
return
framework
::
proto
::
VarType
::
FP64
;
return
framework
::
proto
::
VarType
::
FP64
;
}
else
{
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s."
,
str
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported dtype %s in OpTester."
,
str
.
c_str
()));
}
}
}
}
...
@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
...
@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
case
framework
::
proto
::
AttrType
::
INTS
:
case
framework
::
proto
::
AttrType
::
INTS
:
case
framework
::
proto
::
AttrType
::
FLOATS
:
case
framework
::
proto
::
AttrType
::
FLOATS
:
case
framework
::
proto
::
AttrType
::
STRINGS
:
case
framework
::
proto
::
AttrType
::
STRINGS
:
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
platform
::
errors
::
Unimplemented
(
"Not supported STRINGS type
yet."
));
"Unsupported STRINGS type in OpTester
yet."
));
break
;
break
;
case
framework
::
proto
::
AttrType
::
LONG
:
{
case
framework
::
proto
::
AttrType
::
LONG
:
{
int64_t
value
=
StringTo
<
int64_t
>
(
value_str
);
int64_t
value
=
StringTo
<
int64_t
>
(
value_str
);
...
@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
...
@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
}
break
;
}
break
;
case
framework
::
proto
::
AttrType
::
LONGS
:
case
framework
::
proto
::
AttrType
::
LONGS
:
default:
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupport attr type %d in OpTester."
,
type
));
}
}
}
}
}
}
...
@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
...
@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
}
}
is
.
close
();
is
.
close
();
}
else
{
}
else
{
PADDLE_THROW
(
"Unsupported initializer %s."
,
initializer
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported initializer %s in OpTester."
,
initializer
.
c_str
()));
}
}
if
(
!
platform
::
is_cpu_place
(
place_
))
{
if
(
!
platform
::
is_cpu_place
(
place_
))
{
...
@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
...
@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
static_cast
<
double
>
(
1.0
),
item
.
second
.
initializer
,
static_cast
<
double
>
(
1.0
),
item
.
second
.
initializer
,
item
.
second
.
filename
);
item
.
second
.
filename
);
}
else
{
}
else
{
PADDLE_THROW
(
"Unsupported dtype %d."
,
data_type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported dtype %d in OpTester."
,
data_type
));
}
}
VLOG
(
3
)
<<
"Set lod for tensor "
<<
var_name
;
VLOG
(
3
)
<<
"Set lod for tensor "
<<
var_name
;
...
@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
...
@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
<<
"
\n
"
;
<<
"
\n
"
;
}
break
;
}
break
;
default:
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
attr_type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupport attr type %d in OpTester."
,
attr_type
));
}
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
}
...
@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
...
@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
TEST
(
op_tester
,
base
)
{
TEST
(
op_tester
,
base
)
{
if
(
!
FLAGS_op_config_list
.
empty
())
{
if
(
!
FLAGS_op_config_list
.
empty
())
{
std
::
ifstream
fin
(
FLAGS_op_config_list
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
std
::
ifstream
fin
(
FLAGS_op_config_list
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
PADDLE_ENFORCE_EQ
(
FLAGS_op_config_list
.
c_str
());
static_cast
<
bool
>
(
fin
),
true
,
platform
::
errors
::
InvalidArgument
(
"OpTester cannot open file %s"
,
FLAGS_op_config_list
.
c_str
()));
std
::
vector
<
OpTesterConfig
>
op_configs
;
std
::
vector
<
OpTesterConfig
>
op_configs
;
while
(
!
fin
.
eof
())
{
while
(
!
fin
.
eof
())
{
VLOG
(
4
)
<<
"Reading config "
<<
op_configs
.
size
()
<<
"..."
;
VLOG
(
4
)
<<
"Reading config "
<<
op_configs
.
size
()
<<
"..."
;
...
...
paddle/fluid/operators/benchmark/op_tester_config.cc
浏览文件 @
d0c9f7e8
...
@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
...
@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
}
else
if
(
dtype_str
==
"fp64"
||
dtype_str
==
"double"
)
{
}
else
if
(
dtype_str
==
"fp64"
||
dtype_str
==
"double"
)
{
dtype
=
"fp64"
;
dtype
=
"fp64"
;
}
else
{
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s"
,
dtype_str
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported dtype %s in OpInputConfig."
,
dtype_str
.
c_str
()));
}
}
VLOG
(
4
)
<<
"dtype of input "
<<
name
<<
" is: "
<<
dtype
;
VLOG
(
4
)
<<
"dtype of input "
<<
name
<<
" is: "
<<
dtype
;
}
}
...
@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
...
@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
const
std
::
vector
<
std
::
string
>
supported_initializers
=
{
"random"
,
"natural"
,
const
std
::
vector
<
std
::
string
>
supported_initializers
=
{
"random"
,
"natural"
,
"zeros"
,
"file"
};
"zeros"
,
"file"
};
if
(
!
Has
(
supported_initializers
,
initializer_str
))
{
if
(
!
Has
(
supported_initializers
,
initializer_str
))
{
PADDLE_THROW
(
"Unsupported initializer %s"
,
initializer_str
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported initializer %s in OpInputConfig."
,
initializer_str
.
c_str
()));
}
}
initializer
=
initializer_str
;
initializer
=
initializer_str
;
...
@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
...
@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
}
}
}
}
EraseEndSep
(
&
lod_str
);
EraseEndSep
(
&
lod_str
);
PADDLE_ENFORCE_GE
(
lod_str
.
length
(),
4U
);
PADDLE_ENFORCE_GE
(
lod_str
.
length
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The length of lod string should be "
"equal to or larger than 4. But length of lod string is %zu."
,
lod_str
.
length
()));
VLOG
(
4
)
<<
"lod: "
<<
lod_str
<<
", length: "
<<
lod_str
.
length
();
VLOG
(
4
)
<<
"lod: "
<<
lod_str
<<
", length: "
<<
lod_str
.
length
();
// Parse the lod_str
// Parse the lod_str
...
@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
...
@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
PADDLE_ENFORCE_EQ
(
filename
.
c_str
());
static_cast
<
bool
>
(
fin
),
true
,
platform
::
errors
::
InvalidArgument
(
"OpTesterConfig cannot open file %s."
,
filename
.
c_str
()));
Init
(
fin
);
Init
(
fin
);
}
}
...
...
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
浏览文件 @
d0c9f7e8
...
@@ -54,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase {
...
@@ -54,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase {
class
CSyncCommStreamOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
CSyncCommStreamOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
{
void
Make
()
{
AddInput
(
"X"
,
"(Tensor) Dependency of the variable need to sync"
);
AddInput
(
"X"
,
"(Tensor) Dependency of the variable need to sync"
)
AddOutput
(
"Out"
,
"(Tensor) Dependency of the variable need to sync"
);
.
AsDuplicable
();
AddOutput
(
"Out"
,
"(Tensor) Dependency of the variable need to sync"
)
.
AsDuplicable
();
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) ring id."
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) ring id."
).
SetDefault
(
0
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
CSyncCommStream Operator
CSyncCommStream Operator
...
...
paddle/fluid/operators/instance_norm_op.cc
浏览文件 @
d0c9f7e8
...
@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
...
@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
// (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
// np.sum(dy,
// np.sum(dy,
// axis=(h,w)) * (x - mean) *
// axis=(h,w)) * (x - mean) *
// (np.mean(ddx, axis=(h,w)) - ddx)
+ ddr * (dy * inv_var - inv_var
// (np.mean(ddx, axis=(h,w)) - ddx)
) + ddr * (dy * inv_var -
// *
//
inv_var
*
// np.mean(dy, axis=(h,w)) -
// np.mean(dy, axis=(h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(h,w)))
)
// axis=(h,w)))
Tensor
x_sub_mean_mul_invstd
;
Tensor
x_sub_mean_mul_invstd
;
x_sub_mean_mul_invstd
.
Resize
({
sample_size
,
NxC
});
x_sub_mean_mul_invstd
.
Resize
({
sample_size
,
NxC
});
...
...
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
d0c9f7e8
...
@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
...
@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
}
}
using
Tensor
=
paddle
::
framework
::
Tensor
;
using
Tensor
=
paddle
::
framework
::
Tensor
;
template
<
typename
KernelTuple
,
typename
PlaceType
>
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchKernelXYZN
()
{
void
BenchKernelXYZN
()
{
using
T
=
typename
KernelTuple
::
data_type
;
using
T
=
typename
KernelTuple
::
data_type
;
...
@@ -320,8 +319,15 @@ void BenchKernelSgd() {
...
@@ -320,8 +319,15 @@ void BenchKernelSgd() {
const
T
lr
=
0.1
;
const
T
lr
=
0.1
;
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
PADDLE_ENFORCE_LE
(
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
);
PADDLE_ENFORCE_LE
(
PADDLE_ENFORCE_GT
(
n
,
0
);
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The range of Sgd (upper - lower) should be equal to or lower "
"than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d."
,
static_cast
<
size_t
>
(
upper
-
lower
),
(
n
-
1
)));
PADDLE_ENFORCE_GT
(
n
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The Sgd size should be larger than 0. But the n is %d."
,
n
));
std
::
vector
<
int64_t
>
all
,
out
;
std
::
vector
<
int64_t
>
all
,
out
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
all
.
push_back
(
i
);
all
.
push_back
(
i
);
...
...
paddle/fluid/operators/jit/gen/embseqpool.cc
浏览文件 @
d0c9f7e8
...
@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
...
@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
}
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
table_height
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
table_height
,
0
,
PADDLE_ENFORCE_GT
(
attr
.
table_width
,
0
);
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_GT
(
attr
.
index_height
,
0
);
"The attribute table_height of EmbSeqPool should "
PADDLE_ENFORCE_GT
(
attr
.
index_width
,
0
);
"be larger than 0. But it is %d."
,
PADDLE_ENFORCE_GT
(
attr
.
out_width
,
0
);
attr
.
table_height
));
PADDLE_ENFORCE_GT
(
attr
.
table_width
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute table_width of EmbSeqPool should "
"be larger than 0. But it is %d."
,
attr
.
table_width
));
PADDLE_ENFORCE_GT
(
attr
.
index_height
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute index_height of EmbSeqPool should "
"be larger than 0. But it is %d."
,
attr
.
index_height
));
PADDLE_ENFORCE_GT
(
attr
.
index_width
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute index_width of EmbSeqPool should "
"be larger than 0. But it is %d."
,
attr
.
index_width
));
PADDLE_ENFORCE_GT
(
attr
.
out_width
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute out_width of EmbSeqPool should be "
"larger than 0. But it is %d."
,
attr
.
out_width
));
return
make_unique
<
EmbSeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
return
make_unique
<
EmbSeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
}
}
};
};
...
...
paddle/fluid/operators/jit/gen/matmul.cc
浏览文件 @
d0c9f7e8
...
@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
...
@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
preCode
();
preCode
();
int
block
,
rest
;
int
block
,
rest
;
const
auto
groups
=
packed_groups
(
n_
,
k_
,
&
block
,
&
rest
);
const
auto
groups
=
packed_groups
(
n_
,
k_
,
&
block
,
&
rest
);
PADDLE_ENFORCE_GT
(
groups
.
front
(),
0
);
PADDLE_ENFORCE_GT
(
groups
.
front
(),
0
,
platform
::
errors
::
InvalidArgument
(
"The number of rest registers should "
"be larger than 0. But it is %d."
,
groups
.
front
()));
const
int
block_len
=
sizeof
(
float
)
*
block
;
const
int
block_len
=
sizeof
(
float
)
*
block
;
const
int
x_reg_idx
=
(
block
==
ZMM_FLOAT_BLOCK
?
32
:
16
)
-
1
;
const
int
x_reg_idx
=
(
block
==
ZMM_FLOAT_BLOCK
?
32
:
16
)
-
1
;
...
@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
...
@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
}
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
matmul_attr_t
&
attr
)
const
override
{
const
matmul_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
m
,
0
);
PADDLE_ENFORCE_GT
(
PADDLE_ENFORCE_GT
(
attr
.
n
,
0
);
attr
.
m
,
0
,
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_GT
(
attr
.
k
,
0
);
"The attribute m (first matrix's row) of MatMul should "
"be larger than 0. But it is %d."
,
attr
.
m
));
PADDLE_ENFORCE_GT
(
attr
.
n
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute n (first matrix's col) of MatMul should "
"be larger than 0. But it is %d."
,
attr
.
n
));
PADDLE_ENFORCE_GT
(
attr
.
k
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute k (second matrix's col) of MatMul should "
"be larger than 0. But it is %d."
,
attr
.
k
));
return
make_unique
<
MatMulJitCode
>
(
attr
,
CodeSize
(
attr
));
return
make_unique
<
MatMulJitCode
>
(
attr
,
CodeSize
(
attr
));
}
}
};
};
...
...
paddle/fluid/operators/jit/gen/matmul.h
浏览文件 @
d0c9f7e8
...
@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
...
@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
size_t
code_size
=
256
*
1024
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
m_
(
attr
.
m
),
n_
(
attr
.
n
),
k_
(
attr
.
k
)
{
:
JitCode
(
code_size
,
code_ptr
),
m_
(
attr
.
m
),
n_
(
attr
.
n
),
k_
(
attr
.
k
)
{
PADDLE_ENFORCE_EQ
(
m_
,
1
,
"Only support m==1 yet"
);
PADDLE_ENFORCE_EQ
(
m_
,
1
,
platform
::
errors
::
Unimplemented
(
"Jitcode of matmul only support m==1 (first "
"matrix's row) now. But m is %d."
,
m_
));
this
->
genCode
();
this
->
genCode
();
}
}
...
...
paddle/fluid/operators/jit/gen/seqpool.cc
浏览文件 @
d0c9f7e8
...
@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
...
@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
}
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
seq_pool_attr_t
&
attr
)
const
override
{
const
seq_pool_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
w
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
w
,
0
,
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_GT
(
attr
.
h
,
0
);
"The attribute width of SeqPool should "
"be larger than 0. But it is %d."
,
attr
.
w
));
PADDLE_ENFORCE_GT
(
attr
.
h
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute height of SeqPool should "
"be larger than 0. But it is %d."
,
attr
.
h
));
return
make_unique
<
SeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
return
make_unique
<
SeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
}
}
};
};
...
...
paddle/fluid/operators/jit/gen/seqpool.h
浏览文件 @
d0c9f7e8
...
@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
...
@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
vmovss
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
vmovss
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
reg_idx
++
;
reg_idx
++
;
}
}
PADDLE_ENFORCE_EQ
(
reg_idx
,
rest_used_num_regs
,
PADDLE_ENFORCE_EQ
(
"All heights should use same regs"
);
reg_idx
,
rest_used_num_regs
,
platform
::
errors
::
InvalidArgument
(
"All heights of SeqPool should use the same number of registers."
"It equals to the numbr of rest registers. But use %d registers "
"and the numbr of rest registers is %d."
,
reg_idx
,
rest_used_num_regs
));
for
(
int
i
=
0
;
i
<
reg_idx
;
++
i
)
{
for
(
int
i
=
0
;
i
<
reg_idx
;
++
i
)
{
vaddps
(
xmm_t
(
i
),
xmm_t
(
i
),
xmm_t
(
i
+
max_num_regs
));
vaddps
(
xmm_t
(
i
),
xmm_t
(
i
),
xmm_t
(
i
+
max_num_regs
));
}
}
...
...
paddle/fluid/operators/jit/gen/sgd.cc
浏览文件 @
d0c9f7e8
...
@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
...
@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
size_t
CodeSize
(
const
sgd_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
size_t
CodeSize
(
const
sgd_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
sgd_attr_t
&
attr
)
const
override
{
const
sgd_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_EQ
(
attr
.
param_width
,
attr
.
grad_width
);
PADDLE_ENFORCE_EQ
(
attr
.
param_width
,
attr
.
grad_width
,
PADDLE_ENFORCE_LE
(
attr
.
selected_rows_size
,
attr
.
grad_height
);
platform
::
errors
::
InvalidArgument
(
PADDLE_ENFORCE_GE
(
attr
.
selected_rows_size
,
0
);
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d."
,
attr
.
param_width
,
attr
.
grad_width
));
PADDLE_ENFORCE_LE
(
attr
.
selected_rows_size
,
attr
.
grad_height
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d."
,
attr
.
selected_rows_size
,
attr
.
grad_height
));
PADDLE_ENFORCE_GE
(
attr
.
selected_rows_size
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or larger than 0. But selected_rows_size is %d."
,
attr
.
selected_rows_size
));
return
make_unique
<
SgdJitCode
>
(
attr
,
CodeSize
(
attr
));
return
make_unique
<
SgdJitCode
>
(
attr
,
CodeSize
(
attr
));
}
}
};
};
...
...
paddle/fluid/operators/jit/gen/vbroadcast.cc
浏览文件 @
d0c9f7e8
...
@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
...
@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
}
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int64_t
&
w
)
const
override
{
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int64_t
&
w
)
const
override
{
PADDLE_ENFORCE_GT
(
w
,
0
);
PADDLE_ENFORCE_GT
(
w
,
0
,
platform
::
errors
::
InvalidArgument
(
"The width of VBroadcast should be larger than 0. But w is %d."
,
w
));
return
make_unique
<
VBroadcastJitCode
>
(
w
,
CodeSize
(
w
));
return
make_unique
<
VBroadcastJitCode
>
(
w
,
CodeSize
(
w
));
}
}
};
};
...
...
paddle/fluid/operators/jit/gen_base.cc
浏览文件 @
d0c9f7e8
...
@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
...
@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
void
*
GenBase
::
operator
new
(
size_t
size
)
{
void
*
GenBase
::
operator
new
(
size_t
size
)
{
void
*
ptr
;
void
*
ptr
;
constexpr
size_t
alignment
=
32ul
;
constexpr
size_t
alignment
=
32ul
;
PADDLE_ENFORCE_EQ
(
posix_memalign
(
&
ptr
,
alignment
,
size
),
0
,
PADDLE_ENFORCE_EQ
(
"GenBase Alloc %ld error!"
,
size
);
posix_memalign
(
&
ptr
,
alignment
,
size
),
0
,
PADDLE_ENFORCE
(
ptr
,
"Fail to allocate GenBase CPU memory: size = %d ."
,
size
);
platform
::
errors
::
InvalidArgument
(
"Jitcode generator (GenBase) allocate %ld memory error!"
,
size
));
PADDLE_ENFORCE_NOT_NULL
(
ptr
,
platform
::
errors
::
InvalidArgument
(
"Fail to allocate jitcode generator "
"(GenBase) CPU memory: size = %d ."
,
size
));
return
ptr
;
return
ptr
;
}
}
...
...
paddle/fluid/operators/jit/helper.cc
浏览文件 @
d0c9f7e8
...
@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
...
@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
ONE_CASE
(
kEmbSeqPool
);
ONE_CASE
(
kEmbSeqPool
);
ONE_CASE
(
kSgd
);
ONE_CASE
(
kSgd
);
default:
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
kt
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"JIT kernel do not support type: %d."
,
kt
));
return
"NOT JITKernel"
;
return
"NOT JITKernel"
;
}
}
return
nullptr
;
return
nullptr
;
...
@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
...
@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
ONE_CASE
(
kAvg
);
ONE_CASE
(
kAvg
);
ONE_CASE
(
kSqrt
);
ONE_CASE
(
kSqrt
);
default:
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
tp
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"SeqPool JIT kernel do not support type: %d."
,
tp
));
return
"NOT PoolType"
;
return
"NOT PoolType"
;
}
}
return
nullptr
;
return
nullptr
;
...
@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
...
@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
}
else
if
(
lower
==
"tanh"
||
lower
==
"vtanh"
)
{
}
else
if
(
lower
==
"tanh"
||
lower
==
"vtanh"
)
{
return
kVTanh
;
return
kVTanh
;
}
}
PADDLE_THROW
(
"Not support type: %s, or forget to add this case"
,
act
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Act JIT kernel do not support type: %s."
,
act
));
return
kNone
;
return
kNone
;
}
}
...
@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
...
@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
int
block
,
rest
;
int
block
,
rest
;
const
auto
groups
=
packed_groups
(
n
,
k
,
&
block
,
&
rest
);
const
auto
groups
=
packed_groups
(
n
,
k
,
&
block
,
&
rest
);
std
::
for_each
(
groups
.
begin
(),
groups
.
end
(),
[
&
](
int
i
)
{
std
::
for_each
(
groups
.
begin
(),
groups
.
end
(),
[
&
](
int
i
)
{
PADDLE_ENFORCE_GT
(
i
,
0
,
"each element of groups should be larger than 0."
);
PADDLE_ENFORCE_GT
(
i
,
0
,
platform
::
errors
::
InvalidArgument
(
"Each element of groups should be larger than "
"0. However the element: %d doesn't satify."
,
i
));
});
});
int
sum
=
std
::
accumulate
(
groups
.
begin
(),
groups
.
end
(),
0
);
int
sum
=
std
::
accumulate
(
groups
.
begin
(),
groups
.
end
(),
0
);
std
::
memset
(
dst
,
0
,
k
*
sum
*
block
*
sizeof
(
float
));
std
::
memset
(
dst
,
0
,
k
*
sum
*
block
*
sizeof
(
float
));
PADDLE_ENFORCE_GE
(
sum
*
block
,
n
,
PADDLE_ENFORCE_GE
(
sum
*
block
,
n
,
"The packed n should be equal to or larger than n"
);
platform
::
errors
::
InvalidArgument
(
"The packed n (sum * block) should be equal to or "
"larger than n (matmul row size). "
"However, the packed n is %d and n is %d."
,
sum
*
block
,
n
));
const
int
block_len
=
sizeof
(
float
)
*
block
;
const
int
block_len
=
sizeof
(
float
)
*
block
;
int
n_offset
=
0
;
int
n_offset
=
0
;
...
@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
...
@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
template
<
typename
T
>
template
<
typename
T
>
typename
std
::
enable_if
<!
std
::
is_same
<
T
,
float
>::
value
>::
type
pack_weights
(
typename
std
::
enable_if
<!
std
::
is_same
<
T
,
float
>::
value
>::
type
pack_weights
(
const
T
*
src
,
T
*
dst
,
int
n
,
int
k
)
{
const
T
*
src
,
T
*
dst
,
int
n
,
int
k
)
{
PADDLE_THROW
(
"Only support pack with float type."
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Only supports pack weights with float type."
));
}
}
}
// namespace jit
}
// namespace jit
...
...
paddle/fluid/operators/jit/helper.h
浏览文件 @
d0c9f7e8
...
@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
...
@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
auto
&
ref_pool
=
ReferKernelPool
::
Instance
().
AllKernels
();
auto
&
ref_pool
=
ReferKernelPool
::
Instance
().
AllKernels
();
KernelKey
kkey
(
KernelTuple
::
kernel_type
,
platform
::
CPUPlace
());
KernelKey
kkey
(
KernelTuple
::
kernel_type
,
platform
::
CPUPlace
());
auto
ref_iter
=
ref_pool
.
find
(
kkey
);
auto
ref_iter
=
ref_pool
.
find
(
kkey
);
PADDLE_ENFORCE
(
ref_iter
!=
ref_pool
.
end
(),
PADDLE_ENFORCE_NE
(
"Every Kernel should have reference function."
);
ref_iter
,
ref_pool
.
end
(),
platform
::
errors
::
PreconditionNotMet
(
"Every Refer Kernel of jitcode should have reference function."
));
auto
&
ref_impls
=
ref_iter
->
second
;
auto
&
ref_impls
=
ref_iter
->
second
;
for
(
auto
&
impl
:
ref_impls
)
{
for
(
auto
&
impl
:
ref_impls
)
{
auto
i
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
impl
.
get
());
auto
i
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
impl
.
get
());
...
@@ -101,7 +103,9 @@ template <typename KernelTuple>
...
@@ -101,7 +103,9 @@ template <typename KernelTuple>
inline
typename
KernelTuple
::
func_type
GetReferFunc
()
{
inline
typename
KernelTuple
::
func_type
GetReferFunc
()
{
auto
ker
=
GetReferKernel
<
KernelTuple
>
();
auto
ker
=
GetReferKernel
<
KernelTuple
>
();
auto
p
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
ker
);
auto
p
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
ker
);
PADDLE_ENFORCE
(
p
,
"The Refer kernel should exsit"
);
PADDLE_ENFORCE_NOT_NULL
(
p
,
platform
::
errors
::
InvalidArgument
(
"Get the reference code of kernel in CPU "
"failed. The Refer kernel should exsit."
));
return
p
->
GetFunc
();
return
p
->
GetFunc
();
}
}
...
@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
...
@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
// The last implementation should be reference function on CPUPlace.
// The last implementation should be reference function on CPUPlace.
auto
ref
=
GetReferKernel
<
KernelTuple
>
();
auto
ref
=
GetReferKernel
<
KernelTuple
>
();
PADDLE_ENFORCE
(
ref
!=
nullptr
,
"Refer Kernel can not be empty."
);
PADDLE_ENFORCE_NOT_NULL
(
ref
,
platform
::
errors
::
InvalidArgument
(
"Get all candicate kernel in CPU failed. "
"The Refer Kernel can not be empty."
));
res
.
emplace_back
(
ref
);
res
.
emplace_back
(
ref
);
return
res
;
return
res
;
}
}
...
@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
...
@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
std
::
string
name
=
k
->
ImplType
();
std
::
string
name
=
k
->
ImplType
();
if
(
name
==
"JitCode"
)
{
if
(
name
==
"JitCode"
)
{
auto
i
=
dynamic_cast
<
const
GenBase
*>
(
k
);
auto
i
=
dynamic_cast
<
const
GenBase
*>
(
k
);
PADDLE_ENFORCE
(
i
,
"jitcode kernel cast can not fail."
);
PADDLE_ENFORCE_NOT_NULL
(
i
,
platform
::
errors
::
InvalidArgument
(
"Generate jitcode kernel (GenBase) failed."
));
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
template
getCode
<
Func
>()));
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
template
getCode
<
Func
>()));
}
else
{
}
else
{
auto
i
=
dynamic_cast
<
const
KernelMore
<
KernelTuple
>*>
(
k
);
auto
i
=
dynamic_cast
<
const
KernelMore
<
KernelTuple
>*>
(
k
);
PADDLE_ENFORCE
(
i
,
"kernel cast can not fail."
);
PADDLE_ENFORCE_NOT_NULL
(
i
,
platform
::
errors
::
InvalidArgument
(
"Kernel cast (KernelMore) failed."
));
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
GetFunc
()));
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
GetFunc
()));
}
}
}
}
...
@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
...
@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
typename
KernelTuple
::
func_type
GetDefaultBestFunc
(
typename
KernelTuple
::
func_type
GetDefaultBestFunc
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
const
typename
KernelTuple
::
attr_type
&
attr
)
{
auto
funcs
=
GetAllCandidateFuncs
<
KernelTuple
,
PlaceType
>
(
attr
);
auto
funcs
=
GetAllCandidateFuncs
<
KernelTuple
,
PlaceType
>
(
attr
);
PADDLE_ENFORCE_GE
(
funcs
.
size
(),
1UL
);
PADDLE_ENFORCE_GE
(
funcs
.
size
(),
1UL
,
platform
::
errors
::
InvalidArgument
(
"The candicate jit kernel is at least one in CPU."
));
// Here could do some runtime benchmark of this attr and return the best one.
// Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one,
// But yet just get the first one as the default best one,
// which is searched in order and tuned by offline.
// which is searched in order and tuned by offline.
...
...
paddle/fluid/operators/jit/more/mix/mix.cc
浏览文件 @
d0c9f7e8
...
@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT
...
@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT
}
else
if
(
type
==
kVIdentity
)
{
}
else
if
(
type
==
kVIdentity
)
{
return
KernelFuncs
<
VIdentityTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
return
KernelFuncs
<
VIdentityTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
}
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Act JIT kernel do not support type: %s"
,
type
));
return
nullptr
;
return
nullptr
;
}
}
...
...
paddle/fluid/operators/jit/more/mkl/mkl.h
浏览文件 @
d0c9f7e8
...
@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
...
@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
template
<
typename
T
>
template
<
typename
T
>
void
EmbSeqPool
(
const
T
*
table
,
const
int64_t
*
idx
,
T
*
out
,
void
EmbSeqPool
(
const
T
*
table
,
const
int64_t
*
idx
,
T
*
out
,
const
emb_seq_pool_attr_t
*
attr
)
{
const
emb_seq_pool_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
);
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
,
platform
::
errors
::
InvalidArgument
(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d, "
"out_width is %d."
,
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
));
auto
check_idx_value_valid
=
[
&
](
int64_t
i
)
{
auto
check_idx_value_valid
=
[
&
](
int64_t
i
)
{
PADDLE_ENFORCE_LT
(
idx
[
i
],
attr
->
table_height
,
"idx value: %d, i: %d"
,
PADDLE_ENFORCE_LT
(
idx
[
i
],
i
);
idx
[
i
],
attr
->
table_height
,
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
"idx value: %d, i: %d"
,
idx
[
i
],
i
);
platform
::
errors
::
InvalidArgument
(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d."
,
i
,
idx
[
i
],
attr
->
table_height
));
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d."
,
i
,
idx
[
i
]));
};
};
for
(
int64_t
w
=
0
;
w
!=
attr
->
index_width
;
++
w
)
{
for
(
int64_t
w
=
0
;
w
!=
attr
->
index_width
;
++
w
)
{
...
@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
...
@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
template
<
typename
T
>
template
<
typename
T
>
void
Sgd
(
const
T
*
lr
,
const
T
*
param
,
const
T
*
grad
,
const
int64_t
*
rows
,
void
Sgd
(
const
T
*
lr
,
const
T
*
param
,
const
T
*
grad
,
const
int64_t
*
rows
,
T
*
out
,
const
sgd_attr_t
*
attr
)
{
T
*
out
,
const
sgd_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
);
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
,
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
);
platform
::
errors
::
InvalidArgument
(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d."
,
attr
->
param_width
,
attr
->
grad_width
));
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d."
,
attr
->
selected_rows_size
,
attr
->
grad_height
));
T
scalar
=
-
lr
[
0
];
T
scalar
=
-
lr
[
0
];
int
width
=
attr
->
grad_width
;
int
width
=
attr
->
grad_width
;
if
(
out
==
param
)
{
if
(
out
==
param
)
{
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
auto
h_idx
=
rows
[
i
];
auto
h_idx
=
rows
[
i
];
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
);
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
,
PADDLE_ENFORCE_GE
(
h_idx
,
0
);
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d."
,
i
,
h_idx
,
attr
->
param_height
));
PADDLE_ENFORCE_GE
(
h_idx
,
0
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d."
,
i
,
h_idx
));
VAXPY
(
scalar
,
grad
+
i
*
width
,
out
+
h_idx
*
width
,
width
);
VAXPY
(
scalar
,
grad
+
i
*
width
,
out
+
h_idx
*
width
,
width
);
}
}
}
else
{
}
else
{
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
auto
h_idx
=
rows
[
i
];
auto
h_idx
=
rows
[
i
];
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
);
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
,
PADDLE_ENFORCE_GE
(
h_idx
,
0
);
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d."
,
i
,
h_idx
,
attr
->
param_height
));
PADDLE_ENFORCE_GE
(
h_idx
,
0
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d."
,
i
,
h_idx
));
VScal
(
&
scalar
,
grad
+
i
*
width
,
out
+
h_idx
*
width
,
width
);
VScal
(
&
scalar
,
grad
+
i
*
width
,
out
+
h_idx
*
width
,
width
);
VAdd
(
param
+
h_idx
*
width
,
out
+
h_idx
*
width
,
out
+
h_idx
*
width
,
VAdd
(
param
+
h_idx
*
width
,
out
+
h_idx
*
width
,
out
+
h_idx
*
width
,
width
);
width
);
...
...
paddle/fluid/operators/jit/refer/refer.h
浏览文件 @
d0c9f7e8
...
@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT
...
@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT
}
else
if
(
type
==
kVIdentity
)
{
}
else
if
(
type
==
kVIdentity
)
{
return
VIdentity
<
T
>
;
return
VIdentity
<
T
>
;
}
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Act JIT kernel do not support type: %s."
,
type
));
return
nullptr
;
return
nullptr
;
}
}
...
@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
...
@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
template
<
typename
T
>
template
<
typename
T
>
void
EmbSeqPool
(
const
T
*
table
,
const
int64_t
*
idx
,
T
*
out
,
void
EmbSeqPool
(
const
T
*
table
,
const
int64_t
*
idx
,
T
*
out
,
const
emb_seq_pool_attr_t
*
attr
)
{
const
emb_seq_pool_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
);
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
,
platform
::
errors
::
InvalidArgument
(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d and "
"out_width is %d."
,
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
));
auto
check_idx_value_valid
=
[
&
](
int64_t
i
)
{
auto
check_idx_value_valid
=
[
&
](
int64_t
i
)
{
PADDLE_ENFORCE_LT
(
idx
[
i
],
attr
->
table_height
,
"idx value: %d, i: %d"
,
PADDLE_ENFORCE_LT
(
idx
[
i
],
i
);
idx
[
i
],
attr
->
table_height
,
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
"idx value: %d, i: %d"
,
idx
[
i
],
i
);
platform
::
errors
::
InvalidArgument
(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d."
,
i
,
idx
[
i
],
attr
->
table_height
));
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d."
,
i
,
idx
[
i
]));
};
};
for
(
int64_t
w
=
0
;
w
!=
attr
->
index_width
;
++
w
)
{
for
(
int64_t
w
=
0
;
w
!=
attr
->
index_width
;
++
w
)
{
...
@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
...
@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
template
<
typename
T
>
template
<
typename
T
>
void
Sgd
(
const
T
*
lr
,
const
T
*
param
,
const
T
*
grad
,
const
int64_t
*
rows
,
void
Sgd
(
const
T
*
lr
,
const
T
*
param
,
const
T
*
grad
,
const
int64_t
*
rows
,
T
*
out
,
const
sgd_attr_t
*
attr
)
{
T
*
out
,
const
sgd_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
);
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
,
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
);
platform
::
errors
::
InvalidArgument
(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d."
,
attr
->
param_width
,
attr
->
grad_width
));
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d."
,
attr
->
selected_rows_size
,
attr
->
grad_height
));
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
auto
h_idx
=
rows
[
i
];
auto
h_idx
=
rows
[
i
];
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
);
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
,
PADDLE_ENFORCE_GE
(
h_idx
,
0
);
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d."
,
i
,
h_idx
,
attr
->
param_height
));
PADDLE_ENFORCE_GE
(
h_idx
,
0
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d."
,
i
,
h_idx
));
for
(
int64_t
j
=
0
;
j
<
attr
->
grad_width
;
++
j
)
{
for
(
int64_t
j
=
0
;
j
<
attr
->
grad_width
;
++
j
)
{
out
[
h_idx
*
attr
->
grad_width
+
j
]
=
out
[
h_idx
*
attr
->
grad_width
+
j
]
=
param
[
h_idx
*
attr
->
grad_width
+
j
]
-
param
[
h_idx
*
attr
->
grad_width
+
j
]
-
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
d0c9f7e8
...
@@ -850,8 +850,15 @@ void TestKernelSgd() {
...
@@ -850,8 +850,15 @@ void TestKernelSgd() {
const
T
lr
=
0.1
;
const
T
lr
=
0.1
;
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
PADDLE_ENFORCE_LE
(
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
);
PADDLE_ENFORCE_LE
(
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
,
PADDLE_ENFORCE_GT
(
n
,
0
);
paddle
::
platform
::
errors
::
InvalidArgument
(
"The range of Sgd (upper - lower) should be lower "
"than n-1 (Sgd size -1). But the upper - lower is %d "
"and n-1 is %d."
,
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
));
PADDLE_ENFORCE_GT
(
n
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The Sgd size should be larger than 0. But the n is %d."
,
n
));
std
::
vector
<
int64_t
>
all
,
out
;
std
::
vector
<
int64_t
>
all
,
out
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
all
.
push_back
(
i
);
all
.
push_back
(
i
);
...
...
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
d0c9f7e8
...
@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
...
@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
});
});
}
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
...
@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
...
@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
}
}
}
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
**
A
,
const
platform
::
float16
**
B
,
platform
::
float16
beta
,
platform
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
platform
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
void
Blas
<
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
...
...
paddle/fluid/operators/matmul_v2_op.cu
浏览文件 @
d0c9f7e8
...
@@ -17,10 +17,12 @@ limitations under the License. */
...
@@ -17,10 +17,12 @@ limitations under the License. */
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plf
=
paddle
::
platform
;
namespace
plf
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
matmul_v2
,
REGISTER_OP_CUDA_KERNEL
(
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
float
>
,
matmul_v2
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
float
>
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
double
>
);
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
double
>
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
plf
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
matmul_v2_grad
,
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
float
>
,
matmul_v2_grad
,
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
float
>
,
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
double
>
);
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
double
>
,
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
plf
::
float16
>
);
paddle/fluid/operators/matmul_v2_op.h
浏览文件 @
d0c9f7e8
...
@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
...
@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if
(
trans_y
)
{
if
(
trans_y
)
{
const
int
M
=
Y
->
numel
()
/
N
;
const
int
M
=
Y
->
numel
()
/
N
;
VLOG
(
3
)
<<
"MatMul's case 2"
;
VLOG
(
3
)
<<
"MatMul's case 2"
;
blas
.
GEMV
(
false
,
M
,
N
,
1.
,
y_data
,
x_data
,
0.
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
false
,
M
,
N
,
static_cast
<
T
>
(
1
),
y_data
,
x_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
}
else
{
const
int
M
=
y_dims
[
y_ndim
-
1
];
const
int
M
=
y_dims
[
y_ndim
-
1
];
const
int
batch_size
=
Y
->
numel
()
/
(
M
*
N
);
const
int
batch_size
=
Y
->
numel
()
/
(
M
*
N
);
if
(
batch_size
==
1
)
{
if
(
batch_size
==
1
)
{
VLOG
(
3
)
<<
"MatMul's case 3"
;
VLOG
(
3
)
<<
"MatMul's case 3"
;
blas
.
GEMV
(
true
,
N
,
M
,
1.
,
y_data
,
x_data
,
0.
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
true
,
N
,
M
,
static_cast
<
T
>
(
1
),
y_data
,
x_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
}
else
{
VLOG
(
3
)
<<
"MatMul's case 4"
;
VLOG
(
3
)
<<
"MatMul's case 4"
;
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
1.0
f
,
y_data
,
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
static_cast
<
T
>
(
1
),
x_data
,
0
,
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
y_data
,
x_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
}
}
}
}
return
;
return
;
...
@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
...
@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
const
int
batch_size
=
X
->
numel
()
/
(
M
*
N
);
const
int
batch_size
=
X
->
numel
()
/
(
M
*
N
);
if
(
batch_size
==
1
)
{
if
(
batch_size
==
1
)
{
VLOG
(
3
)
<<
"MatMul's case 5"
;
VLOG
(
3
)
<<
"MatMul's case 5"
;
blas
.
GEMV
(
true
,
N
,
M
,
1.0
f
,
x_data
,
y_data
,
0.0
f
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
true
,
N
,
M
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
}
else
{
VLOG
(
3
)
<<
"MatMul's case 6"
;
VLOG
(
3
)
<<
"MatMul's case 6"
;
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
1.0
f
,
x_data
,
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
static_cast
<
T
>
(
1
),
y_data
,
0
,
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
}
}
}
else
{
}
else
{
const
int
M
=
X
->
numel
()
/
N
;
const
int
M
=
X
->
numel
()
/
N
;
VLOG
(
3
)
<<
"MatMul's case 7"
;
VLOG
(
3
)
<<
"MatMul's case 7"
;
blas
.
GEMV
(
false
,
M
,
N
,
1.0
f
,
x_data
,
y_data
,
0.0
f
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
false
,
M
,
N
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
}
return
;
return
;
}
}
...
@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
...
@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if
(
x_batch_size
==
1
&&
y_batch_size
==
1
)
{
if
(
x_batch_size
==
1
&&
y_batch_size
==
1
)
{
VLOG
(
3
)
<<
"MatMul's case 8"
;
VLOG
(
3
)
<<
"MatMul's case 8"
;
blas
.
GEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
blas
.
GEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_data
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
)
,
y_data
,
0.0
f
,
Out
->
data
<
T
>
());
x_data
,
y_data
,
static_cast
<
T
>
(
0
)
,
Out
->
data
<
T
>
());
}
else
if
(
x_batch_size
==
1
)
{
}
else
if
(
x_batch_size
==
1
)
{
if
(
M
==
1
&&
trans_y
)
{
if
(
M
==
1
&&
trans_y
)
{
VLOG
(
3
)
<<
"MatMul's case 9"
;
VLOG
(
3
)
<<
"MatMul's case 9"
;
blas
.
GEMV
(
false
,
y_batch_size
*
N
,
K
,
1.0
f
,
y_data
,
x_data
,
0.0
f
,
blas
.
GEMV
(
false
,
y_batch_size
*
N
,
K
,
static_cast
<
T
>
(
1
),
y_data
,
x_data
,
Out
->
data
<
T
>
());
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
}
else
{
VLOG
(
3
)
<<
"MatMul's case 10"
;
VLOG
(
3
)
<<
"MatMul's case 10"
;
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
x_data
,
y_data
,
0
,
Out
->
data
<
T
>
(),
out_batch_size
,
0
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
)
,
K
*
N
);
Out
->
data
<
T
>
(),
out_batch_size
,
0
,
K
*
N
);
}
}
}
else
if
(
y_batch_size
==
1
)
{
}
else
if
(
y_batch_size
==
1
)
{
if
(
!
trans_x
)
{
if
(
!
trans_x
)
{
VLOG
(
3
)
<<
"MatMul's case 11"
;
VLOG
(
3
)
<<
"MatMul's case 11"
;
blas
.
GEMM
(
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
blas
.
GEMM
(
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
x_batch_size
*
M
,
N
,
K
,
1.0
f
,
x_data
,
y_data
,
0.0
f
,
x_batch_size
*
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
Out
->
data
<
T
>
());
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
}
else
{
VLOG
(
3
)
<<
"MatMul's case 12"
;
VLOG
(
3
)
<<
"MatMul's case 12"
;
blas
.
BatchedGEMM
(
CblasTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
blas
.
BatchedGEMM
(
CblasTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_data
,
y_data
,
0
,
Out
->
data
<
T
>
(),
out_batch_size
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
)
,
M
*
K
,
0
);
Out
->
data
<
T
>
(),
out_batch_size
,
M
*
K
,
0
);
}
}
}
else
if
(
!
is_broadcast_dims
)
{
}
else
if
(
!
is_broadcast_dims
)
{
VLOG
(
3
)
<<
"MatMul's case 13"
;
VLOG
(
3
)
<<
"MatMul's case 13"
;
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_data
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
y_data
,
0
,
Out
->
data
<
T
>
(),
out_batch_size
,
M
*
K
,
K
*
N
);
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
(),
out_batch_size
,
M
*
K
,
K
*
N
);
}
else
{
}
else
{
// in the case, can't use stridedgemm
// in the case, can't use stridedgemm
std
::
vector
<
const
T
*>
x_ptr
(
out_batch_size
);
std
::
vector
<
const
T
*>
x_ptr
(
out_batch_size
);
...
@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
...
@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
}
}
VLOG
(
3
)
<<
"MatMul's case 14"
;
VLOG
(
3
)
<<
"MatMul's case 14"
;
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
x_ptr
.
data
(),
y_ptr
.
data
(),
0.0
f
,
out
_ptr
.
data
(),
static_cast
<
T
>
(
1
),
x_ptr
.
data
(),
y
_ptr
.
data
(),
out_batch_size
);
static_cast
<
T
>
(
0
),
out_ptr
.
data
(),
out_batch_size
);
}
}
}
}
...
...
paddle/fluid/operators/norm_utils.cu.h
浏览文件 @
d0c9f7e8
...
@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
...
@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy,
// np.sum(dy,
// axis=(n,h,w)) * (x - mean) *
// axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
// (np.mean(ddx, axis=(n,h,w)) - ddx)
)
+ ddr * (dy * inv_var -
// inv_var
// inv_var
// *
// *
// np.mean(dy, axis=(n,h,w)) -
// np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))
)
// axis=(n,h,w)))
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDX
(
const
T
*
x
,
const
T
*
mean
,
__global__
void
DoubleGradComputeDX
(
const
T
*
x
,
const
T
*
mean
,
...
@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
...
@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
?
(
j
/
sample_size
*
C
+
i
)
*
sample_size
+
j
%
sample_size
?
(
j
/
sample_size
*
C
+
i
)
*
sample_size
+
j
%
sample_size
:
j
*
outer_size
+
i
;
:
j
*
outer_size
+
i
;
dx
[
index
]
+=
(
dy
[
index
]
*
var_val
-
dy_sum_val
/
inner_size
*
var_val
-
dx
[
index
]
+=
(
dy
[
index
]
*
var_val
-
dy_sum_val
/
inner_size
*
var_val
-
(
x
[
index
]
-
mean_val
)
*
var_val
*
(
x
[
index
]
-
mean_val
)
*
var_val
*
var_val
*
dy_mul_x_sub_mean_sum_val
*
var_val
/
inner_size
)
*
dy_mul_x_sub_mean_sum_val
*
var_val
/
inner_size
)
*
ddscale
[
i
];
ddscale
[
i
];
}
}
...
@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
...
@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
}
}
// math: dx = ddscale * dy * inv_var
// math: dx = ddscale * dy * inv_var
// math: ddy = scale * ddx * inv_var
template
<
typename
T
,
framework
::
DataLayout
layout
>
template
<
typename
T
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDataWithGlobal
(
__global__
void
DoubleGradComputeDXWithGlobal
(
const
T
*
dy
,
const
T
*
ddscale
,
const
T
*
dy
,
const
T
*
scale
,
const
T
*
variance
,
const
double
epsilon
,
const
T
*
variance
,
const
int
C
,
const
int
sample_size
,
const
int
num
,
T
*
dx
)
{
const
double
epsilon
,
const
int
C
,
const
int
sample_size
,
const
int
num
,
T
*
dx
)
{
int
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
if
(
scale
!=
nullptr
)
{
if
(
dd
scale
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
T
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
T
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
dx
[
i
]
=
dy
[
i
]
*
scale
[
c
]
*
inv_var
;
dx
[
i
]
=
dy
[
i
]
*
ddscale
[
c
]
*
inv_var
;
}
}
}
// math: ddy = scale * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
template
<
typename
T
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDDYWithGlobal
(
const
T
*
ddx
,
const
T
*
scale
,
const
T
*
mean
,
const
T
*
variance
,
const
T
*
x
,
const
T
*
ddbias
,
const
T
*
ddscale
,
const
double
epsilon
,
const
int
C
,
const
int
sample_size
,
const
int
num
,
T
*
ddy
)
{
int
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
if
(
ddx
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
T
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
ddy
[
i
]
+=
ddx
[
i
]
*
scale
[
c
]
*
inv_var
;
}
}
__syncthreads
();
if
(
ddscale
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
T
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
ddy
[
i
]
+=
(
x
[
i
]
-
mean
[
c
])
*
inv_var
*
ddscale
[
c
];
}
}
__syncthreads
();
if
(
ddbias
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
ddy
[
i
]
+=
ddbias
[
c
];
}
}
}
}
}
}
...
@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
...
@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const
T
*
mean_data
,
*
variance_data
;
const
T
*
mean_data
,
*
variance_data
;
if
(
use_global_stats
)
{
if
(
use_global_stats
)
{
const
auto
*
running_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
running_var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
running_var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
running_mean_data
=
running_mean
->
template
data
<
T
>();
const
auto
*
running_var_data
=
running_var
->
template
data
<
T
>();
const
auto
*
running_var_data
=
running_var
->
template
data
<
T
>();
mean_data
=
running_mean_data
;
variance_data
=
running_var_data
;
variance_data
=
running_var_data
;
}
else
{
}
else
{
const
T
*
smean_data
=
Saved_mean
->
data
<
T
>
();
const
T
*
smean_data
=
Saved_mean
->
data
<
T
>
();
...
@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
...
@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant
(
dev_ctx
,
dX
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
dX
,
static_cast
<
T
>
(
0
));
if
(
use_global_stats
)
{
if
(
use_global_stats
)
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
X
WithGlobal
<
T
,
DataLayout
::
kNHWC
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
T
,
DataLayout
::
kNHWC
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
ddscale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
dy_data
,
ddscale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
dx_data
);
dx_data
);
}
else
{
}
else
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
X
WithGlobal
<
T
,
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
T
,
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
ddscale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
dy_data
,
ddscale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
dx_data
);
dx_data
);
...
@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
...
@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant
(
dev_ctx
,
ddY
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
ddY
,
static_cast
<
T
>
(
0
));
if
(
use_global_stats
)
{
if
(
use_global_stats
)
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
DY
WithGlobal
<
T
,
DataLayout
::
kNHWC
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
T
,
DataLayout
::
kNHWC
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
ddx_data
,
scale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
ddx_data
,
scale_data
,
mean_data
,
variance_data
,
x_data
,
ddbias_data
,
ddy_data
);
dd
scale_data
,
epsilon
,
C
,
sample_size
,
num
,
dd
y_data
);
}
else
{
}
else
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
DY
WithGlobal
<
T
,
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
T
,
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
ddx_data
,
scale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
ddx_data
,
scale_data
,
mean_data
,
variance_data
,
x_data
,
ddbias_data
,
ddy_data
);
dd
scale_data
,
epsilon
,
C
,
sample_size
,
num
,
dd
y_data
);
}
}
}
else
{
}
else
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
...
...
paddle/fluid/operators/utils.h
浏览文件 @
d0c9f7e8
...
@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
...
@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
// NOTE: Converting int64 to int32 may cause data overflow.
// NOTE: Converting int64 to int32 may cause data overflow.
vec_new_data
=
std
::
vector
<
T
>
(
data
,
data
+
x
->
numel
());
vec_new_data
=
std
::
vector
<
T
>
(
data
,
data
+
x
->
numel
());
}
else
{
}
else
{
PADDLE_THROW
(
"The dtype of Tensor must be int32 or int64."
);
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The dtype of Tensor must be int32 or int64, but received: %s"
,
x
->
type
()));
}
}
return
vec_new_data
;
return
vec_new_data
;
}
}
...
@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
...
@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
for
(
size_t
i
=
0
;
i
<
list_tensor
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
list_tensor
.
size
();
++
i
)
{
auto
tensor
=
list_tensor
[
i
];
auto
tensor
=
list_tensor
[
i
];
PADDLE_ENFORCE_EQ
(
tensor
->
dims
(),
framework
::
make_ddim
({
1
}),
PADDLE_ENFORCE_EQ
(
tensor
->
dims
(),
framework
::
make_ddim
({
1
}),
"ShapeError: The shape of Tensor in list must be [1]. "
platform
::
errors
::
InvalidArgument
(
"But received the shape "
"The shape of Tensor in list must be [1]. "
"is [%s]"
,
"But received its shape "
tensor
->
dims
());
"is [%s]"
,
tensor
->
dims
()));
if
(
tensor
->
type
()
==
framework
::
proto
::
VarType
::
INT32
)
{
if
(
tensor
->
type
()
==
framework
::
proto
::
VarType
::
INT32
)
{
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
...
@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
...
@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
vec_new_data
.
push_back
(
static_cast
<
T
>
(
*
tensor
->
data
<
int64_t
>
()));
vec_new_data
.
push_back
(
static_cast
<
T
>
(
*
tensor
->
data
<
int64_t
>
()));
}
}
}
else
{
}
else
{
PADDLE_THROW
(
"The dtype of Tensor in list must be int32 or int64."
);
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The dtype of Tensor in list must be int32 or int64, but received: "
"%s"
,
tensor
->
type
()));
}
}
}
}
return
vec_new_data
;
return
vec_new_data
;
...
...
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
d0c9f7e8
...
@@ -134,7 +134,26 @@ USE_CUDA_ATOMIC(Max, int);
...
@@ -134,7 +134,26 @@ USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC
(
Max
,
unsigned
int
);
USE_CUDA_ATOMIC
(
Max
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
// It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
USE_CUDA_ATOMIC
(
Max
,
unsigned
long
long
int
);
// NOLINT
USE_CUDA_ATOMIC
(
Max
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Max
,
unsigned
long
long
int
)
{
if
(
*
address
>=
val
)
{
return
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
do
{
assumed
=
old
;
if
(
assumed
>=
val
)
{
break
;
}
old
=
atomicCAS
(
address
,
assumed
,
val
);
}
while
(
assumed
!=
old
);
}
#endif
CUDA_ATOMIC_WRAPPER
(
Max
,
int64_t
)
{
CUDA_ATOMIC_WRAPPER
(
Max
,
int64_t
)
{
// Here, we check long long int must be int64_t.
// Here, we check long long int must be int64_t.
...
@@ -187,7 +206,26 @@ USE_CUDA_ATOMIC(Min, int);
...
@@ -187,7 +206,26 @@ USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC
(
Min
,
unsigned
int
);
USE_CUDA_ATOMIC
(
Min
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
// It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
USE_CUDA_ATOMIC
(
Min
,
unsigned
long
long
int
);
// NOLINT
USE_CUDA_ATOMIC
(
Min
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Min
,
unsigned
long
long
int
)
{
if
(
*
address
<=
val
)
{
return
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
do
{
assumed
=
old
;
if
(
assumed
<=
val
)
{
break
;
}
old
=
atomicCAS
(
address
,
assumed
,
val
);
}
while
(
assumed
!=
old
);
}
#endif
CUDA_ATOMIC_WRAPPER
(
Min
,
int64_t
)
{
CUDA_ATOMIC_WRAPPER
(
Min
,
int64_t
)
{
// Here, we check long long int must be int64_t.
// Here, we check long long int must be int64_t.
...
...
paddle/scripts/installation_validate.py
浏览文件 @
d0c9f7e8
...
@@ -15,4 +15,5 @@
...
@@ -15,4 +15,5 @@
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle
as
pd
import
paddle
as
pd
fluid
.
install_check
.
run_check
()
print
(
pd
.
__version__
)
print
(
pd
.
__version__
)
paddle/scripts/paddle_build.bat
浏览文件 @
d0c9f7e8
...
@@ -40,6 +40,7 @@ if not defined WITH_TPCACHE set WITH_TPCACHE=ON
...
@@ -40,6 +40,7 @@ if not defined WITH_TPCACHE set WITH_TPCACHE=ON
rem -------set cache build work directory-----------
rem -------set cache build work directory-----------
rmdir
build
\python
/s/q
if
"
%WITH_CACHE%
"
==
"OFF"
(
if
"
%WITH_CACHE%
"
==
"OFF"
(
rmdir
build
/s/q
rmdir
build
/s/q
goto
:mkbuild
goto
:mkbuild
...
@@ -48,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" (
...
@@ -48,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" (
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
datetime
=
%%
#
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
datetime
=
%%
#
set
day_now
=
%datetime
:
~
6
,
2
%
set
day_now
=
%datetime
:
~
6
,
2
%
set
day_before
=-
1
set
day_before
=-
1
set
/p
day_before
=<
day
.txt
set
/p
day_before
=<
%work_dir%
\..\
day.txt
if
%day_now%
NEQ
%day_before%
(
if
%day_now%
NEQ
%day_before%
(
echo
%day_now%
>
day
.txt
echo
%day_now%
>
%work_dir%
\..\
day.txt
type
day
.txt
type
%work_dir%
\..\
day.txt
rmdir
build
/s/q
rmdir
build
/s/q
)
)
git
diff
origin
/develop --stat --name-only
|
findstr
"cmake CMakeLists.txt paddle_build.bat"
git
diff
origin
/develop --stat --name-only
|
findstr
"cmake CMakeLists.txt paddle_build.bat"
...
@@ -208,7 +209,7 @@ echo Build third_party the %build_times% time:
...
@@ -208,7 +209,7 @@ echo Build third_party the %build_times% time:
msbuild
/m /p
:Configuration
=
Release
/verbosity
:quiet
third_party
.vcxproj
msbuild
/m /p
:Configuration
=
Release
/verbosity
:quiet
third_party
.vcxproj
if
%ERRORLEVEL%
NEQ
0
(
if
%ERRORLEVEL%
NEQ
0
(
set
/a
build_times
=
%build_times%
+
1
set
/a
build_times
=
%build_times%
+
1
if
%build_times%
GTR
3
(
if
%build_times%
GTR
2
(
exit
/b
7
exit
/b
7
)
else
(
)
else
(
echo
Build
third_party
failed
,
will
retry
!
echo
Build
third_party
failed
,
will
retry
!
...
@@ -223,7 +224,7 @@ echo Build Paddle the %build_times% time:
...
@@ -223,7 +224,7 @@ echo Build Paddle the %build_times% time:
msbuild
/m
:
%PARALLEL_PROJECT_COUNT%
/p
:TrackFileAccess
=
false
/p
:CLToolExe
=
clcache
.exe
/p
:CLToolPath
=
%PYTHON_ROOT%
\Scripts
/p
:Configuration
=
Release
/verbosity
:minimal
paddle
.sln
msbuild
/m
:
%PARALLEL_PROJECT_COUNT%
/p
:TrackFileAccess
=
false
/p
:CLToolExe
=
clcache
.exe
/p
:CLToolPath
=
%PYTHON_ROOT%
\Scripts
/p
:Configuration
=
Release
/verbosity
:minimal
paddle
.sln
if
%ERRORLEVEL%
NEQ
0
(
if
%ERRORLEVEL%
NEQ
0
(
set
/a
build_times
=
%build_times%
+
1
set
/a
build_times
=
%build_times%
+
1
if
%build_times%
GTR
2
(
if
%build_times%
GTR
1
(
exit
/b
7
exit
/b
7
)
else
(
)
else
(
echo
Build
Paddle
failed
,
will
retry
!
echo
Build
Paddle
failed
,
will
retry
!
...
@@ -301,6 +302,7 @@ goto:eof
...
@@ -301,6 +302,7 @@ goto:eof
call
paddle_winci
\Scripts\deactivate.bat
2
>
NUL
call
paddle_winci
\Scripts\deactivate.bat
2
>
NUL
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
end
=
%%
#
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
end
=
%%
#
set
end
=
%end
:
~
4
,
10
%
set
end
=
%end
:
~
4
,
10
%
call
:timestamp
"
%start%
"
"
%end%
"
"1 card TestCases Total"
call
:timestamp
"
%start%
"
"
%end%
"
"TestCases Total"
call
:timestamp
"
%start%
"
"
%end%
"
"TestCases Total"
echo
Running
unit
tests
failed
,
will
exit
!
echo
Running
unit
tests
failed
,
will
exit
!
exit
/b
8
exit
/b
8
...
@@ -313,6 +315,7 @@ echo ========================================
...
@@ -313,6 +315,7 @@ echo ========================================
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
end
=
%%
#
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
end
=
%%
#
set
end
=
%end
:
~
4
,
10
%
set
end
=
%end
:
~
4
,
10
%
call
:timestamp
"
%start%
"
"
%end%
"
"1 card TestCases Total"
call
:timestamp
"
%start%
"
"
%end%
"
"TestCases Total"
call
:timestamp
"
%start%
"
"
%end%
"
"TestCases Total"
cd
%work_dir%
\paddle\fluid\inference\api\demo_ci
cd
%work_dir%
\paddle\fluid\inference\api\demo_ci
...
@@ -345,6 +348,8 @@ echo ============================================ >> check_change_of_unitte
...
@@ -345,6 +348,8 @@ echo ============================================ >> check_change_of_unitte
echo
EOF
>>
check_change_of_unittest
.sh
echo
EOF
>>
check_change_of_unittest
.sh
echo
spec_path
=
$
(
pwd
)
/UNITTEST
_PR.spec>>
check_change_of_unittest
.sh
echo
spec_path
=
$
(
pwd
)
/UNITTEST
_PR.spec>>
check_change_of_unittest
.sh
echo
ctest
-N
^|
awk
-F
':'
'{print $2}'
^|
sed
'/
^^
$/d'
^|
sed
'$d'
^>
$
{
spec_path
}>>
check_change_of_unittest
.sh
echo
ctest
-N
^|
awk
-F
':'
'{print $2}'
^|
sed
'/
^^
$/d'
^|
sed
'$d'
^>
$
{
spec_path
}>>
check_change_of_unittest
.sh
echo
num
=
$
(
awk
'END{print NR}'
$
{
spec_path
})>>
check_change_of_unittest
.sh
echo
echo
"Windows 1 card TestCases count is $num"
>>
check_change_of_unittest
.sh
echo
UPSTREAM_URL
=
'https://github.com/PaddlePaddle/Paddle'
>>
check_change_of_unittest
.sh
echo
UPSTREAM_URL
=
'https://github.com/PaddlePaddle/Paddle'
>>
check_change_of_unittest
.sh
echo
origin_upstream_url
=
`git remote -v
^|
awk '{print $1, $2}'
^|
uniq
^|
grep upstream
^|
awk '{print $2}'`
>>
check_change_of_unittest
.sh
echo
origin_upstream_url
=
`git remote -v
^|
awk '{print $1, $2}'
^|
uniq
^|
grep upstream
^|
awk '{print $2}'`
>>
check_change_of_unittest
.sh
echo
if
[
"$origin_upstream_url"
==
""
];
then
>>
check_change_of_unittest
.sh
echo
if
[
"$origin_upstream_url"
==
""
];
then
>>
check_change_of_unittest
.sh
...
@@ -455,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL
...
@@ -455,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL
taskkill
/f /im
rc
.exe
2
>
NUL
taskkill
/f /im
rc
.exe
2
>
NUL
wmic
process
where
name
=
"op_function_generator.exe"
call
terminate
2
>
NUL
wmic
process
where
name
=
"op_function_generator.exe"
call
terminate
2
>
NUL
taskkill
/f /im
python
.exe
2
>
NUL
taskkill
/f /im
python
.exe
2
>
NUL
call
paddle_winci
\Scripts\deactivate.bat
2
>
NUL
del
%PADDLE_WHL_FILE_WIN%
taskkill
/f /im
python
.exe
2
>
NUL
taskkill
/f /im
python
.exe
2
>
NUL
echo
Windows
CI
run
successfully
!
echo
Windows
CI
run
successfully
!
exit
/b
0
exit
/b
0
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
d0c9f7e8
...
@@ -988,11 +988,6 @@ set +x
...
@@ -988,11 +988,6 @@ set +x
fi
fi
read
testcase
<<<
$(
echo
"
$line
"
|grep
-oEi
"
\w
+$"
)
read
testcase
<<<
$(
echo
"
$line
"
|grep
-oEi
"
\w
+$"
)
if
python
$PADDLE_ROOT
/tools/is_ut_disabled.py
$testcase
;
then
echo
$testcase
" is disabled."
continue
fi
if
[[
"
$is_nightly
"
!=
""
]]
&&
[
${
NIGHTLY_MODE
:-
OFF
}
==
"OFF"
]
;
then
if
[[
"
$is_nightly
"
!=
""
]]
&&
[
${
NIGHTLY_MODE
:-
OFF
}
==
"OFF"
]
;
then
echo
$testcase
" will only run at night."
echo
$testcase
" will only run at night."
continue
continue
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
d0c9f7e8
...
@@ -845,6 +845,29 @@ class DistributedStrategy(object):
...
@@ -845,6 +845,29 @@ class DistributedStrategy(object):
check_configs_key
(
self
.
strategy
.
dgc_configs
,
configs
,
"dgc_configs"
)
check_configs_key
(
self
.
strategy
.
dgc_configs
,
configs
,
"dgc_configs"
)
assign_configs_value
(
self
.
strategy
.
dgc_configs
,
configs
)
assign_configs_value
(
self
.
strategy
.
dgc_configs
,
configs
)
@
property
def
fp16_allreduce
(
self
):
"""
Indicating whether we are using fp16 gradient allreduce training
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fp16_allreduce = True # by default this is false
"""
return
self
.
strategy
.
fp16_allreduce
@
fp16_allreduce
.
setter
@
is_strict_auto
def
fp16_allreduce
(
self
,
flag
):
if
not
isinstance
(
flag
,
bool
):
raise
TypeError
(
'fp16_allreduce must be value of bool type'
)
self
.
strategy
.
fp16_allreduce
=
flag
@
property
@
property
def
gradient_merge
(
self
):
def
gradient_merge
(
self
):
"""
"""
...
...
python/paddle/distributed/fleet/meta_optimizers/__init__.py
浏览文件 @
d0c9f7e8
...
@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer
...
@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer
from
.parameter_server_graph_optimizer
import
ParameterServerGraphOptimizer
from
.parameter_server_graph_optimizer
import
ParameterServerGraphOptimizer
from
.dgc_optimizer
import
DGCOptimizer
from
.dgc_optimizer
import
DGCOptimizer
from
.lamb_optimizer
import
LambOptimizer
from
.lamb_optimizer
import
LambOptimizer
from
.fp16_allreduce_optimizer
import
FP16AllReduceOptimizer
python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
0 → 100755
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from
paddle.fluid
import
core
,
framework
,
unique_name
from
.meta_optimizer_base
import
MetaOptimizerBase
class
FP16AllReduceOptimizer
(
MetaOptimizerBase
):
def
__init__
(
self
,
optimizer
):
super
(
FP16AllReduceOptimizer
,
self
).
__init__
(
optimizer
)
self
.
inner_opt
=
optimizer
# we do not allow meta optimizer to be inner optimizer currently
self
.
meta_optimizers_white_list
=
[
"LarsOptimizer"
,
"LambOptimizer"
,
"RecomputeOptimizer"
,
"LocalSGDOptimizer"
,
"GradientMergeOptimizer"
,
"GraphExecutionOptimizer"
,
"AdaptiveLocalSGDOptimizer"
,
]
self
.
meta_optimizers_black_list
=
[
"DGCOptimizer"
]
def
_set_basic_info
(
self
,
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
):
super
(
FP16AllReduceOptimizer
,
self
).
_set_basic_info
(
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
)
def
_can_apply
(
self
):
if
not
self
.
role_maker
.
_is_collective
:
return
False
if
self
.
user_defined_strategy
.
fp16_allreduce
:
return
True
return
False
def
_disable_strategy
(
self
,
dist_strategy
):
dist_strategy
.
fp16_allreduce
=
False
def
_enable_strategy
(
self
,
dist_strategy
,
context
=
None
):
dist_strategy
.
fp16_allreduce
=
True
@
staticmethod
def
fp16_compression
(
param_and_grads
):
"""
Compress fp32 gradients to fp16 during allreduce.
"""
op_maker
=
core
.
op_proto_and_checker_maker
new_param_and_grads
=
[]
# param, grad, is_cast
# cast grad from fp32->fp16 before allreduce,
for
param
,
grad
in
param_and_grads
:
if
grad
is
None
or
grad
.
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
:
new_param_and_grads
.
append
((
param
,
grad
,
False
))
continue
op
=
grad
.
op
block
=
grad
.
block
var_attr
=
op
.
all_attrs
()[
op_maker
.
kOpRoleVarAttrName
()]
if
param
.
name
not
in
var_attr
:
new_param_and_grads
.
append
((
param
,
grad
,
False
))
continue
# remove (param, grad) from op_role_var
var_attr
.
remove
(
param
.
name
)
var_attr
.
remove
(
grad
.
name
)
if
len
(
var_attr
)
>
1
:
op
.
_set_attr
(
op_maker
.
kOpRoleVarAttrName
(),
var_attr
)
else
:
op
.
_remove_attr
(
op_maker
.
kOpRoleVarAttrName
())
new_grad
=
block
.
create_var
(
name
=
unique_name
.
generate
(
grad
.
name
+
".cast_fp16"
),
dtype
=
core
.
VarDesc
.
VarType
.
FP16
,
persistable
=
False
,
stop_gradient
=
True
)
with
block
.
program
.
_backward_role_guard
():
cast_op
=
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
grad
},
outputs
=
{
"Out"
:
new_grad
},
attrs
=
{
"in_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP16
},
stop_gradient
=
True
)
backward
=
op_maker
.
OpRole
.
Backward
cast_op
.
_set_attr
(
op_maker
.
kOpRoleAttrName
(),
backward
)
cast_op
.
_set_attr
(
op_maker
.
kOpRoleVarAttrName
(),
[
param
.
name
,
new_grad
.
name
])
new_grad
.
op
=
cast_op
new_param_and_grads
.
append
((
param
,
new_grad
,
True
))
ret_param_and_grads
=
[]
# cast grad from fp16->fp32 after allreduce.
# NOTE. Now we split fp16 compression into two for loops,
# if we do not separate them, fuse allreduce will wrong.
# This must be the problem of fuse allreduce pass, need
# fixed in future.
for
param
,
grad
,
cast
in
new_param_and_grads
:
if
not
cast
:
ret_param_and_grads
.
append
((
param
,
grad
))
continue
block
=
grad
.
block
new_grad
=
block
.
create_var
(
name
=
unique_name
.
generate
(
grad
.
name
+
".cast_fp32"
),
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
persistable
=
False
,
stop_gradient
=
True
)
with
block
.
program
.
_optimized_guard
(
[
param
,
grad
]),
framework
.
name_scope
(
'fp16_allreduce'
):
cast_op
=
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
grad
},
outputs
=
{
"Out"
:
new_grad
},
attrs
=
{
"in_dtype"
:
core
.
VarDesc
.
VarType
.
FP16
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
},
stop_gradient
=
True
)
ret_param_and_grads
.
append
((
param
,
new_grad
))
return
ret_param_and_grads
def
apply_optimize
(
self
,
loss
,
startup_program
,
params_grads
):
new_params_grads
=
self
.
fp16_compression
(
params_grads
)
return
self
.
inner_opt
.
apply_optimize
(
loss
,
startup_program
=
startup_program
,
params_grads
=
new_params_grads
)
python/paddle/fluid/executor.py
浏览文件 @
d0c9f7e8
...
@@ -1355,7 +1355,7 @@ class Executor(object):
...
@@ -1355,7 +1355,7 @@ class Executor(object):
if
not
program
.
_fleet_opt
is
None
:
if
not
program
.
_fleet_opt
is
None
:
if
program
.
_fleet_opt
.
get
(
"worker_class"
,
""
)
==
"HeterCpuWorker"
:
if
program
.
_fleet_opt
.
get
(
"worker_class"
,
""
)
==
"HeterCpuWorker"
:
is_heter
=
1
is_heter
=
1
if
program
.
_fleet_opt
(
"trainer"
,
""
)
==
"HeterXpuTrainer"
:
if
program
.
_fleet_opt
.
get
(
"trainer"
,
""
)
==
"HeterXpuTrainer"
:
is_heter
=
1
is_heter
=
1
if
scope
is
None
:
if
scope
is
None
:
scope
=
global_scope
()
scope
=
global_scope
()
...
...
python/paddle/fluid/reader.py
浏览文件 @
d0c9f7e8
...
@@ -167,10 +167,10 @@ class DataLoader(object):
...
@@ -167,10 +167,10 @@ class DataLoader(object):
The variables should be created by :code:`fluid.data()`.
The variables should be created by :code:`fluid.data()`.
:attr:`feed_list` must be set if :attr:`return_list` is
:attr:`feed_list` must be set if :attr:`return_list` is
False. Default None.
False. Default None.
places(list(Place)|tuple(Place)
): a list of Place, to put data
places(list(Place)|tuple(Place)
|optional): a list of Place,
onto, :attr:`places` must be set in both static graph and
to put data onto, :attr:`places` can be None, if
dynamic graph mode, in dynamic graph mode, place number must
:attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
be 1
. Default None.
will be used
. Default None.
return_list (bool): whether the return value on each device is
return_list (bool): whether the return value on each device is
presented as a list. If :attr:`return_list=False`, the return
presented as a list. If :attr:`return_list=False`, the return
value on each device would be a dict of str -> LoDTensor, where
value on each device would be a dict of str -> LoDTensor, where
...
@@ -222,6 +222,8 @@ class DataLoader(object):
...
@@ -222,6 +222,8 @@ class DataLoader(object):
.. code-block:: python
.. code-block:: python
import numpy as np
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid as fluid
from paddle.io import Dataset, BatchSampler, DataLoader
from paddle.io import Dataset, BatchSampler, DataLoader
...
@@ -247,11 +249,48 @@ class DataLoader(object):
...
@@ -247,11 +249,48 @@ class DataLoader(object):
def __len__(self):
def __len__(self):
return self.num_samples
return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
# get places
# get places
places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
# --------------------- dygraph mode --------------------
class SimpleNet(fluid.dygraph.Layer):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
def forward(self, image, label=None):
return self.fc(image)
with fluid.dygraph.guard(places[0]):
simple_net = SimpleNet()
opt = fluid.optimizer.SGD(learning_rate=1e-3,
parameter_list=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.reduce_mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
# -------------------------------------------------------
# -------------------- static graph ---------------------
# -------------------- static graph ---------------------
paddle.enable_static()
def simple_net(image, label):
def simple_net(image, label):
fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
...
@@ -270,11 +309,8 @@ class DataLoader(object):
...
@@ -270,11 +309,8 @@ class DataLoader(object):
prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
loader = DataLoader(dataset,
loader = DataLoader(dataset,
feed_list=[image, label],
feed_list=[image, label],
places=places,
batch_size=BATCH_SIZE,
batch_size=BATCH_SIZE,
shuffle=True,
shuffle=True,
drop_last=True,
drop_last=True,
...
@@ -287,39 +323,6 @@ class DataLoader(object):
...
@@ -287,39 +323,6 @@ class DataLoader(object):
# -------------------------------------------------------
# -------------------------------------------------------
# --------------------- dygraph mode --------------------
class SimpleNet(fluid.dygraph.Layer):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
def forward(self, image, label=None):
return self.fc(image)
with fluid.dygraph.guard(places[0]):
simple_net = SimpleNet()
opt = fluid.optimizer.SGD(learning_rate=1e-3,
parameter_list=simple_net.parameters())
loader = DataLoader(dataset,
places=places[0],
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.reduce_mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
# -------------------------------------------------------
.. note::
.. note::
For reading iterable dataset with multiprocess Dataloader,
For reading iterable dataset with multiprocess Dataloader,
...
@@ -356,11 +359,9 @@ class DataLoader(object):
...
@@ -356,11 +359,9 @@ class DataLoader(object):
"feed_list should be set when return_list=False"
"feed_list should be set when return_list=False"
self
.
feed_list
=
feed_list
self
.
feed_list
=
feed_list
assert
places
is
not
None
,
"places cannot be None"
if
places
is
None
:
places
=
_current_expected_place
()
self
.
places
=
_convert_places
(
places
)
self
.
places
=
_convert_places
(
places
)
if
in_dygraph_mode
():
assert
len
(
self
.
places
)
==
1
,
\
"Number of places must be 1 in dygraph mode"
assert
num_workers
>=
0
,
"num_workers should be a non-negative value"
assert
num_workers
>=
0
,
"num_workers should be a non-negative value"
if
num_workers
>
0
and
(
sys
.
platform
==
'darwin'
or
if
num_workers
>
0
and
(
sys
.
platform
==
'darwin'
or
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
d0c9f7e8
...
@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
...
@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_fp16_allreduce_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_private_function
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_private_function
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base
)
...
@@ -334,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
...
@@ -334,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
# disable this unittest temporarily
# disable this unittest temporarily
list
(
REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception
)
list
(
REMOVE_ITEM TEST_OPS test_sampling_id_op
)
list
(
REMOVE_ITEM TEST_OPS test_sampling_id_op
)
list
(
REMOVE_ITEM TEST_OPS test_paddle_save_load
)
if
(
APPLE OR WIN32
)
if
(
APPLE OR WIN32
)
list
(
REMOVE_ITEM TEST_OPS test_dataset
)
list
(
REMOVE_ITEM TEST_OPS test_dataset
)
...
@@ -458,6 +456,7 @@ if(WITH_DISTRIBUTE)
...
@@ -458,6 +456,7 @@ if(WITH_DISTRIBUTE)
py_test_modules
(
test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
${
dist_ENVS
}
)
...
...
python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed.fleet.meta_optimizers
import
FP16AllReduceOptimizer
as
FP16AllReduce
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
from
dist_mnist
import
cnn_model
DTYPE
=
"float32"
paddle
.
dataset
.
mnist
.
fetch
()
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
class
TestDistMnist2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
):
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
# Train program
predict
=
cnn_model
(
images
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
batch_size_tensor
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size_tensor
)
inference_program
=
fluid
.
default_main_program
().
clone
()
# Optimization
opt
=
fluid
.
optimizer
.
MomentumOptimizer
(
learning_rate
=
0.001
,
momentum
=
0.9
)
opt
=
FP16AllReduce
(
opt
)
# Reader
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
opt
.
minimize
(
avg_cost
)
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
if
__name__
==
"__main__"
:
runtime_main
(
TestDistMnist2x2
)
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ConvActivationMkldnnFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
conv_out
=
fluid
.
layers
.
conv2d
(
data
,
num_filters
=
self
.
conv_num_filters
,
filter_size
=
self
.
conv_filter_size
,
bias_attr
=
self
.
conv_bias_attr
,
act
=
self
.
act
)
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
fetch_list
=
[
conv_out
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
conv_num_filters
=
3
self
.
conv_filter_size
=
3
self
.
conv_bias_attr
=
False
self
.
act
=
"relu"
self
.
pass_name
=
'conv_relu_mkldnn_fuse_pass'
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
ConvActivationMkldnnFusePassTest_1
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
5
self
.
conv_filter_size
=
5
self
.
conv_bias_attr
=
True
self
.
act
=
"relu"
self
.
pass_name
=
'conv_relu_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_2
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
3
self
.
conv_filter_size
=
3
self
.
conv_bias_attr
=
False
self
.
act
=
"leaky_relu"
self
.
pass_name
=
'conv_leaky_relu_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_3
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
5
self
.
conv_filter_size
=
5
self
.
conv_bias_attr
=
True
self
.
act
=
"leaky_relu"
self
.
pass_name
=
'conv_leaky_relu_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_4
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
3
self
.
conv_filter_size
=
3
self
.
conv_bias_attr
=
False
self
.
act
=
"relu6"
self
.
pass_name
=
'conv_relu6_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_4
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
5
self
.
conv_filter_size
=
5
self
.
conv_bias_attr
=
True
self
.
act
=
"swish"
self
.
pass_name
=
'conv_swish_mkldnn_fuse_pass'
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ConvConcatReluMkldnnFusePassTest_0
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data_1
=
fluid
.
data
(
name
=
"data_1"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
data_2
=
fluid
.
data
(
name
=
"data_2"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
conv_1
=
fluid
.
layers
.
conv2d
(
data_1
,
num_filters
=
self
.
conv1_num_filters
,
filter_size
=
self
.
conv1_filter_size
,
padding
=
self
.
conv1_padding
,
bias_attr
=
self
.
conv1_bias_attr
)
conv_2
=
fluid
.
layers
.
conv2d
(
data_2
,
num_filters
=
self
.
conv2_num_filters
,
filter_size
=
self
.
conv2_filter_size
,
padding
=
self
.
conv2_padding
,
bias_attr
=
self
.
conv2_bias_attr
)
concat
=
fluid
.
layers
.
concat
(
[
conv_1
,
conv_2
],
axis
=
self
.
concat_axis
)
out
=
fluid
.
layers
.
relu
(
concat
)
self
.
feeds
=
{
"data_1"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
),
"data_2"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
fetch_list
=
[
out
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
conv1_num_filters
=
3
self
.
conv1_filter_size
=
3
self
.
conv1_padding
=
0
self
.
conv1_bias_attr
=
False
self
.
conv2_num_filters
=
3
self
.
conv2_filter_size
=
3
self
.
conv2_padding
=
0
self
.
conv2_bias_attr
=
False
self
.
concat_axis
=
0
self
.
pass_name
=
"conv_concat_relu_mkldnn_fuse_pass"
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
ConvConcatReluMkldnnFusePassTest_1
(
ConvConcatReluMkldnnFusePassTest_0
):
def
set_params
(
self
):
self
.
conv1_num_filters
=
3
self
.
conv1_filter_size
=
3
self
.
conv1_padding
=
0
self
.
conv1_bias_attr
=
False
self
.
conv2_num_filters
=
5
self
.
conv2_filter_size
=
5
self
.
conv2_padding
=
1
self
.
conv2_bias_attr
=
True
self
.
concat_axis
=
1
self
.
pass_name
=
"conv_concat_relu_mkldnn_fuse_pass"
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
MatmulTransposeReshapeMkldnnFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
self
.
data_shape
,
dtype
=
"float32"
)
weight
=
fluid
.
layers
.
create_parameter
(
shape
=
self
.
weight_shape
,
dtype
=
"float32"
)
matmul
=
fluid
.
layers
.
matmul
(
data
,
weight
,
transpose_x
=
self
.
transpose_x
,
transpose_y
=
self
.
transpose_y
)
transpose
=
fluid
.
layers
.
transpose
(
matmul
,
self
.
tranpose_perm
)
reshape
=
fluid
.
layers
.
reshape
(
transpose
,
shape
=
self
.
reshape_shape
)
self
.
fetch_list
=
[
reshape
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
data_shape
=
[
-
1
,
3
,
100
,
110
]
self
.
weight_shape
=
[
1
,
3
,
110
,
100
]
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
110
)).
astype
(
"float32"
)
}
self
.
transpose_x
=
False
self
.
transpose_y
=
False
self
.
tranpose_perm
=
[
0
,
2
,
1
,
3
]
self
.
reshape_shape
=
[
3
,
100
,
100
]
self
.
pass_name
=
'matmul_transpose_reshape_fuse_pass'
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
MatmulTransposeReshapeMkldnnFusePassTest_1
(
MatmulTransposeReshapeMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
data_shape
=
[
-
1
,
3
,
100
,
100
]
self
.
weight_shape
=
[
1
,
3
,
100
,
100
]
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
transpose_x
=
True
self
.
transpose_y
=
True
self
.
tranpose_perm
=
[
0
,
2
,
1
,
3
]
self
.
reshape_shape
=
[
6
,
50
,
100
]
self
.
pass_name
=
'matmul_transpose_reshape_fuse_pass'
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ScaleMatmulMkldnnFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
[
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
weight
=
fluid
.
layers
.
create_parameter
(
shape
=
[
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
scale
=
fluid
.
layers
.
scale
(
data
,
scale
=
self
.
scale_scale
)
matmul
=
fluid
.
layers
.
matmul
(
scale
,
weight
,
transpose_x
=
self
.
transpose_x
,
transpose_y
=
self
.
transpose_y
)
self
.
fetch_list
=
[
matmul
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
scale_scale
=
2.0
self
.
transpose_x
=
False
self
.
transpose_y
=
False
self
.
pass_name
=
"scale_matmul_fuse_pass"
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
ScaleMatmulMkldnnFusePassTest_1
(
ScaleMatmulMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
scale_scale
=
5.0
self
.
transpose_x
=
True
self
.
transpose_y
=
True
self
.
pass_name
=
"scale_matmul_fuse_pass"
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_
mkldnn_conv_relu
_fuse_pass.py
→
python/paddle/fluid/tests/unittests/ir/inference/test_
seq_concat_fc
_fuse_pass.py
浏览文件 @
d0c9f7e8
...
@@ -20,25 +20,13 @@ from inference_pass_test import InferencePassTest
...
@@ -20,25 +20,13 @@ from inference_pass_test import InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ConvBnFusePassMKLDNNTest
(
InferencePassTest
):
class
SeqConcatFCFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
def
test_compatible
(
self
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
self
.
assertTrue
(
data
=
fluid
.
data
(
PassVersionChecker
.
IsCompatible
(
'seq_concat_fc_fuse_pass'
))
name
=
"data"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
conv_out
=
fluid
.
layers
.
conv2d
(
data
,
num_filters
=
3
,
filter_size
=
3
,
bias_attr
=
False
,
act
=
"relu"
)
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
fetch_list
=
[
conv_out
]
self
.
enable_mkldnn
=
True
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
浏览文件 @
d0c9f7e8
...
@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
...
@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
self
.
func
(
p
)
self
.
func
(
p
)
class
TestAbsDoubleGradCheck
(
unittest
.
TestCase
):
@
prog_scope
()
def
func
(
self
,
place
):
# the shape of input variable should be clearly specified, not inlcude -1.
shape
=
[
2
,
3
,
7
,
9
]
eps
=
1e-6
dtype
=
np
.
float64
x
=
layers
.
data
(
'x'
,
shape
,
False
,
dtype
)
x
.
persistable
=
True
y
=
layers
.
abs
(
x
)
x_arr
=
np
.
random
.
uniform
(
-
1
,
1
,
shape
).
astype
(
dtype
)
gradient_checker
.
double_grad_check
(
[
x
],
y
,
x_init
=
x_arr
,
place
=
place
,
eps
=
eps
)
def
test_grad
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
p
in
places
:
self
.
func
(
p
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_activation_op.py
浏览文件 @
d0c9f7e8
...
@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard
...
@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard
class
TestSqrtOpError
(
unittest
.
TestCase
):
class
TestSqrtOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
(),
Program
()):
with
program_guard
(
Program
(),
Program
()):
# The input type of sqrt op must be Variable or numpy.ndarray.
# The input type of sqrt op must be Variable or numpy.ndarray.
in1
=
1
in1
=
1
...
@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase):
...
@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase):
class
TestActivation
(
OpTest
):
class
TestActivation
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"exp"
self
.
op_type
=
"exp"
self
.
init_dtype
()
self
.
init_dtype
()
self
.
init_kernel_type
()
self
.
init_kernel_type
()
...
@@ -71,6 +73,7 @@ class TestActivation(OpTest):
...
@@ -71,6 +73,7 @@ class TestActivation(OpTest):
class
TestParameter
(
object
):
class
TestParameter
(
object
):
def
test_out_name
(
self
):
def
test_out_name
(
self
):
paddle
.
enable_static
()
with
fluid
.
program_guard
(
fluid
.
Program
()):
with
fluid
.
program_guard
(
fluid
.
Program
()):
np_x
=
np
.
array
([
0.1
])
np_x
=
np
.
array
([
0.1
])
data
=
fluid
.
layers
.
data
(
name
=
"X"
,
shape
=
[
1
])
data
=
fluid
.
layers
.
data
(
name
=
"X"
,
shape
=
[
1
])
...
@@ -92,6 +95,7 @@ class TestParameter(object):
...
@@ -92,6 +95,7 @@ class TestParameter(object):
class
TestSigmoid
(
TestActivation
):
class
TestSigmoid
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sigmoid"
self
.
op_type
=
"sigmoid"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation):
...
@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation):
class
TestLogSigmoid
(
TestActivation
):
class
TestLogSigmoid
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"logsigmoid"
self
.
op_type
=
"logsigmoid"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase):
...
@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase):
class
TestTanh
(
TestActivation
,
TestParameter
):
class
TestTanh
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"tanh"
self
.
op_type
=
"tanh"
self
.
init_dtype
()
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
17
]).
astype
(
self
.
dtype
)
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
17
]).
astype
(
self
.
dtype
)
...
@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase):
...
@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase):
class
TestAtan
(
TestActivation
,
TestParameter
):
class
TestAtan
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"atan"
self
.
op_type
=
"atan"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter):
...
@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter):
class
TestSinh
(
TestActivation
):
class
TestSinh
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sinh"
self
.
op_type
=
"sinh"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -349,6 +357,7 @@ class TestSinh(TestActivation):
...
@@ -349,6 +357,7 @@ class TestSinh(TestActivation):
class
TestSinhOpError
(
unittest
.
TestCase
):
class
TestSinhOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
sinh
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
sinh
,
1
)
...
@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase):
...
@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase):
class
TestCosh
(
TestActivation
):
class
TestCosh
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"cosh"
self
.
op_type
=
"cosh"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -420,6 +430,7 @@ class TestCosh(TestActivation):
...
@@ -420,6 +430,7 @@ class TestCosh(TestActivation):
class
TestCoshOpError
(
unittest
.
TestCase
):
class
TestCoshOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
cosh
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
cosh
,
1
)
...
@@ -438,6 +449,7 @@ def ref_tanhshrink(x):
...
@@ -438,6 +449,7 @@ def ref_tanhshrink(x):
class
TestTanhshrink
(
TestActivation
):
class
TestTanhshrink
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"tanh_shrink"
self
.
op_type
=
"tanh_shrink"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold):
...
@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold):
class
TestHardShrink
(
TestActivation
):
class
TestHardShrink
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"hard_shrink"
self
.
op_type
=
"hard_shrink"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink):
...
@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink):
class
TestHardShrinkAPI
(
unittest
.
TestCase
):
class
TestHardShrinkAPI
(
unittest
.
TestCase
):
# test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
# test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
x_np
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
12
]).
astype
(
'float32'
)
self
.
x_np
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
12
]).
astype
(
'float32'
)
self
.
place
=
paddle
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
()
\
self
.
place
=
paddle
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
()
\
else
paddle
.
CPUPlace
()
else
paddle
.
CPUPlace
()
...
@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5):
...
@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5):
class
TestSoftshrink
(
TestActivation
):
class
TestSoftshrink
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"softshrink"
self
.
op_type
=
"softshrink"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase):
...
@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase):
class
TestSqrt
(
TestActivation
,
TestParameter
):
class
TestSqrt
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sqrt"
self
.
op_type
=
"sqrt"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter):
...
@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter):
class
TestRsqrt
(
TestActivation
):
class
TestRsqrt
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"rsqrt"
self
.
op_type
=
"rsqrt"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation):
...
@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation):
class
TestAbs
(
TestActivation
):
class
TestAbs
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"abs"
self
.
op_type
=
"abs"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -792,6 +810,7 @@ class TestAbs(TestActivation):
...
@@ -792,6 +810,7 @@ class TestAbs(TestActivation):
class
TestCeil
(
TestActivation
):
class
TestCeil
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"ceil"
self
.
op_type
=
"ceil"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -808,6 +827,7 @@ class TestCeil(TestActivation):
...
@@ -808,6 +827,7 @@ class TestCeil(TestActivation):
class
TestFloor
(
TestActivation
):
class
TestFloor
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"floor"
self
.
op_type
=
"floor"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -826,6 +846,7 @@ class TestFloor(TestActivation):
...
@@ -826,6 +846,7 @@ class TestFloor(TestActivation):
class
TestCos
(
TestActivation
):
class
TestCos
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"cos"
self
.
op_type
=
"cos"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -843,6 +864,7 @@ class TestCos(TestActivation):
...
@@ -843,6 +864,7 @@ class TestCos(TestActivation):
class
TestAcos
(
TestActivation
):
class
TestAcos
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"acos"
self
.
op_type
=
"acos"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -860,6 +882,7 @@ class TestAcos(TestActivation):
...
@@ -860,6 +882,7 @@ class TestAcos(TestActivation):
class
TestSin
(
TestActivation
,
TestParameter
):
class
TestSin
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sin"
self
.
op_type
=
"sin"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter):
...
@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter):
class
TestAsin
(
TestActivation
):
class
TestAsin
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"asin"
self
.
op_type
=
"asin"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -894,6 +918,7 @@ class TestAsin(TestActivation):
...
@@ -894,6 +918,7 @@ class TestAsin(TestActivation):
class
TestRound
(
TestActivation
):
class
TestRound
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"round"
self
.
op_type
=
"round"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -909,6 +934,7 @@ class TestRound(TestActivation):
...
@@ -909,6 +934,7 @@ class TestRound(TestActivation):
class
TestRelu
(
TestActivation
):
class
TestRelu
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"relu"
self
.
op_type
=
"relu"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation):
...
@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation):
return
0.02
return
0.02
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"leaky_relu"
self
.
op_type
=
"leaky_relu"
self
.
init_dtype
()
self
.
init_dtype
()
alpha
=
self
.
get_alpha
()
alpha
=
self
.
get_alpha
()
...
@@ -1084,6 +1111,7 @@ def gelu(x, approximate):
...
@@ -1084,6 +1111,7 @@ def gelu(x, approximate):
class
TestGeluApproximate
(
TestActivation
):
class
TestGeluApproximate
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"gelu"
self
.
op_type
=
"gelu"
self
.
init_dtype
()
self
.
init_dtype
()
approximate
=
True
approximate
=
True
...
@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation):
...
@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation):
class
TestGelu
(
TestActivation
):
class
TestGelu
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"gelu"
self
.
op_type
=
"gelu"
self
.
init_dtype
()
self
.
init_dtype
()
approximate
=
False
approximate
=
False
...
@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase):
...
@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase):
class
TestBRelu
(
TestActivation
):
class
TestBRelu
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"brelu"
self
.
op_type
=
"brelu"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation):
...
@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation):
class
TestBReluOpError
(
unittest
.
TestCase
):
class
TestBReluOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
brelu
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
brelu
,
1
)
...
@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0):
...
@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0):
class
TestRelu6
(
TestActivation
):
class
TestRelu6
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"relu6"
self
.
op_type
=
"relu6"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase):
...
@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase):
class
TestHardSwish
(
TestActivation
):
class
TestHardSwish
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
'hard_swish'
self
.
op_type
=
'hard_swish'
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation):
...
@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation):
class
TestHardSwishOpError
(
unittest
.
TestCase
):
class
TestHardSwishOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
hard_swish
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
hard_swish
,
1
)
...
@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase):
...
@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase):
class
TestSoftRelu
(
TestActivation
):
class
TestSoftRelu
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"soft_relu"
self
.
op_type
=
"soft_relu"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation):
...
@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation):
class
TestSoftReluOpError
(
unittest
.
TestCase
):
class
TestSoftReluOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
soft_relu
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
soft_relu
,
1
)
...
@@ -1366,6 +1402,7 @@ def elu(x, alpha):
...
@@ -1366,6 +1402,7 @@ def elu(x, alpha):
class
TestELU
(
TestActivation
):
class
TestELU
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"elu"
self
.
op_type
=
"elu"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase):
...
@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase):
class
TestReciprocal
(
TestActivation
):
class
TestReciprocal
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"reciprocal"
self
.
op_type
=
"reciprocal"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation):
...
@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation):
class
TestLog
(
TestActivation
):
class
TestLog
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"log"
self
.
op_type
=
"log"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1478,6 +1517,7 @@ class TestLog(TestActivation):
...
@@ -1478,6 +1517,7 @@ class TestLog(TestActivation):
class
TestLog1p
(
TestActivation
):
class
TestLog1p
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"log1p"
self
.
op_type
=
"log1p"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation):
...
@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation):
class
TestSquare
(
TestActivation
):
class
TestSquare
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"square"
self
.
op_type
=
"square"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation):
...
@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation):
class
TestPow
(
TestActivation
):
class
TestPow
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"pow"
self
.
op_type
=
"pow"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1557,6 +1599,7 @@ class TestPow(TestActivation):
...
@@ -1557,6 +1599,7 @@ class TestPow(TestActivation):
class
TestPow_factor_tensor
(
TestActivation
):
class
TestPow_factor_tensor
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"pow"
self
.
op_type
=
"pow"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation):
...
@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation):
class
TestSTanh
(
TestActivation
):
class
TestSTanh
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"stanh"
self
.
op_type
=
"stanh"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation):
...
@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation):
class
TestSTanhOpError
(
unittest
.
TestCase
):
class
TestSTanhOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
stanh
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
stanh
,
1
)
...
@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20):
...
@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20):
class
TestSoftplus
(
TestActivation
):
class
TestSoftplus
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"softplus"
self
.
op_type
=
"softplus"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1751,6 +1797,7 @@ def ref_softsign(x):
...
@@ -1751,6 +1797,7 @@ def ref_softsign(x):
class
TestSoftsign
(
TestActivation
):
class
TestSoftsign
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"softsign"
self
.
op_type
=
"softsign"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase):
...
@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase):
class
TestThresholdedRelu
(
TestActivation
):
class
TestThresholdedRelu
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"thresholded_relu"
self
.
op_type
=
"thresholded_relu"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation):
...
@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation):
class
TestThresholdedReluOpError
(
unittest
.
TestCase
):
class
TestThresholdedReluOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
thresholded_relu
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
thresholded_relu
,
1
)
...
@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase):
...
@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase):
class
TestHardSigmoid
(
TestActivation
):
class
TestHardSigmoid
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"hard_sigmoid"
self
.
op_type
=
"hard_sigmoid"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation):
...
@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation):
class
TestHardSigmoidOpError
(
unittest
.
TestCase
):
class
TestHardSigmoidOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
hard_sigmoid
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
hard_sigmoid
,
1
)
...
@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase):
...
@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase):
class
TestSwish
(
TestActivation
):
class
TestSwish
(
TestActivation
):
def
setUp
(
self
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"swish"
self
.
op_type
=
"swish"
self
.
init_dtype
()
self
.
init_dtype
()
...
@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation):
...
@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation):
class
TestSwishOpError
(
unittest
.
TestCase
):
class
TestSwishOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
with
program_guard
(
Program
()):
# The input type must be Variable.
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
swish
,
1
)
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
swish
,
1
)
...
...
python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
浏览文件 @
d0c9f7e8
...
@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase):
...
@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase):
fetch_val2
,
=
exe
.
run
(
compiled_prog
,
fetch_val2
,
=
exe
.
run
(
compiled_prog
,
feed
=
feed_dict
,
feed
=
feed_dict
,
fetch_list
=
[
fetch_var
])
fetch_list
=
[
fetch_var
])
#NOTE(zhiqiu): Temporally changed from array_equal to allclose.
self
.
assertTrue
(
np
.
array_equal
(
fetch_val1
,
fetch_val2
))
# The real root is fuse_all_reduce and fuse_all_optimizer_opss may
# result in diff because of the instruction set on the virtual machine.
# And the related unit tests: test_fuse_all_reduce_pass and test_fuse_optimizer_pass use "almostEqual" in their checks.
# There are also some related issues:
# https://github.com/PaddlePaddle/Paddle/issues/21270
# https://github.com/PaddlePaddle/Paddle/issues/21046
# https://github.com/PaddlePaddle/Paddle/issues/21045
self
.
assertTrue
(
np
.
allclose
(
fetch_val1
,
fetch_val2
))
def
check_multi_card_fetch_var
(
self
):
def
check_multi_card_fetch_var
(
self
):
if
self
.
is_invalid_test
():
if
self
.
is_invalid_test
():
...
@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase):
...
@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase):
fetch_vals
.
append
(
fetch_val
)
fetch_vals
.
append
(
fetch_val
)
for
item
in
fetch_vals
:
for
item
in
fetch_vals
:
self
.
assertTrue
(
np
.
array_equal
(
fetch_vals
[
0
],
item
))
# save above
self
.
assertTrue
(
np
.
allclose
(
fetch_vals
[
0
],
item
))
class
CUDAInplaceTest
(
InplaceTestBase
):
class
CUDAInplaceTest
(
InplaceTestBase
):
...
...
tools/is_ut_disabled
.py
→
python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce
.py
浏览文件 @
d0c9f7e8
...
@@ -11,30 +11,23 @@
...
@@ -11,30 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
""" Check whether ut is disabled. """
import
os
from
__future__
import
print_function
import
sys
import
unittest
from
test_dist_base
import
TestDistBase
def
check_ut
():
class
TestDistMnist2x2FP16AllReduce
(
TestDistBase
):
""" Get disabled unit tests. """
def
_setup_config
(
self
):
disable_ut_file
=
'disable_ut'
self
.
_sync_mode
=
True
cmd
=
'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'
.
format
(
self
.
_use_reduce
=
False
disable_ut_file
)
self
.
_nccl2_mode
=
True
os
.
system
(
cmd
)
with
open
(
disable_ut_file
)
as
utfile
:
for
u
in
utfile
:
if
u
.
rstrip
(
'
\r\n
'
)
==
sys
.
argv
[
1
]:
exit
(
0
)
exit
(
1
)
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist_fp16_allreduce.py"
,
delta
=
1e-5
)
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
!=
2
:
if
__name__
==
"__main__"
:
exit
(
1
)
unittest
.
main
()
try
:
check_ut
()
except
Exception
as
e
:
print
(
e
)
exit
(
1
)
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
浏览文件 @
d0c9f7e8
...
@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase):
...
@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase):
strategy
.
dgc
=
"True"
strategy
.
dgc
=
"True"
self
.
assertEqual
(
strategy
.
dgc
,
False
)
self
.
assertEqual
(
strategy
.
dgc
,
False
)
def
test_fp16_allreduce
(
self
):
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
fp16_allreduce
=
True
self
.
assertEqual
(
strategy
.
fp16_allreduce
,
True
)
strategy
.
fp16_allreduce
=
False
self
.
assertEqual
(
strategy
.
fp16_allreduce
,
False
)
with
self
.
assertRaises
(
TypeError
):
strategy
.
fp16_allreduce
=
"True"
self
.
assertEqual
(
strategy
.
fp16_allreduce
,
False
)
def
test_sync_nccl_allreduce
(
self
):
def
test_sync_nccl_allreduce
(
self
):
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
sync_nccl_allreduce
=
True
strategy
.
sync_nccl_allreduce
=
True
...
...
python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.distributed.fleet
as
fleet
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
unittest
import
paddle
import
paddle.fluid
as
fluid
import
os
paddle
.
enable_static
()
class
TestFleetFP16CompressOptimizer
(
unittest
.
TestCase
):
def
setUp
(
self
):
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001"
def
net
(
self
,
main_prog
,
startup_prog
,
dtype
=
'float32'
):
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
dtype
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_2
=
paddle
.
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
64
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
cost
=
paddle
.
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
input_y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
fp16_allreduce
=
True
return
avg_cost
,
strategy
def
test_fp16_allreduce_optimizer
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
cast_out
=
[
op
.
output
(
'Out'
)[
0
]
for
op
in
avg_cost
.
block
.
ops
if
op
.
type
==
'cast'
]
cast_op_count
=
0
for
name
in
ops
:
if
name
==
'cast'
:
cast_op_count
+=
1
self
.
assertIn
(
'cast'
,
ops
)
self
.
assertEqual
(
cast_op_count
,
12
)
# 6 + 6, cast_fp16 + cast_fp32
for
name
in
cast_out
:
self
.
assertIn
(
'cast_fp16'
,
name
)
def
test_fp16_allreduce_not_apply_fp16_net
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
,
dtype
=
'float16'
)
optimizer
=
paddle
.
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
self
.
assertNotIn
(
'cast'
,
ops
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
浏览文件 @
d0c9f7e8
...
@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase):
...
@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase):
x
=
np
.
random
.
randn
(
*
shape
).
astype
(
"float32"
)
x
=
np
.
random
.
randn
(
*
shape
).
astype
(
"float32"
)
y1
=
compute_v1
(
x
)
y1
=
compute_v1
(
x
)
y2
=
compute_v2
(
x
)
y2
=
compute_v2
(
x
)
self
.
assertTrue
(
np
.
allclose
(
y1
,
y2
))
result
=
np
.
allclose
(
y1
,
y2
)
if
not
result
:
print
(
"y1:"
,
y1
,
"
\t
y2:"
,
y2
)
self
.
assertTrue
(
result
)
test_weight_bias_false
()
test_weight_bias_false
()
def
test_static
(
self
):
def
test_static
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
浏览文件 @
d0c9f7e8
...
@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
...
@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from
paddle.fluid.dygraph.base
import
to_variable
from
paddle.fluid.dygraph.base
import
to_variable
from
test_imperative_base
import
new_program_scope
from
test_imperative_base
import
new_program_scope
if
fluid
.
is_compiled_with_cuda
():
fluid
.
set_flags
({
'FLAGS_cudnn_deterministic'
:
True
})
batch_size
=
8
batch_size
=
8
train_parameters
=
{
train_parameters
=
{
"input_size"
:
[
3
,
224
,
224
],
"input_size"
:
[
3
,
224
,
224
],
...
@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase):
...
@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase):
label
.
stop_gradient
=
True
label
.
stop_gradient
=
True
out
=
se_resnext
(
img
)
out
=
se_resnext
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
out
,
label
=
label
)
softmax_out
=
fluid
.
layers
.
softmax
(
out
,
use_cudnn
=
False
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
softmax_out
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
dy_out
=
avg_loss
.
numpy
()
dy_out
=
avg_loss
.
numpy
()
...
@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase):
...
@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase):
name
=
'pixel'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
name
=
'pixel'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
out
=
se_resnext
(
img
)
out
=
se_resnext
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
out
,
label
=
label
)
softmax_out
=
fluid
.
layers
.
softmax
(
out
,
use_cudnn
=
False
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
softmax_out
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
optimizer
.
minimize
(
avg_loss
)
optimizer
.
minimize
(
avg_loss
)
...
@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase):
...
@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase):
static_grad_value
[
static_grad_name_list
[
static_grad_value
[
static_grad_name_list
[
i
-
grad_start_pos
]]
=
out
[
i
]
i
-
grad_start_pos
]]
=
out
[
i
]
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
))
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
),
"
\n
static_out: {}
\n
dy_out: {}"
.
format
(
static_out
,
dy_out
))
self
.
assertEqual
(
len
(
dy_param_init_value
),
len
(
static_param_init_value
))
self
.
assertEqual
(
len
(
dy_param_init_value
),
len
(
static_param_init_value
))
...
@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase):
...
@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase):
self
.
assertEqual
(
len
(
dy_grad_value
),
len
(
static_grad_value
))
self
.
assertEqual
(
len
(
dy_grad_value
),
len
(
static_grad_value
))
for
key
,
value
in
six
.
iteritems
(
static_grad_value
):
for
key
,
value
in
six
.
iteritems
(
static_grad_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_grad_value
[
key
]))
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_grad_value
[
key
]),
"
\n
static_grad_value: {}
\n
dy_grad_value: {}"
.
format
(
value
,
dy_grad_value
[
key
]))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
self
.
assertEqual
(
len
(
dy_param_value
),
len
(
static_param_value
))
self
.
assertEqual
(
len
(
dy_param_value
),
len
(
static_param_value
))
for
key
,
value
in
six
.
iteritems
(
static_param_value
):
for
key
,
value
in
six
.
iteritems
(
static_param_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]))
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]),
"
\n
static_param_value: {}
\n
dy_param_value: {}"
.
format
(
value
,
dy_param_value
[
key
]))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
浏览文件 @
d0c9f7e8
...
@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
...
@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
self
.
y_shape
=
(
100
,
)
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
def
init_kernel_type
(
self
):
self
.
dtype
=
"float64"
self
.
dtype
=
"float64"
def
setUp
(
self
):
def
setUp
(
self
):
self
.
init_kernel_type
()
self
.
config
()
self
.
config
()
self
.
op_type
=
"matmul_v2"
self
.
op_type
=
"matmul_v2"
x
=
np
.
random
.
random
(
self
.
x_shape
).
astype
(
self
.
dtype
)
x
=
np
.
random
.
random
(
self
.
x_shape
).
astype
(
self
.
dtype
)
y
=
np
.
random
.
random
(
self
.
y_shape
).
astype
(
self
.
dtype
)
y
=
np
.
random
.
random
(
self
.
y_shape
).
astype
(
self
.
dtype
)
# -0.1 ~ 0.1
x
=
-
0.1
+
0.2
*
x
y
=
-
0.1
+
0.2
*
y
result
=
reference_matmul
(
x
,
y
,
self
.
trans_x
,
self
.
trans_y
)
result
=
reference_matmul
(
x
,
y
,
self
.
trans_x
,
self
.
trans_y
)
result
=
result
.
astype
(
self
.
dtype
)
self
.
inputs
=
{
self
.
inputs
=
{
'X'
:
x
,
'X'
:
x
,
'Y'
:
y
,
'Y'
:
y
,
...
@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
...
@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
3
,
2
,
100
)
self
.
y_shape
=
(
1
,
3
,
2
,
100
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
True
self
.
trans_y
=
True
self
.
dtype
=
"float64"
class
TestMatMuklOp3
(
TestMatMulV2Op
):
class
TestMatMuklOp3
(
TestMatMulV2Op
):
...
@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
...
@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp4
(
TestMatMulV2Op
):
class
TestMatMuklOp4
(
TestMatMulV2Op
):
...
@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
...
@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
2
,
100
,
2
)
self
.
y_shape
=
(
1
,
2
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp5
(
TestMatMulV2Op
):
class
TestMatMuklOp5
(
TestMatMulV2Op
):
...
@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
...
@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
"""
"""
def
config
(
self
):
def
config
(
self
):
self
.
x_shape
=
(
1
,
1
,
100
,
2
)
self
.
x_shape
=
(
1
,
1
,
100
,
1
)
self
.
y_shape
=
(
100
,
)
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
True
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp6
(
TestMatMulV2Op
):
class
TestMatMuklOp6
(
TestMatMulV2Op
):
...
@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
...
@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
self
.
y_shape
=
(
100
,
)
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
True
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp7
(
TestMatMulV2Op
):
class
TestMatMuklOp7
(
TestMatMulV2Op
):
...
@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
...
@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
self
.
y_shape
=
(
100
,
)
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp8
(
TestMatMulV2Op
):
class
TestMatMuklOp8
(
TestMatMulV2Op
):
...
@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
...
@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp9
(
TestMatMulV2Op
):
class
TestMatMuklOp9
(
TestMatMulV2Op
):
...
@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
...
@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
self
.
y_shape
=
(
2
,
1
,
2
,
100
)
self
.
y_shape
=
(
2
,
1
,
2
,
100
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
True
self
.
trans_y
=
True
self
.
dtype
=
"float64"
class
TestMatMuklOp10
(
TestMatMulV2Op
):
class
TestMatMuklOp10
(
TestMatMulV2Op
):
...
@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
...
@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
"""
"""
def
config
(
self
):
def
config
(
self
):
self
.
x_shape
=
(
1
,
1
,
2
,
100
)
self
.
x_shape
=
(
1
,
1
,
2
5
,
4
)
self
.
y_shape
=
(
1
,
2
,
100
,
2
)
self
.
y_shape
=
(
1
,
2
,
4
,
25
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp11
(
TestMatMulV2Op
):
class
TestMatMuklOp11
(
TestMatMulV2Op
):
...
@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
...
@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp12
(
TestMatMulV2Op
):
class
TestMatMuklOp12
(
TestMatMulV2Op
):
...
@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
...
@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
"""
"""
def
config
(
self
):
def
config
(
self
):
self
.
x_shape
=
(
2
,
1
,
100
,
2
)
self
.
x_shape
=
(
2
,
1
,
4
,
25
)
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
y_shape
=
(
1
,
1
,
4
,
25
)
self
.
trans_x
=
True
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp13
(
TestMatMulV2Op
):
class
TestMatMuklOp13
(
TestMatMulV2Op
):
...
@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
...
@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
"""
"""
def
config
(
self
):
def
config
(
self
):
self
.
x_shape
=
(
2
,
2
,
100
,
2
)
self
.
x_shape
=
(
2
,
2
,
2
,
50
)
self
.
y_shape
=
(
2
,
2
,
100
,
2
)
self
.
y_shape
=
(
2
,
2
,
2
,
50
)
self
.
trans_x
=
True
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp14
(
TestMatMulV2Op
):
class
TestMatMuklOp14
(
TestMatMulV2Op
):
...
@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
...
@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
2
)
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
2
)
self
.
trans_x
=
True
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp15
(
TestMatMulV2Op
):
class
TestMatMuklOp15
(
TestMatMulV2Op
):
...
@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
...
@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
1
)
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
1
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp16
(
TestMatMulV2Op
):
class
TestMatMuklOp16
(
TestMatMulV2Op
):
...
@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
...
@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
def
config
(
self
):
def
config
(
self
):
self
.
x_shape
=
(
100
)
self
.
x_shape
=
(
100
)
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
1
)
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp17
(
TestMatMulV2Op
):
class
TestMatMuklOp17
(
TestMatMulV2Op
):
...
@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
...
@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
self
.
y_shape
=
(
100
)
self
.
y_shape
=
(
100
)
self
.
trans_x
=
False
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
#--------------------test matmul fp16--------------------
def
create_test_fp16_class
(
parent
,
atol
=
0.001
,
max_relative_error
=
1.0
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMatMulOpFp16Case
(
parent
):
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
atol
)
def
test_check_grad
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'X'
,
'Y'
],
'Out'
,
max_relative_error
=
max_relative_error
)
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"Fp16"
)
TestMatMulOpFp16Case
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestMatMulOpFp16Case
create_test_fp16_class
(
TestMatMulV2Op
)
create_test_fp16_class
(
TestMatMuklOp2
)
create_test_fp16_class
(
TestMatMuklOp3
)
create_test_fp16_class
(
TestMatMuklOp4
)
create_test_fp16_class
(
TestMatMuklOp5
)
create_test_fp16_class
(
TestMatMuklOp6
)
create_test_fp16_class
(
TestMatMuklOp7
)
create_test_fp16_class
(
TestMatMuklOp8
)
create_test_fp16_class
(
TestMatMuklOp9
)
create_test_fp16_class
(
TestMatMuklOp10
)
create_test_fp16_class
(
TestMatMuklOp11
)
create_test_fp16_class
(
TestMatMuklOp12
)
create_test_fp16_class
(
TestMatMuklOp13
)
create_test_fp16_class
(
TestMatMuklOp14
)
create_test_fp16_class
(
TestMatMuklOp15
)
create_test_fp16_class
(
TestMatMuklOp16
)
create_test_fp16_class
(
TestMatMuklOp17
)
class
TestMatMulV2API
(
unittest
.
TestCase
):
class
TestMatMulV2API
(
unittest
.
TestCase
):
...
@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
...
@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
y
=
paddle
.
to_tensor
(
input_y
)
y
=
paddle
.
to_tensor
(
input_y
)
result
=
paddle
.
matmul
(
x
,
y
)
result
=
paddle
.
matmul
(
x
,
y
)
def
test_dygraph_fp16
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
with
fluid
.
dygraph
.
guard
(
place
):
input_x
=
np
.
random
.
random
([
4
,
3
]).
astype
(
"float16"
)
input_y
=
np
.
random
.
random
([
3
,
4
]).
astype
(
"float16"
)
x
=
paddle
.
to_tensor
(
input_x
)
y
=
paddle
.
to_tensor
(
input_y
)
result
=
paddle
.
matmul
(
x
,
y
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
浏览文件 @
d0c9f7e8
...
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
...
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
dataset
=
RandomDataset
(
SAMPLE_NUM
,
CLASS_NUM
)
dataset
=
RandomDataset
(
SAMPLE_NUM
,
CLASS_NUM
)
dataloader
=
DataLoader
(
dataloader
=
DataLoader
(
dataset
,
dataset
,
places
=
places
,
num_workers
=
num_workers
,
num_workers
=
num_workers
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
drop_last
=
True
)
drop_last
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
浏览文件 @
d0c9f7e8
...
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
...
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
dataset
=
RandomDataset
(
SAMPLE_NUM
,
CLASS_NUM
)
dataset
=
RandomDataset
(
SAMPLE_NUM
,
CLASS_NUM
)
dataloader
=
DataLoader
(
dataloader
=
DataLoader
(
dataset
,
dataset
,
places
=
places
,
num_workers
=
num_workers
,
num_workers
=
num_workers
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
drop_last
=
True
)
drop_last
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
浏览文件 @
d0c9f7e8
...
@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
...
@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
self
.
shape
=
[
2
,
2
,
3
,
4
,
5
]
self
.
shape
=
[
2
,
2
,
3
,
4
,
5
]
class
TestBatchNormDoubleGradCheckCase5
(
TestBatchNormDoubleGradCheck
):
@
prog_scope
()
def
func
(
self
,
place
):
prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
prog
):
np
.
random
.
seed
()
dtype
=
"float32"
eps
=
0.005
atol
=
2e-4
chn
=
self
.
shape
[
1
]
if
self
.
data_layout
==
'NCHW'
else
self
.
shape
[
-
1
]
x
=
layers
.
create_parameter
(
dtype
=
dtype
,
shape
=
self
.
shape
,
name
=
'x'
)
z
=
fluid
.
layers
.
batch_norm
(
input
=
x
,
data_layout
=
self
.
data_layout
,
use_global_stats
=
self
.
use_global_stats
)
x_arr
=
np
.
random
.
uniform
(
-
1
,
1
,
self
.
shape
).
astype
(
dtype
)
w
,
b
=
prog
.
global_block
().
all_parameters
()[
1
:
3
]
w_arr
=
np
.
ones
(
chn
).
astype
(
dtype
)
b_arr
=
np
.
zeros
(
chn
).
astype
(
dtype
)
gradient_checker
.
double_grad_check
(
[
x
,
w
,
b
],
z
,
x_init
=
[
x_arr
,
w_arr
,
b_arr
],
atol
=
atol
,
place
=
place
,
eps
=
eps
)
class
TestBatchNormDoubleGradCheckCase6
(
TestBatchNormDoubleGradCheckCase5
):
def
init_test
(
self
):
self
.
data_layout
=
'NCHW'
self
.
use_global_stats
=
True
self
.
shape
=
[
2
,
3
,
4
,
5
]
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_paddle_save_load.py
浏览文件 @
d0c9f7e8
...
@@ -29,19 +29,23 @@ IMAGE_SIZE = 784
...
@@ -29,19 +29,23 @@ IMAGE_SIZE = 784
CLASS_NUM
=
10
CLASS_NUM
=
10
# define a random dataset
def
random_batch_reader
():
class
RandomDataset
(
paddle
.
io
.
Dataset
):
def
_get_random_inputs_and_labels
():
def
__init__
(
self
,
num_samples
):
self
.
num_samples
=
num_samples
def
__getitem__
(
self
,
idx
):
np
.
random
.
seed
(
SEED
)
np
.
random
.
seed
(
SEED
)
image
=
np
.
random
.
random
([
IMAGE_SIZE
]).
astype
(
'float32'
)
image
=
np
.
random
.
random
([
BATCH_SIZE
,
IMAGE_SIZE
]).
astype
(
'float32'
)
label
=
np
.
random
.
randint
(
0
,
CLASS_NUM
-
1
,
(
1
,
)).
astype
(
'int64'
)
label
=
np
.
random
.
randint
(
0
,
CLASS_NUM
-
1
,
(
BATCH_SIZE
,
1
,
)).
astype
(
'int64'
)
return
image
,
label
return
image
,
label
def
__len__
(
self
):
def
__reader__
():
return
self
.
num_samples
for
_
in
range
(
BATCH_NUM
):
batch_image
,
batch_label
=
_get_random_inputs_and_labels
()
batch_image
=
paddle
.
to_tensor
(
batch_image
)
batch_label
=
paddle
.
to_tensor
(
batch_label
)
yield
batch_image
,
batch_label
return
__reader__
class
LinearNet
(
nn
.
Layer
):
class
LinearNet
(
nn
.
Layer
):
...
@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt):
...
@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt):
class
TestSaveLoad
(
unittest
.
TestCase
):
class
TestSaveLoad
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
# enable dygraph mode
# enable dygraph mode
self
.
place
=
paddle
.
CPUPlace
()
paddle
.
disable_static
()
paddle
.
disable_static
(
self
.
place
)
# config seed
# config seed
paddle
.
manual_seed
(
SEED
)
paddle
.
manual_seed
(
SEED
)
...
@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase):
...
@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase):
adam
=
opt
.
Adam
(
learning_rate
=
0.001
,
parameters
=
layer
.
parameters
())
adam
=
opt
.
Adam
(
learning_rate
=
0.001
,
parameters
=
layer
.
parameters
())
# create data loader
# create data loader
dataset
=
RandomDataset
(
BATCH_NUM
*
BATCH_SIZE
)
# TODO: using new DataLoader cause unknown Timeout on windows, replace it
loader
=
paddle
.
io
.
DataLoader
(
loader
=
random_batch_reader
()
dataset
,
places
=
self
.
place
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
2
)
# train
# train
train
(
layer
,
loader
,
loss_fn
,
adam
)
train
(
layer
,
loader
,
loss_fn
,
adam
)
...
@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase):
...
@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase):
layer
,
opt
=
self
.
build_and_train_model
()
layer
,
opt
=
self
.
build_and_train_model
()
# save
# save
layer_save_path
=
"linear.pdparams"
layer_save_path
=
"
test_paddle_save_load.
linear.pdparams"
opt_save_path
=
"linear.pdopt"
opt_save_path
=
"
test_paddle_save_load.
linear.pdopt"
layer_state_dict
=
layer
.
state_dict
()
layer_state_dict
=
layer
.
state_dict
()
opt_state_dict
=
opt
.
state_dict
()
opt_state_dict
=
opt
.
state_dict
()
...
@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase):
...
@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase):
# test save load in static mode
# test save load in static mode
paddle
.
enable_static
()
paddle
.
enable_static
()
static_save_path
=
"static_mode_test/linear.pdparams"
static_save_path
=
"static_mode_test/
test_paddle_save_load.
linear.pdparams"
paddle
.
save
(
layer_state_dict
,
static_save_path
)
paddle
.
save
(
layer_state_dict
,
static_save_path
)
load_static_state_dict
=
paddle
.
load
(
static_save_path
)
load_static_state_dict
=
paddle
.
load
(
static_save_path
)
self
.
check_load_state_dict
(
layer_state_dict
,
load_static_state_dict
)
self
.
check_load_state_dict
(
layer_state_dict
,
load_static_state_dict
)
...
@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase):
...
@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase):
# 2. test save path format error
# 2. test save path format error
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
paddle
.
save
(
layer_state_dict
,
"linear.model/"
)
paddle
.
save
(
layer_state_dict
,
"
test_paddle_save_load.
linear.model/"
)
# 3. test load path not exist error
# 3. test load path not exist error
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
paddle
.
load
(
"linear.params"
)
paddle
.
load
(
"
test_paddle_save_load.
linear.params"
)
# 4. test load old save path error
# 4. test load old save path error
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
paddle
.
load
(
"linear"
)
paddle
.
load
(
"
test_paddle_save_load.
linear"
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/tensor/linalg.py
浏览文件 @
d0c9f7e8
...
@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
...
@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
def
__check_input
(
x
,
y
):
def
__check_input
(
x
,
y
):
var_names
=
{
'x'
:
x
,
'y'
:
y
}
var_names
=
{
'x'
:
x
,
'y'
:
y
}
for
name
,
val
in
var_names
.
items
():
for
name
,
val
in
var_names
.
items
():
check_variable_and_dtype
(
val
,
name
,
[
'float32'
,
'float64'
],
check_variable_and_dtype
(
'matmul'
)
val
,
name
,
[
'float16'
,
'float32'
,
'float64'
],
'matmul'
)
__check_input
(
x
,
y
)
__check_input
(
x
,
y
)
...
...
tools/check_file_diff_approvals.sh
浏览文件 @
d0c9f7e8
...
@@ -286,7 +286,7 @@ fi
...
@@ -286,7 +286,7 @@ fi
# Get the list of PR authors with unresolved unit test issues
# Get the list of PR authors with unresolved unit test issues
pip
install
PyGithub
pip
install
PyGithub
# For getting PR related data
# For getting PR related data
wget https://
sys-p0.bj
.bcebos.com/blk/block.txt
--no-check-certificate
wget https://
paddle-ci.gz
.bcebos.com/blk/block.txt
--no-check-certificate
wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt
--no-check-certificate
wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt
--no-check-certificate
HASUTFIXED
=
`
python
${
PADDLE_ROOT
}
/tools/check_ut.py |
grep
"has unit-test to be fixed"
||
true
`
HASUTFIXED
=
`
python
${
PADDLE_ROOT
}
/tools/check_ut.py |
grep
"has unit-test to be fixed"
||
true
`
if
[
"
${
HASUTFIXED
}
"
!=
""
]
;
then
if
[
"
${
HASUTFIXED
}
"
!=
""
]
;
then
...
...
tools/dockerfile/Dockerfile.centos
浏览文件 @
d0c9f7e8
...
@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
...
@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
make -j8 && make install && \
make -j8 && make install && \
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
# gcc4.8 TRT
# Downgrade gcc&&g++
RUN mkdir -p /opt/compiler && cd /opt/compiler && \
<install_gcc>
wget -q https://paddle-ci.gz.bcebos.com/gcc-4.8.2.tar.gz && \
tar xf gcc-4.8.2.tar.gz && rm -f gcc-4.8.2.tar.gz
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
tools/dockerfile/ci_dockerfile.sh
浏览文件 @
d0c9f7e8
...
@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){
...
@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){
function
make_centos_dockerfile
(){
function
make_centos_dockerfile
(){
dockerfile_name
=
"Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
dockerfile_name
=
"Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
sed
's/<baseimg>/
9.0
-cudnn7-devel-centos6/g'
Dockerfile.centos
>
${
dockerfile_name
}
sed
's/<baseimg>/
10.2
-cudnn7-devel-centos6/g'
Dockerfile.centos
>
${
dockerfile_name
}
sed
-i
's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g'
${
dockerfile_name
}
sed
-i
's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g'
${
dockerfile_name
}
dockerfile_line
=
`
wc
-l
${
dockerfile_name
}
|awk
'{print $1}'
`
dockerfile_line
=
`
wc
-l
${
dockerfile_name
}
|awk
'{print $1}'
`
sed
-i
"
${
dockerfile_line
}
i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so &&
\
sed
-i
"
${
dockerfile_line
}
i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so &&
\
...
@@ -29,6 +29,15 @@ function make_centos_dockerfile(){
...
@@ -29,6 +29,15 @@ function make_centos_dockerfile(){
rm -rf /usr/include/NvInfer*"
${
dockerfile_name
}
rm -rf /usr/include/NvInfer*"
${
dockerfile_name
}
sed
-i
"
${
dockerfile_line
}
i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz &&
\
sed
-i
"
${
dockerfile_line
}
i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz &&
\
tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/"
${
dockerfile_name
}
tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/"
${
dockerfile_name
}
sed
-i
's#<install_gcc>#WORKDIR /usr/bin \
COPY tools/dockerfile/build_scripts /build_scripts \
RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g'
${
dockerfile_name
}
}
}
...
...
tools/sampcd_processor.py
浏览文件 @
d0c9f7e8
...
@@ -534,13 +534,6 @@ def get_incrementapi():
...
@@ -534,13 +534,6 @@ def get_incrementapi():
f
.
write
(
'
\n
'
)
f
.
write
(
'
\n
'
)
# only white on CPU
gpu_not_white
=
[
"deformable_conv"
,
"cuda_places"
,
"CUDAPinnedPlace"
,
"CUDAPlace"
,
"cuda_profiler"
,
'DGCMomentumOptimizer'
]
def
get_wlist
():
def
get_wlist
():
'''
'''
this function will get the white list of API.
this function will get the white list of API.
...
@@ -552,17 +545,25 @@ def get_wlist():
...
@@ -552,17 +545,25 @@ def get_wlist():
'''
'''
wlist
=
[]
wlist
=
[]
wlist_file
=
[]
wlist_file
=
[]
# only white on CPU
gpu_not_white
=
[]
with
open
(
"wlist.json"
,
'r'
)
as
load_f
:
with
open
(
"wlist.json"
,
'r'
)
as
load_f
:
load_dict
=
json
.
load
(
load_f
)
load_dict
=
json
.
load
(
load_f
)
for
key
in
load_dict
:
for
key
in
load_dict
:
if
key
==
'wlist_file'
:
if
key
==
'wlist_dir'
:
wlist_file
=
wlist_file
+
load_dict
[
key
]
for
item
in
load_dict
[
key
]:
wlist_file
.
append
(
item
[
"name"
])
elif
key
==
"gpu_not_white"
:
gpu_not_white
=
load_dict
[
key
]
elif
key
==
"wlist_api"
:
for
item
in
load_dict
[
key
]:
wlist
.
append
(
item
[
"name"
])
else
:
else
:
wlist
=
wlist
+
load_dict
[
key
]
wlist
=
wlist
+
load_dict
[
key
]
return
wlist
,
wlist_file
return
wlist
,
wlist_file
,
gpu_not_white
wlist
,
wlist_file
=
get_wlist
()
wlist
,
wlist_file
,
gpu_not_white
=
get_wlist
()
if
len
(
sys
.
argv
)
<
2
:
if
len
(
sys
.
argv
)
<
2
:
print
(
"Error: inadequate number of arguments"
)
print
(
"Error: inadequate number of arguments"
)
...
...
tools/wlist.json
浏览文件 @
d0c9f7e8
{
{
"wlist_file"
:
[
"wlist_dir"
:[
"../python/paddle/fluid/contrib"
,
{
"../python/paddle/verison.py"
,
"name"
:
"../python/paddle/fluid/contrib"
,
"../python/paddle/fluid/core_avx.py"
,
"annotation"
:
""
"../python/paddle/distributed"
},
{
"name"
:
"../python/paddle/verison.py"
,
"annotation"
:
""
},
{
"name"
:
"../python/paddle/fluid/core_avx.py"
,
"annotation"
:
""
},
{
"name"
:
"../python/paddle/distributed"
,
"annotation"
:
""
}
],
],
"wlist_inneed"
:[
"wlist_api"
:[
{
"name"
:
"xxxxx"
,
"annotation"
:
"not a real api, just for example"
}
],
"wlist_temp_api"
:[
"append_LARS"
,
"append_LARS"
,
"BuildStrategy.debug_graphviz_path"
,
"BuildStrategy.debug_graphviz_path"
,
"BuildStrategy.enable_sequential_execution"
,
"BuildStrategy.enable_sequential_execution"
,
...
@@ -63,9 +81,7 @@
...
@@ -63,9 +81,7 @@
"cuda_places"
,
"cuda_places"
,
"CUDAPinnedPlace"
,
"CUDAPinnedPlace"
,
"CUDAPlace"
,
"CUDAPlace"
,
"Program.parse_from_string"
"Program.parse_from_string"
,
],
"wlist_nosample"
:[
"Compressor"
,
"Compressor"
,
"Compressor.config"
,
"Compressor.config"
,
"Compressor.run"
,
"Compressor.run"
,
...
@@ -159,13 +175,9 @@
...
@@ -159,13 +175,9 @@
"RNN"
,
"RNN"
,
"BiRNN"
,
"BiRNN"
,
"RNNCellBase"
,
"RNNCellBase"
,
"RNNCellBase.get_initial_states"
"RNNCellBase.get_initial_states"
,
],
"wlist_no_op_pass"
:[
"gelu"
,
"gelu"
,
"erf"
"erf"
,
],
"wlist_ci_nopass"
:[
"DecodeHelper"
,
"DecodeHelper"
,
"DecodeHelper.initialize"
,
"DecodeHelper.initialize"
,
"DecodeHelper.sample"
,
"DecodeHelper.sample"
,
...
@@ -188,9 +200,7 @@
...
@@ -188,9 +200,7 @@
"SampleEmbeddingHelper"
,
"SampleEmbeddingHelper"
,
"BasicDecoder"
,
"BasicDecoder"
,
"lstm"
,
"lstm"
,
"partial_sum"
"partial_sum"
,
],
"wlist_nopass"
:[
"StateCell"
,
"StateCell"
,
"StateCell.compute_state"
,
"StateCell.compute_state"
,
"TrainingDecoder"
,
"TrainingDecoder"
,
...
@@ -242,9 +252,7 @@
...
@@ -242,9 +252,7 @@
"GroupNorm"
,
"GroupNorm"
,
"SpectralNorm"
,
"SpectralNorm"
,
"TreeConv"
,
"TreeConv"
,
"prroi_pool"
"prroi_pool"
,
],
"wlist_temp"
:[
"to_tensor"
,
"to_tensor"
,
"ChunkEvaluator"
,
"ChunkEvaluator"
,
"EditDistance"
,
"EditDistance"
,
...
@@ -322,9 +330,7 @@
...
@@ -322,9 +330,7 @@
"Conv2DTranspose"
,
"Conv2DTranspose"
,
"QueueDataset.local_shuffle"
,
"QueueDataset.local_shuffle"
,
"save_persistables@dygraph/checkpoint.py"
,
"save_persistables@dygraph/checkpoint.py"
,
"load_persistables@dygraph/checkpoint.py"
"load_persistables@dygraph/checkpoint.py"
,
],
"wlist_ignore"
:[
"elementwise_pow"
,
"elementwise_pow"
,
"WeightedAverage.reset"
,
"WeightedAverage.reset"
,
"ChunkEvaluator.eval"
,
"ChunkEvaluator.eval"
,
...
@@ -401,5 +407,13 @@
...
@@ -401,5 +407,13 @@
"LinearChainCRF.forward"
,
"LinearChainCRF.forward"
,
"CRFDecoding.forward"
,
"CRFDecoding.forward"
,
"SequenceTagging.forward"
"SequenceTagging.forward"
],
"gpu_not_white"
:[
"deformable_conv"
,
"cuda_places"
,
"CUDAPinnedPlace"
,
"CUDAPlace"
,
"cuda_profiler"
,
"DGCMomentumOptimizer"
]
]
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录