Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d0c9f7e8
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d0c9f7e8
编写于
9月 25, 2020
作者:
S
smallv0221
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into yxp0925
Merge branch 'develop'.
上级
a35a2ee9
09f19532
变更
91
隐藏空白更改
内联
并排
Showing
91 changed file
with
2345 addition
and
520 deletion
+2345
-520
cmake/generic.cmake
cmake/generic.cmake
+1
-0
cmake/third_party.cmake
cmake/third_party.cmake
+5
-4
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+4
-1
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+1
-0
paddle/fluid/framework/fleet/fleet_wrapper.cc
paddle/fluid/framework/fleet/fleet_wrapper.cc
+50
-0
paddle/fluid/framework/fleet/fleet_wrapper.h
paddle/fluid/framework/fleet/fleet_wrapper.h
+8
-0
paddle/fluid/framework/fleet/heter_wrapper.cc
paddle/fluid/framework/fleet/heter_wrapper.cc
+48
-11
paddle/fluid/framework/heterxpu_trainer.cc
paddle/fluid/framework/heterxpu_trainer.cc
+119
-8
paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
...d/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+21
-0
paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
.../framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+8
-0
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
...framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+8
-0
paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+7
-0
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+13
-0
paddle/fluid/framework/pull_dense_worker.cc
paddle/fluid/framework/pull_dense_worker.cc
+17
-3
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+15
-2
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+2
-1
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+19
-3
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
.../tests/api/analyzer_seq_pool1_compare_determine_tester.cc
+40
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
.../inference/tests/api/analyzer_seq_pool1_compare_tester.cc
+39
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
...s/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
+46
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
...erence/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+48
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
.../inference/tests/api/analyzer_seq_pool1_profile_tester.cc
+42
-0
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h
...id/inference/tests/api/analyzer_seq_pool1_tester_helper.h
+7
-72
paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
...nference/tests/api/analyzer_transformer_compare_tester.cc
+44
-0
paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
...d/inference/tests/api/analyzer_transformer_fuse_tester.cc
+36
-0
paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
...nference/tests/api/analyzer_transformer_profile_tester.cc
+45
-0
paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h
.../inference/tests/api/analyzer_transformer_tester_helper.h
+8
-53
paddle/fluid/memory/allocation/retry_allocator_test.cc
paddle/fluid/memory/allocation/retry_allocator_test.cc
+3
-1
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+39
-3
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+13
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+21
-0
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+34
-21
paddle/fluid/operators/benchmark/op_tester.cc
paddle/fluid/operators/benchmark/op_tester.cc
+20
-12
paddle/fluid/operators/benchmark/op_tester_config.cc
paddle/fluid/operators/benchmark/op_tester_config.cc
+15
-5
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+4
-2
paddle/fluid/operators/instance_norm_op.cc
paddle/fluid/operators/instance_norm_op.cc
+3
-3
paddle/fluid/operators/jit/benchmark.cc
paddle/fluid/operators/jit/benchmark.cc
+9
-3
paddle/fluid/operators/jit/gen/embseqpool.cc
paddle/fluid/operators/jit/gen/embseqpool.cc
+25
-5
paddle/fluid/operators/jit/gen/matmul.cc
paddle/fluid/operators/jit/gen/matmul.cc
+20
-4
paddle/fluid/operators/jit/gen/matmul.h
paddle/fluid/operators/jit/gen/matmul.h
+4
-1
paddle/fluid/operators/jit/gen/seqpool.cc
paddle/fluid/operators/jit/gen/seqpool.cc
+8
-2
paddle/fluid/operators/jit/gen/seqpool.h
paddle/fluid/operators/jit/gen/seqpool.h
+7
-2
paddle/fluid/operators/jit/gen/sgd.cc
paddle/fluid/operators/jit/gen/sgd.cc
+18
-3
paddle/fluid/operators/jit/gen/vbroadcast.cc
paddle/fluid/operators/jit/gen/vbroadcast.cc
+5
-1
paddle/fluid/operators/jit/gen_base.cc
paddle/fluid/operators/jit/gen_base.cc
+8
-3
paddle/fluid/operators/jit/helper.cc
paddle/fluid/operators/jit/helper.cc
+17
-6
paddle/fluid/operators/jit/helper.h
paddle/fluid/operators/jit/helper.h
+18
-7
paddle/fluid/operators/jit/more/mix/mix.cc
paddle/fluid/operators/jit/more/mix/mix.cc
+2
-1
paddle/fluid/operators/jit/more/mkl/mkl.h
paddle/fluid/operators/jit/more/mkl/mkl.h
+51
-10
paddle/fluid/operators/jit/refer/refer.h
paddle/fluid/operators/jit/refer/refer.h
+42
-9
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+9
-2
paddle/fluid/operators/math/blas_impl.cu.h
paddle/fluid/operators/math/blas_impl.cu.h
+29
-0
paddle/fluid/operators/matmul_v2_op.cu
paddle/fluid/operators/matmul_v2_op.cu
+6
-4
paddle/fluid/operators/matmul_v2_op.h
paddle/fluid/operators/matmul_v2_op.h
+31
-24
paddle/fluid/operators/norm_utils.cu.h
paddle/fluid/operators/norm_utils.cu.h
+58
-17
paddle/fluid/operators/utils.h
paddle/fluid/operators/utils.h
+12
-6
paddle/fluid/platform/cuda_primitives.h
paddle/fluid/platform/cuda_primitives.h
+38
-0
paddle/scripts/installation_validate.py
paddle/scripts/installation_validate.py
+1
-0
paddle/scripts/paddle_build.bat
paddle/scripts/paddle_build.bat
+10
-7
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+0
-5
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+23
-0
python/paddle/distributed/fleet/meta_optimizers/__init__.py
python/paddle/distributed/fleet/meta_optimizers/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
...ributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+146
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+1
-1
python/paddle/fluid/reader.py
python/paddle/fluid/reader.py
+45
-44
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-3
python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
...paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+63
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
...sts/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
+106
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
...nference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+92
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
...ference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+81
-0
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
...ttests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+73
-0
python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
...ts/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
+5
-17
python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
...n/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+24
-0
python/paddle/fluid/tests/unittests/test_activation_op.py
python/paddle/fluid/tests/unittests/test_activation_op.py
+53
-0
python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
...d/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+11
-3
python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
...e/fluid/tests/unittests/test_dist_mnist_fp16_allreduce.py
+15
-22
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
.../fluid/tests/unittests/test_fleet_distributed_strategy.py
+10
-0
python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
...sts/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
+91
-0
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+4
-1
python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
...addle/fluid/tests/unittests/test_imperative_se_resnext.py
+20
-5
python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+74
-25
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
...d/tests/unittests/test_multiprocess_dataloader_dynamic.py
+0
-1
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
.../test_multiprocess_dataloader_iterable_dataset_dynamic.py
+0
-1
python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+36
-0
python/paddle/fluid/tests/unittests/test_paddle_save_load.py
python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+23
-26
python/paddle/tensor/linalg.py
python/paddle/tensor/linalg.py
+2
-2
tools/check_file_diff_approvals.sh
tools/check_file_diff_approvals.sh
+1
-1
tools/dockerfile/Dockerfile.centos
tools/dockerfile/Dockerfile.centos
+2
-4
tools/dockerfile/ci_dockerfile.sh
tools/dockerfile/ci_dockerfile.sh
+10
-1
tools/sampcd_processor.py
tools/sampcd_processor.py
+12
-11
tools/wlist.json
tools/wlist.json
+38
-24
未找到文件。
cmake/generic.cmake
浏览文件 @
d0c9f7e8
...
...
@@ -721,6 +721,7 @@ function(proto_library TARGET_NAME)
set
(
proto_hdrs
)
paddle_protobuf_generate_cpp
(
proto_srcs proto_hdrs
${
proto_library_SRCS
}
)
cc_library
(
${
TARGET_NAME
}
SRCS
${
proto_srcs
}
DEPS
${
proto_library_DEPS
}
protobuf
)
add_dependencies
(
extern_xxhash
${
TARGET_NAME
}
)
endfunction
()
function
(
py_proto_compile TARGET_NAME
)
...
...
cmake/third_party.cmake
浏览文件 @
d0c9f7e8
...
...
@@ -39,6 +39,7 @@ set(third_party_deps)
# REPOSITORY ${TARGET_REPOSITORY}
# TAG ${TARGET_TAG}
# DIR ${TARGET_SOURCE_DIR})
FUNCTION
(
cache_third_party TARGET
)
SET
(
options
""
)
SET
(
oneValueArgs URL REPOSITORY TAG DIR
)
...
...
@@ -269,6 +270,10 @@ if(WITH_PSLIB)
endif
()
endif
(
WITH_PSLIB
)
if
(
NOT WIN32 AND NOT APPLE
)
include
(
external/gloo
)
list
(
APPEND third_party_deps extern_gloo
)
endif
()
if
(
WITH_BOX_PS
)
include
(
external/box_ps
)
...
...
@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
endif
(
WITH_BOX_PS
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_GLOO
)
include
(
external/gloo
)
list
(
APPEND third_party_deps extern_gloo
)
endif
()
if
(
WITH_GRPC
)
list
(
APPEND third_party_deps extern_grpc
)
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
d0c9f7e8
...
...
@@ -74,7 +74,9 @@ class PullDenseWorker {
virtual
void
Initialize
(
const
TrainerDesc
&
param
);
#ifdef PADDLE_WITH_CUDA
void
AddStream
(
const
cudaStream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
void
AddPlace
(
const
paddle
::
platform
::
Place
place
)
{
places_
.
push_back
(
place
);
}
...
...
@@ -135,9 +137,9 @@ class PullDenseWorker {
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
cudaStream_t
>
copy_streams_
;
#endif
std
::
vector
<
paddle
::
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
thread_scopes_
;
#endif
};
// should incorporate different type of device
...
...
@@ -161,6 +163,7 @@ class DeviceWorker {
virtual
void
SetDataFeed
(
DataFeed
*
data_feed
);
virtual
void
SetWorkerNum
(
int
num
)
{}
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{}
virtual
void
GetXpuOpIndex
()
{}
virtual
void
SetNeedDumpField
(
bool
need_dump_field
)
{
need_dump_field_
=
need_dump_field
;
}
...
...
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
d0c9f7e8
...
...
@@ -127,6 +127,7 @@ message DistributedStrategy {
optional
int32
conv_workspace_size_limit
=
22
[
default
=
4000
];
optional
bool
cudnn_batchnorm_spatial_persistent
=
23
[
default
=
true
];
optional
bool
adaptive_localsgd
=
24
[
default
=
false
];
optional
bool
fp16_allreduce
=
25
[
default
=
false
];
optional
RecomputeConfig
recompute_configs
=
101
;
optional
AMPConfig
amp_configs
=
102
;
...
...
paddle/fluid/framework/fleet/fleet_wrapper.cc
浏览文件 @
d0c9f7e8
...
...
@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
push_sparse_status
->
push_back
(
std
::
move
(
status
));
}
}
#endif
#ifdef PADDLE_WITH_XPU
void
FleetWrapper
::
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
)
{
#ifdef PADDLE_WITH_PSLIB
std
::
vector
<
paddle
::
ps
::
Region
>
regions
;
for
(
auto
&
t
:
var_names
)
{
Variable
*
var
=
scope
.
FindVar
(
t
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
int
count
=
tensor
->
numel
();
float
*
g_data
=
tensor
->
data
<
float
>
();
Variable
*
pin_var
=
scope
.
FindVar
(
t
+
"pin"
);
LoDTensor
*
pin_tensor
=
pin_var
->
GetMutable
<
LoDTensor
>
();
float
*
pin_g
=
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CPUPlace
());
memory
::
Copy
(
platform
::
CPUPlace
(),
pin_g
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
),
g_data
,
sizeof
(
float
)
*
count
);
float
*
g
=
pin_g
;
if
(
scale_datanorm
>=
0
)
{
if
(
t
.
find
(
".batch_size@GRAD"
)
!=
std
::
string
::
npos
||
t
.
find
(
".batch_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
Eigen
::
Map
<
Eigen
::
MatrixXf
>
mat
(
g
,
1
,
count
);
float
scale
=
1.0
/
batch_size
;
mat
*=
scale
;
}
else
if
(
t
.
find
(
".batch_square_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
VLOG
(
3
)
<<
"epsilon: "
<<
scale_datanorm
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
g
[
i
]
=
(
g
[
i
]
-
batch_size
*
scale_datanorm
)
/
batch_size
+
batch_size
*
scale_datanorm
;
}
}
}
paddle
::
ps
::
Region
reg
(
g
,
count
);
regions
.
emplace_back
(
std
::
move
(
reg
));
}
auto
status
=
pslib_ptr_
->
_worker_ptr
->
push_dense
(
regions
.
data
(),
regions
.
size
(),
table_id
);
if
(
push_sparse_status
)
{
push_sparse_status
->
push_back
(
std
::
move
(
status
));
}
#endif
}
#endif
void
FleetWrapper
::
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
...
...
paddle/fluid/framework/fleet/fleet_wrapper.h
浏览文件 @
d0c9f7e8
...
...
@@ -160,6 +160,14 @@ class FleetWrapper {
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
,
cudaStream_t
stream
,
cudaEvent_t
event
);
#endif
#ifdef PADDLE_WITH_XPU
void
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
);
#endif
void
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
...
...
paddle/fluid/framework/fleet/heter_wrapper.cc
浏览文件 @
d0c9f7e8
...
...
@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
memcpy
(
data_ptr
,
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#ifdef PADDLE_WITH_CUDA
}
else
{
#ifdef PADDLE_WITH_CUDA
memory
::
Copy
(
platform
::
CPUPlace
(),
data_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tensor
->
place
()),
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
nullptr
);
}
#else
}
#endif
#ifdef PADDLE_WITH_XPU
memory
::
Copy
(
platform
::
CPUPlace
(),
data_ptr
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
tensor
->
place
()),
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#endif
}
}
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
,
cudaStream_t
stream
)
{
// const VariableMessage& req_var = request->vars();
auto
*
var
=
scope
->
FindVar
(
req_var
.
varname
());
auto
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
std
::
vector
<
int
>
vec_dim
;
for
(
auto
&
x
:
req_var
.
dims
())
{
vec_dim
.
push_back
(
x
);
}
tensor
->
Resize
(
make_ddim
(
vec_dim
));
LoD
lod
;
for
(
int
i
=
0
;
i
<
req_var
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
req_var
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
req_var
.
lod
(
i
).
lod_data
(
j
));
}
lod
.
push_back
(
v
);
}
tensor
->
set_lod
(
lod
);
void
*
tensor_data
=
tensor
->
mutable_data
(
place
,
ToVarType
(
req_var
.
data_type
()));
#ifdef PADDLE_WITH_CUDA
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
tensor_data
,
platform
::
CPUPlace
(),
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
stream
);
#else
memcpy
(
tensor_data
,
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#endif
}
#endif
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
)
{
#endif
// const VariableMessage& req_var = request->vars();
auto
*
var
=
scope
->
FindVar
(
req_var
.
varname
());
auto
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
...
...
@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
void
*
tensor_data
=
tensor
->
mutable_data
(
place
,
ToVarType
(
req_var
.
data_type
()));
#ifdef PADDLE_WITH_
CUDA
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDA
Place
,
place
),
tensor_data
,
#ifdef PADDLE_WITH_
XPU
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPU
Place
,
place
),
tensor_data
,
platform
::
CPUPlace
(),
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
())
,
stream
);
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#else
memcpy
(
tensor_data
,
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
...
...
@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
case
VariableMessage
::
BOOL
:
return
framework
::
proto
::
VarType
::
BOOL
;
// NOLINT
default:
VLOG
(
0
)
<<
"Not support type "
<<
type
;
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ToVarType:Unsupported type %d"
,
type
));
}
}
...
...
paddle/fluid/framework/heterxpu_trainer.cc
浏览文件 @
d0c9f7e8
...
...
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
int
place_num
=
trainer_desc
.
worker_places_size
();
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
int
num
=
trainer_desc
.
worker_places
(
i
);
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
cudaStream_t
stream
;
...
...
@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
events_
.
push_back
(
event
);
#endif
#ifdef PADDLE_WITH_XPU
platform
::
XPUPlace
place
=
platform
::
XPUPlace
(
num
);
places_
.
push_back
(
place
);
#endif
}
// thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset);
...
...
@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
void
HeterXpuTrainer
::
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
)
{
auto
place
=
places_
[
num
];
Scope
*
scope
=
place_scopes_
[
num
];
#ifdef PADDLE_WITH_CUDA
auto
stream
=
copy_streams_
[
num
];
auto
event
=
events_
[
num
];
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
#endif
#ifdef PADDLE_WITH_XPU
xpu_set_device
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
);
#endif
auto
&
block
=
program
.
Block
(
0
);
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
...
...
@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \
} while (0)
#define HeterMemcpyXpuFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
} \
} while (0)
#ifdef PADDLE_WITH_CUDA
_ForEachDataType_
(
HeterMemcpyFunc
);
#endif
#ifdef PADDLE_WITH_XPU
_ForEachDataType_
(
HeterMemcpyXpuFunc
);
#endif
}
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
cudaEventSynchronize
(
event
);
#endif
}
#ifdef PADDLE_WITH_CUDA
template
<
typename
T
>
void
HeterXpuTrainer
::
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
...
...
@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
(),
stream
);
}
}
#endif
#ifdef PADDLE_WITH_XPU
template
<
typename
T
>
void
HeterXpuTrainer
::
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
)
{
T
*
thread_ptr
=
thread_tensor
->
mutable_data
<
T
>
(
root_tensor
->
dims
(),
thread_place
);
T
*
root_ptr
=
root_tensor
->
data
<
T
>
();
if
(
platform
::
is_cpu_place
(
root_tensor
->
place
()))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
thread_place
),
thread_ptr
,
platform
::
CPUPlace
(),
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
());
}
else
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
thread_place
),
thread_ptr
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
root_tensor
->
place
()),
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
());
}
}
#endif
void
HeterXpuTrainer
::
DumpWork
(
int
tid
)
{}
...
...
@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
CreateThreadParam
(
main_program
,
i
);
pull_dense_worker_
->
AddThreadScope
(
scope
);
pull_dense_worker_
->
AddPlace
(
places_
[
i
]);
#ifdef PADDLE_WITH_CUDA
pull_dense_worker_
->
AddStream
(
copy_streams_
[
i
]);
#endif
}
pull_dense_worker_
->
Start
();
#ifdef PADDLE_WITH_CUDA
for
(
auto
&
stream
:
copy_streams_
)
{
cudaStreamSynchronize
(
stream
);
}
#endif
op_names_
.
clear
();
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
std
::
unique_ptr
<
OperatorBase
>
local_op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
...
...
@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
(
context
->
ops_
).
push_back
(
local_op_ptr
);
}
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
#endif
object_pool_
.
Push
(
context
);
}
}
...
...
@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
} \
} while (0)
_ForEachDataType_
(
MergeCallback
);
if
(
platform
::
is_gpu_place
(
thread_tensor
->
place
()))
{
if
(
!
platform
::
is_cpu_place
(
thread_tensor
->
place
()))
{
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
thread_tensor
->
place
()).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
cudaMemset
(
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
#endif
#ifdef PADDLE_WITH_XPU
auto
place
=
thread_tensor
->
place
();
xpu_set_device
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContext
*
dev_ctx
=
pool
.
Get
(
place
);
const
platform
::
XPUDeviceContext
*
xpu_ctx
=
reinterpret_cast
<
const
platform
::
XPUDeviceContext
*>
(
dev_ctx
);
xpu
::
memset
(
xpu_ctx
->
x_context
(),
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
#endif
}
else
{
memset
(
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
...
...
@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
auto
*
merge_var
=
response
->
add_vars
();
heter_ptr_
->
SerializeToReq
(
need_merge_var_names_
[
i
],
root_scope_
,
merge_var
);
if
(
platform
::
is_gpu_place
(
root_tensor
->
place
()))
{
if
(
!
platform
::
is_cpu_place
(
root_tensor
->
place
()))
{
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
root_tensor
->
place
()).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
cudaMemset
(
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
#endif
#ifdef PADDLE_WITH_XPU
auto
place
=
root_tensor
->
place
();
xpu_set_device
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContext
*
dev_ctx
=
pool
.
Get
(
place
);
const
platform
::
XPUDeviceContext
*
xpu_ctx
=
reinterpret_cast
<
const
platform
::
XPUDeviceContext
*>
(
dev_ctx
);
xpu
::
memset
(
xpu_ctx
->
x_context
(),
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
#endif
}
else
{
memset
(
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
...
...
@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
(
context
->
ops_
).
push_back
(
local_op_ptr
);
}
#ifdef PADDLE_WITH_CUDA
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
#endif
}
context
->
Reset
();
...
...
@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
auto
deserial_timer
=
std
::
make_shared
<
paddle
::
ps
::
CostTimer
>
(
"xpu_service_deserial"
);
for
(
int
i
=
0
;
i
<
request
->
vars_size
();
++
i
)
{
#ifdef PADDLE_WITH_CUDA
heter_ptr_
->
DeSerializeToTensor
(
context
->
scope_
,
request
->
vars
(
i
),
place
,
copy_streams_
[
context
->
place_num_
]);
#endif
#ifdef PADDLE_WITH_XPU
heter_ptr_
->
DeSerializeToTensor
(
context
->
scope_
,
request
->
vars
(
i
),
place
);
#endif
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
context
->
event_
,
copy_streams_
[
context
->
place_num_
]));
while
(
cudaEventQuery
(
context
->
event_
)
!=
cudaSuccess
)
{
VLOG
(
3
)
<<
"wait for kernel"
;
bthread_yield
();
}
#endif
}
{
...
...
@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
op
->
Run
(
*
(
context
->
scope_
),
place
);
}
}
#ifdef PADDLE_WITH_CUDA
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
bthread_yield
();
}
}
#endif
#ifdef PADDLE_WITH_XPU
xpu_wait
();
#endif
for
(
int
i
=
0
;
i
<
trainer_desc_
.
xpu_send_list_size
();
++
i
)
{
const
std
::
string
&
varname
=
trainer_desc_
.
xpu_send_list
(
i
);
...
...
@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_dense_table_id
(
i
));
#ifdef PADDLE_WITH_CUDA
fleet_ptr_
->
PushDenseVarsAsync
(
*
(
context
->
scope_
),
tid
,
dense_grad_names_
[
tid
],
&
(
context
->
push_dense_status_
),
scale_datanorm_
,
request
->
cur_batch
(),
places_
[
context
->
place_num_
],
copy_streams_
[
context
->
place_num_
],
context
->
event_
);
#endif
#ifdef PADDLE_WITH_XPU
fleet_ptr_
->
PushDenseVarsAsync
(
*
(
context
->
scope_
),
tid
,
dense_grad_names_
[
tid
],
&
(
context
->
push_dense_status_
),
scale_datanorm_
,
request
->
cur_batch
(),
places_
[
context
->
place_num_
]);
#endif
}
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
++
i
)
{
...
...
@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
pull_dense_worker_
->
Stop
();
root_scope_
->
DropKids
();
}
}
// namespace framework
}
// namespace paddle
#endif
paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
...
...
@@ -103,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
REGISTER_PASS
(
conv_relu_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
ConvActivationFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_relu_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"relu"
,
0
));
REGISTER_PASS
(
conv_leaky_relu_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
Conv2DLeakyReLUFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_leaky_relu_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
LE
(
"leaky_relu"
,
1
));
REGISTER_PASS
(
conv_relu6_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
Conv2DReLU6FusePass
);
REGISTER_PASS_CAPABILITY
(
conv_relu6_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"relu6"
,
0
));
REGISTER_PASS
(
conv_swish_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
Conv2DSwishFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_swish_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"swish"
,
0
));
paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
...
...
@@ -123,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS
(
conv_concat_relu_mkldnn_fuse_pass
,
paddle
::
framework
::
ir
::
ConvConcatReLUFusePass
);
REGISTER_PASS_CAPABILITY
(
conv_concat_relu_mkldnn_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"conv2d"
,
0
)
.
EQ
(
"concat"
,
0
)
.
EQ
(
"relu"
,
0
));
paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
#include <paddle/fluid/string/pretty_log.h>
#include <vector>
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
...
...
@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const {
REGISTER_PASS
(
matmul_transpose_reshape_fuse_pass
,
paddle
::
framework
::
ir
::
MatmulTransposeReshapeMKLDNNPass
);
REGISTER_PASS_CAPABILITY
(
matmul_transpose_reshape_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"matmul"
,
0
)
.
EQ
(
"transpose"
,
0
)
.
EQ
(
"reshape"
,
0
));
paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/string/pretty_log.h"
namespace
paddle
{
...
...
@@ -90,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS
(
scale_matmul_fuse_pass
,
paddle
::
framework
::
ir
::
ScaleMatmulFusePass
);
REGISTER_PASS_CAPABILITY
(
scale_matmul_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"scale"
,
0
)
.
EQ
(
"matmul"
,
0
));
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
浏览文件 @
d0c9f7e8
...
...
@@ -17,6 +17,7 @@
#include <string>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -255,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
REGISTER_PASS
(
seq_concat_fc_fuse_pass
,
paddle
::
framework
::
ir
::
SeqConcatFcFusePass
);
REGISTER_PASS_CAPABILITY
(
seq_concat_fc_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"sequence_expand"
,
0
)
.
EQ
(
"concat"
,
0
)
.
EQ
(
"mul"
,
0
)
.
EQ
(
"elementwise_add"
,
0
)
.
EQ
(
"sigmoid"
,
0
)
.
EQ
(
"tanh"
,
0
)
.
EQ
(
"relu"
,
0
)
.
EQ
(
"identity"
,
0
)
.
EQ
(
"fusion_seqexpand_concat_fc"
,
0
));
paddle/fluid/framework/pull_dense_worker.cc
浏览文件 @
d0c9f7e8
...
...
@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
#ifdef PADDLE_WITH_CUDA
copy_streams_
.
clear
();
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
places_
.
clear
();
thread_scopes_
.
clear
();
#endif
}
void
PullDenseWorker
::
CreatePinVar
()
{
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for
(
int
i
=
0
;
i
<
dwp_param_
.
program_config
(
0
).
pull_dense_table_id_size
();
...
...
@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
auto
*
ptr
=
root_scope_
->
Var
(
name
+
"pin"
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
LoDTensor
*
pin_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
#ifdef PADDLE_WITH_CUDA
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CUDAPinnedPlace
());
#endif
#ifdef PADDLE_WITH_XPU
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CPUPlace
());
#endif
}
}
#endif
...
...
@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit
(
-
1
);
}
status_vec
->
resize
(
0
);
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// for (auto& v : dense_value_names_) {
...
...
@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
Variable
*
var
=
thread_scopes_
[
i
]
->
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
float
*
w
=
tensor
->
data
<
float
>
();
#ifdef PADDLE_WITH_CUDA
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
places_
[
i
]),
w
,
platform
::
CUDAPinnedPlace
(),
pin_w
,
sizeof
(
float
)
*
tensor
->
numel
(),
copy_streams_
[
i
]);
#endif
#ifdef PADDLE_WITH_XPU
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
places_
[
i
]),
w
,
platform
::
CPUPlace
(),
pin_w
,
sizeof
(
float
)
*
tensor
->
numel
());
#endif
}
}
}
...
...
@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t
tid
=
static_cast
<
uint64_t
>
(
dwp_param_
.
program_config
(
0
).
pull_dense_table_id
(
i
));
if
(
force_update
||
CheckUpdateParam
(
tid
))
{
#if
def PADDLE_WITH_CUDA
#if
(defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
VLOG
(
3
)
<<
"pull dense "
<<
force_update
<<
" "
<<
tid
;
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
,
false
);
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
d0c9f7e8
...
...
@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
};
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class
HeterServiceContext
{
public:
HeterServiceContext
()
{}
...
...
@@ -151,7 +152,9 @@ class HeterServiceContext {
void
Reset
()
{
push_dense_status_
.
clear
();
}
int
place_num_
;
Scope
*
scope_
{
nullptr
};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t
event_
;
#endif
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
};
...
...
@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
template
<
typename
T
>
#ifdef PADDLE_WITH_CUDA
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cudaStream_t
stream
);
#endif
#ifdef PADDLE_WITH_XPU
void
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
);
#endif
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
template
<
typename
T
>
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
...
...
@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
std
::
vector
<
std
::
string
>
op_names_
;
std
::
vector
<
Scope
*>
place_scopes_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
std
::
vector
<
cudaStream_t
>
copy_streams_
;
std
::
vector
<
platform
::
Place
>
places_
;
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
cudaStream_t
>
copy_streams_
;
std
::
vector
<
cudaEvent_t
>
events_
;
#endif
};
#endif
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
d0c9f7e8
...
...
@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
#endif
#if defined(PADDLE_WITH_NCCL)
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
d0c9f7e8
...
...
@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML)
# seq_pool1
set
(
SEQ_POOL1_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/seq_pool"
)
download_model_and_data
(
${
SEQ_POOL1_INSTALL_DIR
}
"seq_pool1_model_.tar.gz"
"seq_pool1_data.txt.tar.gz"
)
inference_analysis_api_test
(
test_analyzer_seq_pool1
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_compare_determine
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_compare_determine_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_compare_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_fuse_compare_zero_copy
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_fuse_statis
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_fuse_statis_tester.cc
)
inference_analysis_api_test
(
test_analyzer_seq_pool1_profile
${
SEQ_POOL1_INSTALL_DIR
}
analyzer_seq_pool1_profile_tester.cc
)
if
(
NOT WIN32
)
set_tests_properties
(
test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150
)
set_tests_properties
(
test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120
)
endif
()
else
()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that
...
...
@@ -215,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# transformer, the dataset only works on batch_size=8 now
set
(
TRANSFORMER_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/transformer"
)
download_model_and_data
(
${
TRANSFORMER_INSTALL_DIR
}
"temp/transformer_model.tar.gz"
"temp/transformer_data.txt.tar.gz"
)
inference_analysis_test
(
test_analyzer_transformer SRCS analyzer_transformer_tester.cc
inference_analysis_test
(
test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRANSFORMER_INSTALL_DIR
}
/model --infer_data=
${
TRANSFORMER_INSTALL_DIR
}
/data.txt --batch_size=8
--cpu_num_threads=
${
CPU_NUM_THREADS_ON_CI
}
)
inference_analysis_test
(
test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRANSFORMER_INSTALL_DIR
}
/model --infer_data=
${
TRANSFORMER_INSTALL_DIR
}
/data.txt --batch_size=8
--cpu_num_threads=
${
CPU_NUM_THREADS_ON_CI
}
)
inference_analysis_test
(
test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TRANSFORMER_INSTALL_DIR
}
/model --infer_data=
${
TRANSFORMER_INSTALL_DIR
}
/data.txt --batch_size=8
--cpu_num_threads=
${
CPU_NUM_THREADS_ON_CI
}
)
...
...
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// Compare Deterministic result
TEST
(
Analyzer_seq_pool1_compare_determine
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
TEST
(
Analyzer_seq_pool1_compare
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST
(
Analyzer_seq_pool1_compare_zero_copy
,
compare_zero_copy
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
AnalysisConfig
cfg1
;
SetConfig
(
&
cfg1
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
std
::
vector
<
std
::
string
>
outputs_name
;
outputs_name
.
emplace_back
(
out_var_name
);
CompareAnalysisAndZeroCopy
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg
),
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg1
),
input_slots_all
,
outputs_name
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// Check the fuse status
TEST
(
Analyzer_seq_pool1_fuse_statis
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"seqpool_concat_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"squared_mat_sub_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"repeated_fc_relu_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
10
);
EXPECT_EQ
(
fuse_statis
.
at
(
"seqpool_concat_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"squared_mat_sub_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"repeated_fc_relu_fuse"
),
2
);
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
EXPECT_EQ
(
num_ops
,
171
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
,
use_mkldnn
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_seq_pool1_profile
,
profile
)
{
profile
();
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester
.cc
→
paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester
_helper.h
浏览文件 @
d0c9f7e8
...
...
@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
seq_pool1_tester
{
// diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
static
const
char
out_var_name
[]
=
"reduce_sum_0.tmp_0"
;
...
...
@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg
->
pass_builder
()
->
InsertPass
(
2
,
"seqpool_concat_fuse_pass"
);
}
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
,
use_mkldnn
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_seq_pool1
,
profile
)
{
profile
();
}
// Compare result of NativeConfig and AnalysisConfig
TEST
(
Analyzer_seq_pool1
,
compare
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Compare Deterministic result
TEST
(
Analyzer_seq_pool1
,
compare_determine
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareDeterministic
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
// Check the fuse status
TEST
(
Analyzer_seq_pool1
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
predictor
.
get
(),
&
num_ops
);
ASSERT_TRUE
(
fuse_statis
.
count
(
"fc_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"seqpool_concat_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"squared_mat_sub_fuse"
));
ASSERT_TRUE
(
fuse_statis
.
count
(
"repeated_fc_relu_fuse"
));
ASSERT_EQ
(
fuse_statis
.
at
(
"fc_fuse"
),
10
);
EXPECT_EQ
(
fuse_statis
.
at
(
"seqpool_concat_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"squared_mat_sub_fuse"
),
2
);
EXPECT_EQ
(
fuse_statis
.
at
(
"repeated_fc_relu_fuse"
),
2
);
LOG
(
INFO
)
<<
"num_ops: "
<<
num_ops
;
EXPECT_EQ
(
num_ops
,
171
);
}
// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
TEST
(
Analyzer_seq_pool1
,
compare_zero_copy
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
AnalysisConfig
cfg1
;
SetConfig
(
&
cfg1
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
std
::
vector
<
std
::
string
>
outputs_name
;
outputs_name
.
emplace_back
(
out_var_name
);
CompareAnalysisAndZeroCopy
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg
),
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
cfg1
),
input_slots_all
,
outputs_name
);
}
}
// namespace seq_pool1_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
void
compare
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
TEST
(
Analyzer_Transformer
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
compare_mkldnn
)
{
compare
(
true
/* use_mkldnn */
);
}
#endif
}
// namespace transformer_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
// Check the fuse status
TEST
(
Analyzer_Transformer
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
}
}
// namespace transformer_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
0 → 100644
浏览文件 @
d0c9f7e8
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_Transformer
,
profile
)
{
profile
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
profile_mkldnn
)
{
profile
(
true
);
}
#endif
}
// namespace transformer_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/analyzer_transformer_tester
.cc
→
paddle/fluid/inference/tests/api/analyzer_transformer_tester
_helper.h
浏览文件 @
d0c9f7e8
...
...
@@ -11,11 +11,16 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
namespace
transformer_tester
{
struct
DataRecord
{
std
::
vector
<
std
::
vector
<
int64_t
>>
src_word
,
src_pos
,
trg_word
,
init_idx
;
...
...
@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
}
}
// Easy for profiling independently.
void
profile
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
outputs
;
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
TestPrediction
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
,
&
outputs
,
FLAGS_num_threads
);
}
TEST
(
Analyzer_Transformer
,
profile
)
{
profile
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
profile_mkldnn
)
{
profile
(
true
);
}
#endif
// Check the fuse status
TEST
(
Analyzer_Transformer
,
fuse_statis
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
int
num_ops
;
auto
predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
cfg
);
auto
fuse_statis
=
GetFuseStatis
(
static_cast
<
AnalysisPredictor
*>
(
predictor
.
get
()),
&
num_ops
);
}
// Compare result of NativeConfig and AnalysisConfig
void
compare
(
bool
use_mkldnn
=
false
)
{
AnalysisConfig
cfg
;
SetConfig
(
&
cfg
);
if
(
use_mkldnn
)
{
cfg
.
EnableMKLDNN
();
cfg
.
pass_builder
()
->
AppendPass
(
"fc_mkldnn_pass"
);
}
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInput
(
&
input_slots_all
);
CompareNativeAndAnalysis
(
reinterpret_cast
<
const
PaddlePredictor
::
Config
*>
(
&
cfg
),
input_slots_all
);
}
TEST
(
Analyzer_Transformer
,
compare
)
{
compare
();
}
#ifdef PADDLE_WITH_MKLDNN
TEST
(
Analyzer_Transformer
,
compare_mkldnn
)
{
compare
(
true
/* use_mkldnn */
);
}
#endif
}
// namespace transformer_tester
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
paddle/fluid/memory/allocation/retry_allocator_test.cc
浏览文件 @
d0c9f7e8
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include <algorithm>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
...
...
@@ -20,6 +21,7 @@
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
...
...
@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) {
size_t
thread_num
=
4
;
size_t
sleep_time
=
40
;
size_t
extra_time
=
1
0
;
size_t
extra_time
=
2
0
;
// Reserve to perform more tests in the future
std
::
vector
<
std
::
shared_ptr
<
Allocator
>>
allocators
;
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
d0c9f7e8
...
...
@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
}
};
//
// AbsGrad: dx=dy if x >=0 else -dy
// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
template
<
typename
T
>
class
AbsDoubleGradMaker
:
public
::
paddle
::
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
::
paddle
::
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
protected:
void
Apply
(
GradOpPtr
<
T
>
op
)
const
override
{
op
->
SetType
(
"abs_grad_grad"
);
// input1: x
op
->
SetInput
(
"X"
,
this
->
Input
(
"X"
));
// input2: ddx
op
->
SetInput
(
"DDX"
,
this
->
OutputGrad
(
framework
::
GradVarName
(
"X"
)));
op
->
SetAttrMap
(
this
->
Attrs
());
// output: ddy
op
->
SetOutput
(
"DDOut"
,
this
->
InputGrad
(
framework
::
GradVarName
(
"Out"
)));
}
};
// ReluGrad: dx = dy if y >= 0 else 0
// ReluGradGrad: ddy = ddx if y >= 0 else 0
//
template
<
typename
T
>
class
ReluDoubleGradMaker
:
public
::
paddle
::
framework
::
SingleGradOpMaker
<
T
>
{
public:
...
...
@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR(
std
::
conditional
<
ops
::
CanInplaceAct
<
ops
::
AbsGradFunctor
<
float
>>
(),
ops
::
ActFwdInplaceInferer
,
void
>::
type
);
REGISTER_OPERATOR
(
abs_grad
,
ops
::
ActivationOpGrad
,
ops
::
ActivationGradOpInplaceInferer
);
ops
::
ActivationGradOpInplaceInferer
,
ops
::
AbsDoubleGradMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
AbsDoubleGradMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OPERATOR
(
abs_grad_grad
,
ops
::
ActivationOpDoubleGrad
<
ops
::
AbsGradGradFunctor
<
float
>::
FwdDeps
()
>
,
ops
::
ActivationDoubleGradOpInplaceInferer
);
REGISTER_OP_CPU_KERNEL
(
abs
,
ops
::
ActivationKernel
<
paddle
::
platform
::
CPUDeviceContext
,
...
...
@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL(
ops
::
AbsGradFunctor
<
int
>>
,
ops
::
ActivationGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
ops
::
AbsGradFunctor
<
int64_t
>>
);
REGISTER_OP_CPU_KERNEL
(
abs_grad_grad
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
float
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
double
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
plat
::
float16
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
int
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
AbsGradGradFunctor
<
int64_t
>>
);
/* ========================================================================== */
/* ========================== register checkpoint ===========================*/
...
...
paddle/fluid/operators/activation_op.cu
浏览文件 @
d0c9f7e8
...
...
@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
ExpGradFunctor
<
plat
::
float16
>>
);
/* ========================================================================== */
/* ==========================
exp
register ============================ */
/* ==========================
abs
register ============================ */
REGISTER_OP_CUDA_KERNEL
(
abs
,
ops
::
ActivationKernel
<
plat
::
CUDADeviceContext
,
ops
::
AbsFunctor
<
float
>>
,
...
...
@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
AbsGradFunctor
<
int64_t
>>
,
ops
::
ActivationGradKernel
<
plat
::
CUDADeviceContext
,
ops
::
AbsGradFunctor
<
plat
::
float16
>>
);
REGISTER_OP_CUDA_KERNEL
(
abs_grad_grad
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
float
>>
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
double
>>
,
ops
::
ActivationDoubleGradKernel
<
plat
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
plat
::
float16
>>
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
int
>>
,
ops
::
ActivationDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
AbsGradGradFunctor
<
int64_t
>>
);
/* ========================================================================== */
paddle/fluid/operators/activation_op.h
浏览文件 @
d0c9f7e8
...
...
@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
}
};
template
<
typename
T
>
struct
AbsGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
framework
::
Tensor
*
X
,
const
framework
::
Tensor
*
Out
,
const
framework
::
Tensor
*
ddX
,
framework
::
Tensor
*
ddOut
,
framework
::
Tensor
*
dOut
,
framework
::
Tensor
*
dX
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"AbsGradGrad"
));
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
X
,
"Input"
,
"X"
,
"AbsGradGrad"
));
if
(
ddOut
)
{
auto
ddout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddOut
,
"Output"
,
"DDOut"
,
"AbsGradGrad"
));
ddout
.
device
(
*
d
)
=
ddx
*
x
.
sign
();
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
kDepX
;
}
};
template
<
typename
T
>
struct
ReluGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
...
...
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
d0c9f7e8
...
...
@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
op
->
SetInput
(
"SavedMean"
,
this
->
Input
(
"SavedMean"
));
op
->
SetInput
(
"SavedVariance"
,
this
->
Input
(
"SavedVariance"
));
if
(
BOOST_GET_CONST
(
bool
,
this
->
GetAttr
(
"use_global_stats"
)))
{
op
->
SetInput
(
"Mean"
,
this
->
Input
(
"Mean"
));
op
->
SetInput
(
"Variance"
,
this
->
Input
(
"Variance"
));
}
op
->
SetInput
(
"DDX"
,
this
->
OutputGrad
(
framework
::
GradVarName
(
"X"
)));
...
...
@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
"BatchNormDoubleGrad"
);
}
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"DDX"
),
"Input"
,
"DDX"
,
"BatchNormDoubleGrad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"DY"
),
"Input"
,
"DY"
,
"BatchNormDoubleGrad"
);
// check output
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"DX"
),
"Output"
,
"DX"
,
"BatchNormDoubleGrad"
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
const
int
C
=
x_dims
[
1
];
const
DataLayout
data_layout
=
framework
::
StringToDataLayout
(
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"data_layout"
));
const
int
C
=
((
this
->
IsMKLDNNType
()
==
true
)
||
(
data_layout
==
DataLayout
::
kNCHW
)
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
if
(
ctx
->
HasOutput
(
"DX"
))
{
ctx
->
SetOutputDim
(
"DX"
,
x_dims
);
}
...
...
@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
Tensor
inv_var_tensor
;
if
(
use_global_stats
)
{
const
auto
*
running_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
running_variance
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
mean_data
=
running_mean
->
data
<
T
>
();
inv_var_tensor
.
Resize
({
C
});
T
*
running_inv_var_data
=
inv_var_tensor
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy,
// axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
// (np.mean(ddx, axis=(n,h,w)) - ddx)
)
+ ddr * (dy * inv_var -
// inv_var
// *
// np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))
)
// axis=(n,h,w)))
if
(
ddX
)
{
dx_arr
+=
...
...
@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
C
,
sample_size
);
ddy_arr
.
setZero
();
if
(
use_global_stats
)
{
// math: ddy = r * ddx * inv_var
// math: ddy = r * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
if
(
ddX
)
{
ddy_arr
=
scale_tile_data
*
ddx_arr
*
inv_var_tile_data
;
}
...
...
@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
.
replicate
(
1
,
sample_size
)
/
sample_size
);
}
if
(
ddScale
&&
ddBias
)
{
ConstEigenVectorArrayMap
<
T
>
ddscale_arr
(
ddScale
->
data
<
T
>
(),
C
);
Tensor
ddscale_tile
;
ddscale_tile
.
Resize
({
C
,
sample_size
});
EigenArrayMap
<
T
>
ddscale_tile_data
(
ddscale_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
ddscale_tile_data
=
ddscale_arr
.
replicate
(
1
,
sample_size
);
}
if
(
ddScale
)
{
ConstEigenVectorArrayMap
<
T
>
ddscale_arr
(
ddScale
->
data
<
T
>
(),
C
);
Tensor
ddscale_tile
;
ddscale_tile
.
Resize
({
C
,
sample_size
});
EigenArrayMap
<
T
>
ddscale_tile_data
(
ddscale_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
ddscale_tile_data
=
ddscale_arr
.
replicate
(
1
,
sample_size
);
ddy_arr
+=
x_sub_mean_mul_invstd_arr
*
ddscale_tile_data
;
}
ConstEigenVectorArrayMap
<
T
>
ddbias_arr
(
ddBias
->
data
<
T
>
(),
C
);
Tensor
ddbias_tile
;
ddbias_tile
.
Resize
({
C
,
sample_size
});
EigenArrayMap
<
T
>
ddbias_tile_data
(
ddbias_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
ddbias_tile_data
=
ddbias_arr
.
replicate
(
1
,
sample_size
);
if
(
ddBias
)
{
ConstEigenVectorArrayMap
<
T
>
ddbias_arr
(
ddBias
->
data
<
T
>
(),
C
);
Tensor
ddbias_tile
;
ddbias_tile
.
Resize
({
C
,
sample_size
});
EigenArrayMap
<
T
>
ddbias_tile_data
(
ddbias_tile
.
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
C
,
sample_size
);
ddbias_tile_data
=
ddbias_arr
.
replicate
(
1
,
sample_size
);
ddy_arr
+=
x_sub_mean_mul_invstd_arr
*
ddscale_tile_data
;
ddy_arr
+=
ddbias_tile_data
;
}
ddy_arr
+=
ddbias_tile_data
;
}
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
VLOG
(
3
)
<<
"Transform batchnorm output from NHWC to NCHW"
;
TransToChannelFirst
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
(
...
...
paddle/fluid/operators/benchmark/op_tester.cc
浏览文件 @
d0c9f7e8
...
...
@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
CreateInputVarDesc
();
CreateOutputVarDesc
();
}
else
{
PADDLE_THROW
(
platform
::
errors
::
NotFound
(
"Operator '%s' is not registered."
,
config_
.
op_type
));
PADDLE_THROW
(
platform
::
errors
::
NotFound
(
"Operator '%s' is not registered in OpTester."
,
config_
.
op_type
));
}
if
(
config_
.
device_id
>=
0
)
{
...
...
@@ -81,7 +81,8 @@ void OpTester::Run() {
platform
::
EnableProfiler
(
platform
::
ProfilerState
::
kAll
);
platform
::
SetDeviceId
(
config_
.
device_id
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'CUDAPlace' is not supported in CPU only device."
));
#endif
}
...
...
@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
}
else
if
(
str
==
"fp64"
)
{
return
framework
::
proto
::
VarType
::
FP64
;
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s."
,
str
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported dtype %s in OpTester."
,
str
.
c_str
()));
}
}
...
...
@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
case
framework
::
proto
::
AttrType
::
INTS
:
case
framework
::
proto
::
AttrType
::
FLOATS
:
case
framework
::
proto
::
AttrType
::
STRINGS
:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Not supported STRINGS type
yet."
));
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported STRINGS type in OpTester
yet."
));
break
;
case
framework
::
proto
::
AttrType
::
LONG
:
{
int64_t
value
=
StringTo
<
int64_t
>
(
value_str
);
...
...
@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
}
break
;
case
framework
::
proto
::
AttrType
::
LONGS
:
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupport attr type %d in OpTester."
,
type
));
}
}
}
...
...
@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
}
is
.
close
();
}
else
{
PADDLE_THROW
(
"Unsupported initializer %s."
,
initializer
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported initializer %s in OpTester."
,
initializer
.
c_str
()));
}
if
(
!
platform
::
is_cpu_place
(
place_
))
{
...
...
@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
static_cast
<
double
>
(
1.0
),
item
.
second
.
initializer
,
item
.
second
.
filename
);
}
else
{
PADDLE_THROW
(
"Unsupported dtype %d."
,
data_type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported dtype %d in OpTester."
,
data_type
));
}
VLOG
(
3
)
<<
"Set lod for tensor "
<<
var_name
;
...
...
@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
<<
"
\n
"
;
}
break
;
default:
PADDLE_THROW
(
"Unsupport attr type %d"
,
attr_type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupport attr type %d in OpTester."
,
attr_type
));
}
ss
<<
GenSpaces
(
--
count
)
<<
"}
\n
"
;
}
...
...
@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
TEST
(
op_tester
,
base
)
{
if
(
!
FLAGS_op_config_list
.
empty
())
{
std
::
ifstream
fin
(
FLAGS_op_config_list
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
FLAGS_op_config_list
.
c_str
());
PADDLE_ENFORCE_EQ
(
static_cast
<
bool
>
(
fin
),
true
,
platform
::
errors
::
InvalidArgument
(
"OpTester cannot open file %s"
,
FLAGS_op_config_list
.
c_str
()));
std
::
vector
<
OpTesterConfig
>
op_configs
;
while
(
!
fin
.
eof
())
{
VLOG
(
4
)
<<
"Reading config "
<<
op_configs
.
size
()
<<
"..."
;
...
...
paddle/fluid/operators/benchmark/op_tester_config.cc
浏览文件 @
d0c9f7e8
...
...
@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
}
else
if
(
dtype_str
==
"fp64"
||
dtype_str
==
"double"
)
{
dtype
=
"fp64"
;
}
else
{
PADDLE_THROW
(
"Unsupported dtype %s"
,
dtype_str
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported dtype %s in OpInputConfig."
,
dtype_str
.
c_str
()));
}
VLOG
(
4
)
<<
"dtype of input "
<<
name
<<
" is: "
<<
dtype
;
}
...
...
@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
const
std
::
vector
<
std
::
string
>
supported_initializers
=
{
"random"
,
"natural"
,
"zeros"
,
"file"
};
if
(
!
Has
(
supported_initializers
,
initializer_str
))
{
PADDLE_THROW
(
"Unsupported initializer %s"
,
initializer_str
.
c_str
());
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported initializer %s in OpInputConfig."
,
initializer_str
.
c_str
()));
}
initializer
=
initializer_str
;
...
...
@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
}
}
EraseEndSep
(
&
lod_str
);
PADDLE_ENFORCE_GE
(
lod_str
.
length
(),
4U
);
PADDLE_ENFORCE_GE
(
lod_str
.
length
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The length of lod string should be "
"equal to or larger than 4. But length of lod string is %zu."
,
lod_str
.
length
()));
VLOG
(
4
)
<<
"lod: "
<<
lod_str
<<
", length: "
<<
lod_str
.
length
();
// Parse the lod_str
...
...
@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
OpTesterConfig
::
OpTesterConfig
(
const
std
::
string
&
filename
)
{
std
::
ifstream
fin
(
filename
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
fin
),
"Cannot open file %s"
,
filename
.
c_str
());
PADDLE_ENFORCE_EQ
(
static_cast
<
bool
>
(
fin
),
true
,
platform
::
errors
::
InvalidArgument
(
"OpTesterConfig cannot open file %s."
,
filename
.
c_str
()));
Init
(
fin
);
}
...
...
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
浏览文件 @
d0c9f7e8
...
...
@@ -54,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase {
class
CSyncCommStreamOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"(Tensor) Dependency of the variable need to sync"
);
AddOutput
(
"Out"
,
"(Tensor) Dependency of the variable need to sync"
);
AddInput
(
"X"
,
"(Tensor) Dependency of the variable need to sync"
)
.
AsDuplicable
();
AddOutput
(
"Out"
,
"(Tensor) Dependency of the variable need to sync"
)
.
AsDuplicable
();
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) ring id."
).
SetDefault
(
0
);
AddComment
(
R"DOC(
CSyncCommStream Operator
...
...
paddle/fluid/operators/instance_norm_op.cc
浏览文件 @
d0c9f7e8
...
...
@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
// (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
// np.sum(dy,
// axis=(h,w)) * (x - mean) *
// (np.mean(ddx, axis=(h,w)) - ddx)
+ ddr * (dy * inv_var - inv_var
// *
// (np.mean(ddx, axis=(h,w)) - ddx)
) + ddr * (dy * inv_var -
//
inv_var
*
// np.mean(dy, axis=(h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(h,w)))
)
// axis=(h,w)))
Tensor
x_sub_mean_mul_invstd
;
x_sub_mean_mul_invstd
.
Resize
({
sample_size
,
NxC
});
...
...
paddle/fluid/operators/jit/benchmark.cc
浏览文件 @
d0c9f7e8
...
...
@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
}
using
Tensor
=
paddle
::
framework
::
Tensor
;
template
<
typename
KernelTuple
,
typename
PlaceType
>
void
BenchKernelXYZN
()
{
using
T
=
typename
KernelTuple
::
data_type
;
...
...
@@ -320,8 +319,15 @@ void BenchKernelSgd() {
const
T
lr
=
0.1
;
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
PADDLE_ENFORCE_LE
(
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
);
PADDLE_ENFORCE_GT
(
n
,
0
);
PADDLE_ENFORCE_LE
(
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The range of Sgd (upper - lower) should be equal to or lower "
"than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d."
,
static_cast
<
size_t
>
(
upper
-
lower
),
(
n
-
1
)));
PADDLE_ENFORCE_GT
(
n
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The Sgd size should be larger than 0. But the n is %d."
,
n
));
std
::
vector
<
int64_t
>
all
,
out
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
all
.
push_back
(
i
);
...
...
paddle/fluid/operators/jit/gen/embseqpool.cc
浏览文件 @
d0c9f7e8
...
...
@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
table_height
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
table_width
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
index_height
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
index_width
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
out_width
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
table_height
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute table_height of EmbSeqPool should "
"be larger than 0. But it is %d."
,
attr
.
table_height
));
PADDLE_ENFORCE_GT
(
attr
.
table_width
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute table_width of EmbSeqPool should "
"be larger than 0. But it is %d."
,
attr
.
table_width
));
PADDLE_ENFORCE_GT
(
attr
.
index_height
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute index_height of EmbSeqPool should "
"be larger than 0. But it is %d."
,
attr
.
index_height
));
PADDLE_ENFORCE_GT
(
attr
.
index_width
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute index_width of EmbSeqPool should "
"be larger than 0. But it is %d."
,
attr
.
index_width
));
PADDLE_ENFORCE_GT
(
attr
.
out_width
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute out_width of EmbSeqPool should be "
"larger than 0. But it is %d."
,
attr
.
out_width
));
return
make_unique
<
EmbSeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
}
};
...
...
paddle/fluid/operators/jit/gen/matmul.cc
浏览文件 @
d0c9f7e8
...
...
@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
preCode
();
int
block
,
rest
;
const
auto
groups
=
packed_groups
(
n_
,
k_
,
&
block
,
&
rest
);
PADDLE_ENFORCE_GT
(
groups
.
front
(),
0
);
PADDLE_ENFORCE_GT
(
groups
.
front
(),
0
,
platform
::
errors
::
InvalidArgument
(
"The number of rest registers should "
"be larger than 0. But it is %d."
,
groups
.
front
()));
const
int
block_len
=
sizeof
(
float
)
*
block
;
const
int
x_reg_idx
=
(
block
==
ZMM_FLOAT_BLOCK
?
32
:
16
)
-
1
;
...
...
@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
matmul_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
m
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
n
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
k
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
m
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute m (first matrix's row) of MatMul should "
"be larger than 0. But it is %d."
,
attr
.
m
));
PADDLE_ENFORCE_GT
(
attr
.
n
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute n (first matrix's col) of MatMul should "
"be larger than 0. But it is %d."
,
attr
.
n
));
PADDLE_ENFORCE_GT
(
attr
.
k
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute k (second matrix's col) of MatMul should "
"be larger than 0. But it is %d."
,
attr
.
k
));
return
make_unique
<
MatMulJitCode
>
(
attr
,
CodeSize
(
attr
));
}
};
...
...
paddle/fluid/operators/jit/gen/matmul.h
浏览文件 @
d0c9f7e8
...
...
@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
m_
(
attr
.
m
),
n_
(
attr
.
n
),
k_
(
attr
.
k
)
{
PADDLE_ENFORCE_EQ
(
m_
,
1
,
"Only support m==1 yet"
);
PADDLE_ENFORCE_EQ
(
m_
,
1
,
platform
::
errors
::
Unimplemented
(
"Jitcode of matmul only support m==1 (first "
"matrix's row) now. But m is %d."
,
m_
));
this
->
genCode
();
}
...
...
paddle/fluid/operators/jit/gen/seqpool.cc
浏览文件 @
d0c9f7e8
...
...
@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
seq_pool_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_GT
(
attr
.
w
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
h
,
0
);
PADDLE_ENFORCE_GT
(
attr
.
w
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute width of SeqPool should "
"be larger than 0. But it is %d."
,
attr
.
w
));
PADDLE_ENFORCE_GT
(
attr
.
h
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute height of SeqPool should "
"be larger than 0. But it is %d."
,
attr
.
h
));
return
make_unique
<
SeqPoolJitCode
>
(
attr
,
CodeSize
(
attr
));
}
};
...
...
paddle/fluid/operators/jit/gen/seqpool.h
浏览文件 @
d0c9f7e8
...
...
@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
vmovss
(
xmm_t
(
reg_idx
+
max_num_regs
),
ptr
[
reg_ptr_src_i
]);
reg_idx
++
;
}
PADDLE_ENFORCE_EQ
(
reg_idx
,
rest_used_num_regs
,
"All heights should use same regs"
);
PADDLE_ENFORCE_EQ
(
reg_idx
,
rest_used_num_regs
,
platform
::
errors
::
InvalidArgument
(
"All heights of SeqPool should use the same number of registers."
"It equals to the numbr of rest registers. But use %d registers "
"and the numbr of rest registers is %d."
,
reg_idx
,
rest_used_num_regs
));
for
(
int
i
=
0
;
i
<
reg_idx
;
++
i
)
{
vaddps
(
xmm_t
(
i
),
xmm_t
(
i
),
xmm_t
(
i
+
max_num_regs
));
}
...
...
paddle/fluid/operators/jit/gen/sgd.cc
浏览文件 @
d0c9f7e8
...
...
@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
size_t
CodeSize
(
const
sgd_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
sgd_attr_t
&
attr
)
const
override
{
PADDLE_ENFORCE_EQ
(
attr
.
param_width
,
attr
.
grad_width
);
PADDLE_ENFORCE_LE
(
attr
.
selected_rows_size
,
attr
.
grad_height
);
PADDLE_ENFORCE_GE
(
attr
.
selected_rows_size
,
0
);
PADDLE_ENFORCE_EQ
(
attr
.
param_width
,
attr
.
grad_width
,
platform
::
errors
::
InvalidArgument
(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d."
,
attr
.
param_width
,
attr
.
grad_width
));
PADDLE_ENFORCE_LE
(
attr
.
selected_rows_size
,
attr
.
grad_height
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d."
,
attr
.
selected_rows_size
,
attr
.
grad_height
));
PADDLE_ENFORCE_GE
(
attr
.
selected_rows_size
,
0
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or larger than 0. But selected_rows_size is %d."
,
attr
.
selected_rows_size
));
return
make_unique
<
SgdJitCode
>
(
attr
,
CodeSize
(
attr
));
}
};
...
...
paddle/fluid/operators/jit/gen/vbroadcast.cc
浏览文件 @
d0c9f7e8
...
...
@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int64_t
&
w
)
const
override
{
PADDLE_ENFORCE_GT
(
w
,
0
);
PADDLE_ENFORCE_GT
(
w
,
0
,
platform
::
errors
::
InvalidArgument
(
"The width of VBroadcast should be larger than 0. But w is %d."
,
w
));
return
make_unique
<
VBroadcastJitCode
>
(
w
,
CodeSize
(
w
));
}
};
...
...
paddle/fluid/operators/jit/gen_base.cc
浏览文件 @
d0c9f7e8
...
...
@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
void
*
GenBase
::
operator
new
(
size_t
size
)
{
void
*
ptr
;
constexpr
size_t
alignment
=
32ul
;
PADDLE_ENFORCE_EQ
(
posix_memalign
(
&
ptr
,
alignment
,
size
),
0
,
"GenBase Alloc %ld error!"
,
size
);
PADDLE_ENFORCE
(
ptr
,
"Fail to allocate GenBase CPU memory: size = %d ."
,
size
);
PADDLE_ENFORCE_EQ
(
posix_memalign
(
&
ptr
,
alignment
,
size
),
0
,
platform
::
errors
::
InvalidArgument
(
"Jitcode generator (GenBase) allocate %ld memory error!"
,
size
));
PADDLE_ENFORCE_NOT_NULL
(
ptr
,
platform
::
errors
::
InvalidArgument
(
"Fail to allocate jitcode generator "
"(GenBase) CPU memory: size = %d ."
,
size
));
return
ptr
;
}
...
...
paddle/fluid/operators/jit/helper.cc
浏览文件 @
d0c9f7e8
...
...
@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
ONE_CASE
(
kEmbSeqPool
);
ONE_CASE
(
kSgd
);
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
kt
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"JIT kernel do not support type: %d."
,
kt
));
return
"NOT JITKernel"
;
}
return
nullptr
;
...
...
@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
ONE_CASE
(
kAvg
);
ONE_CASE
(
kSqrt
);
default:
PADDLE_THROW
(
"Not support type: %d, or forget to add it."
,
tp
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"SeqPool JIT kernel do not support type: %d."
,
tp
));
return
"NOT PoolType"
;
}
return
nullptr
;
...
...
@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
}
else
if
(
lower
==
"tanh"
||
lower
==
"vtanh"
)
{
return
kVTanh
;
}
PADDLE_THROW
(
"Not support type: %s, or forget to add this case"
,
act
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Act JIT kernel do not support type: %s."
,
act
));
return
kNone
;
}
...
...
@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
int
block
,
rest
;
const
auto
groups
=
packed_groups
(
n
,
k
,
&
block
,
&
rest
);
std
::
for_each
(
groups
.
begin
(),
groups
.
end
(),
[
&
](
int
i
)
{
PADDLE_ENFORCE_GT
(
i
,
0
,
"each element of groups should be larger than 0."
);
PADDLE_ENFORCE_GT
(
i
,
0
,
platform
::
errors
::
InvalidArgument
(
"Each element of groups should be larger than "
"0. However the element: %d doesn't satify."
,
i
));
});
int
sum
=
std
::
accumulate
(
groups
.
begin
(),
groups
.
end
(),
0
);
std
::
memset
(
dst
,
0
,
k
*
sum
*
block
*
sizeof
(
float
));
PADDLE_ENFORCE_GE
(
sum
*
block
,
n
,
"The packed n should be equal to or larger than n"
);
platform
::
errors
::
InvalidArgument
(
"The packed n (sum * block) should be equal to or "
"larger than n (matmul row size). "
"However, the packed n is %d and n is %d."
,
sum
*
block
,
n
));
const
int
block_len
=
sizeof
(
float
)
*
block
;
int
n_offset
=
0
;
...
...
@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
template
<
typename
T
>
typename
std
::
enable_if
<!
std
::
is_same
<
T
,
float
>::
value
>::
type
pack_weights
(
const
T
*
src
,
T
*
dst
,
int
n
,
int
k
)
{
PADDLE_THROW
(
"Only support pack with float type."
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Only supports pack weights with float type."
));
}
}
// namespace jit
...
...
paddle/fluid/operators/jit/helper.h
浏览文件 @
d0c9f7e8
...
...
@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
auto
&
ref_pool
=
ReferKernelPool
::
Instance
().
AllKernels
();
KernelKey
kkey
(
KernelTuple
::
kernel_type
,
platform
::
CPUPlace
());
auto
ref_iter
=
ref_pool
.
find
(
kkey
);
PADDLE_ENFORCE
(
ref_iter
!=
ref_pool
.
end
(),
"Every Kernel should have reference function."
);
PADDLE_ENFORCE_NE
(
ref_iter
,
ref_pool
.
end
(),
platform
::
errors
::
PreconditionNotMet
(
"Every Refer Kernel of jitcode should have reference function."
));
auto
&
ref_impls
=
ref_iter
->
second
;
for
(
auto
&
impl
:
ref_impls
)
{
auto
i
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
impl
.
get
());
...
...
@@ -101,7 +103,9 @@ template <typename KernelTuple>
inline
typename
KernelTuple
::
func_type
GetReferFunc
()
{
auto
ker
=
GetReferKernel
<
KernelTuple
>
();
auto
p
=
dynamic_cast
<
const
ReferKernel
<
KernelTuple
>*>
(
ker
);
PADDLE_ENFORCE
(
p
,
"The Refer kernel should exsit"
);
PADDLE_ENFORCE_NOT_NULL
(
p
,
platform
::
errors
::
InvalidArgument
(
"Get the reference code of kernel in CPU "
"failed. The Refer kernel should exsit."
));
return
p
->
GetFunc
();
}
...
...
@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
// The last implementation should be reference function on CPUPlace.
auto
ref
=
GetReferKernel
<
KernelTuple
>
();
PADDLE_ENFORCE
(
ref
!=
nullptr
,
"Refer Kernel can not be empty."
);
PADDLE_ENFORCE_NOT_NULL
(
ref
,
platform
::
errors
::
InvalidArgument
(
"Get all candicate kernel in CPU failed. "
"The Refer Kernel can not be empty."
));
res
.
emplace_back
(
ref
);
return
res
;
}
...
...
@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
std
::
string
name
=
k
->
ImplType
();
if
(
name
==
"JitCode"
)
{
auto
i
=
dynamic_cast
<
const
GenBase
*>
(
k
);
PADDLE_ENFORCE
(
i
,
"jitcode kernel cast can not fail."
);
PADDLE_ENFORCE_NOT_NULL
(
i
,
platform
::
errors
::
InvalidArgument
(
"Generate jitcode kernel (GenBase) failed."
));
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
template
getCode
<
Func
>()));
}
else
{
auto
i
=
dynamic_cast
<
const
KernelMore
<
KernelTuple
>*>
(
k
);
PADDLE_ENFORCE
(
i
,
"kernel cast can not fail."
);
PADDLE_ENFORCE_NOT_NULL
(
i
,
platform
::
errors
::
InvalidArgument
(
"Kernel cast (KernelMore) failed."
));
res
.
emplace_back
(
std
::
make_pair
(
name
,
i
->
GetFunc
()));
}
}
...
...
@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
typename
KernelTuple
::
func_type
GetDefaultBestFunc
(
const
typename
KernelTuple
::
attr_type
&
attr
)
{
auto
funcs
=
GetAllCandidateFuncs
<
KernelTuple
,
PlaceType
>
(
attr
);
PADDLE_ENFORCE_GE
(
funcs
.
size
(),
1UL
);
PADDLE_ENFORCE_GE
(
funcs
.
size
(),
1UL
,
platform
::
errors
::
InvalidArgument
(
"The candicate jit kernel is at least one in CPU."
));
// Here could do some runtime benchmark of this attr and return the best one.
// But yet just get the first one as the default best one,
// which is searched in order and tuned by offline.
...
...
paddle/fluid/operators/jit/more/mix/mix.cc
浏览文件 @
d0c9f7e8
...
...
@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT
}
else
if
(
type
==
kVIdentity
)
{
return
KernelFuncs
<
VIdentityTuple
<
T
>
,
CPUPlace
>::
Cache
().
At
(
d
);
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Act JIT kernel do not support type: %s"
,
type
));
return
nullptr
;
}
...
...
paddle/fluid/operators/jit/more/mkl/mkl.h
浏览文件 @
d0c9f7e8
...
...
@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
template
<
typename
T
>
void
EmbSeqPool
(
const
T
*
table
,
const
int64_t
*
idx
,
T
*
out
,
const
emb_seq_pool_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
);
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
,
platform
::
errors
::
InvalidArgument
(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d, "
"out_width is %d."
,
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
));
auto
check_idx_value_valid
=
[
&
](
int64_t
i
)
{
PADDLE_ENFORCE_LT
(
idx
[
i
],
attr
->
table_height
,
"idx value: %d, i: %d"
,
idx
[
i
],
i
);
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
"idx value: %d, i: %d"
,
idx
[
i
],
i
);
PADDLE_ENFORCE_LT
(
idx
[
i
],
attr
->
table_height
,
platform
::
errors
::
InvalidArgument
(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d."
,
i
,
idx
[
i
],
attr
->
table_height
));
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d."
,
i
,
idx
[
i
]));
};
for
(
int64_t
w
=
0
;
w
!=
attr
->
index_width
;
++
w
)
{
...
...
@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
template
<
typename
T
>
void
Sgd
(
const
T
*
lr
,
const
T
*
param
,
const
T
*
grad
,
const
int64_t
*
rows
,
T
*
out
,
const
sgd_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
);
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
);
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
,
platform
::
errors
::
InvalidArgument
(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d."
,
attr
->
param_width
,
attr
->
grad_width
));
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d."
,
attr
->
selected_rows_size
,
attr
->
grad_height
));
T
scalar
=
-
lr
[
0
];
int
width
=
attr
->
grad_width
;
if
(
out
==
param
)
{
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
auto
h_idx
=
rows
[
i
];
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
);
PADDLE_ENFORCE_GE
(
h_idx
,
0
);
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d."
,
i
,
h_idx
,
attr
->
param_height
));
PADDLE_ENFORCE_GE
(
h_idx
,
0
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d."
,
i
,
h_idx
));
VAXPY
(
scalar
,
grad
+
i
*
width
,
out
+
h_idx
*
width
,
width
);
}
}
else
{
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
auto
h_idx
=
rows
[
i
];
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
);
PADDLE_ENFORCE_GE
(
h_idx
,
0
);
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d."
,
i
,
h_idx
,
attr
->
param_height
));
PADDLE_ENFORCE_GE
(
h_idx
,
0
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d."
,
i
,
h_idx
));
VScal
(
&
scalar
,
grad
+
i
*
width
,
out
+
h_idx
*
width
,
width
);
VAdd
(
param
+
h_idx
*
width
,
out
+
h_idx
*
width
,
out
+
h_idx
*
width
,
width
);
...
...
paddle/fluid/operators/jit/refer/refer.h
浏览文件 @
d0c9f7e8
...
...
@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT
}
else
if
(
type
==
kVIdentity
)
{
return
VIdentity
<
T
>
;
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Act JIT kernel do not support type: %s."
,
type
));
return
nullptr
;
}
...
...
@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
template
<
typename
T
>
void
EmbSeqPool
(
const
T
*
table
,
const
int64_t
*
idx
,
T
*
out
,
const
emb_seq_pool_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
);
PADDLE_ENFORCE_EQ
(
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
,
platform
::
errors
::
InvalidArgument
(
"The attribute table_width * index_width of EmbSeqPool should "
"be equal to out_width. But table_width * index_width is %d and "
"out_width is %d."
,
attr
->
table_width
*
attr
->
index_width
,
attr
->
out_width
));
auto
check_idx_value_valid
=
[
&
](
int64_t
i
)
{
PADDLE_ENFORCE_LT
(
idx
[
i
],
attr
->
table_height
,
"idx value: %d, i: %d"
,
idx
[
i
],
i
);
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
"idx value: %d, i: %d"
,
idx
[
i
],
i
);
PADDLE_ENFORCE_LT
(
idx
[
i
],
attr
->
table_height
,
platform
::
errors
::
InvalidArgument
(
"The idx shoud be lower than the attribute table_height of "
"EmbSeqPool. But %dth of idx is %d and table_height is %d."
,
i
,
idx
[
i
],
attr
->
table_height
));
PADDLE_ENFORCE_GE
(
idx
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"The idx shoud be equal to or larger than "
"the 0. But %dth of idx is %d."
,
i
,
idx
[
i
]));
};
for
(
int64_t
w
=
0
;
w
!=
attr
->
index_width
;
++
w
)
{
...
...
@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
template
<
typename
T
>
void
Sgd
(
const
T
*
lr
,
const
T
*
param
,
const
T
*
grad
,
const
int64_t
*
rows
,
T
*
out
,
const
sgd_attr_t
*
attr
)
{
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
);
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
);
PADDLE_ENFORCE_EQ
(
attr
->
param_width
,
attr
->
grad_width
,
platform
::
errors
::
InvalidArgument
(
"The attribute param_width of Sgd should be "
"equal to the attribute grad_width. But param_width "
"is %d and grad_width is %d."
,
attr
->
param_width
,
attr
->
grad_width
));
PADDLE_ENFORCE_LE
(
attr
->
selected_rows_size
,
attr
->
grad_height
,
platform
::
errors
::
InvalidArgument
(
"The attribute selected_rows_size of Sgd should be "
"equal to or less than the attribute grad_height. "
"But selected_rows_size is %d and grad_height is %d."
,
attr
->
selected_rows_size
,
attr
->
grad_height
));
for
(
int64_t
i
=
0
;
i
<
attr
->
selected_rows_size
;
++
i
)
{
auto
h_idx
=
rows
[
i
];
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
);
PADDLE_ENFORCE_GE
(
h_idx
,
0
);
PADDLE_ENFORCE_LT
(
h_idx
,
attr
->
param_height
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"less than the attribute. But %dth of rows "
"is %d and grad_width is %d."
,
i
,
h_idx
,
attr
->
param_height
));
PADDLE_ENFORCE_GE
(
h_idx
,
0
,
platform
::
errors
::
InvalidArgument
(
"The rows of Sgd should be "
"larger than 0. But %dth of rows "
"is %d."
,
i
,
h_idx
));
for
(
int64_t
j
=
0
;
j
<
attr
->
grad_width
;
++
j
)
{
out
[
h_idx
*
attr
->
grad_width
+
j
]
=
param
[
h_idx
*
attr
->
grad_width
+
j
]
-
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
d0c9f7e8
...
...
@@ -850,8 +850,15 @@ void TestKernelSgd() {
const
T
lr
=
0.1
;
auto
UnDuplicatedRandomVec
=
[](
int
n
,
const
int64_t
lower
,
const
int64_t
upper
)
->
std
::
vector
<
int64_t
>
{
PADDLE_ENFORCE_LE
(
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
);
PADDLE_ENFORCE_GT
(
n
,
0
);
PADDLE_ENFORCE_LE
(
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The range of Sgd (upper - lower) should be lower "
"than n-1 (Sgd size -1). But the upper - lower is %d "
"and n-1 is %d."
,
static_cast
<
size_t
>
(
upper
-
lower
),
n
-
1
));
PADDLE_ENFORCE_GT
(
n
,
0
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"The Sgd size should be larger than 0. But the n is %d."
,
n
));
std
::
vector
<
int64_t
>
all
,
out
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
all
.
push_back
(
i
);
...
...
paddle/fluid/operators/math/blas_impl.cu.h
浏览文件 @
d0c9f7e8
...
...
@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
});
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
platform
::
float16
alpha
,
const
platform
::
float16
*
A
,
const
platform
::
float16
*
B
,
platform
::
float16
beta
,
platform
::
float16
*
C
)
const
{
// Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
if
(
trans_a
)
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
1
,
N
,
M
,
alpha
,
B
,
A
,
beta
,
C
);
}
else
{
this
->
template
GEMM
<
platform
::
float16
>(
CblasNoTrans
,
CblasNoTrans
,
M
,
1
,
N
,
alpha
,
A
,
B
,
beta
,
C
);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
...
...
@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
}
}
template
<
>
template
<
>
inline
void
Blas
<
platform
::
CUDADeviceContext
>::
BatchedGEMM
(
CBLAS_TRANSPOSE
transA
,
CBLAS_TRANSPOSE
transB
,
int
M
,
int
N
,
int
K
,
platform
::
float16
alpha
,
const
platform
::
float16
**
A
,
const
platform
::
float16
**
B
,
platform
::
float16
beta
,
platform
::
float16
**
C
,
int
batchCount
)
const
{
for
(
int
k
=
0
;
k
<
batchCount
;
++
k
)
{
this
->
template
GEMM
<
platform
::
float16
>(
transA
,
transB
,
M
,
N
,
K
,
alpha
,
A
[
k
],
B
[
k
],
beta
,
C
[
k
]);
}
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CUDADeviceContext
>::
TRSM
(
CBLAS_SIDE
side
,
CBLAS_UPLO
uplo
,
...
...
paddle/fluid/operators/matmul_v2_op.cu
浏览文件 @
d0c9f7e8
...
...
@@ -17,10 +17,12 @@ limitations under the License. */
namespace
ops
=
paddle
::
operators
;
namespace
plf
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
matmul_v2
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
float
>
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
matmul_v2
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
float
>
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
double
>
,
ops
::
MatMulV2Kernel
<
plf
::
CUDADeviceContext
,
plf
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
matmul_v2_grad
,
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
float
>
,
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
double
>
);
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
double
>
,
ops
::
MatMulV2GradKernel
<
plf
::
CUDADeviceContext
,
plf
::
float16
>
);
paddle/fluid/operators/matmul_v2_op.h
浏览文件 @
d0c9f7e8
...
...
@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if
(
trans_y
)
{
const
int
M
=
Y
->
numel
()
/
N
;
VLOG
(
3
)
<<
"MatMul's case 2"
;
blas
.
GEMV
(
false
,
M
,
N
,
1.
,
y_data
,
x_data
,
0.
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
false
,
M
,
N
,
static_cast
<
T
>
(
1
),
y_data
,
x_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
const
int
M
=
y_dims
[
y_ndim
-
1
];
const
int
batch_size
=
Y
->
numel
()
/
(
M
*
N
);
if
(
batch_size
==
1
)
{
VLOG
(
3
)
<<
"MatMul's case 3"
;
blas
.
GEMV
(
true
,
N
,
M
,
1.
,
y_data
,
x_data
,
0.
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
true
,
N
,
M
,
static_cast
<
T
>
(
1
),
y_data
,
x_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
VLOG
(
3
)
<<
"MatMul's case 4"
;
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
1.0
f
,
y_data
,
x_data
,
0
,
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
static_cast
<
T
>
(
1
),
y_data
,
x_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
}
}
return
;
...
...
@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
const
int
batch_size
=
X
->
numel
()
/
(
M
*
N
);
if
(
batch_size
==
1
)
{
VLOG
(
3
)
<<
"MatMul's case 5"
;
blas
.
GEMV
(
true
,
N
,
M
,
1.0
f
,
x_data
,
y_data
,
0.0
f
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
true
,
N
,
M
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
VLOG
(
3
)
<<
"MatMul's case 6"
;
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
1.0
f
,
x_data
,
y_data
,
0
,
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
blas
.
BatchedGEMM
(
CblasTrans
,
CblasNoTrans
,
M
,
1
,
N
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
(),
batch_size
,
M
*
N
,
0
);
}
}
else
{
const
int
M
=
X
->
numel
()
/
N
;
VLOG
(
3
)
<<
"MatMul's case 7"
;
blas
.
GEMV
(
false
,
M
,
N
,
1.0
f
,
x_data
,
y_data
,
0.0
f
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
false
,
M
,
N
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
return
;
}
...
...
@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
if
(
x_batch_size
==
1
&&
y_batch_size
==
1
)
{
VLOG
(
3
)
<<
"MatMul's case 8"
;
blas
.
GEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_data
,
y_data
,
0.0
f
,
Out
->
data
<
T
>
());
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
)
,
x_data
,
y_data
,
static_cast
<
T
>
(
0
)
,
Out
->
data
<
T
>
());
}
else
if
(
x_batch_size
==
1
)
{
if
(
M
==
1
&&
trans_y
)
{
VLOG
(
3
)
<<
"MatMul's case 9"
;
blas
.
GEMV
(
false
,
y_batch_size
*
N
,
K
,
1.0
f
,
y_data
,
x_data
,
0.0
f
,
Out
->
data
<
T
>
());
blas
.
GEMV
(
false
,
y_batch_size
*
N
,
K
,
static_cast
<
T
>
(
1
),
y_data
,
x_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
VLOG
(
3
)
<<
"MatMul's case 10"
;
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_data
,
y_data
,
0
,
Out
->
data
<
T
>
(),
out_batch_size
,
0
,
K
*
N
);
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
)
,
Out
->
data
<
T
>
(),
out_batch_size
,
0
,
K
*
N
);
}
}
else
if
(
y_batch_size
==
1
)
{
if
(
!
trans_x
)
{
VLOG
(
3
)
<<
"MatMul's case 11"
;
blas
.
GEMM
(
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
x_batch_size
*
M
,
N
,
K
,
1.0
f
,
x_data
,
y_data
,
0.0
f
,
Out
->
data
<
T
>
());
x_batch_size
*
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
());
}
else
{
VLOG
(
3
)
<<
"MatMul's case 12"
;
blas
.
BatchedGEMM
(
CblasTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_data
,
y_data
,
0
,
Out
->
data
<
T
>
(),
out_batch_size
,
M
*
K
,
0
);
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
)
,
Out
->
data
<
T
>
(),
out_batch_size
,
M
*
K
,
0
);
}
}
else
if
(
!
is_broadcast_dims
)
{
VLOG
(
3
)
<<
"MatMul's case 13"
;
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_data
,
y_data
,
0
,
Out
->
data
<
T
>
(),
out_batch_size
,
M
*
K
,
K
*
N
);
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
x_data
,
y_data
,
static_cast
<
T
>
(
0
),
Out
->
data
<
T
>
(),
out_batch_size
,
M
*
K
,
K
*
N
);
}
else
{
// in the case, can't use stridedgemm
std
::
vector
<
const
T
*>
x_ptr
(
out_batch_size
);
...
...
@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
}
VLOG
(
3
)
<<
"MatMul's case 14"
;
blas
.
BatchedGEMM
(
trans_x
?
CblasTrans
:
CblasNoTrans
,
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
1.0
f
,
x_ptr
.
data
(),
y_ptr
.
data
(),
0.0
f
,
out
_ptr
.
data
(),
out_batch_size
);
trans_y
?
CblasTrans
:
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
x_ptr
.
data
(),
y
_ptr
.
data
(),
static_cast
<
T
>
(
0
),
out_ptr
.
data
(),
out_batch_size
);
}
}
...
...
paddle/fluid/operators/norm_utils.cu.h
浏览文件 @
d0c9f7e8
...
...
@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
// (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
// np.sum(dy,
// axis=(n,h,w)) * (x - mean) *
// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
// (np.mean(ddx, axis=(n,h,w)) - ddx)
)
+ ddr * (dy * inv_var -
// inv_var
// *
// np.mean(dy, axis=(n,h,w)) -
// inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
// axis=(n,h,w)))
)
// axis=(n,h,w)))
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDX
(
const
T
*
x
,
const
T
*
mean
,
...
...
@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
?
(
j
/
sample_size
*
C
+
i
)
*
sample_size
+
j
%
sample_size
:
j
*
outer_size
+
i
;
dx
[
index
]
+=
(
dy
[
index
]
*
var_val
-
dy_sum_val
/
inner_size
*
var_val
-
(
x
[
index
]
-
mean_val
)
*
var_val
*
(
x
[
index
]
-
mean_val
)
*
var_val
*
var_val
*
dy_mul_x_sub_mean_sum_val
*
var_val
/
inner_size
)
*
ddscale
[
i
];
}
...
...
@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
}
// math: dx = ddscale * dy * inv_var
// math: ddy = scale * ddx * inv_var
template
<
typename
T
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDataWithGlobal
(
const
T
*
dy
,
const
T
*
scale
,
const
T
*
variance
,
const
double
epsilon
,
const
int
C
,
const
int
sample_size
,
const
int
num
,
T
*
dx
)
{
__global__
void
DoubleGradComputeDXWithGlobal
(
const
T
*
dy
,
const
T
*
ddscale
,
const
T
*
variance
,
const
double
epsilon
,
const
int
C
,
const
int
sample_size
,
const
int
num
,
T
*
dx
)
{
int
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
if
(
scale
!=
nullptr
)
{
if
(
dd
scale
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
T
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
dx
[
i
]
=
dy
[
i
]
*
scale
[
c
]
*
inv_var
;
dx
[
i
]
=
dy
[
i
]
*
ddscale
[
c
]
*
inv_var
;
}
}
}
// math: ddy = scale * ddx * inv_var + ddbias +
// ddscale * (x - mean) * inv_var
template
<
typename
T
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDDYWithGlobal
(
const
T
*
ddx
,
const
T
*
scale
,
const
T
*
mean
,
const
T
*
variance
,
const
T
*
x
,
const
T
*
ddbias
,
const
T
*
ddscale
,
const
double
epsilon
,
const
int
C
,
const
int
sample_size
,
const
int
num
,
T
*
ddy
)
{
int
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
if
(
ddx
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
T
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
ddy
[
i
]
+=
ddx
[
i
]
*
scale
[
c
]
*
inv_var
;
}
}
__syncthreads
();
if
(
ddscale
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
T
inv_var
=
1.0
/
sqrt
(
variance
[
c
]
+
epsilon
);
ddy
[
i
]
+=
(
x
[
i
]
-
mean
[
c
])
*
inv_var
*
ddscale
[
c
];
}
}
__syncthreads
();
if
(
ddbias
!=
nullptr
)
{
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
framework
::
DataLayout
::
kNCHW
?
i
/
sample_size
%
C
:
i
%
C
;
ddy
[
i
]
+=
ddbias
[
c
];
}
}
}
...
...
@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
const
T
*
mean_data
,
*
variance_data
;
if
(
use_global_stats
)
{
const
auto
*
running_mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
running_var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
running_mean_data
=
running_mean
->
template
data
<
T
>();
const
auto
*
running_var_data
=
running_var
->
template
data
<
T
>();
mean_data
=
running_mean_data
;
variance_data
=
running_var_data
;
}
else
{
const
T
*
smean_data
=
Saved_mean
->
data
<
T
>
();
...
...
@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant
(
dev_ctx
,
dX
,
static_cast
<
T
>
(
0
));
if
(
use_global_stats
)
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
X
WithGlobal
<
T
,
DataLayout
::
kNHWC
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
ddscale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
dx_data
);
}
else
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
X
WithGlobal
<
T
,
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
dy_data
,
ddscale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
dx_data
);
...
...
@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant
(
dev_ctx
,
ddY
,
static_cast
<
T
>
(
0
));
if
(
use_global_stats
)
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
DY
WithGlobal
<
T
,
DataLayout
::
kNHWC
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
ddx_data
,
scale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
ddy_data
);
ddx_data
,
scale_data
,
mean_data
,
variance_data
,
x_data
,
ddbias_data
,
dd
scale_data
,
epsilon
,
C
,
sample_size
,
num
,
dd
y_data
);
}
else
{
DoubleGradComputeD
ata
WithGlobal
<
DoubleGradComputeD
DY
WithGlobal
<
T
,
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
ddx_data
,
scale_data
,
variance_data
,
epsilon
,
C
,
sample_size
,
num
,
ddy_data
);
ddx_data
,
scale_data
,
mean_data
,
variance_data
,
x_data
,
ddbias_data
,
dd
scale_data
,
epsilon
,
C
,
sample_size
,
num
,
dd
y_data
);
}
}
else
{
if
(
data_layout
==
DataLayout
::
kNHWC
)
{
...
...
paddle/fluid/operators/utils.h
浏览文件 @
d0c9f7e8
...
...
@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
// NOTE: Converting int64 to int32 may cause data overflow.
vec_new_data
=
std
::
vector
<
T
>
(
data
,
data
+
x
->
numel
());
}
else
{
PADDLE_THROW
(
"The dtype of Tensor must be int32 or int64."
);
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The dtype of Tensor must be int32 or int64, but received: %s"
,
x
->
type
()));
}
return
vec_new_data
;
}
...
...
@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
for
(
size_t
i
=
0
;
i
<
list_tensor
.
size
();
++
i
)
{
auto
tensor
=
list_tensor
[
i
];
PADDLE_ENFORCE_EQ
(
tensor
->
dims
(),
framework
::
make_ddim
({
1
}),
"ShapeError: The shape of Tensor in list must be [1]. "
"But received the shape "
"is [%s]"
,
tensor
->
dims
());
platform
::
errors
::
InvalidArgument
(
"The shape of Tensor in list must be [1]. "
"But received its shape "
"is [%s]"
,
tensor
->
dims
()));
if
(
tensor
->
type
()
==
framework
::
proto
::
VarType
::
INT32
)
{
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
...
...
@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
vec_new_data
.
push_back
(
static_cast
<
T
>
(
*
tensor
->
data
<
int64_t
>
()));
}
}
else
{
PADDLE_THROW
(
"The dtype of Tensor in list must be int32 or int64."
);
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The dtype of Tensor in list must be int32 or int64, but received: "
"%s"
,
tensor
->
type
()));
}
}
return
vec_new_data
;
...
...
paddle/fluid/platform/cuda_primitives.h
浏览文件 @
d0c9f7e8
...
...
@@ -134,7 +134,26 @@ USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC
(
Max
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
USE_CUDA_ATOMIC
(
Max
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Max
,
unsigned
long
long
int
)
{
if
(
*
address
>=
val
)
{
return
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
do
{
assumed
=
old
;
if
(
assumed
>=
val
)
{
break
;
}
old
=
atomicCAS
(
address
,
assumed
,
val
);
}
while
(
assumed
!=
old
);
}
#endif
CUDA_ATOMIC_WRAPPER
(
Max
,
int64_t
)
{
// Here, we check long long int must be int64_t.
...
...
@@ -187,7 +206,26 @@ USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC
(
Min
,
unsigned
int
);
// CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
USE_CUDA_ATOMIC
(
Min
,
unsigned
long
long
int
);
// NOLINT
#else
CUDA_ATOMIC_WRAPPER
(
Min
,
unsigned
long
long
int
)
{
if
(
*
address
<=
val
)
{
return
;
}
unsigned
long
long
int
old
=
*
address
,
assumed
;
do
{
assumed
=
old
;
if
(
assumed
<=
val
)
{
break
;
}
old
=
atomicCAS
(
address
,
assumed
,
val
);
}
while
(
assumed
!=
old
);
}
#endif
CUDA_ATOMIC_WRAPPER
(
Min
,
int64_t
)
{
// Here, we check long long int must be int64_t.
...
...
paddle/scripts/installation_validate.py
浏览文件 @
d0c9f7e8
...
...
@@ -15,4 +15,5 @@
import
paddle.fluid
as
fluid
import
paddle
as
pd
fluid
.
install_check
.
run_check
()
print
(
pd
.
__version__
)
paddle/scripts/paddle_build.bat
浏览文件 @
d0c9f7e8
...
...
@@ -40,6 +40,7 @@ if not defined WITH_TPCACHE set WITH_TPCACHE=ON
rem -------set cache build work directory-----------
rmdir
build
\python
/s/q
if
"
%WITH_CACHE%
"
==
"OFF"
(
rmdir
build
/s/q
goto
:mkbuild
...
...
@@ -48,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" (
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
datetime
=
%%
#
set
day_now
=
%datetime
:
~
6
,
2
%
set
day_before
=-
1
set
/p
day_before
=<
day
.txt
set
/p
day_before
=<
%work_dir%
\..\
day.txt
if
%day_now%
NEQ
%day_before%
(
echo
%day_now%
>
day
.txt
type
day
.txt
echo
%day_now%
>
%work_dir%
\..\
day.txt
type
%work_dir%
\..\
day.txt
rmdir
build
/s/q
)
git
diff
origin
/develop --stat --name-only
|
findstr
"cmake CMakeLists.txt paddle_build.bat"
...
...
@@ -208,7 +209,7 @@ echo Build third_party the %build_times% time:
msbuild
/m /p
:Configuration
=
Release
/verbosity
:quiet
third_party
.vcxproj
if
%ERRORLEVEL%
NEQ
0
(
set
/a
build_times
=
%build_times%
+
1
if
%build_times%
GTR
3
(
if
%build_times%
GTR
2
(
exit
/b
7
)
else
(
echo
Build
third_party
failed
,
will
retry
!
...
...
@@ -223,7 +224,7 @@ echo Build Paddle the %build_times% time:
msbuild
/m
:
%PARALLEL_PROJECT_COUNT%
/p
:TrackFileAccess
=
false
/p
:CLToolExe
=
clcache
.exe
/p
:CLToolPath
=
%PYTHON_ROOT%
\Scripts
/p
:Configuration
=
Release
/verbosity
:minimal
paddle
.sln
if
%ERRORLEVEL%
NEQ
0
(
set
/a
build_times
=
%build_times%
+
1
if
%build_times%
GTR
2
(
if
%build_times%
GTR
1
(
exit
/b
7
)
else
(
echo
Build
Paddle
failed
,
will
retry
!
...
...
@@ -301,6 +302,7 @@ goto:eof
call
paddle_winci
\Scripts\deactivate.bat
2
>
NUL
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
end
=
%%
#
set
end
=
%end
:
~
4
,
10
%
call
:timestamp
"
%start%
"
"
%end%
"
"1 card TestCases Total"
call
:timestamp
"
%start%
"
"
%end%
"
"TestCases Total"
echo
Running
unit
tests
failed
,
will
exit
!
exit
/b
8
...
...
@@ -313,6 +315,7 @@ echo ========================================
for
/F
%%
#
in
(
'wmic os get localdatetime
^|
findstr 20'
)
do
set
end
=
%%
#
set
end
=
%end
:
~
4
,
10
%
call
:timestamp
"
%start%
"
"
%end%
"
"1 card TestCases Total"
call
:timestamp
"
%start%
"
"
%end%
"
"TestCases Total"
cd
%work_dir%
\paddle\fluid\inference\api\demo_ci
...
...
@@ -345,6 +348,8 @@ echo ============================================ >> check_change_of_unitte
echo
EOF
>>
check_change_of_unittest
.sh
echo
spec_path
=
$
(
pwd
)
/UNITTEST
_PR.spec>>
check_change_of_unittest
.sh
echo
ctest
-N
^|
awk
-F
':'
'{print $2}'
^|
sed
'/
^^
$/d'
^|
sed
'$d'
^>
$
{
spec_path
}>>
check_change_of_unittest
.sh
echo
num
=
$
(
awk
'END{print NR}'
$
{
spec_path
})>>
check_change_of_unittest
.sh
echo
echo
"Windows 1 card TestCases count is $num"
>>
check_change_of_unittest
.sh
echo
UPSTREAM_URL
=
'https://github.com/PaddlePaddle/Paddle'
>>
check_change_of_unittest
.sh
echo
origin_upstream_url
=
`git remote -v
^|
awk '{print $1, $2}'
^|
uniq
^|
grep upstream
^|
awk '{print $2}'`
>>
check_change_of_unittest
.sh
echo
if
[
"$origin_upstream_url"
==
""
];
then
>>
check_change_of_unittest
.sh
...
...
@@ -455,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL
taskkill
/f /im
rc
.exe
2
>
NUL
wmic
process
where
name
=
"op_function_generator.exe"
call
terminate
2
>
NUL
taskkill
/f /im
python
.exe
2
>
NUL
call
paddle_winci
\Scripts\deactivate.bat
2
>
NUL
del
%PADDLE_WHL_FILE_WIN%
taskkill
/f /im
python
.exe
2
>
NUL
echo
Windows
CI
run
successfully
!
exit
/b
0
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
d0c9f7e8
...
...
@@ -988,11 +988,6 @@ set +x
fi
read
testcase
<<<
$(
echo
"
$line
"
|grep
-oEi
"
\w
+$"
)
if
python
$PADDLE_ROOT
/tools/is_ut_disabled.py
$testcase
;
then
echo
$testcase
" is disabled."
continue
fi
if
[[
"
$is_nightly
"
!=
""
]]
&&
[
${
NIGHTLY_MODE
:-
OFF
}
==
"OFF"
]
;
then
echo
$testcase
" will only run at night."
continue
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
d0c9f7e8
...
...
@@ -845,6 +845,29 @@ class DistributedStrategy(object):
check_configs_key
(
self
.
strategy
.
dgc_configs
,
configs
,
"dgc_configs"
)
assign_configs_value
(
self
.
strategy
.
dgc_configs
,
configs
)
@
property
def
fp16_allreduce
(
self
):
"""
Indicating whether we are using fp16 gradient allreduce training
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fp16_allreduce = True # by default this is false
"""
return
self
.
strategy
.
fp16_allreduce
@
fp16_allreduce
.
setter
@
is_strict_auto
def
fp16_allreduce
(
self
,
flag
):
if
not
isinstance
(
flag
,
bool
):
raise
TypeError
(
'fp16_allreduce must be value of bool type'
)
self
.
strategy
.
fp16_allreduce
=
flag
@
property
def
gradient_merge
(
self
):
"""
...
...
python/paddle/distributed/fleet/meta_optimizers/__init__.py
浏览文件 @
d0c9f7e8
...
...
@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer
from
.parameter_server_graph_optimizer
import
ParameterServerGraphOptimizer
from
.dgc_optimizer
import
DGCOptimizer
from
.lamb_optimizer
import
LambOptimizer
from
.fp16_allreduce_optimizer
import
FP16AllReduceOptimizer
python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
0 → 100755
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from
paddle.fluid
import
core
,
framework
,
unique_name
from
.meta_optimizer_base
import
MetaOptimizerBase
class
FP16AllReduceOptimizer
(
MetaOptimizerBase
):
def
__init__
(
self
,
optimizer
):
super
(
FP16AllReduceOptimizer
,
self
).
__init__
(
optimizer
)
self
.
inner_opt
=
optimizer
# we do not allow meta optimizer to be inner optimizer currently
self
.
meta_optimizers_white_list
=
[
"LarsOptimizer"
,
"LambOptimizer"
,
"RecomputeOptimizer"
,
"LocalSGDOptimizer"
,
"GradientMergeOptimizer"
,
"GraphExecutionOptimizer"
,
"AdaptiveLocalSGDOptimizer"
,
]
self
.
meta_optimizers_black_list
=
[
"DGCOptimizer"
]
def
_set_basic_info
(
self
,
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
):
super
(
FP16AllReduceOptimizer
,
self
).
_set_basic_info
(
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
)
def
_can_apply
(
self
):
if
not
self
.
role_maker
.
_is_collective
:
return
False
if
self
.
user_defined_strategy
.
fp16_allreduce
:
return
True
return
False
def
_disable_strategy
(
self
,
dist_strategy
):
dist_strategy
.
fp16_allreduce
=
False
def
_enable_strategy
(
self
,
dist_strategy
,
context
=
None
):
dist_strategy
.
fp16_allreduce
=
True
@
staticmethod
def
fp16_compression
(
param_and_grads
):
"""
Compress fp32 gradients to fp16 during allreduce.
"""
op_maker
=
core
.
op_proto_and_checker_maker
new_param_and_grads
=
[]
# param, grad, is_cast
# cast grad from fp32->fp16 before allreduce,
for
param
,
grad
in
param_and_grads
:
if
grad
is
None
or
grad
.
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
:
new_param_and_grads
.
append
((
param
,
grad
,
False
))
continue
op
=
grad
.
op
block
=
grad
.
block
var_attr
=
op
.
all_attrs
()[
op_maker
.
kOpRoleVarAttrName
()]
if
param
.
name
not
in
var_attr
:
new_param_and_grads
.
append
((
param
,
grad
,
False
))
continue
# remove (param, grad) from op_role_var
var_attr
.
remove
(
param
.
name
)
var_attr
.
remove
(
grad
.
name
)
if
len
(
var_attr
)
>
1
:
op
.
_set_attr
(
op_maker
.
kOpRoleVarAttrName
(),
var_attr
)
else
:
op
.
_remove_attr
(
op_maker
.
kOpRoleVarAttrName
())
new_grad
=
block
.
create_var
(
name
=
unique_name
.
generate
(
grad
.
name
+
".cast_fp16"
),
dtype
=
core
.
VarDesc
.
VarType
.
FP16
,
persistable
=
False
,
stop_gradient
=
True
)
with
block
.
program
.
_backward_role_guard
():
cast_op
=
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
grad
},
outputs
=
{
"Out"
:
new_grad
},
attrs
=
{
"in_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP16
},
stop_gradient
=
True
)
backward
=
op_maker
.
OpRole
.
Backward
cast_op
.
_set_attr
(
op_maker
.
kOpRoleAttrName
(),
backward
)
cast_op
.
_set_attr
(
op_maker
.
kOpRoleVarAttrName
(),
[
param
.
name
,
new_grad
.
name
])
new_grad
.
op
=
cast_op
new_param_and_grads
.
append
((
param
,
new_grad
,
True
))
ret_param_and_grads
=
[]
# cast grad from fp16->fp32 after allreduce.
# NOTE. Now we split fp16 compression into two for loops,
# if we do not separate them, fuse allreduce will wrong.
# This must be the problem of fuse allreduce pass, need
# fixed in future.
for
param
,
grad
,
cast
in
new_param_and_grads
:
if
not
cast
:
ret_param_and_grads
.
append
((
param
,
grad
))
continue
block
=
grad
.
block
new_grad
=
block
.
create_var
(
name
=
unique_name
.
generate
(
grad
.
name
+
".cast_fp32"
),
dtype
=
core
.
VarDesc
.
VarType
.
FP32
,
persistable
=
False
,
stop_gradient
=
True
)
with
block
.
program
.
_optimized_guard
(
[
param
,
grad
]),
framework
.
name_scope
(
'fp16_allreduce'
):
cast_op
=
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
grad
},
outputs
=
{
"Out"
:
new_grad
},
attrs
=
{
"in_dtype"
:
core
.
VarDesc
.
VarType
.
FP16
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
},
stop_gradient
=
True
)
ret_param_and_grads
.
append
((
param
,
new_grad
))
return
ret_param_and_grads
def
apply_optimize
(
self
,
loss
,
startup_program
,
params_grads
):
new_params_grads
=
self
.
fp16_compression
(
params_grads
)
return
self
.
inner_opt
.
apply_optimize
(
loss
,
startup_program
=
startup_program
,
params_grads
=
new_params_grads
)
python/paddle/fluid/executor.py
浏览文件 @
d0c9f7e8
...
...
@@ -1355,7 +1355,7 @@ class Executor(object):
if
not
program
.
_fleet_opt
is
None
:
if
program
.
_fleet_opt
.
get
(
"worker_class"
,
""
)
==
"HeterCpuWorker"
:
is_heter
=
1
if
program
.
_fleet_opt
(
"trainer"
,
""
)
==
"HeterXpuTrainer"
:
if
program
.
_fleet_opt
.
get
(
"trainer"
,
""
)
==
"HeterXpuTrainer"
:
is_heter
=
1
if
scope
is
None
:
scope
=
global_scope
()
...
...
python/paddle/fluid/reader.py
浏览文件 @
d0c9f7e8
...
...
@@ -167,10 +167,10 @@ class DataLoader(object):
The variables should be created by :code:`fluid.data()`.
:attr:`feed_list` must be set if :attr:`return_list` is
False. Default None.
places(list(Place)|tuple(Place)
): a list of Place, to put data
onto, :attr:`places` must be set in both static graph and
dynamic graph mode, in dynamic graph mode, place number must
be 1
. Default None.
places(list(Place)|tuple(Place)
|optional): a list of Place,
to put data onto, :attr:`places` can be None, if
:attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
will be used
. Default None.
return_list (bool): whether the return value on each device is
presented as a list. If :attr:`return_list=False`, the return
value on each device would be a dict of str -> LoDTensor, where
...
...
@@ -222,6 +222,8 @@ class DataLoader(object):
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.io import Dataset, BatchSampler, DataLoader
...
...
@@ -247,11 +249,48 @@ class DataLoader(object):
def __len__(self):
return self.num_samples
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
# get places
places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
# --------------------- dygraph mode --------------------
class SimpleNet(fluid.dygraph.Layer):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
def forward(self, image, label=None):
return self.fc(image)
with fluid.dygraph.guard(places[0]):
simple_net = SimpleNet()
opt = fluid.optimizer.SGD(learning_rate=1e-3,
parameter_list=simple_net.parameters())
loader = DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.reduce_mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
# -------------------------------------------------------
# -------------------- static graph ---------------------
paddle.enable_static()
def simple_net(image, label):
fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
...
...
@@ -270,11 +309,8 @@ class DataLoader(object):
prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
loader = DataLoader(dataset,
feed_list=[image, label],
places=places,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
...
...
@@ -287,39 +323,6 @@ class DataLoader(object):
# -------------------------------------------------------
# --------------------- dygraph mode --------------------
class SimpleNet(fluid.dygraph.Layer):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
def forward(self, image, label=None):
return self.fc(image)
with fluid.dygraph.guard(places[0]):
simple_net = SimpleNet()
opt = fluid.optimizer.SGD(learning_rate=1e-3,
parameter_list=simple_net.parameters())
loader = DataLoader(dataset,
places=places[0],
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
for e in range(EPOCH_NUM):
for i, (image, label) in enumerate(loader()):
out = simple_net(image)
loss = fluid.layers.cross_entropy(out, label)
avg_loss = fluid.layers.reduce_mean(loss)
avg_loss.backward()
opt.minimize(avg_loss)
simple_net.clear_gradients()
print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
# -------------------------------------------------------
.. note::
For reading iterable dataset with multiprocess Dataloader,
...
...
@@ -356,11 +359,9 @@ class DataLoader(object):
"feed_list should be set when return_list=False"
self
.
feed_list
=
feed_list
assert
places
is
not
None
,
"places cannot be None"
if
places
is
None
:
places
=
_current_expected_place
()
self
.
places
=
_convert_places
(
places
)
if
in_dygraph_mode
():
assert
len
(
self
.
places
)
==
1
,
\
"Number of places must be 1 in dygraph mode"
assert
num_workers
>=
0
,
"num_workers should be a non-negative value"
if
num_workers
>
0
and
(
sys
.
platform
==
'darwin'
or
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
d0c9f7e8
...
...
@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_fp16_allreduce_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_private_function
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base
)
...
...
@@ -334,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
# disable this unittest temporarily
list
(
REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception
)
list
(
REMOVE_ITEM TEST_OPS test_sampling_id_op
)
list
(
REMOVE_ITEM TEST_OPS test_paddle_save_load
)
if
(
APPLE OR WIN32
)
list
(
REMOVE_ITEM TEST_OPS test_dataset
)
...
...
@@ -458,6 +456,7 @@ if(WITH_DISTRIBUTE)
py_test_modules
(
test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
${
dist_ENVS
}
)
...
...
python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle
import
paddle.fluid
as
fluid
from
paddle.distributed.fleet.meta_optimizers
import
FP16AllReduceOptimizer
as
FP16AllReduce
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
from
dist_mnist
import
cnn_model
DTYPE
=
"float32"
paddle
.
dataset
.
mnist
.
fetch
()
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
class
TestDistMnist2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
):
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
# Train program
predict
=
cnn_model
(
images
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
batch_size_tensor
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size_tensor
)
inference_program
=
fluid
.
default_main_program
().
clone
()
# Optimization
opt
=
fluid
.
optimizer
.
MomentumOptimizer
(
learning_rate
=
0.001
,
momentum
=
0.9
)
opt
=
FP16AllReduce
(
opt
)
# Reader
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
opt
.
minimize
(
avg_cost
)
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
if
__name__
==
"__main__"
:
runtime_main
(
TestDistMnist2x2
)
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ConvActivationMkldnnFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
conv_out
=
fluid
.
layers
.
conv2d
(
data
,
num_filters
=
self
.
conv_num_filters
,
filter_size
=
self
.
conv_filter_size
,
bias_attr
=
self
.
conv_bias_attr
,
act
=
self
.
act
)
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
fetch_list
=
[
conv_out
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
conv_num_filters
=
3
self
.
conv_filter_size
=
3
self
.
conv_bias_attr
=
False
self
.
act
=
"relu"
self
.
pass_name
=
'conv_relu_mkldnn_fuse_pass'
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
ConvActivationMkldnnFusePassTest_1
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
5
self
.
conv_filter_size
=
5
self
.
conv_bias_attr
=
True
self
.
act
=
"relu"
self
.
pass_name
=
'conv_relu_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_2
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
3
self
.
conv_filter_size
=
3
self
.
conv_bias_attr
=
False
self
.
act
=
"leaky_relu"
self
.
pass_name
=
'conv_leaky_relu_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_3
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
5
self
.
conv_filter_size
=
5
self
.
conv_bias_attr
=
True
self
.
act
=
"leaky_relu"
self
.
pass_name
=
'conv_leaky_relu_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_4
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
3
self
.
conv_filter_size
=
3
self
.
conv_bias_attr
=
False
self
.
act
=
"relu6"
self
.
pass_name
=
'conv_relu6_mkldnn_fuse_pass'
class
ConvActivationMkldnnFusePassTest_4
(
ConvActivationMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
conv_num_filters
=
5
self
.
conv_filter_size
=
5
self
.
conv_bias_attr
=
True
self
.
act
=
"swish"
self
.
pass_name
=
'conv_swish_mkldnn_fuse_pass'
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ConvConcatReluMkldnnFusePassTest_0
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data_1
=
fluid
.
data
(
name
=
"data_1"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
data_2
=
fluid
.
data
(
name
=
"data_2"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
conv_1
=
fluid
.
layers
.
conv2d
(
data_1
,
num_filters
=
self
.
conv1_num_filters
,
filter_size
=
self
.
conv1_filter_size
,
padding
=
self
.
conv1_padding
,
bias_attr
=
self
.
conv1_bias_attr
)
conv_2
=
fluid
.
layers
.
conv2d
(
data_2
,
num_filters
=
self
.
conv2_num_filters
,
filter_size
=
self
.
conv2_filter_size
,
padding
=
self
.
conv2_padding
,
bias_attr
=
self
.
conv2_bias_attr
)
concat
=
fluid
.
layers
.
concat
(
[
conv_1
,
conv_2
],
axis
=
self
.
concat_axis
)
out
=
fluid
.
layers
.
relu
(
concat
)
self
.
feeds
=
{
"data_1"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
),
"data_2"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
fetch_list
=
[
out
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
conv1_num_filters
=
3
self
.
conv1_filter_size
=
3
self
.
conv1_padding
=
0
self
.
conv1_bias_attr
=
False
self
.
conv2_num_filters
=
3
self
.
conv2_filter_size
=
3
self
.
conv2_padding
=
0
self
.
conv2_bias_attr
=
False
self
.
concat_axis
=
0
self
.
pass_name
=
"conv_concat_relu_mkldnn_fuse_pass"
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
ConvConcatReluMkldnnFusePassTest_1
(
ConvConcatReluMkldnnFusePassTest_0
):
def
set_params
(
self
):
self
.
conv1_num_filters
=
3
self
.
conv1_filter_size
=
3
self
.
conv1_padding
=
0
self
.
conv1_bias_attr
=
False
self
.
conv2_num_filters
=
5
self
.
conv2_filter_size
=
5
self
.
conv2_padding
=
1
self
.
conv2_bias_attr
=
True
self
.
concat_axis
=
1
self
.
pass_name
=
"conv_concat_relu_mkldnn_fuse_pass"
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
MatmulTransposeReshapeMkldnnFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
self
.
data_shape
,
dtype
=
"float32"
)
weight
=
fluid
.
layers
.
create_parameter
(
shape
=
self
.
weight_shape
,
dtype
=
"float32"
)
matmul
=
fluid
.
layers
.
matmul
(
data
,
weight
,
transpose_x
=
self
.
transpose_x
,
transpose_y
=
self
.
transpose_y
)
transpose
=
fluid
.
layers
.
transpose
(
matmul
,
self
.
tranpose_perm
)
reshape
=
fluid
.
layers
.
reshape
(
transpose
,
shape
=
self
.
reshape_shape
)
self
.
fetch_list
=
[
reshape
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
data_shape
=
[
-
1
,
3
,
100
,
110
]
self
.
weight_shape
=
[
1
,
3
,
110
,
100
]
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
110
)).
astype
(
"float32"
)
}
self
.
transpose_x
=
False
self
.
transpose_y
=
False
self
.
tranpose_perm
=
[
0
,
2
,
1
,
3
]
self
.
reshape_shape
=
[
3
,
100
,
100
]
self
.
pass_name
=
'matmul_transpose_reshape_fuse_pass'
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
MatmulTransposeReshapeMkldnnFusePassTest_1
(
MatmulTransposeReshapeMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
data_shape
=
[
-
1
,
3
,
100
,
100
]
self
.
weight_shape
=
[
1
,
3
,
100
,
100
]
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
transpose_x
=
True
self
.
transpose_y
=
True
self
.
tranpose_perm
=
[
0
,
2
,
1
,
3
]
self
.
reshape_shape
=
[
6
,
50
,
100
]
self
.
pass_name
=
'matmul_transpose_reshape_fuse_pass'
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ScaleMatmulMkldnnFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
self
.
set_params
()
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
[
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
weight
=
fluid
.
layers
.
create_parameter
(
shape
=
[
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
scale
=
fluid
.
layers
.
scale
(
data
,
scale
=
self
.
scale_scale
)
matmul
=
fluid
.
layers
.
matmul
(
scale
,
weight
,
transpose_x
=
self
.
transpose_x
,
transpose_y
=
self
.
transpose_y
)
self
.
fetch_list
=
[
matmul
]
self
.
enable_mkldnn
=
True
def
set_params
(
self
):
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
scale_scale
=
2.0
self
.
transpose_x
=
False
self
.
transpose_y
=
False
self
.
pass_name
=
"scale_matmul_fuse_pass"
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
def
test_pass_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
self
.
pass_name
))
class
ScaleMatmulMkldnnFusePassTest_1
(
ScaleMatmulMkldnnFusePassTest
):
def
set_params
(
self
):
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
scale_scale
=
5.0
self
.
transpose_x
=
True
self
.
transpose_y
=
True
self
.
pass_name
=
"scale_matmul_fuse_pass"
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_
mkldnn_conv_relu
_fuse_pass.py
→
python/paddle/fluid/tests/unittests/ir/inference/test_
seq_concat_fc
_fuse_pass.py
浏览文件 @
d0c9f7e8
...
...
@@ -20,25 +20,13 @@ from inference_pass_test import InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
AnalysisConfig
from
paddle.fluid.core
import
PassVersionChecker
class
ConvBnFusePassMKLDNNTest
(
InferencePassTest
):
def
setUp
(
self
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
fluid
.
data
(
name
=
"data"
,
shape
=
[
-
1
,
3
,
100
,
100
],
dtype
=
"float32"
)
conv_out
=
fluid
.
layers
.
conv2d
(
data
,
num_filters
=
3
,
filter_size
=
3
,
bias_attr
=
False
,
act
=
"relu"
)
self
.
feeds
=
{
"data"
:
np
.
random
.
random
((
1
,
3
,
100
,
100
)).
astype
(
"float32"
)
}
self
.
fetch_list
=
[
conv_out
]
self
.
enable_mkldnn
=
True
def
test_check_output
(
self
):
use_gpu
=
False
self
.
check_output_with_option
(
use_gpu
)
class
SeqConcatFCFusePassTest
(
InferencePassTest
):
def
test_compatible
(
self
):
self
.
assertTrue
(
PassVersionChecker
.
IsCompatible
(
'seq_concat_fc_fuse_pass'
))
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
浏览文件 @
d0c9f7e8
...
...
@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
self
.
func
(
p
)
class
TestAbsDoubleGradCheck
(
unittest
.
TestCase
):
@
prog_scope
()
def
func
(
self
,
place
):
# the shape of input variable should be clearly specified, not inlcude -1.
shape
=
[
2
,
3
,
7
,
9
]
eps
=
1e-6
dtype
=
np
.
float64
x
=
layers
.
data
(
'x'
,
shape
,
False
,
dtype
)
x
.
persistable
=
True
y
=
layers
.
abs
(
x
)
x_arr
=
np
.
random
.
uniform
(
-
1
,
1
,
shape
).
astype
(
dtype
)
gradient_checker
.
double_grad_check
(
[
x
],
y
,
x_init
=
x_arr
,
place
=
place
,
eps
=
eps
)
def
test_grad
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
p
in
places
:
self
.
func
(
p
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_activation_op.py
浏览文件 @
d0c9f7e8
...
...
@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard
class
TestSqrtOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
(),
Program
()):
# The input type of sqrt op must be Variable or numpy.ndarray.
in1
=
1
...
...
@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase):
class
TestActivation
(
OpTest
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"exp"
self
.
init_dtype
()
self
.
init_kernel_type
()
...
...
@@ -71,6 +73,7 @@ class TestActivation(OpTest):
class
TestParameter
(
object
):
def
test_out_name
(
self
):
paddle
.
enable_static
()
with
fluid
.
program_guard
(
fluid
.
Program
()):
np_x
=
np
.
array
([
0.1
])
data
=
fluid
.
layers
.
data
(
name
=
"X"
,
shape
=
[
1
])
...
...
@@ -92,6 +95,7 @@ class TestParameter(object):
class
TestSigmoid
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sigmoid"
self
.
init_dtype
()
...
...
@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation):
class
TestLogSigmoid
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"logsigmoid"
self
.
init_dtype
()
...
...
@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase):
class
TestTanh
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"tanh"
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
17
]).
astype
(
self
.
dtype
)
...
...
@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase):
class
TestAtan
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"atan"
self
.
init_dtype
()
...
...
@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter):
class
TestSinh
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sinh"
self
.
init_dtype
()
...
...
@@ -349,6 +357,7 @@ class TestSinh(TestActivation):
class
TestSinhOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
sinh
,
1
)
...
...
@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase):
class
TestCosh
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"cosh"
self
.
init_dtype
()
...
...
@@ -420,6 +430,7 @@ class TestCosh(TestActivation):
class
TestCoshOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
cosh
,
1
)
...
...
@@ -438,6 +449,7 @@ def ref_tanhshrink(x):
class
TestTanhshrink
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"tanh_shrink"
self
.
init_dtype
()
...
...
@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold):
class
TestHardShrink
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"hard_shrink"
self
.
init_dtype
()
...
...
@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink):
class
TestHardShrinkAPI
(
unittest
.
TestCase
):
# test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
x_np
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
12
]).
astype
(
'float32'
)
self
.
place
=
paddle
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
()
\
else
paddle
.
CPUPlace
()
...
...
@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5):
class
TestSoftshrink
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"softshrink"
self
.
init_dtype
()
...
...
@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase):
class
TestSqrt
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sqrt"
self
.
init_dtype
()
...
...
@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter):
class
TestRsqrt
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"rsqrt"
self
.
init_dtype
()
...
...
@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation):
class
TestAbs
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"abs"
self
.
init_dtype
()
...
...
@@ -792,6 +810,7 @@ class TestAbs(TestActivation):
class
TestCeil
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"ceil"
self
.
init_dtype
()
...
...
@@ -808,6 +827,7 @@ class TestCeil(TestActivation):
class
TestFloor
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"floor"
self
.
init_dtype
()
...
...
@@ -826,6 +846,7 @@ class TestFloor(TestActivation):
class
TestCos
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"cos"
self
.
init_dtype
()
...
...
@@ -843,6 +864,7 @@ class TestCos(TestActivation):
class
TestAcos
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"acos"
self
.
init_dtype
()
...
...
@@ -860,6 +882,7 @@ class TestAcos(TestActivation):
class
TestSin
(
TestActivation
,
TestParameter
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"sin"
self
.
init_dtype
()
...
...
@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter):
class
TestAsin
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"asin"
self
.
init_dtype
()
...
...
@@ -894,6 +918,7 @@ class TestAsin(TestActivation):
class
TestRound
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"round"
self
.
init_dtype
()
...
...
@@ -909,6 +934,7 @@ class TestRound(TestActivation):
class
TestRelu
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"relu"
self
.
init_dtype
()
...
...
@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation):
return
0.02
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"leaky_relu"
self
.
init_dtype
()
alpha
=
self
.
get_alpha
()
...
...
@@ -1084,6 +1111,7 @@ def gelu(x, approximate):
class
TestGeluApproximate
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"gelu"
self
.
init_dtype
()
approximate
=
True
...
...
@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation):
class
TestGelu
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"gelu"
self
.
init_dtype
()
approximate
=
False
...
...
@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase):
class
TestBRelu
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"brelu"
self
.
init_dtype
()
...
...
@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation):
class
TestBReluOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
brelu
,
1
)
...
...
@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0):
class
TestRelu6
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"relu6"
self
.
init_dtype
()
...
...
@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase):
class
TestHardSwish
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
'hard_swish'
self
.
init_dtype
()
...
...
@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation):
class
TestHardSwishOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
hard_swish
,
1
)
...
...
@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase):
class
TestSoftRelu
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"soft_relu"
self
.
init_dtype
()
...
...
@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation):
class
TestSoftReluOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
soft_relu
,
1
)
...
...
@@ -1366,6 +1402,7 @@ def elu(x, alpha):
class
TestELU
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"elu"
self
.
init_dtype
()
...
...
@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase):
class
TestReciprocal
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"reciprocal"
self
.
init_dtype
()
...
...
@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation):
class
TestLog
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"log"
self
.
init_dtype
()
...
...
@@ -1478,6 +1517,7 @@ class TestLog(TestActivation):
class
TestLog1p
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"log1p"
self
.
init_dtype
()
...
...
@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation):
class
TestSquare
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"square"
self
.
init_dtype
()
...
...
@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation):
class
TestPow
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"pow"
self
.
init_dtype
()
...
...
@@ -1557,6 +1599,7 @@ class TestPow(TestActivation):
class
TestPow_factor_tensor
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"pow"
self
.
init_dtype
()
...
...
@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation):
class
TestSTanh
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"stanh"
self
.
init_dtype
()
...
...
@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation):
class
TestSTanhOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
stanh
,
1
)
...
...
@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20):
class
TestSoftplus
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"softplus"
self
.
init_dtype
()
...
...
@@ -1751,6 +1797,7 @@ def ref_softsign(x):
class
TestSoftsign
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"softsign"
self
.
init_dtype
()
...
...
@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase):
class
TestThresholdedRelu
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"thresholded_relu"
self
.
init_dtype
()
...
...
@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation):
class
TestThresholdedReluOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
thresholded_relu
,
1
)
...
...
@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase):
class
TestHardSigmoid
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"hard_sigmoid"
self
.
init_dtype
()
...
...
@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation):
class
TestHardSigmoidOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
hard_sigmoid
,
1
)
...
...
@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase):
class
TestSwish
(
TestActivation
):
def
setUp
(
self
):
paddle
.
enable_static
()
self
.
op_type
=
"swish"
self
.
init_dtype
()
...
...
@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation):
class
TestSwishOpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
paddle
.
enable_static
()
with
program_guard
(
Program
()):
# The input type must be Variable.
self
.
assertRaises
(
TypeError
,
fluid
.
layers
.
swish
,
1
)
...
...
python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
浏览文件 @
d0c9f7e8
...
...
@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase):
fetch_val2
,
=
exe
.
run
(
compiled_prog
,
feed
=
feed_dict
,
fetch_list
=
[
fetch_var
])
self
.
assertTrue
(
np
.
array_equal
(
fetch_val1
,
fetch_val2
))
#NOTE(zhiqiu): Temporally changed from array_equal to allclose.
# The real root is fuse_all_reduce and fuse_all_optimizer_opss may
# result in diff because of the instruction set on the virtual machine.
# And the related unit tests: test_fuse_all_reduce_pass and test_fuse_optimizer_pass use "almostEqual" in their checks.
# There are also some related issues:
# https://github.com/PaddlePaddle/Paddle/issues/21270
# https://github.com/PaddlePaddle/Paddle/issues/21046
# https://github.com/PaddlePaddle/Paddle/issues/21045
self
.
assertTrue
(
np
.
allclose
(
fetch_val1
,
fetch_val2
))
def
check_multi_card_fetch_var
(
self
):
if
self
.
is_invalid_test
():
...
...
@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase):
fetch_vals
.
append
(
fetch_val
)
for
item
in
fetch_vals
:
self
.
assertTrue
(
np
.
array_equal
(
fetch_vals
[
0
],
item
))
# save above
self
.
assertTrue
(
np
.
allclose
(
fetch_vals
[
0
],
item
))
class
CUDAInplaceTest
(
InplaceTestBase
):
...
...
tools/is_ut_disabled
.py
→
python/paddle/fluid/tests/unittests/test_dist_mnist_fp16_allreduce
.py
浏览文件 @
d0c9f7e8
...
...
@@ -11,30 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Check whether ut is disabled. """
import
os
import
sys
from
__future__
import
print_function
import
unittest
from
test_dist_base
import
TestDistBase
def
check_ut
():
""" Get disabled unit tests. """
disable_ut_file
=
'disable_ut'
cmd
=
'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'
.
format
(
disable_ut_file
)
os
.
system
(
cmd
)
with
open
(
disable_ut_file
)
as
utfile
:
for
u
in
utfile
:
if
u
.
rstrip
(
'
\r\n
'
)
==
sys
.
argv
[
1
]:
exit
(
0
)
exit
(
1
)
class
TestDistMnist2x2FP16AllReduce
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
self
.
_nccl2_mode
=
True
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist_fp16_allreduce.py"
,
delta
=
1e-5
)
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
!=
2
:
exit
(
1
)
try
:
check_ut
()
except
Exception
as
e
:
print
(
e
)
exit
(
1
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
浏览文件 @
d0c9f7e8
...
...
@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase):
strategy
.
dgc
=
"True"
self
.
assertEqual
(
strategy
.
dgc
,
False
)
def
test_fp16_allreduce
(
self
):
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
fp16_allreduce
=
True
self
.
assertEqual
(
strategy
.
fp16_allreduce
,
True
)
strategy
.
fp16_allreduce
=
False
self
.
assertEqual
(
strategy
.
fp16_allreduce
,
False
)
with
self
.
assertRaises
(
TypeError
):
strategy
.
fp16_allreduce
=
"True"
self
.
assertEqual
(
strategy
.
fp16_allreduce
,
False
)
def
test_sync_nccl_allreduce
(
self
):
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
sync_nccl_allreduce
=
True
...
...
python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
0 → 100644
浏览文件 @
d0c9f7e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.distributed.fleet
as
fleet
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
unittest
import
paddle
import
paddle.fluid
as
fluid
import
os
paddle
.
enable_static
()
class
TestFleetFP16CompressOptimizer
(
unittest
.
TestCase
):
def
setUp
(
self
):
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:36001"
def
net
(
self
,
main_prog
,
startup_prog
,
dtype
=
'float32'
):
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
dtype
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
fc_2
=
paddle
.
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
64
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
act
=
'softmax'
)
cost
=
paddle
.
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
input_y
)
avg_cost
=
paddle
.
fluid
.
layers
.
mean
(
x
=
cost
)
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
fp16_allreduce
=
True
return
avg_cost
,
strategy
def
test_fp16_allreduce_optimizer
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
optimizer
=
paddle
.
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
cast_out
=
[
op
.
output
(
'Out'
)[
0
]
for
op
in
avg_cost
.
block
.
ops
if
op
.
type
==
'cast'
]
cast_op_count
=
0
for
name
in
ops
:
if
name
==
'cast'
:
cast_op_count
+=
1
self
.
assertIn
(
'cast'
,
ops
)
self
.
assertEqual
(
cast_op_count
,
12
)
# 6 + 6, cast_fp16 + cast_fp32
for
name
in
cast_out
:
self
.
assertIn
(
'cast_fp16'
,
name
)
def
test_fp16_allreduce_not_apply_fp16_net
(
self
):
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
,
dtype
=
'float16'
)
optimizer
=
paddle
.
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
self
.
assertNotIn
(
'cast'
,
ops
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
浏览文件 @
d0c9f7e8
...
...
@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase):
x
=
np
.
random
.
randn
(
*
shape
).
astype
(
"float32"
)
y1
=
compute_v1
(
x
)
y2
=
compute_v2
(
x
)
self
.
assertTrue
(
np
.
allclose
(
y1
,
y2
))
result
=
np
.
allclose
(
y1
,
y2
)
if
not
result
:
print
(
"y1:"
,
y1
,
"
\t
y2:"
,
y2
)
self
.
assertTrue
(
result
)
test_weight_bias_false
()
def
test_static
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
浏览文件 @
d0c9f7e8
...
...
@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from
paddle.fluid.dygraph.base
import
to_variable
from
test_imperative_base
import
new_program_scope
if
fluid
.
is_compiled_with_cuda
():
fluid
.
set_flags
({
'FLAGS_cudnn_deterministic'
:
True
})
batch_size
=
8
train_parameters
=
{
"input_size"
:
[
3
,
224
,
224
],
...
...
@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase):
label
.
stop_gradient
=
True
out
=
se_resnext
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
out
,
label
=
label
)
softmax_out
=
fluid
.
layers
.
softmax
(
out
,
use_cudnn
=
False
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
softmax_out
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
dy_out
=
avg_loss
.
numpy
()
...
...
@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase):
name
=
'pixel'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
out
=
se_resnext
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
out
,
label
=
label
)
softmax_out
=
fluid
.
layers
.
softmax
(
out
,
use_cudnn
=
False
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
softmax_out
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
optimizer
.
minimize
(
avg_loss
)
...
...
@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase):
static_grad_value
[
static_grad_name_list
[
i
-
grad_start_pos
]]
=
out
[
i
]
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
))
self
.
assertTrue
(
np
.
allclose
(
static_out
,
dy_out
),
"
\n
static_out: {}
\n
dy_out: {}"
.
format
(
static_out
,
dy_out
))
self
.
assertEqual
(
len
(
dy_param_init_value
),
len
(
static_param_init_value
))
...
...
@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase):
self
.
assertEqual
(
len
(
dy_grad_value
),
len
(
static_grad_value
))
for
key
,
value
in
six
.
iteritems
(
static_grad_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_grad_value
[
key
]))
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_grad_value
[
key
]),
"
\n
static_grad_value: {}
\n
dy_grad_value: {}"
.
format
(
value
,
dy_grad_value
[
key
]))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
self
.
assertEqual
(
len
(
dy_param_value
),
len
(
static_param_value
))
for
key
,
value
in
six
.
iteritems
(
static_param_value
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]))
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_value
[
key
]),
"
\n
static_param_value: {}
\n
dy_param_value: {}"
.
format
(
value
,
dy_param_value
[
key
]))
self
.
assertTrue
(
np
.
isfinite
(
value
.
all
()))
self
.
assertFalse
(
np
.
isnan
(
value
.
any
()))
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
浏览文件 @
d0c9f7e8
...
...
@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
False
self
.
trans_y
=
False
def
init_kernel_type
(
self
):
self
.
dtype
=
"float64"
def
setUp
(
self
):
self
.
init_kernel_type
()
self
.
config
()
self
.
op_type
=
"matmul_v2"
x
=
np
.
random
.
random
(
self
.
x_shape
).
astype
(
self
.
dtype
)
y
=
np
.
random
.
random
(
self
.
y_shape
).
astype
(
self
.
dtype
)
# -0.1 ~ 0.1
x
=
-
0.1
+
0.2
*
x
y
=
-
0.1
+
0.2
*
y
result
=
reference_matmul
(
x
,
y
,
self
.
trans_x
,
self
.
trans_y
)
result
=
result
.
astype
(
self
.
dtype
)
self
.
inputs
=
{
'X'
:
x
,
'Y'
:
y
,
...
...
@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
3
,
2
,
100
)
self
.
trans_x
=
False
self
.
trans_y
=
True
self
.
dtype
=
"float64"
class
TestMatMuklOp3
(
TestMatMulV2Op
):
...
...
@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp4
(
TestMatMulV2Op
):
...
...
@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
2
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp5
(
TestMatMulV2Op
):
...
...
@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
"""
def
config
(
self
):
self
.
x_shape
=
(
1
,
1
,
100
,
2
)
self
.
x_shape
=
(
1
,
1
,
100
,
1
)
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp6
(
TestMatMulV2Op
):
...
...
@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp7
(
TestMatMulV2Op
):
...
...
@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
self
.
y_shape
=
(
100
,
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp8
(
TestMatMulV2Op
):
...
...
@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp9
(
TestMatMulV2Op
):
...
...
@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
self
.
y_shape
=
(
2
,
1
,
2
,
100
)
self
.
trans_x
=
False
self
.
trans_y
=
True
self
.
dtype
=
"float64"
class
TestMatMuklOp10
(
TestMatMulV2Op
):
...
...
@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
"""
def
config
(
self
):
self
.
x_shape
=
(
1
,
1
,
2
,
100
)
self
.
y_shape
=
(
1
,
2
,
100
,
2
)
self
.
x_shape
=
(
1
,
1
,
2
5
,
4
)
self
.
y_shape
=
(
1
,
2
,
4
,
25
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp11
(
TestMatMulV2Op
):
...
...
@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp12
(
TestMatMulV2Op
):
...
...
@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
"""
def
config
(
self
):
self
.
x_shape
=
(
2
,
1
,
100
,
2
)
self
.
y_shape
=
(
1
,
1
,
100
,
2
)
self
.
x_shape
=
(
2
,
1
,
4
,
25
)
self
.
y_shape
=
(
1
,
1
,
4
,
25
)
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp13
(
TestMatMulV2Op
):
...
...
@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
"""
def
config
(
self
):
self
.
x_shape
=
(
2
,
2
,
100
,
2
)
self
.
y_shape
=
(
2
,
2
,
100
,
2
)
self
.
x_shape
=
(
2
,
2
,
2
,
50
)
self
.
y_shape
=
(
2
,
2
,
2
,
50
)
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp14
(
TestMatMulV2Op
):
...
...
@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
2
)
self
.
trans_x
=
True
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp15
(
TestMatMulV2Op
):
...
...
@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
1
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp16
(
TestMatMulV2Op
):
...
...
@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
def
config
(
self
):
self
.
x_shape
=
(
100
)
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
1
)
self
.
y_shape
=
(
1
,
2
,
2
,
100
,
2
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
class
TestMatMuklOp17
(
TestMatMulV2Op
):
...
...
@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
self
.
y_shape
=
(
100
)
self
.
trans_x
=
False
self
.
trans_y
=
False
self
.
dtype
=
"float64"
#--------------------test matmul fp16--------------------
def
create_test_fp16_class
(
parent
,
atol
=
0.001
,
max_relative_error
=
1.0
):
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMatMulOpFp16Case
(
parent
):
def
init_kernel_type
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_output_with_place
(
place
,
atol
=
atol
)
def
test_check_grad
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
self
.
check_grad_with_place
(
place
,
[
'X'
,
'Y'
],
'Out'
,
max_relative_error
=
max_relative_error
)
cls_name
=
"{0}_{1}"
.
format
(
parent
.
__name__
,
"Fp16"
)
TestMatMulOpFp16Case
.
__name__
=
cls_name
globals
()[
cls_name
]
=
TestMatMulOpFp16Case
create_test_fp16_class
(
TestMatMulV2Op
)
create_test_fp16_class
(
TestMatMuklOp2
)
create_test_fp16_class
(
TestMatMuklOp3
)
create_test_fp16_class
(
TestMatMuklOp4
)
create_test_fp16_class
(
TestMatMuklOp5
)
create_test_fp16_class
(
TestMatMuklOp6
)
create_test_fp16_class
(
TestMatMuklOp7
)
create_test_fp16_class
(
TestMatMuklOp8
)
create_test_fp16_class
(
TestMatMuklOp9
)
create_test_fp16_class
(
TestMatMuklOp10
)
create_test_fp16_class
(
TestMatMuklOp11
)
create_test_fp16_class
(
TestMatMuklOp12
)
create_test_fp16_class
(
TestMatMuklOp13
)
create_test_fp16_class
(
TestMatMuklOp14
)
create_test_fp16_class
(
TestMatMuklOp15
)
create_test_fp16_class
(
TestMatMuklOp16
)
create_test_fp16_class
(
TestMatMuklOp17
)
class
TestMatMulV2API
(
unittest
.
TestCase
):
...
...
@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
y
=
paddle
.
to_tensor
(
input_y
)
result
=
paddle
.
matmul
(
x
,
y
)
def
test_dygraph_fp16
(
self
):
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
if
core
.
is_float16_supported
(
place
):
with
fluid
.
dygraph
.
guard
(
place
):
input_x
=
np
.
random
.
random
([
4
,
3
]).
astype
(
"float16"
)
input_y
=
np
.
random
.
random
([
3
,
4
]).
astype
(
"float16"
)
x
=
paddle
.
to_tensor
(
input_x
)
y
=
paddle
.
to_tensor
(
input_y
)
result
=
paddle
.
matmul
(
x
,
y
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
浏览文件 @
d0c9f7e8
...
...
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
dataset
=
RandomDataset
(
SAMPLE_NUM
,
CLASS_NUM
)
dataloader
=
DataLoader
(
dataset
,
places
=
places
,
num_workers
=
num_workers
,
batch_size
=
BATCH_SIZE
,
drop_last
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
浏览文件 @
d0c9f7e8
...
...
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
dataset
=
RandomDataset
(
SAMPLE_NUM
,
CLASS_NUM
)
dataloader
=
DataLoader
(
dataset
,
places
=
places
,
num_workers
=
num_workers
,
batch_size
=
BATCH_SIZE
,
drop_last
=
True
)
...
...
python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
浏览文件 @
d0c9f7e8
...
...
@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
self
.
shape
=
[
2
,
2
,
3
,
4
,
5
]
class
TestBatchNormDoubleGradCheckCase5
(
TestBatchNormDoubleGradCheck
):
@
prog_scope
()
def
func
(
self
,
place
):
prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
prog
):
np
.
random
.
seed
()
dtype
=
"float32"
eps
=
0.005
atol
=
2e-4
chn
=
self
.
shape
[
1
]
if
self
.
data_layout
==
'NCHW'
else
self
.
shape
[
-
1
]
x
=
layers
.
create_parameter
(
dtype
=
dtype
,
shape
=
self
.
shape
,
name
=
'x'
)
z
=
fluid
.
layers
.
batch_norm
(
input
=
x
,
data_layout
=
self
.
data_layout
,
use_global_stats
=
self
.
use_global_stats
)
x_arr
=
np
.
random
.
uniform
(
-
1
,
1
,
self
.
shape
).
astype
(
dtype
)
w
,
b
=
prog
.
global_block
().
all_parameters
()[
1
:
3
]
w_arr
=
np
.
ones
(
chn
).
astype
(
dtype
)
b_arr
=
np
.
zeros
(
chn
).
astype
(
dtype
)
gradient_checker
.
double_grad_check
(
[
x
,
w
,
b
],
z
,
x_init
=
[
x_arr
,
w_arr
,
b_arr
],
atol
=
atol
,
place
=
place
,
eps
=
eps
)
class
TestBatchNormDoubleGradCheckCase6
(
TestBatchNormDoubleGradCheckCase5
):
def
init_test
(
self
):
self
.
data_layout
=
'NCHW'
self
.
use_global_stats
=
True
self
.
shape
=
[
2
,
3
,
4
,
5
]
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_paddle_save_load.py
浏览文件 @
d0c9f7e8
...
...
@@ -29,19 +29,23 @@ IMAGE_SIZE = 784
CLASS_NUM
=
10
# define a random dataset
class
RandomDataset
(
paddle
.
io
.
Dataset
):
def
__init__
(
self
,
num_samples
):
self
.
num_samples
=
num_samples
def
__getitem__
(
self
,
idx
):
def
random_batch_reader
():
def
_get_random_inputs_and_labels
():
np
.
random
.
seed
(
SEED
)
image
=
np
.
random
.
random
([
IMAGE_SIZE
]).
astype
(
'float32'
)
label
=
np
.
random
.
randint
(
0
,
CLASS_NUM
-
1
,
(
1
,
)).
astype
(
'int64'
)
image
=
np
.
random
.
random
([
BATCH_SIZE
,
IMAGE_SIZE
]).
astype
(
'float32'
)
label
=
np
.
random
.
randint
(
0
,
CLASS_NUM
-
1
,
(
BATCH_SIZE
,
1
,
)).
astype
(
'int64'
)
return
image
,
label
def
__len__
(
self
):
return
self
.
num_samples
def
__reader__
():
for
_
in
range
(
BATCH_NUM
):
batch_image
,
batch_label
=
_get_random_inputs_and_labels
()
batch_image
=
paddle
.
to_tensor
(
batch_image
)
batch_label
=
paddle
.
to_tensor
(
batch_label
)
yield
batch_image
,
batch_label
return
__reader__
class
LinearNet
(
nn
.
Layer
):
...
...
@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt):
class
TestSaveLoad
(
unittest
.
TestCase
):
def
setUp
(
self
):
# enable dygraph mode
self
.
place
=
paddle
.
CPUPlace
()
paddle
.
disable_static
(
self
.
place
)
paddle
.
disable_static
()
# config seed
paddle
.
manual_seed
(
SEED
)
...
...
@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase):
adam
=
opt
.
Adam
(
learning_rate
=
0.001
,
parameters
=
layer
.
parameters
())
# create data loader
dataset
=
RandomDataset
(
BATCH_NUM
*
BATCH_SIZE
)
loader
=
paddle
.
io
.
DataLoader
(
dataset
,
places
=
self
.
place
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
2
)
# TODO: using new DataLoader cause unknown Timeout on windows, replace it
loader
=
random_batch_reader
()
# train
train
(
layer
,
loader
,
loss_fn
,
adam
)
...
...
@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase):
layer
,
opt
=
self
.
build_and_train_model
()
# save
layer_save_path
=
"linear.pdparams"
opt_save_path
=
"linear.pdopt"
layer_save_path
=
"
test_paddle_save_load.
linear.pdparams"
opt_save_path
=
"
test_paddle_save_load.
linear.pdopt"
layer_state_dict
=
layer
.
state_dict
()
opt_state_dict
=
opt
.
state_dict
()
...
...
@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase):
# test save load in static mode
paddle
.
enable_static
()
static_save_path
=
"static_mode_test/linear.pdparams"
static_save_path
=
"static_mode_test/
test_paddle_save_load.
linear.pdparams"
paddle
.
save
(
layer_state_dict
,
static_save_path
)
load_static_state_dict
=
paddle
.
load
(
static_save_path
)
self
.
check_load_state_dict
(
layer_state_dict
,
load_static_state_dict
)
...
...
@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase):
# 2. test save path format error
with
self
.
assertRaises
(
ValueError
):
paddle
.
save
(
layer_state_dict
,
"linear.model/"
)
paddle
.
save
(
layer_state_dict
,
"
test_paddle_save_load.
linear.model/"
)
# 3. test load path not exist error
with
self
.
assertRaises
(
ValueError
):
paddle
.
load
(
"linear.params"
)
paddle
.
load
(
"
test_paddle_save_load.
linear.params"
)
# 4. test load old save path error
with
self
.
assertRaises
(
ValueError
):
paddle
.
load
(
"linear"
)
paddle
.
load
(
"
test_paddle_save_load.
linear"
)
if
__name__
==
'__main__'
:
...
...
python/paddle/tensor/linalg.py
浏览文件 @
d0c9f7e8
...
...
@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
def
__check_input
(
x
,
y
):
var_names
=
{
'x'
:
x
,
'y'
:
y
}
for
name
,
val
in
var_names
.
items
():
check_variable_and_dtype
(
val
,
name
,
[
'float32'
,
'float64'
],
'matmul'
)
check_variable_and_dtype
(
val
,
name
,
[
'float16'
,
'float32'
,
'float64'
],
'matmul'
)
__check_input
(
x
,
y
)
...
...
tools/check_file_diff_approvals.sh
浏览文件 @
d0c9f7e8
...
...
@@ -286,7 +286,7 @@ fi
# Get the list of PR authors with unresolved unit test issues
pip
install
PyGithub
# For getting PR related data
wget https://
sys-p0.bj
.bcebos.com/blk/block.txt
--no-check-certificate
wget https://
paddle-ci.gz
.bcebos.com/blk/block.txt
--no-check-certificate
wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt
--no-check-certificate
HASUTFIXED
=
`
python
${
PADDLE_ROOT
}
/tools/check_ut.py |
grep
"has unit-test to be fixed"
||
true
`
if
[
"
${
HASUTFIXED
}
"
!=
""
]
;
then
...
...
tools/dockerfile/Dockerfile.centos
浏览文件 @
d0c9f7e8
...
...
@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
make -j8 && make install && \
ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
# gcc4.8 TRT
RUN mkdir -p /opt/compiler && cd /opt/compiler && \
wget -q https://paddle-ci.gz.bcebos.com/gcc-4.8.2.tar.gz && \
tar xf gcc-4.8.2.tar.gz && rm -f gcc-4.8.2.tar.gz
# Downgrade gcc&&g++
<install_gcc>
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
tools/dockerfile/ci_dockerfile.sh
浏览文件 @
d0c9f7e8
...
...
@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){
function
make_centos_dockerfile
(){
dockerfile_name
=
"Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
sed
's/<baseimg>/
9.0
-cudnn7-devel-centos6/g'
Dockerfile.centos
>
${
dockerfile_name
}
sed
's/<baseimg>/
10.2
-cudnn7-devel-centos6/g'
Dockerfile.centos
>
${
dockerfile_name
}
sed
-i
's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g'
${
dockerfile_name
}
dockerfile_line
=
`
wc
-l
${
dockerfile_name
}
|awk
'{print $1}'
`
sed
-i
"
${
dockerfile_line
}
i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so &&
\
...
...
@@ -29,6 +29,15 @@ function make_centos_dockerfile(){
rm -rf /usr/include/NvInfer*"
${
dockerfile_name
}
sed
-i
"
${
dockerfile_line
}
i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz &&
\
tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/"
${
dockerfile_name
}
sed
-i
's#<install_gcc>#WORKDIR /usr/bin \
COPY tools/dockerfile/build_scripts /build_scripts \
RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g'
${
dockerfile_name
}
}
...
...
tools/sampcd_processor.py
浏览文件 @
d0c9f7e8
...
...
@@ -534,13 +534,6 @@ def get_incrementapi():
f
.
write
(
'
\n
'
)
# only white on CPU
gpu_not_white
=
[
"deformable_conv"
,
"cuda_places"
,
"CUDAPinnedPlace"
,
"CUDAPlace"
,
"cuda_profiler"
,
'DGCMomentumOptimizer'
]
def
get_wlist
():
'''
this function will get the white list of API.
...
...
@@ -552,17 +545,25 @@ def get_wlist():
'''
wlist
=
[]
wlist_file
=
[]
# only white on CPU
gpu_not_white
=
[]
with
open
(
"wlist.json"
,
'r'
)
as
load_f
:
load_dict
=
json
.
load
(
load_f
)
for
key
in
load_dict
:
if
key
==
'wlist_file'
:
wlist_file
=
wlist_file
+
load_dict
[
key
]
if
key
==
'wlist_dir'
:
for
item
in
load_dict
[
key
]:
wlist_file
.
append
(
item
[
"name"
])
elif
key
==
"gpu_not_white"
:
gpu_not_white
=
load_dict
[
key
]
elif
key
==
"wlist_api"
:
for
item
in
load_dict
[
key
]:
wlist
.
append
(
item
[
"name"
])
else
:
wlist
=
wlist
+
load_dict
[
key
]
return
wlist
,
wlist_file
return
wlist
,
wlist_file
,
gpu_not_white
wlist
,
wlist_file
=
get_wlist
()
wlist
,
wlist_file
,
gpu_not_white
=
get_wlist
()
if
len
(
sys
.
argv
)
<
2
:
print
(
"Error: inadequate number of arguments"
)
...
...
tools/wlist.json
浏览文件 @
d0c9f7e8
{
"wlist_file"
:
[
"../python/paddle/fluid/contrib"
,
"../python/paddle/verison.py"
,
"../python/paddle/fluid/core_avx.py"
,
"../python/paddle/distributed"
"wlist_dir"
:[
{
"name"
:
"../python/paddle/fluid/contrib"
,
"annotation"
:
""
},
{
"name"
:
"../python/paddle/verison.py"
,
"annotation"
:
""
},
{
"name"
:
"../python/paddle/fluid/core_avx.py"
,
"annotation"
:
""
},
{
"name"
:
"../python/paddle/distributed"
,
"annotation"
:
""
}
],
"wlist_inneed"
:[
"wlist_api"
:[
{
"name"
:
"xxxxx"
,
"annotation"
:
"not a real api, just for example"
}
],
"wlist_temp_api"
:[
"append_LARS"
,
"BuildStrategy.debug_graphviz_path"
,
"BuildStrategy.enable_sequential_execution"
,
...
...
@@ -63,9 +81,7 @@
"cuda_places"
,
"CUDAPinnedPlace"
,
"CUDAPlace"
,
"Program.parse_from_string"
],
"wlist_nosample"
:[
"Program.parse_from_string"
,
"Compressor"
,
"Compressor.config"
,
"Compressor.run"
,
...
...
@@ -159,13 +175,9 @@
"RNN"
,
"BiRNN"
,
"RNNCellBase"
,
"RNNCellBase.get_initial_states"
],
"wlist_no_op_pass"
:[
"RNNCellBase.get_initial_states"
,
"gelu"
,
"erf"
],
"wlist_ci_nopass"
:[
"erf"
,
"DecodeHelper"
,
"DecodeHelper.initialize"
,
"DecodeHelper.sample"
,
...
...
@@ -188,9 +200,7 @@
"SampleEmbeddingHelper"
,
"BasicDecoder"
,
"lstm"
,
"partial_sum"
],
"wlist_nopass"
:[
"partial_sum"
,
"StateCell"
,
"StateCell.compute_state"
,
"TrainingDecoder"
,
...
...
@@ -242,9 +252,7 @@
"GroupNorm"
,
"SpectralNorm"
,
"TreeConv"
,
"prroi_pool"
],
"wlist_temp"
:[
"prroi_pool"
,
"to_tensor"
,
"ChunkEvaluator"
,
"EditDistance"
,
...
...
@@ -322,9 +330,7 @@
"Conv2DTranspose"
,
"QueueDataset.local_shuffle"
,
"save_persistables@dygraph/checkpoint.py"
,
"load_persistables@dygraph/checkpoint.py"
],
"wlist_ignore"
:[
"load_persistables@dygraph/checkpoint.py"
,
"elementwise_pow"
,
"WeightedAverage.reset"
,
"ChunkEvaluator.eval"
,
...
...
@@ -401,5 +407,13 @@
"LinearChainCRF.forward"
,
"CRFDecoding.forward"
,
"SequenceTagging.forward"
],
"gpu_not_white"
:[
"deformable_conv"
,
"cuda_places"
,
"CUDAPinnedPlace"
,
"CUDAPlace"
,
"cuda_profiler"
,
"DGCMomentumOptimizer"
]
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录