Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
1731b32d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1731b32d
编写于
8月 05, 2020
作者:
Z
zlsh80826
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into trt_stack_op, test=develop
上级
b750d6ed
4adac0e3
变更
50
隐藏空白更改
内联
并排
Showing
50 changed file
with
2308 addition
and
359 deletion
+2308
-359
cmake/cudnn.cmake
cmake/cudnn.cmake
+11
-7
paddle/fluid/imperative/partial_grad_engine.cc
paddle/fluid/imperative/partial_grad_engine.cc
+8
-6
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
...fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+3
-19
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+17
-3
paddle/fluid/inference/tensorrt/helper.h
paddle/fluid/inference/tensorrt/helper.h
+3
-0
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
...inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+15
-39
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
.../inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+122
-15
paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+0
-3
paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+59
-19
paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
.../fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+0
-4
paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
...e/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+80
-11
paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
...uid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+2
-6
paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
...luid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+86
-24
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+15
-2
paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+6
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+19
-0
paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
...nce/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+146
-0
paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
...fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+1
-1
paddle/fluid/operators/clip_op.h
paddle/fluid/operators/clip_op.h
+26
-9
paddle/fluid/operators/controlflow/compare_op.cc
paddle/fluid/operators/controlflow/compare_op.cc
+19
-8
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+3
-38
paddle/fluid/operators/elementwise/elementwise_op_function.h
paddle/fluid/operators/elementwise/elementwise_op_function.h
+44
-0
paddle/fluid/operators/fused/conv_fusion_op.cu
paddle/fluid/operators/fused/conv_fusion_op.cu
+9
-3
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+11
-4
paddle/fluid/operators/math/bert_encoder_functor.cu
paddle/fluid/operators/math/bert_encoder_functor.cu
+1
-0
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+15
-4
paddle/fluid/platform/dynload/tensorrt.cc
paddle/fluid/platform/dynload/tensorrt.cc
+45
-1
paddle/fluid/platform/dynload/tensorrt.h
paddle/fluid/platform/dynload/tensorrt.h
+5
-3
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-0
python/paddle/fleet/base/fleet_base.py
python/paddle/fleet/base/fleet_base.py
+15
-4
python/paddle/fleet/base/runtime_factory.py
python/paddle/fleet/base/runtime_factory.py
+3
-5
python/paddle/fleet/base/util_factory.py
python/paddle/fleet/base/util_factory.py
+3
-4
python/paddle/fleet/cloud_utils.py
python/paddle/fleet/cloud_utils.py
+85
-0
python/paddle/fleet/launch.py
python/paddle/fleet/launch.py
+319
-0
python/paddle/fleet/launch_utils.py
python/paddle/fleet/launch_utils.py
+421
-0
python/paddle/fleet/runtime/runtime_base.py
python/paddle/fleet/runtime/runtime_base.py
+2
-5
python/paddle/fluid/dygraph/layers.py
python/paddle/fluid/dygraph/layers.py
+39
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
...ddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+87
-6
python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
...addle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+19
-1
python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
...addle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+15
-0
python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
...luid/tests/unittests/dygraph_to_static/test_mobile_net.py
+79
-6
python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
...le/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+142
-81
python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
...fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+66
-0
python/paddle/fluid/tests/unittests/test_compare_op.py
python/paddle/fluid/tests/unittests/test_compare_op.py
+24
-8
python/paddle/fluid/tests/unittests/test_fleet_launch.sh
python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+101
-0
python/paddle/fluid/tests/unittests/test_fleet_util.py
python/paddle/fluid/tests/unittests/test_fleet_util.py
+4
-2
python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
...ddle/fluid/tests/unittests/test_imperative_double_grad.py
+15
-8
python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
...ddle/fluid/tests/unittests/test_imperative_layer_apply.py
+90
-0
python/setup.py.in
python/setup.py.in
+5
-0
未找到文件。
cmake/cudnn.cmake
浏览文件 @
1731b32d
...
...
@@ -60,9 +60,8 @@ else()
set
(
CUDNN_FOUND OFF
)
endif
()
if
(
CUDNN_FOUND
)
file
(
READ
${
CUDNN_INCLUDE_DIR
}
/cudnn.h CUDNN_VERSION_FILE_CONTENTS
)
macro
(
find_cudnn_version cudnn_header_file
)
file
(
READ
${
cudnn_header_file
}
CUDNN_VERSION_FILE_CONTENTS
)
get_filename_component
(
CUDNN_LIB_PATH
${
CUDNN_LIBRARY
}
DIRECTORY
)
string
(
REGEX MATCH
"define CUDNN_VERSION +([0-9]+)"
...
...
@@ -93,10 +92,15 @@ if(CUDNN_FOUND)
math
(
EXPR CUDNN_VERSION
"
${
CUDNN_MAJOR_VERSION
}
* 1000 +
${
CUDNN_MINOR_VERSION
}
* 100 +
${
CUDNN_PATCHLEVEL_VERSION
}
"
)
message
(
STATUS
"Current cuDNN header is
${
cudnn_header_file
}
"
"Current cuDNN version is v
${
CUDNN_MAJOR_VERSION
}
.
${
CUDNN_MINOR_VERSION
}
. "
)
endif
()
message
(
STATUS
"Current cuDNN header is
${
CUDNN_INCLUDE_DIR
}
/cudnn.h. "
"Current cuDNN version is v
${
CUDNN_MAJOR_VERSION
}
.
${
CUDNN_MINOR_VERSION
}
. "
)
endif
()
endmacro
()
if
(
CUDNN_FOUND
)
find_cudnn_version
(
${
CUDNN_INCLUDE_DIR
}
/cudnn.h
)
if
(
NOT CUDNN_MAJOR_VERSION
)
find_cudnn_version
(
${
CUDNN_INCLUDE_DIR
}
/cudnn_version.h
)
endif
()
endif
()
paddle/fluid/imperative/partial_grad_engine.cc
浏览文件 @
1731b32d
...
...
@@ -885,12 +885,14 @@ void PartialGradTask::RunEachOp(OpBase *op) {
if
(
create_graph_
)
{
auto
double_grad_node
=
CreateGradOpNode
(
op
->
InnerOp
(),
tmp_ins
,
tmp_outs
,
op
->
Attrs
(),
op
->
place
());
if
(
double_grad_node
)
{
VLOG
(
10
)
<<
"Create "
<<
double_grad_node
->
size
()
<<
" double grad op(s) for "
<<
op
->
Type
()
<<
", pending ops: "
<<
GradPendingOpTypes
(
*
double_grad_node
);
double_grad_nodes_
.
emplace_back
(
std
::
move
(
double_grad_node
));
}
PADDLE_ENFORCE_NOT_NULL
(
double_grad_node
,
platform
::
errors
::
NotFound
(
"The Op %s doesn't have any grad op."
,
op
->
Type
()));
VLOG
(
10
)
<<
"Create "
<<
double_grad_node
->
size
()
<<
" double grad op(s) for "
<<
op
->
Type
()
<<
", pending ops: "
<<
GradPendingOpTypes
(
*
double_grad_node
);
double_grad_nodes_
.
emplace_back
(
std
::
move
(
double_grad_node
));
}
VLOG
(
10
)
<<
"There are "
<<
grads_to_accumulate_
.
size
()
<<
" to sum gradient"
;
...
...
paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
浏览文件 @
1731b32d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
nvinfer1
::
ILayer
*
layer
=
nullptr
;
if
(
engine_
->
with_dynamic_shape
())
{
auto
use_fp16
=
engine_
->
WithFp16
();
plugin
::
DynamicPluginTensorRT
*
plugin
=
nullptr
;
if
(
use_fp16
)
{
#ifdef SUPPORTS_CUDA_FP16
plugin
=
new
plugin
::
EmbEltwiseLayernormPluginDynamic
<
half
>
(
input_embs
,
bias
,
scale
,
emb_sizes
,
bias_size
,
scale_size
,
hidden
,
eps
);
#else
plugin
=
new
plugin
::
EmbEltwiseLayernormPluginDynamic
<
float
>
(
input_embs
,
bias
,
scale
,
emb_sizes
,
bias_size
,
scale_size
,
hidden
,
eps
);
#endif
}
else
{
plugin
=
new
plugin
::
EmbEltwiseLayernormPluginDynamic
<
float
>
(
input_embs
,
bias
,
scale
,
emb_sizes
,
bias_size
,
scale_size
,
hidden
,
eps
);
}
plugin
=
new
plugin
::
EmbEltwiseLayernormPluginDynamic
<
float
>
(
input_embs
,
bias
,
scale
,
emb_sizes
,
bias_size
,
scale_size
,
hidden
,
eps
);
layer
=
engine_
->
AddPluginV2
(
input_ids
.
data
(),
input_num
,
plugin
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
1731b32d
...
...
@@ -200,9 +200,23 @@ class TensorRTEngine {
void
Deserialize
(
const
std
::
string
&
engine_serialized_data
)
{
freshDeviceId
();
infer_ptr
<
nvinfer1
::
IRuntime
>
runtime
(
createInferRuntime
(
&
logger_
));
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
(),
&
inference
::
Singleton
<
plugin
::
PluginFactoryTensorRT
>::
Global
()));
if
(
with_dynamic_shape_
)
{
#if IS_TRT_VERSION_GE(6000)
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
(),
nullptr
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"To enable dynamic shape support, the TensorRT version should be "
"greater than 6.0.0"
));
#endif
}
else
{
infer_engine_
.
reset
(
runtime
->
deserializeCudaEngine
(
engine_serialized_data
.
c_str
(),
engine_serialized_data
.
size
(),
&
inference
::
Singleton
<
plugin
::
PluginFactoryTensorRT
>::
Global
()));
}
PADDLE_ENFORCE
(
infer_engine_
!=
nullptr
,
"build cuda engine failed when deserialize engine info.!"
);
}
...
...
paddle/fluid/inference/tensorrt/helper.h
浏览文件 @
1731b32d
...
...
@@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return
static_cast
<
nvinfer1
::
IRuntime
*>
(
dy
::
createInferRuntime_INTERNAL
(
logger
,
NV_TENSORRT_VERSION
));
}
static
nvinfer1
::
IPluginRegistry
*
getPluginRegistry
()
{
return
static_cast
<
nvinfer1
::
IPluginRegistry
*>
(
dy
::
getPluginRegistry
());
}
// A logger for create TensorRT infer builder.
class
NaiveLogger
:
public
nvinfer1
::
ILogger
{
...
...
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
浏览文件 @
1731b32d
...
...
@@ -33,53 +33,29 @@ namespace plugin {
template
<
typename
T
>
int
EmbEltwiseLayernormPluginDynamic
<
T
>::
initialize
()
{
int
nb_emb
=
embs_
.
size
();
std
::
vector
<
void
*>
ptr_vector
(
nb_emb
);
std
::
vector
<
std
::
vector
<
half
>>
emb_fp16
(
nb_emb
);
if
(
sizeof
(
T
)
==
sizeof
(
float
))
{
// FP32
for
(
int
i
=
0
;
i
<
nb_emb
;
++
i
)
{
ptr_vector
[
i
]
=
embs_
[
i
];
}
}
else
{
// FP16
for
(
int
i
=
0
;
i
<
nb_emb
;
++
i
)
{
auto
emb_size
=
emb_sizes_
[
i
];
auto
&
tmp
=
emb_fp16
[
i
];
tmp
.
resize
(
emb_size
);
for
(
int
j
=
0
;
j
<
emb_size
;
++
j
)
{
tmp
[
j
]
=
static_cast
<
half
>
(
embs_
[
i
][
j
]);
}
ptr_vector
[
i
]
=
tmp
.
data
();
}
}
embs_gpu_
.
resize
(
embs_
.
size
());
for
(
int
i
=
0
;
i
<
embs_
.
size
();
i
++
)
{
cudaMalloc
(
&
embs_gpu_
[
i
],
sizeof
(
T
)
*
emb_sizes_
[
i
]);
cudaMemcpy
(
embs_gpu_
[
i
],
ptr_vector
[
i
],
emb_sizes_
[
i
]
*
sizeof
(
T
),
cudaMemcpyHostToDevice
);
if
(
embs_
[
i
])
{
cudaMalloc
(
&
embs_gpu_
[
i
],
sizeof
(
float
)
*
emb_sizes_
[
i
]);
cudaMemcpy
(
embs_gpu_
[
i
],
embs_
[
i
],
emb_sizes_
[
i
]
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
}
}
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
float
)
*
bias_size_
);
cudaMemcpy
(
bias_gpu_
,
bias_
,
bias_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
float
)
*
scale_size_
);
cudaMemcpy
(
scale_gpu_
,
scale_
,
scale_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
return
0
;
}
if
(
bias_
)
{
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
float
)
*
bias_size_
);
cudaMemcpy
(
bias_gpu_
,
bias_
,
bias_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
}
if
(
scale_
)
{
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
float
)
*
scale_size_
);
cudaMemcpy
(
scale_gpu_
,
scale_
,
scale_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
}
template
<
typename
T
>
size_t
EmbEltwiseLayernormPluginDynamic
<
T
>::
getSerializationSize
()
const
{
return
0
;
}
template
<
typename
T
>
void
EmbEltwiseLayernormPluginDynamic
<
T
>::
serialize
(
void
*
buffer
)
const
{}
template
<
typename
T
>
nvinfer1
::
DimsExprs
EmbEltwiseLayernormPluginDynamic
<
T
>::
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
...
...
paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
浏览文件 @
1731b32d
...
...
@@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
hidden_size_
(
hidden_size
),
eps_
(
eps
)
{}
EmbEltwiseLayernormPluginDynamic
(
void
const
*
serialData
,
size_t
serialLength
)
{}
EmbEltwiseLayernormPluginDynamic
(
void
const
*
serial_data
,
size_t
serial_length
)
{
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
emb_sizes_
);
embs_gpu_
.
resize
(
emb_sizes_
.
size
());
embs_
.
resize
(
emb_sizes_
.
size
());
for
(
size_t
i
=
0
;
i
<
emb_sizes_
.
size
();
i
++
)
{
cudaMalloc
(
&
embs_gpu_
[
i
],
sizeof
(
float
)
*
emb_sizes_
[
i
]);
cudaMemcpy
(
embs_gpu_
[
i
],
serial_data
,
emb_sizes_
[
i
]
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
reinterpret_cast
<
char
const
*&>
(
serial_data
)
+=
emb_sizes_
[
i
]
*
sizeof
(
float
);
serial_length
-=
emb_sizes_
[
i
]
*
sizeof
(
float
);
embs_
[
i
]
=
nullptr
;
}
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
bias_size_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
scale_size_
);
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
float
)
*
bias_size_
);
cudaMemcpy
(
bias_gpu_
,
serial_data
,
bias_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
bias_
=
nullptr
;
reinterpret_cast
<
char
const
*&>
(
serial_data
)
+=
bias_size_
*
sizeof
(
float
);
serial_length
-=
bias_size_
*
sizeof
(
float
);
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
float
)
*
scale_size_
);
cudaMemcpy
(
scale_gpu_
,
serial_data
,
scale_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
scale_
=
nullptr
;
reinterpret_cast
<
char
const
*&>
(
serial_data
)
+=
scale_size_
*
sizeof
(
float
);
serial_length
-=
scale_size_
*
sizeof
(
float
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
hidden_size_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
eps_
);
}
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
override
{
return
new
EmbEltwiseLayernormPluginDynamic
(
embs_
,
bias_
,
scale_
,
emb_sizes_
,
bias_size_
,
scale_size_
,
hidden_size_
,
...
...
@@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int
getNbOutputs
()
const
override
{
return
1
;
}
int
initialize
()
override
;
size_t
getSerializationSize
()
const
override
;
void
serialize
(
void
*
buffer
)
const
override
;
size_t
getSerializationSize
()
const
override
{
int
sum_num
=
0
;
sum_num
+=
SerializedSize
(
emb_sizes_
);
for
(
size_t
i
=
0
;
i
<
emb_sizes_
.
size
();
i
++
)
{
sum_num
+=
emb_sizes_
[
i
]
*
sizeof
(
float
);
}
sum_num
+=
SerializedSize
(
bias_size_
);
sum_num
+=
SerializedSize
(
scale_size_
);
sum_num
+=
(
bias_size_
+
scale_size_
)
*
sizeof
(
float
);
sum_num
+=
SerializedSize
(
hidden_size_
);
sum_num
+=
SerializedSize
(
eps_
);
// sum_num += SerializedSize(with_fp16_);
return
sum_num
;
}
void
serialize
(
void
*
buffer
)
const
override
{
// SerializeValue(&buffer, with_fp16_);
SerializeValue
(
&
buffer
,
emb_sizes_
);
for
(
size_t
i
=
0
;
i
<
emb_sizes_
.
size
();
i
++
)
{
SerializeCudaPointer
(
&
buffer
,
embs_gpu_
[
i
],
emb_sizes_
[
i
]);
}
SerializeValue
(
&
buffer
,
bias_size_
);
SerializeValue
(
&
buffer
,
scale_size_
);
SerializeCudaPointer
(
&
buffer
,
bias_gpu_
,
bias_size_
);
SerializeCudaPointer
(
&
buffer
,
scale_gpu_
,
scale_size_
);
SerializeValue
(
&
buffer
,
hidden_size_
);
SerializeValue
(
&
buffer
,
eps_
);
}
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
nvinfer1
::
IExprBuilder
&
expr_builder
)
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
in
O
ut
,
int
nb
Inputs
,
int
nbO
utputs
)
override
;
const
nvinfer1
::
PluginTensorDesc
*
in
_o
ut
,
int
nb
_inputs
,
int
nb_o
utputs
)
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nb
O
utputs
)
override
{}
int
nb
_o
utputs
)
override
{}
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nb
O
utputs
)
const
override
{
int
nb
_o
utputs
)
const
override
{
return
0
;
}
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input
D
esc
,
const
nvinfer1
::
PluginTensorDesc
*
output
D
esc
,
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input
_d
esc
,
const
nvinfer1
::
PluginTensorDesc
*
output
_d
esc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
input
T
ypes
,
int
nb
I
nputs
)
const
override
;
const
nvinfer1
::
DataType
*
input
_t
ypes
,
int
nb
_i
nputs
)
const
override
;
void
destroy
()
override
{
delete
this
;
}
...
...
@@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
// data on devices
float
*
bias_gpu_
;
float
*
scale_gpu_
;
std
::
vector
<
T
*>
embs_gpu_
;
std
::
vector
<
float
*>
embs_gpu_
;
std
::
vector
<
int
>
emb_sizes_
;
int
bias_size_
;
...
...
@@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int
hidden_size_
;
float
eps_
;
};
class
EmbEltwiseLayernormPluginV2Creator
:
public
nvinfer1
::
IPluginCreator
{
public:
EmbEltwiseLayernormPluginV2Creator
()
{}
const
char
*
getPluginName
()
const
override
{
return
"fused_embedding_eltwise_layernorm_plugin"
;
}
const
char
*
getPluginVersion
()
const
override
{
return
"1"
;
}
const
nvinfer1
::
PluginFieldCollection
*
getFieldNames
()
override
{
return
&
field_collection_
;
}
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
override
{
return
nullptr
;
}
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serial_data
,
size_t
serial_length
)
override
{
return
new
EmbEltwiseLayernormPluginDynamic
<
float
>
(
serial_data
,
serial_length
);
}
void
setPluginNamespace
(
const
char
*
lib_namespace
)
override
{
plugin_namespace_
=
lib_namespace
;
}
const
char
*
getPluginNamespace
()
const
override
{
return
plugin_namespace_
.
c_str
();
}
private:
std
::
string
plugin_namespace_
;
std
::
string
plugin_name_
;
nvinfer1
::
PluginFieldCollection
field_collection_
;
std
::
vector
<
nvinfer1
::
PluginField
>
plugin_attributes_
;
};
REGISTER_TRT_PLUGIN_V2
(
EmbEltwiseLayernormPluginV2Creator
);
#endif
}
// namespace plugin
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
浏览文件 @
1731b32d
...
...
@@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
// Dynamic Plugin below.
#if IS_TRT_VERSION_GE(6000)
size_t
GeluPluginDynamic
::
getSerializationSize
()
const
{
return
0
;
}
void
GeluPluginDynamic
::
serialize
(
void
*
buffer
)
const
{}
nvinfer1
::
DimsExprs
GeluPluginDynamic
::
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
...
...
paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
浏览文件 @
1731b32d
...
...
@@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT {
// It was used for tensorrt deserialization.
// It should not be called by users.
GeluPlugin
(
void
const
*
serial
Data
,
size_t
serialL
ength
)
{
deserializeBase
(
serial
Data
,
serialL
ength
);
GeluPlugin
(
void
const
*
serial
_data
,
size_t
serial_l
ength
)
{
deserializeBase
(
serial
_data
,
serial_l
ength
);
}
~
GeluPlugin
()
{}
...
...
@@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT {
bool
supportsFormat
(
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
)
const
override
;
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
int
nb
InputD
ims
)
override
;
int
enqueue
(
int
batch
S
ize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
int
nb
_input_d
ims
)
override
;
int
enqueue
(
int
batch
_s
ize
,
const
void
*
const
*
inputs
,
void
**
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
protected:
...
...
@@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT {
class
GeluPluginDynamic
:
public
DynamicPluginTensorRT
{
public:
GeluPluginDynamic
()
{}
GeluPluginDynamic
(
void
const
*
serial
Data
,
size_t
serialL
ength
)
{}
GeluPluginDynamic
(
void
const
*
serial
_data
,
size_t
serial_l
ength
)
{}
~
GeluPluginDynamic
()
{}
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
override
{
...
...
@@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
int
getNbOutputs
()
const
override
{
return
1
;
}
int
initialize
()
override
{
return
0
;
}
size_t
getSerializationSize
()
const
override
;
void
serialize
(
void
*
buffer
)
const
override
;
size_t
getSerializationSize
()
const
override
{
return
0
;
}
void
serialize
(
void
*
buffer
)
const
override
{}
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
output
Index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nbI
nputs
,
nvinfer1
::
IExprBuilder
&
expr
B
uilder
)
override
;
int
output
_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_i
nputs
,
nvinfer1
::
IExprBuilder
&
expr
_b
uilder
)
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
in
O
ut
,
int
nb
Inputs
,
int
nbO
utputs
)
override
;
const
nvinfer1
::
PluginTensorDesc
*
in
_o
ut
,
int
nb
_inputs
,
int
nb_o
utputs
)
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nb
O
utputs
)
override
{}
int
nb
_o
utputs
)
override
{}
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nb
O
utputs
)
const
override
{
int
nb
_o
utputs
)
const
override
{
return
0
;
}
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input
D
esc
,
const
nvinfer1
::
PluginTensorDesc
*
output
D
esc
,
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input
_d
esc
,
const
nvinfer1
::
PluginTensorDesc
*
output
_d
esc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
input
T
ypes
,
int
nb
I
nputs
)
const
override
;
const
nvinfer1
::
DataType
*
input
_t
ypes
,
int
nb
_i
nputs
)
const
override
;
void
destroy
()
override
{
delete
this
;
}
};
class
GeluPluginV2Creator
:
public
nvinfer1
::
IPluginCreator
{
public:
GeluPluginV2Creator
()
{}
const
char
*
getPluginName
()
const
override
{
return
"gelu_plugin"
;
}
const
char
*
getPluginVersion
()
const
override
{
return
"1"
;
}
const
nvinfer1
::
PluginFieldCollection
*
getFieldNames
()
override
{
return
&
field_collection_
;
}
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
override
{
return
nullptr
;
}
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serial_data
,
size_t
serial_length
)
override
{
auto
plugin
=
new
GeluPluginDynamic
(
serial_data
,
serial_length
);
return
plugin
;
}
void
setPluginNamespace
(
const
char
*
lib_namespace
)
override
{
plugin_namespace_
=
lib_namespace
;
}
const
char
*
getPluginNamespace
()
const
override
{
return
plugin_namespace_
.
c_str
();
}
private:
std
::
string
plugin_namespace_
;
std
::
string
plugin_name_
;
nvinfer1
::
PluginFieldCollection
field_collection_
{
0
,
nullptr
};
std
::
vector
<
nvinfer1
::
PluginField
>
plugin_attributes_
;
};
REGISTER_TRT_PLUGIN_V2
(
GeluPluginV2Creator
);
#endif
}
// namespace plugin
...
...
paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
浏览文件 @
1731b32d
...
...
@@ -152,10 +152,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
int
QkvToContextPluginDynamic
::
initialize
()
{
return
0
;
}
size_t
QkvToContextPluginDynamic
::
getSerializationSize
()
const
{
return
0
;
}
void
QkvToContextPluginDynamic
::
serialize
(
void
*
buffer
)
const
{}
nvinfer1
::
DimsExprs
QkvToContextPluginDynamic
::
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
nvinfer1
::
IExprBuilder
&
expr_builder
)
{
...
...
paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
浏览文件 @
1731b32d
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
...
...
@@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
scale_
(
scale
),
ban_fp16_
(
ban_fp16
)
{}
QkvToContextPluginDynamic
(
void
const
*
serialData
,
size_t
serialLength
)
{}
QkvToContextPluginDynamic
(
void
const
*
serial_data
,
size_t
serial_length
)
{
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
hidden_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
head_number_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
head_size_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
scale_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
ban_fp16_
);
}
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
override
{
return
new
QkvToContextPluginDynamic
(
hidden_
,
head_number_
,
head_size_
,
scale_
,
ban_fp16_
);
...
...
@@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
int
getNbOutputs
()
const
override
{
return
1
;
}
int
initialize
()
override
;
size_t
getSerializationSize
()
const
override
;
void
serialize
(
void
*
buffer
)
const
override
;
size_t
getSerializationSize
()
const
override
{
return
SerializedSize
(
hidden_
)
+
SerializedSize
(
head_number_
)
+
SerializedSize
(
head_size_
)
+
SerializedSize
(
scale_
)
+
SerializedSize
(
ban_fp16_
);
}
void
serialize
(
void
*
buffer
)
const
override
{
SerializeValue
(
&
buffer
,
hidden_
);
SerializeValue
(
&
buffer
,
head_number_
);
SerializeValue
(
&
buffer
,
head_size_
);
SerializeValue
(
&
buffer
,
scale_
);
SerializeValue
(
&
buffer
,
ban_fp16_
);
}
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
nvinfer1
::
IExprBuilder
&
expr_builder
)
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
in
O
ut
,
int
nb
Inputs
,
int
nbO
utputs
)
override
;
const
nvinfer1
::
PluginTensorDesc
*
in
_o
ut
,
int
nb
_inputs
,
int
nb_o
utputs
)
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nb
O
utputs
)
override
{}
int
nb
_o
utputs
)
override
{}
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nb
O
utputs
)
const
override
{
int
nb
_o
utputs
)
const
override
{
return
0
;
}
...
...
@@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
input
T
ypes
,
int
nb
I
nputs
)
const
override
;
const
nvinfer1
::
DataType
*
input
_t
ypes
,
int
nb
_i
nputs
)
const
override
;
void
destroy
()
override
{
delete
this
;
}
...
...
@@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
float
scale_
;
bool
ban_fp16_
;
};
class
QkvToContextPluginV2Creator
:
public
nvinfer1
::
IPluginCreator
{
public:
QkvToContextPluginV2Creator
()
{}
const
char
*
getPluginName
()
const
override
{
return
"qkv_to_context_plugin"
;
}
const
char
*
getPluginVersion
()
const
override
{
return
"1"
;
}
const
nvinfer1
::
PluginFieldCollection
*
getFieldNames
()
override
{
return
&
field_collection_
;
}
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
override
{
return
nullptr
;
}
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serial_data
,
size_t
serial_length
)
override
{
auto
plugin
=
new
QkvToContextPluginDynamic
(
serial_data
,
serial_length
);
return
plugin
;
}
void
setPluginNamespace
(
const
char
*
lib_namespace
)
override
{
plugin_namespace_
=
lib_namespace
;
}
const
char
*
getPluginNamespace
()
const
override
{
return
plugin_namespace_
.
c_str
();
}
private:
std
::
string
plugin_namespace_
;
std
::
string
plugin_name_
;
nvinfer1
::
PluginFieldCollection
field_collection_
;
std
::
vector
<
nvinfer1
::
PluginField
>
plugin_attributes_
;
};
REGISTER_TRT_PLUGIN_V2
(
QkvToContextPluginV2Creator
);
#endif
}
// namespace plugin
...
...
paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
浏览文件 @
1731b32d
...
...
@@ -32,18 +32,14 @@ namespace plugin {
int
SkipLayerNormPluginDynamic
::
initialize
()
{
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
float
)
*
bias_size_
);
cudaMemcpy
(
bias_gpu_
,
bias_
,
bias_size_
*
sizeof
(
float
),
cudaMemcpy
(
bias_gpu_
,
bias_
.
data
()
,
bias_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
float
)
*
scale_size_
);
cudaMemcpy
(
scale_gpu_
,
scale_
,
scale_size_
*
sizeof
(
float
),
cudaMemcpy
(
scale_gpu_
,
scale_
.
data
()
,
scale_size_
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
return
0
;
}
size_t
SkipLayerNormPluginDynamic
::
getSerializationSize
()
const
{
return
0
;
}
void
SkipLayerNormPluginDynamic
::
serialize
(
void
*
buffer
)
const
{}
nvinfer1
::
DimsExprs
SkipLayerNormPluginDynamic
::
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
nvinfer1
::
IExprBuilder
&
expr_builder
)
{
...
...
paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
浏览文件 @
1731b32d
...
...
@@ -29,61 +29,84 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000)
class
SkipLayerNormPluginDynamic
:
public
DynamicPluginTensorRT
{
public:
explicit
SkipLayerNormPluginDynamic
(
float
*
bias
,
float
*
scale
,
int
bias_size
,
int
scale_size
,
const
float
eps
,
bool
ban_fp16
)
:
bias_
(
bias
),
scale_
(
scale
),
bias_size_
(
bias_size
),
explicit
SkipLayerNormPluginDynamic
(
const
float
*
bias
,
const
float
*
scale
,
int
bias_size
,
int
scale_size
,
const
float
eps
,
bool
ban_fp16
)
:
bias_size_
(
bias_size
),
scale_size_
(
scale_size
),
eps_
(
eps
),
ban_fp16_
(
ban_fp16
)
{}
SkipLayerNormPluginDynamic
(
void
const
*
serialData
,
size_t
serialLength
)
{}
ban_fp16_
(
ban_fp16
)
{
bias_
.
resize
(
bias_size
);
scale_
.
resize
(
scale_size
);
std
::
copy
(
bias
,
bias
+
bias_size
,
bias_
.
data
());
std
::
copy
(
scale
,
scale
+
scale_size
,
scale_
.
data
());
}
SkipLayerNormPluginDynamic
(
void
const
*
serial_data
,
size_t
serial_length
)
{
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
bias_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
scale_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
bias_size_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
scale_size_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
eps_
);
DeserializeValue
(
&
serial_data
,
&
serial_length
,
&
ban_fp16_
);
}
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
override
{
return
new
SkipLayerNormPluginDynamic
(
bias_
,
scale_
,
bias_size_
,
scale_size_
,
eps_
,
ban_fp16_
);
return
new
SkipLayerNormPluginDynamic
(
bias_
.
data
(),
scale_
.
data
(),
bias_size_
,
scale_size_
,
eps_
,
ban_fp16_
);
}
const
char
*
getPluginType
()
const
override
{
return
"skip_layernorm_plugin"
;
}
int
getNbOutputs
()
const
override
{
return
1
;
}
int
initialize
()
override
;
size_t
getSerializationSize
()
const
override
;
void
serialize
(
void
*
buffer
)
const
override
;
size_t
getSerializationSize
()
const
override
{
size_t
ser_size
=
SerializedSize
(
bias_
)
+
SerializedSize
(
scale_
)
+
SerializedSize
(
bias_size_
)
+
SerializedSize
(
scale_size_
)
+
SerializedSize
(
eps_
)
+
SerializedSize
(
eps_
);
return
ser_size
;
}
void
serialize
(
void
*
buffer
)
const
override
{
SerializeValue
(
&
buffer
,
bias_
);
SerializeValue
(
&
buffer
,
scale_
);
SerializeValue
(
&
buffer
,
bias_size_
);
SerializeValue
(
&
buffer
,
scale_size_
);
SerializeValue
(
&
buffer
,
eps_
);
SerializeValue
(
&
buffer
,
ban_fp16_
);
}
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
output_index
,
const
nvinfer1
::
DimsExprs
*
inputs
,
int
nb_inputs
,
nvinfer1
::
IExprBuilder
&
expr_builder
)
override
;
bool
supportsFormatCombination
(
int
pos
,
const
nvinfer1
::
PluginTensorDesc
*
in
O
ut
,
int
nb
Inputs
,
int
nbO
utputs
)
override
;
const
nvinfer1
::
PluginTensorDesc
*
in
_o
ut
,
int
nb
_inputs
,
int
nb_o
utputs
)
override
;
void
configurePlugin
(
const
nvinfer1
::
DynamicPluginTensorDesc
*
in
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
DynamicPluginTensorDesc
*
out
,
int
nb
O
utputs
)
override
{}
int
nb
_o
utputs
)
override
{}
size_t
getWorkspaceSize
(
const
nvinfer1
::
PluginTensorDesc
*
inputs
,
int
nb
I
nputs
,
int
nb
_i
nputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nb
O
utputs
)
const
override
{
int
nb
_o
utputs
)
const
override
{
return
0
;
}
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input
D
esc
,
const
nvinfer1
::
PluginTensorDesc
*
output
D
esc
,
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input
_d
esc
,
const
nvinfer1
::
PluginTensorDesc
*
output
_d
esc
,
const
void
*
const
*
inputs
,
void
*
const
*
outputs
,
void
*
workspace
,
cudaStream_t
stream
)
override
;
nvinfer1
::
DataType
getOutputDataType
(
int
index
,
const
nvinfer1
::
DataType
*
input
T
ypes
,
int
nb
I
nputs
)
const
override
;
const
nvinfer1
::
DataType
*
input
_t
ypes
,
int
nb
_i
nputs
)
const
override
;
void
destroy
()
override
{
delete
this
;
}
private:
float
*
bias_
;
float
*
scale_
;
std
::
vector
<
float
>
bias_
;
std
::
vector
<
float
>
scale_
;
float
*
bias_gpu_
;
float
*
scale_gpu_
;
...
...
@@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
float
eps_
;
bool
ban_fp16_
;
};
class
SkipLayerNormPluginV2Creator
:
public
nvinfer1
::
IPluginCreator
{
public:
SkipLayerNormPluginV2Creator
()
{}
const
char
*
getPluginName
()
const
override
{
return
"skip_layernorm_plugin"
;
}
const
char
*
getPluginVersion
()
const
override
{
return
"1"
;
}
const
nvinfer1
::
PluginFieldCollection
*
getFieldNames
()
override
{
return
&
field_collection_
;
}
nvinfer1
::
IPluginV2
*
createPlugin
(
const
char
*
name
,
const
nvinfer1
::
PluginFieldCollection
*
fc
)
override
{
return
nullptr
;
}
nvinfer1
::
IPluginV2
*
deserializePlugin
(
const
char
*
name
,
const
void
*
serial_data
,
size_t
serial_length
)
override
{
auto
plugin
=
new
SkipLayerNormPluginDynamic
(
serial_data
,
serial_length
);
return
plugin
;
}
void
setPluginNamespace
(
const
char
*
lib_namespace
)
override
{
plugin_namespace_
=
lib_namespace
;
}
const
char
*
getPluginNamespace
()
const
override
{
return
plugin_namespace_
.
c_str
();
}
private:
std
::
string
plugin_namespace_
;
std
::
string
plugin_name_
;
nvinfer1
::
PluginFieldCollection
field_collection_
;
std
::
vector
<
nvinfer1
::
PluginField
>
plugin_attributes_
;
};
REGISTER_TRT_PLUGIN_V2
(
SkipLayerNormPluginV2Creator
);
#endif
}
// namespace plugin
...
...
paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
浏览文件 @
1731b32d
...
...
@@ -175,11 +175,24 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
void
serializeBase
(
void
*&
buffer
)
const
;
// NOLINT
private:
std
::
string
name_space_
{
"paddle_trt"
}
;
std
::
string
plugin_base_
{
"plugin_dynamic"
}
;
std
::
string
name_space_
;
std
::
string
plugin_base_
;
};
#endif
template
<
typename
T
>
class
TrtPluginRegistrarV2
{
public:
TrtPluginRegistrarV2
()
{
getPluginRegistry
()
->
registerCreator
(
creator
,
""
);
}
private:
T
creator
;
};
#define REGISTER_TRT_PLUGIN_V2(name) \
static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
plugin_registrar_##name {}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
...
...
paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
浏览文件 @
1731b32d
...
...
@@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size,
return
details
::
Serializer
<
T
>::
Deserialize
(
buffer
,
buffer_size
,
value
);
}
template
<
typename
T
>
inline
void
SerializeCudaPointer
(
void
**
buffer
,
T
*
value
,
int
size
)
{
cudaMemcpy
((
*
buffer
),
value
,
size
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
reinterpret_cast
<
char
*&>
(
*
buffer
)
+=
size
*
sizeof
(
T
);
}
}
// namespace plugin
}
// namespace tensorrt
}
// namespace inference
...
...
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
1731b32d
...
...
@@ -433,6 +433,25 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_analysis_test
(
test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TEST_TRT_ERNIE_MODEL
}
/ernie_model_4
)
set
(
TEST_TRT_ERNIE_UNSER_MODEL
"
${
TRT_MODEL_INSTALL_DIR
}
/ernie_test/ernie_model_4_unserialized/"
)
if
(
NOT EXISTS
${
TEST_TRT_ERNIE_UNSER_MODEL
}
)
inference_download_and_uncompress
(
${
TEST_TRT_ERNIE_MODEL
}
${
INFERENCE_URL
}
/tensorrt_test
"ernie_model_4_unserialized.tgz"
)
endif
()
inference_analysis_test
(
test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TEST_TRT_ERNIE_MODEL
}
/ernie_model_4_unserialized
)
set
(
TEST_TRT_ERNIE_SER_MODEL
"
${
TRT_MODEL_INSTALL_DIR
}
/ernie_test/ernie_model_4_serialized/"
)
if
(
NOT EXISTS
${
TEST_TRT_ERNIE_SER_MODEL
}
)
inference_download_and_uncompress
(
${
TEST_TRT_ERNIE_MODEL
}
${
INFERENCE_URL
}
/tensorrt_test
"ernie_model_4_serialized.tgz"
)
endif
()
inference_analysis_test
(
test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS
${
INFERENCE_EXTRA_DEPS
}
ARGS --infer_model=
${
TEST_TRT_ERNIE_MODEL
}
/ernie_model_4_serialized
)
endif
()
set
(
LITE_MODEL_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/lite"
)
...
...
paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
0 → 100644
浏览文件 @
1731b32d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace
paddle
{
namespace
inference
{
void
run
(
const
AnalysisConfig
&
config
,
std
::
vector
<
float
>*
out_data
)
{
auto
predictor
=
CreatePaddlePredictor
(
config
);
auto
input_names
=
predictor
->
GetInputNames
();
int
run_batch
=
1
;
const
int
run_seq_len
=
128
;
std
::
vector
<
int64_t
>
tmp_input
;
std
::
vector
<
float
>
tmp_four_input
;
tmp_input
.
reserve
(
run_batch
*
run_seq_len
);
tmp_four_input
.
reserve
(
run_batch
*
run_seq_len
);
int64_t
i0
[
run_seq_len
]
=
{
1
,
3558
,
4
,
75
,
491
,
89
,
340
,
313
,
93
,
4
,
255
,
10
,
75
,
321
,
4095
,
1902
,
4
,
134
,
49
,
75
,
311
,
14
,
44
,
178
,
543
,
15
,
12043
,
2
,
75
,
201
,
340
,
9
,
14
,
44
,
486
,
218
,
1140
,
279
,
12043
,
2
};
int64_t
i1
[
run_seq_len
]
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
int64_t
i2
[
run_seq_len
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
,
32
,
33
,
34
,
35
,
36
,
37
,
38
,
39
};
float
i3
[
run_seq_len
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
// first input
auto
input_t
=
predictor
->
GetInputTensor
(
input_names
[
0
]);
input_t
->
Reshape
({
run_batch
,
run_seq_len
,
1
});
input_t
->
copy_from_cpu
(
i0
);
// second input
auto
input_t2
=
predictor
->
GetInputTensor
(
input_names
[
1
]);
input_t2
->
Reshape
({
run_batch
,
run_seq_len
,
1
});
input_t2
->
copy_from_cpu
(
i1
);
// third input.
auto
input_t3
=
predictor
->
GetInputTensor
(
input_names
[
2
]);
input_t3
->
Reshape
({
run_batch
,
run_seq_len
,
1
});
input_t3
->
copy_from_cpu
(
i2
);
auto
input_t4
=
predictor
->
GetInputTensor
(
input_names
[
3
]);
input_t4
->
Reshape
({
run_batch
,
run_seq_len
,
1
});
input_t4
->
copy_from_cpu
(
i3
);
ASSERT_TRUE
(
predictor
->
ZeroCopyRun
());
auto
output_names
=
predictor
->
GetOutputNames
();
auto
output_t
=
predictor
->
GetOutputTensor
(
output_names
[
0
]);
std
::
vector
<
int
>
output_shape
=
output_t
->
shape
();
int
out_num
=
std
::
accumulate
(
output_shape
.
begin
(),
output_shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
out_data
->
resize
(
out_num
);
output_t
->
copy_to_cpu
(
out_data
->
data
());
}
void
trt_ernie
(
bool
with_fp16
,
std
::
vector
<
float
>
result
)
{
AnalysisConfig
config
;
std
::
string
model_dir
=
FLAGS_infer_model
;
SetConfig
(
&
config
,
model_dir
,
true
/* use_gpu */
);
config
.
SwitchUseFeedFetchOps
(
false
);
int
head_number
=
12
;
int
batch
=
1
;
int
min_seq_len
=
1
;
int
max_seq_len
=
128
;
int
opt_seq_len
=
128
;
std
::
vector
<
int
>
min_shape
=
{
batch
,
min_seq_len
,
1
};
std
::
vector
<
int
>
max_shape
=
{
batch
,
max_seq_len
,
1
};
std
::
vector
<
int
>
opt_shape
=
{
batch
,
opt_seq_len
,
1
};
// Set the input's min, max, opt shape
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
min_input_shape
=
{
{
"read_file_0.tmp_0"
,
min_shape
},
{
"read_file_0.tmp_1"
,
min_shape
},
{
"read_file_0.tmp_2"
,
min_shape
},
{
"stack_0.tmp_0"
,
{
batch
,
head_number
,
min_seq_len
,
min_seq_len
}}};
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
max_input_shape
=
{
{
"read_file_0.tmp_0"
,
max_shape
},
{
"read_file_0.tmp_1"
,
max_shape
},
{
"read_file_0.tmp_2"
,
max_shape
},
{
"stack_0.tmp_0"
,
{
batch
,
head_number
,
max_seq_len
,
max_seq_len
}}};
std
::
map
<
std
::
string
,
std
::
vector
<
int
>>
opt_input_shape
=
{
{
"read_file_0.tmp_0"
,
opt_shape
},
{
"read_file_0.tmp_1"
,
opt_shape
},
{
"read_file_0.tmp_2"
,
opt_shape
},
{
"stack_0.tmp_0"
,
{
batch
,
head_number
,
opt_seq_len
,
opt_seq_len
}}};
auto
precision
=
AnalysisConfig
::
Precision
::
kFloat32
;
if
(
with_fp16
)
{
precision
=
AnalysisConfig
::
Precision
::
kHalf
;
}
config
.
EnableTensorRtEngine
(
1
<<
30
,
1
,
5
,
precision
,
true
,
false
);
config
.
SetTRTDynamicShapeInfo
(
min_input_shape
,
max_input_shape
,
opt_input_shape
);
std
::
vector
<
float
>
out_data
;
run
(
config
,
&
out_data
);
for
(
size_t
i
=
0
;
i
<
out_data
.
size
();
i
++
)
{
EXPECT_NEAR
(
result
[
i
],
out_data
[
i
],
1e-6
);
}
}
TEST
(
AnalysisPredictor
,
no_fp16
)
{
std
::
vector
<
float
>
result
=
{
0.597841
,
0.219972
,
0.182187
};
trt_ernie
(
false
,
result
);
}
TEST
(
AnalysisPredictor
,
fp16
)
{
#ifdef SUPPORTS_CUDA_FP16
std
::
vector
<
float
>
result
=
{
0.598336
,
0.219558
,
0.182106
};
trt_ernie
(
true
,
result
);
#endif
}
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
浏览文件 @
1731b32d
...
...
@@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
if
(
with_fp16
)
{
precision
=
AnalysisConfig
::
Precision
::
kHalf
;
}
config
.
EnableTensorRtEngine
(
1
<<
30
,
1
,
5
,
precision
,
false
,
tru
e
);
config
.
EnableTensorRtEngine
(
1
<<
30
,
1
,
5
,
precision
,
false
,
fals
e
);
config
.
SetTRTDynamicShapeInfo
(
min_input_shape
,
max_input_shape
,
opt_input_shape
);
std
::
vector
<
float
>
out_data
;
...
...
paddle/fluid/operators/clip_op.h
浏览文件 @
1731b32d
...
...
@@ -25,17 +25,23 @@ namespace operators {
using
framework
::
Tensor
;
using
platform
::
Transform
;
#ifdef __NVCC__
template
<
typename
T
,
typename
UnaryOperation
>
__global__
void
ClipCudaKernel
(
const
T
*
input
,
T
*
out
,
int
num
,
UnaryOperation
op
)
{
int
idx
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
if
(
idx
<
num
)
{
out
[
idx
]
=
op
(
input
[
idx
]);
}
}
#endif
template
<
typename
T
>
class
ClipFunctor
{
public:
explicit
ClipFunctor
(
const
T
min
,
const
T
max
)
:
min_
(
min
),
max_
(
max
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
if
(
x
<
min_
)
return
min_
;
else
if
(
x
>
max_
)
return
max_
;
else
return
x
;
return
x
<
min_
?
min_
:
x
>
max_
?
max_
:
x
;
}
private:
...
...
@@ -97,9 +103,20 @@ class ClipKernel : public framework::OpKernel<T> {
T
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
x_data
=
x
->
data
<
T
>
();
int64_t
numel
=
x
->
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
x_data
,
x_data
+
numel
,
out_data
,
ClipFunctor
<
T
>
(
min
,
max
));
if
(
platform
::
is_gpu_place
(
context
.
GetPlace
()))
{
#ifdef __NVCC__
int
threads
=
256
;
int
blocks
=
(
numel
+
threads
-
1
)
/
threads
;
ClipCudaKernel
<
T
,
ClipFunctor
<
T
>><<<
blocks
,
threads
,
0
,
context
.
template
device_context
<
platform
::
CUDADeviceContext
>()
.
stream
()
>>>
(
x_data
,
out_data
,
numel
,
ClipFunctor
<
T
>
(
min
,
max
));
#endif
}
else
{
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
x_data
,
x_data
+
numel
,
out_data
,
ClipFunctor
<
T
>
(
min
,
max
));
}
}
else
if
(
x_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
*
x
=
context
.
Input
<
framework
::
SelectedRows
>
(
"X"
);
auto
*
out
=
context
.
Output
<
framework
::
SelectedRows
>
(
"Out"
);
...
...
paddle/fluid/operators/controlflow/compare_op.cc
浏览文件 @
1731b32d
...
...
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/controlflow/compare_op.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -85,14 +88,22 @@ class CompareOp : public framework::OperatorWithKernel {
auto
dim_x
=
context
->
GetInputDim
(
"X"
);
auto
dim_y
=
context
->
GetInputDim
(
"Y"
);
PADDLE_ENFORCE_GE
(
dim_x
.
size
(),
dim_y
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of dim_y should not be greater than "
"dim_x's, but received dim_y: %d > dim_x: %d.
\n
"
,
dim_y
.
size
(),
dim_x
.
size
()));
context
->
SetOutputDim
(
"Out"
,
context
->
GetInputDim
(
"X"
));
context
->
ShareLoD
(
"X"
,
"Out"
);
if
(
context
->
GetInputDim
(
"X"
)
==
context
->
GetInputDim
(
"Y"
))
{
context
->
ShareDim
(
"X"
,
/*->*/
"Out"
);
context
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
else
{
int
max_dim
=
std
::
max
(
dim_x
.
size
(),
dim_y
.
size
());
int
axis
=
std
::
abs
(
dim_x
.
size
()
-
dim_y
.
size
());
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
dim_x
,
dim_y
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
context
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims_array
));
// to do
context
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
}
framework
::
OpKernelType
GetExpectedKernelType
(
...
...
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
1731b32d
...
...
@@ -162,19 +162,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
has_got_workspace_size
=
false
;
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
workspace_size_limit
=
workspace_size
;
}
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -303,19 +291,8 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
#endif
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
workspace_size_limit
=
workspace_size
;
has_got_workspace_size
=
false
;
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
@@ -432,19 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
has_got_workspace_size
=
false
;
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
}
if
(
!
has_got_workspace_size
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
workspace_size
=
workspace_size_limit
;
}
#else
PADDLE_ENFORCE_CUDA_SUCCESS
(
...
...
paddle/fluid/operators/elementwise/elementwise_op_function.h
浏览文件 @
1731b32d
...
...
@@ -197,6 +197,40 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
}
#ifdef __NVCC__
template
<
typename
Functor
,
typename
T
,
typename
OutType
>
__global__
void
ElementwiseKernel
(
const
T
*
x
,
const
T
*
y
,
OutType
*
out
,
int
pre
,
int
n
,
int
post
,
int
total
,
Functor
func
)
{
int
tid
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
int
idx
=
tid
/
post
%
n
;
if
(
tid
<
total
)
{
out
[
tid
]
=
func
(
x
[
tid
],
y
[
idx
]);
}
}
template
<
typename
Functor
,
typename
T
,
typename
OutType
>
void
ComputeElementwiseCUDA
(
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
,
int
pre
,
int
n
,
int
post
,
const
platform
::
CUDADeviceContext
&
ctx
,
Functor
func
,
const
bool
is_xsize_larger
=
true
)
{
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
y_data
=
y
->
data
<
T
>
();
OutType
*
out_data
=
z
->
mutable_data
<
OutType
>
(
ctx
.
GetPlace
());
int
numel
=
pre
*
n
*
post
;
int
threads
=
256
;
int
blocks
=
(
numel
+
threads
-
1
)
/
threads
;
if
(
is_xsize_larger
)
{
ElementwiseKernel
<
Functor
,
T
,
OutType
><<<
blocks
,
threads
,
0
,
ctx
.
stream
()
>>>
(
x_data
,
y_data
,
out_data
,
pre
,
n
,
post
,
numel
,
func
);
}
else
{
ElementwiseKernel
<
Functor
,
T
,
OutType
><<<
blocks
,
threads
,
0
,
ctx
.
stream
()
>>>
(
y_data
,
x_data
,
out_data
,
pre
,
n
,
post
,
numel
,
func
);
}
}
template
<
typename
Functor
,
typename
T
,
typename
OutType
=
T
>
__global__
void
CommonForwardBroadcastCUDAKernel
(
const
int
*
x_strides_array
,
const
int
*
y_strides_array
,
...
...
@@ -1908,6 +1942,16 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
ctx
,
x
,
y
,
z
,
x_dims
,
y_dims
,
func
,
axis
,
is_xsize_larger
);
return
;
}
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef __NVCC__
ComputeElementwiseCUDA
<
Functor
,
T
,
OutType
>
(
x
,
y
,
z
,
pre
,
n
,
post
,
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
func
,
is_xsize_larger
);
#endif
return
;
}
if
(
post
==
1
)
{
functor
.
RunRowWise
(
n
,
pre
);
return
;
...
...
paddle/fluid/operators/fused/conv_fusion_op.cu
浏览文件 @
1731b32d
...
...
@@ -204,11 +204,17 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto
x_dims
=
framework
::
vectorize
(
transformed_input
.
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
if
(
!
exhaustive_search
)
{
int
perf_count
;
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
_v7
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
cudnn_output_desc
,
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
{
std
::
function
<
cudnnConvolutionFwdAlgo_t
()
>
search_func
=
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
1731b32d
...
...
@@ -179,16 +179,23 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
out_desc
[
i
],
cudnn_dtype
,
4
,
out_dims
[
i
].
data
(),
out_strides
[
i
].
data
()));
int
perf_count
;
int
best_algo_idx
=
0
;
size_t
tmp_size
=
0
;
std
::
unique_ptr
<
cudnnConvolutionFwdAlgoPerf_t
[]
>
perf_results
(
new
cudnnConvolutionFwdAlgoPerf_t
[
kNUM_CUDNN_FWD_ALGS
]);
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
_v7
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
[
i
]))
;
size_t
tmp_size
=
0
;
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
algo
[
i
]
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
in_desc
[
i
],
filter_desc
[
i
],
conv_desc
[
i
],
out_desc
[
i
],
algo
[
i
],
&
tmp_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
tmp_size
);
}
cudnnActivationDescriptor_t
cudnn_act_desc
=
...
...
paddle/fluid/operators/math/bert_encoder_functor.cu
浏览文件 @
1731b32d
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda_runtime.h>
#include <algorithm>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h"
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
1731b32d
...
...
@@ -54,7 +54,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
...
...
@@ -102,7 +101,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnSetRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
...
...
@@ -126,12 +124,19 @@ CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 3000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
__macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
__macro(cudnnGetConvolutionBackwardDataAlgorithm); \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
// APIs available after R3:
#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
__macro(cudnnGetConvolutionBackwardFilterAlgorithm); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnSetRNNDescriptor);
CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
// APIs available after R4:
#if CUDNN_VERSION >= 4007
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \
...
...
@@ -183,6 +188,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
#if CUDNN_VERSION >= 8000
#define CUDNN_DNN_ROUTINE_EACH_R8(__macro) __macro(cudnnSetRNNDescriptor_v8);
CUDNN_DNN_ROUTINE_EACH_R8
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/dynload/tensorrt.cc
浏览文件 @
1731b32d
...
...
@@ -13,18 +13,62 @@
limitations under the License. */
#include "paddle/fluid/platform/dynload/tensorrt.h"
#include <string>
namespace
paddle
{
namespace
platform
{
namespace
dynload
{
std
::
once_flag
tensorrt_dso_flag
;
void
*
tensorrt_dso_handle
;
void
*
tensorrt_dso_handle
;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
TENSORRT_RAND_ROUTINE_EACH
(
DEFINE_WRAP
);
void
*
GetTensorRtHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
std
::
string
dso_name
=
"libnvinfer.dylib"
;
#elif defined(_WIN32)
std
::
string
dso_name
=
"nvinfer.dll"
;
#else
std
::
string
dso_name
=
"libnvinfer.so"
;
#endif
#if !defined(_WIN32)
int
dynload_flags
=
RTLD_LAZY
|
RTLD_LOCAL
;
#else
int
dynload_flags
=
0
;
#endif // !_WIN32
void
*
dso_handle
=
dlopen
(
dso_name
.
c_str
(),
dynload_flags
);
if
(
nullptr
==
dso_handle
)
{
auto
error_msg
=
"TensorRT dynamic library (%s) that Paddle depends on is not "
"configured correctly. (error code is %s)
\n
"
" Suggestions:
\n
"
" 1. Check if TensorRT "
"is installed correctly and its version is matched with paddlepaddle "
"you installed.
\n
"
" 2. Configure TensorRT dynamic library environment variables as "
"follows:
\n
"
" - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`
\n
"
" - Windows: set PATH by `set PATH=XXX;%PATH%`
\n
"
" - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
"[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
"impossible unless System Integrity Protection (SIP) is disabled.]"
;
#if !defined(_WIN32)
auto
errorno
=
dlerror
();
#else
auto
errorno
=
GetLastError
();
#endif // !_WIN32
std
::
cerr
<<
string
::
Sprintf
(
error_msg
,
dso_name
,
errorno
);
}
return
dso_handle
;
}
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/dynload/tensorrt.h
浏览文件 @
1731b32d
...
...
@@ -27,6 +27,8 @@ namespace paddle {
namespace
platform
{
namespace
dynload
{
void
*
GetTensorRtHandle
();
extern
std
::
once_flag
tensorrt_dso_flag
;
extern
void
*
tensorrt_dso_handle
;
...
...
@@ -36,8 +38,7 @@ extern void* tensorrt_dso_handle;
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using tensorrt_func = decltype(&::__name); \
std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = \
paddle::platform::dynload::GetTensorRtDsoHandle(); \
tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle, \
platform::errors::Unavailable( \
"Load tensorrt %s failed", #__name)); \
...
...
@@ -53,7 +54,8 @@ extern void* tensorrt_dso_handle;
#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
__macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL);
__macro(createInferRuntime_INTERNAL); \
__macro(getPluginRegistry);
TENSORRT_RAND_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP
)
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
1731b32d
...
...
@@ -958,6 +958,7 @@ function parallel_test() {
ut_total_startTime_s
=
`
date
+%s
`
mkdir
-p
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
pip
install
${
PADDLE_ROOT
}
/build/python/dist/
*
whl
if
[
"
$WITH_GPU
"
==
"ON"
]
;
then
parallel_test_base_gpu
else
...
...
python/paddle/fleet/base/fleet_base.py
浏览文件 @
1731b32d
...
...
@@ -279,8 +279,11 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/Fleet
"""
context
=
{}
# cache original feed forward program
self
.
origin_main_program
=
loss
.
block
.
program
context
[
"origin_main_program"
]
=
self
.
origin_main_program
context
[
"loss"
]
=
loss
if
startup_program
==
None
:
self
.
origin_startup_program
=
\
paddle
.
default_startup_program
().
clone
(
for_test
=
False
)
...
...
@@ -288,6 +291,8 @@ class Fleet(object):
else
:
self
.
origin_startup_program
=
\
startup_program
.
clone
(
for_test
=
False
)
context
[
"origin_startup_program"
]
=
startup_program
context
[
"role_maker"
]
=
self
.
_role_maker
# compile time
distributed_optimizer_list
=
\
...
...
@@ -317,6 +322,9 @@ class Fleet(object):
valid_strategy
=
self
.
strategy_compiler
.
_get_valid_strategy
(
self
.
user_defined_strategy
,
can_not_apply_optimizer_list
)
context
[
"valid_strategy"
]
=
valid_strategy
self
.
valid_strategy
=
valid_strategy
optimize_ops
=
[]
...
...
@@ -334,6 +342,8 @@ class Fleet(object):
parameter_list
=
parameter_list
,
no_grad_set
=
no_grad_set
)
context
[
"program_optimize_ops"
]
=
optimize_ops
context
[
"program_params_grads"
]
=
params_grads
if
graph_optimizer
:
optimize_ops
,
params_grads
=
graph_optimizer
.
minimize
(
loss
,
...
...
@@ -344,12 +354,13 @@ class Fleet(object):
# if a graph optimizer takes effect, mostly
# optimizers_ops and params_grads are None
# i.e. users can not modify current computation graph anymore
context
[
"graph_optimize_ops"
]
=
optimize_ops
context
[
"graph_optimize_grads"
]
=
params_grads
if
self
.
_runtime_handle
is
None
:
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
valid_strategy
,
self
.
_role_maker
,
optimize_ops
,
params_grads
)
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
context
)
if
self
.
_util
is
None
:
self
.
_util
=
UtilFactory
().
_create_util
(
valid_strategy
,
self
.
_role_maker
,
optimize_ops
,
params_grads
)
self
.
_util
=
UtilFactory
().
_create_util
(
context
)
return
optimize_ops
,
params_grads
python/paddle/fleet/base/runtime_factory.py
浏览文件 @
1731b32d
...
...
@@ -18,10 +18,8 @@ class RuntimeFactory(object):
def
__init__
(
self
):
pass
def
_create_runtime
(
self
,
final_dist_strategy
,
role_maker
,
opt_ops
,
params_grads
):
if
role_maker
.
_is_collective
:
def
_create_runtime
(
self
,
context
):
if
context
[
"role_maker"
].
_is_collective
:
collective_runtime
=
CollectiveRuntime
()
collective_runtime
.
_set_basic_info
(
final_dist_strategy
,
role_maker
,
opt_ops
,
params_grads
)
collective_runtime
.
_set_basic_info
(
context
)
return
collective_runtime
python/paddle/fleet/base/util_factory.py
浏览文件 @
1731b32d
...
...
@@ -20,11 +20,10 @@ __all__ = ['UtilBase']
class
UtilFactory
(
object
):
def
_create_util
(
self
,
dist_strategy
,
role_maker
,
optimize_ops
,
params_grads
):
def
_create_util
(
self
,
context
):
util
=
UtilBase
()
util
.
_set_strategy
(
dist_strategy
)
util
.
_set_role_maker
(
role_maker
)
util
.
_set_strategy
(
context
[
"valid_strategy"
]
)
util
.
_set_role_maker
(
context
[
"role_maker"
]
)
return
util
...
...
python/paddle/fleet/cloud_utils.py
0 → 100644
浏览文件 @
1731b32d
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
paddle
from
paddle.fleet.launch_utils
import
get_cluster
,
logger
def
get_cloud_cluster
(
args_node_ips
,
selected_gpus
,
args_port
=
6170
):
"""
args_node_ips, args_node_ip:string
"""
#you can automatically get ip info while using paddlecloud multi nodes mode.
node_ips
=
os
.
getenv
(
"PADDLE_TRAINERS"
)
assert
node_ips
is
not
None
,
"PADDLE_TRAINERS should not be None"
node_ip
=
os
.
getenv
(
"POD_IP"
)
assert
node_ip
is
not
None
,
"POD_IP should not be None"
node_rank
=
os
.
getenv
(
"PADDLE_TRAINER_ID"
)
assert
node_rank
is
not
None
,
"PADDLE_TRAINER_ID should not be None"
node_ips
=
node_ips
.
split
(
","
)
num_nodes
=
len
(
node_ips
)
node_rank
=
int
(
node_rank
)
if
args_node_ips
!=
"127.0.0.1"
and
args_node_ips
!=
","
.
join
(
node_ips
):
logger
.
warning
(
"Please NOTE: When using paddlecloud, cluster_node_ips is
\
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).
\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from
\
paddlecloud environment."
.
format
(
args_node_ips
,
node_ips
))
started_port
=
args_port
print
(
"num_nodes:"
,
num_nodes
)
if
num_nodes
>
1
:
try
:
paddle_port
=
int
(
os
.
getenv
(
"PADDLE_PORT"
,
""
))
paddle_port_num
=
int
(
os
.
getenv
(
"TRAINER_PORTS_NUM"
,
""
))
if
paddle_port_num
>=
len
(
selected_gpus
)
and
paddle_port
!=
args_port
:
logger
.
warning
(
"Use Cloud specified port:{}."
.
format
(
paddle_port
))
started_port
=
paddle_port
except
Exception
as
e
:
print
(
e
)
pass
if
started_port
is
None
:
started_port
=
6170
logger
.
debug
(
"parsed from args:node_ips:{}
\
node_ip:{} node_rank:{} started_port:{}"
.
format
(
node_ips
,
node_ip
,
node_rank
,
started_port
))
ports
=
[
x
for
x
in
range
(
started_port
,
started_port
+
len
(
selected_gpus
))]
cluster
,
pod
=
get_cluster
(
node_ips
,
node_ip
,
ports
,
selected_gpus
)
return
cluster
,
cluster
.
pods
[
node_rank
]
def
use_paddlecloud
():
node_ips
=
os
.
getenv
(
"PADDLE_TRAINERS"
)
node_ip
=
os
.
getenv
(
"POD_IP"
)
node_rank
=
os
.
getenv
(
"PADDLE_TRAINER_ID"
)
if
node_ips
is
None
or
node_ip
is
None
or
node_rank
is
None
:
return
False
else
:
return
True
def
get_trainers_num
():
return
int
(
os
.
getenv
(
"PADDLE_TRAINERS_NUM"
,
"1"
))
python/paddle/fleet/launch.py
0 → 100644
浏览文件 @
1731b32d
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
paddle.distributed.launch is a module that spawns multiple distributed
process on each training node for gpu training and cpu training.
Usage:
In both of single node training or multiple node training, this module
launch a process on each of the given gpu card or cpu machine.
GPU training:
1. for single node training with all visible gpu cards:
fleetrun your_training_py (arg1 arg2 and all others)
2. for single node training with [0,4) cards
fleetrun --gpus="0,1,2,3" your_training_py (arg1 arg2 and all others)
3. for multiple node training such as two node:192.168.0.16, 192.168.0.17
on 192.168.0.16:
fleetrun --ips="192.168.0.16,192.168.0.17" --node_ip=192.168.0.16
\
your_training_py (arg1 arg2 and all others)
on 192.168.0.17:
fleetrun --ips="192.168.0.16,192.168.0.17"
\
--node_ip=192.168.0.17
\
your_training_py (arg1 arg2 and all others)
CPU training:
1. for single node training with multi servers and workers:
fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others)
2. for multiple node training such as two node:192.168.0.16, 192.168.0.17
\
with 2 servers and 4 workers.
on 192.168.0.16:
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171"
\
--workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175"
\
your_training_py (arg1 arg2 and all others)
on 192.168.0.17:
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171"
\
--workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175"
\
your_training_py (arg1 arg2 and all others)
"""
from
__future__
import
print_function
import
sys
from
sys
import
version
import
subprocess
import
os
import
time
import
six
import
copy
from
argparse
import
ArgumentParser
,
REMAINDER
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fleet.launch_utils
import
*
import
paddle.fleet.cloud_utils
as
cloud_utils
def
_print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
six
.
iteritems
(
vars
(
args
))):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
_parse_args
():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser
=
ArgumentParser
(
description
=
'''start paddle training using multi-process mode.
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
'''
)
#Optional arguments for the launch helper
parser
.
add_argument
(
"--ips"
,
type
=
str
,
default
=
"127.0.0.1"
,
help
=
"Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17.."
)
parser
.
add_argument
(
"--gpus"
,
type
=
str
,
default
=
None
,
help
=
"It's for gpu training and the training process will run on the gpus,"
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
)
parser
.
add_argument
(
"--servers"
,
type
=
str
,
default
=
""
,
help
=
"User defined servers ip:port"
)
parser
.
add_argument
(
"--workers"
,
type
=
str
,
default
=
""
,
help
=
"User defined workers ip:port"
)
parser
.
add_argument
(
"--worker_num"
,
type
=
int
,
default
=
2
,
help
=
"number of workers"
)
parser
.
add_argument
(
"--server_num"
,
type
=
int
,
default
=
2
,
help
=
"number of servers"
)
parser
.
add_argument
(
"--log_dir"
,
type
=
str
,
help
=
"The path for each process's log.If it's not set, the log will printed to default pipe."
)
#positional
parser
.
add_argument
(
"training_script"
,
type
=
str
,
help
=
"The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script"
)
#rest from the training program
parser
.
add_argument
(
'training_script_args'
,
nargs
=
REMAINDER
)
return
parser
.
parse_args
()
def
get_cluster_from_args
(
args
,
gpus
):
node_ips
=
[
x
.
strip
()
for
x
in
args
.
ips
.
split
(
','
)]
if
len
(
node_ips
)
==
1
:
node_ip
=
node_ips
[
0
]
else
:
_
,
node_ip
=
get_host_name_ip
()
# node_ip = args.node_ip
assert
node_ip
in
node_ips
,
"Can't find your local ip {%s} in node_ips:{%s}"
\
%
(
node_ip
,
node_ips
)
node_rank
=
node_ips
.
index
(
node_ip
)
logger
.
debug
(
"parsed from args:node_ips:{} node_ip:{} node_rank:{}"
.
format
(
node_ips
,
node_ip
,
node_rank
))
free_ports
=
None
if
not
cloud_utils
.
use_paddlecloud
()
and
len
(
node_ips
)
<=
1
and
os
.
environ
.
get
(
'FLAGS_START_PORT'
)
is
None
:
free_ports
=
find_free_ports
(
len
(
gpus
))
if
free_ports
is
not
None
:
free_ports
=
list
(
free_ports
)
else
:
start_port
=
6070
if
os
.
environ
.
get
(
'FLAGS_START_PORT'
)
is
not
None
:
start_port
=
os
.
environ
.
get
(
'FLAGS_START_PORT'
)
free_ports
=
[
x
for
x
in
range
(
start_port
,
start_port
+
len
(
gpus
))]
return
get_cluster
(
node_ips
,
node_ip
,
free_ports
,
gpus
)
def
get_gpus
(
gpus
):
if
gpus
is
None
:
gpus_num
=
fluid
.
core
.
get_cuda_device_count
()
gpus
=
[
str
(
x
)
for
x
in
range
(
0
,
gpus_num
)]
else
:
cuda_visible_devices
=
os
.
getenv
(
"CUDA_VISIBLE_DEVICES"
)
if
cuda_visible_devices
is
None
or
cuda_visible_devices
==
""
:
gpus
=
[
x
.
strip
()
for
x
in
gpus
.
split
(
','
)]
else
:
# change gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
# therefore gpus=0,1,2,3
cuda_visible_devices_list
=
cuda_visible_devices
.
split
(
','
)
for
x
in
gpus
.
split
(
','
):
assert
x
in
cuda_visible_devices_list
,
"Can't find "
\
"your gpus %s in CUDA_VISIBLE_DEVICES[%s]."
\
%
(
x
,
cuda_visible_devices
)
gpus
=
[
cuda_visible_devices_list
.
index
(
x
.
strip
())
for
x
in
gpus
.
split
(
','
)
]
return
gpus
def
launch_collective
(
args
):
# parse arguments, used for cloud-single-machine and local
gpus
=
get_gpus
(
args
.
gpus
)
trainers_num
=
cloud_utils
.
get_trainers_num
()
logger
.
debug
(
"parsed from args trainerss_num:{} gpus:{}"
.
format
(
trainers_num
,
gpus
))
cluster
=
None
pod
=
None
if
cloud_utils
.
use_paddlecloud
()
and
trainers_num
!=
1
:
cluster
,
pod
=
cloud_utils
.
get_cloud_cluster
(
args
.
ips
,
gpus
)
logger
.
info
(
"get cluster from cloud:{}"
.
format
(
cluster
))
else
:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster
,
pod
=
get_cluster_from_args
(
args
,
gpus
)
logger
.
info
(
"get cluster from args:{}"
.
format
(
cluster
))
procs
=
start_local_trainers
(
cluster
,
pod
,
training_script
=
args
.
training_script
,
training_script_args
=
args
.
training_script_args
,
log_dir
=
args
.
log_dir
)
while
True
:
alive
=
watch_local_trainers
(
procs
,
cluster
.
trainers_nranks
())
if
not
alive
:
logger
.
info
(
"Local procs complete, POD info:{}"
.
format
(
pod
))
break
time
.
sleep
(
3
)
def
launch_ps
(
args
):
worker_num
=
args
.
worker_num
server_num
=
args
.
server_num
start_port
=
6170
if
os
.
environ
.
get
(
'FLAGS_START_PORT'
)
is
not
None
:
start_port
=
os
.
environ
.
get
(
'FLAGS_START_PORT'
)
default_env
=
os
.
environ
.
copy
()
current_env
=
copy
.
copy
(
default_env
)
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
procs
=
[]
cmds
=
[]
log_fns
=
[]
ports
=
range
(
start_port
,
start_port
+
server_num
,
1
)
default_endpoints
=
","
.
join
([
"127.0.0.1:"
+
str
(
x
)
for
x
in
ports
])
user_endpoints
=
""
if
args
.
servers
==
""
:
user_endpoints
=
default_endpoints
else
:
user_endpoints
=
args
.
servers
user_endpoints_ips
=
[
x
.
split
(
":"
)[
0
]
for
x
in
user_endpoints
.
split
(
","
)]
user_endpoints_port
=
[
x
.
split
(
":"
)[
1
]
for
x
in
user_endpoints
.
split
(
","
)]
for
i
in
range
(
server_num
):
current_env
.
update
({
"PADDLE_PSERVERS_IP_PORT_LIST"
:
user_endpoints
,
"PADDLE_PORT"
:
user_endpoints_port
[
i
],
"TRAINING_ROLE"
:
"PSERVER"
,
"PADDLE_TRAINERS_NUM"
:
str
(
worker_num
),
"POD_IP"
:
user_endpoints_ips
[
i
]
})
cmd
=
[
sys
.
executable
,
"-u"
,
args
.
training_script
]
+
args
.
training_script_args
cmds
.
append
(
cmd
)
if
args
.
log_dir
is
not
None
:
os
.
system
(
"mkdir -p {}"
.
format
(
args
.
log_dir
))
fn
=
open
(
"%s/serverlog.%d"
%
(
args
.
log_dir
,
i
),
"w"
)
log_fns
.
append
(
fn
)
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
stdout
=
fn
,
stderr
=
fn
)
else
:
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
)
procs
.
append
(
proc
)
for
i
in
range
(
worker_num
):
current_env
.
update
({
"PADDLE_PSERVERS_IP_PORT_LIST"
:
user_endpoints
,
"PADDLE_TRAINERS_NUM"
:
str
(
worker_num
),
"TRAINING_ROLE"
:
"TRAINER"
,
"PADDLE_TRAINER_ID"
:
str
(
i
)
})
cmd
=
[
sys
.
executable
,
"-u"
,
args
.
training_script
]
+
args
.
training_script_args
cmds
.
append
(
cmd
)
if
args
.
log_dir
is
not
None
:
os
.
system
(
"mkdir -p {}"
.
format
(
args
.
log_dir
))
fn
=
open
(
"%s/workerlog.%d"
%
(
args
.
log_dir
,
i
),
"w"
)
log_fns
.
append
(
fn
)
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
stdout
=
fn
,
stderr
=
fn
)
else
:
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
)
procs
.
append
(
proc
)
# only wait worker to finish here
for
i
,
proc
in
enumerate
(
procs
):
if
i
<
server_num
:
continue
procs
[
i
].
wait
()
if
len
(
log_fns
)
>
0
:
log_fns
[
i
].
close
()
print
(
"all workers exit, going to finish parameter server"
,
file
=
sys
.
stderr
)
for
i
in
range
(
server_num
):
if
len
(
log_fns
)
>
0
:
log_fns
[
i
].
close
()
procs
[
i
].
terminate
()
print
(
"all parameter server are killed"
,
file
=
sys
.
stderr
)
def
launch
():
args
=
_parse_args
()
logger
=
get_logger
()
_print_arguments
(
args
)
ps_args
=
[
'--worker_num'
,
'--server_num'
,
'--servers'
,
'--workers'
]
collective_args
=
[
'--ips'
,
'--gpus'
]
has_ps_args
=
[
ps_arg
for
ps_arg
in
ps_args
if
ps_arg
in
" "
.
join
(
sys
.
argv
[
1
:
-
1
])
]
has_collective_args
=
[
co_arg
for
co_arg
in
collective_args
if
co_arg
in
" "
.
join
(
sys
.
argv
[
1
:
-
1
])
]
if
len
(
has_ps_args
)
>
0
or
fluid
.
core
.
get_cuda_device_count
()
==
0
:
logger
.
info
(
"Run cpu parameter-sever mode."
)
launch_ps
(
args
)
elif
len
(
has_collective_args
)
>
0
:
logger
.
info
(
"Run gpu collective mode."
)
launch_collective
(
args
)
else
:
logger
.
warning
(
"Not found distinct args. Default use gpu collective mode"
)
launch_collective
(
args
)
if
__name__
==
"__main__"
:
launch
()
python/paddle/fleet/launch_utils.py
0 → 100644
浏览文件 @
1731b32d
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
functools
import
logging
import
socket
import
time
import
os
import
signal
import
copy
import
sys
import
subprocess
from
contextlib
import
closing
import
socket
logger
=
logging
.
getLogger
(
"root"
)
logger
.
propagate
=
False
class
Cluster
(
object
):
def
__init__
(
self
,
hdfs
):
self
.
job_server
=
None
self
.
pods
=
[]
self
.
hdfs
=
None
self
.
job_stage_flag
=
None
def
__str__
(
self
):
return
"job_server:{} pods:{} job_stage_flag:{} hdfs:{}"
.
format
(
self
.
job_server
,
[
str
(
pod
)
for
pod
in
self
.
pods
],
self
.
job_stage_flag
,
self
.
hdfs
)
def
__eq__
(
self
,
cluster
):
if
len
(
self
.
pods
)
!=
len
(
cluster
.
pods
):
return
False
for
a
,
b
in
zip
(
self
.
pods
,
cluster
.
pods
):
if
a
!=
b
:
return
False
if
self
.
job_stage_flag
!=
cluster
.
job_stage_flag
:
return
False
return
True
def
__ne__
(
self
,
cluster
):
return
not
self
.
__eq__
(
cluster
)
def
update_pods
(
cluster
):
self
.
pods
=
copy
.
copy
(
cluster
.
pods
)
def
trainers_nranks
(
self
):
return
len
(
self
.
trainers_endpoints
())
def
pods_nranks
(
self
):
return
len
(
self
.
pods
)
def
trainers_endpoints
(
self
):
r
=
[]
for
pod
in
self
.
pods
:
for
t
in
pod
.
trainers
:
r
.
append
(
t
.
endpoint
)
return
r
def
pods_endpoints
(
self
):
r
=
[]
for
pod
in
self
.
pods
:
ep
=
"{}:{}"
.
format
(
pod
.
addr
,
pod
.
port
)
assert
pod
.
port
!=
None
and
pod
.
addr
!=
None
,
"{} not a valid endpoint"
.
format
(
ep
)
r
.
append
(
ep
)
return
r
def
get_pod_by_id
(
self
,
pod_id
):
for
pod
in
self
.
pods
:
if
str
(
pod_id
)
==
str
(
pod
.
id
):
return
pod
return
None
class
JobServer
(
object
):
def
__init__
(
self
):
self
.
endpoint
=
None
def
__str__
(
self
):
return
"{}"
.
format
(
self
.
endpoint
)
def
__eq__
(
self
,
j
):
return
self
.
endpint
==
j
.
endpoint
def
__ne__
(
self
,
j
):
return
not
self
==
j
class
Trainer
(
object
):
def
__init__
(
self
):
self
.
gpus
=
[]
self
.
endpoint
=
None
self
.
rank
=
None
def
__str__
(
self
):
return
"gpu:{} endpoint:{} rank:{}"
.
format
(
self
.
gpus
,
self
.
endpoint
,
self
.
rank
)
def
__eq__
(
self
,
t
):
if
len
(
self
.
gpus
)
!=
len
(
t
.
gpus
):
return
False
if
self
.
endpoint
!=
t
.
endpoint
or
\
self
.
rank
!=
t
.
rank
:
return
False
for
a
,
b
in
zip
(
self
.
gpus
,
t
.
gpus
):
if
a
!=
b
:
return
False
return
True
def
__ne__
(
self
,
t
):
return
not
self
==
t
def
rank
(
self
):
return
self
.
rank
class
Pod
(
object
):
def
__init__
(
self
):
self
.
rank
=
None
self
.
id
=
None
self
.
addr
=
None
self
.
port
=
None
self
.
trainers
=
[]
self
.
gpus
=
[]
def
__str__
(
self
):
return
"rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}"
.
format
(
self
.
rank
,
self
.
id
,
self
.
addr
,
self
.
port
,
self
.
gpus
,
[
str
(
t
)
for
t
in
self
.
trainers
])
def
__eq__
(
self
,
pod
):
if
self
.
rank
!=
pod
.
rank
or
\
self
.
id
!=
pod
.
id
or
\
self
.
addr
!=
pod
.
addr
or
\
self
.
port
!=
pod
.
port
:
logger
.
debug
(
"pod {} != pod"
.
format
(
self
,
pod
))
return
False
if
len
(
self
.
trainers
)
!=
len
(
pod
.
trainers
):
logger
.
debug
(
"trainers {} != {}"
.
format
(
self
.
trainers
,
pod
.
trainers
))
return
False
for
i
in
range
(
len
(
self
.
trainers
)):
if
self
.
trainers
[
i
]
!=
pod
.
trainers
[
i
]:
logger
.
debug
(
"trainer {} != {}"
.
format
(
self
.
trainers
[
i
],
pod
.
trainers
[
i
]))
return
False
return
True
def
__ne__
(
self
,
pod
):
return
not
self
==
pod
def
parse_response
(
self
,
res_pods
):
pass
def
rank
(
self
):
return
self
.
rank
def
get_visible_gpus
(
self
):
r
=
""
for
g
in
self
.
gpus
:
r
+=
"{},"
.
format
(
g
)
assert
r
!=
""
,
"this pod {} can't see any gpus"
.
format
(
self
)
r
=
r
[:
-
1
]
return
r
def
get_logger
(
log_level
=
20
,
name
=
"root"
):
logger
=
logging
.
getLogger
(
name
)
logger
.
setLevel
(
log_level
)
log_handler
=
logging
.
StreamHandler
()
log_format
=
logging
.
Formatter
(
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
)
log_handler
.
setFormatter
(
log_format
)
logger
.
addHandler
(
log_handler
)
return
logger
def
get_cluster
(
node_ips
,
node_ip
,
paddle_ports
,
selected_gpus
):
assert
type
(
paddle_ports
)
is
list
,
"paddle_ports must be list"
cluster
=
Cluster
(
hdfs
=
None
)
trainer_rank
=
0
for
node_rank
,
ip
in
enumerate
(
node_ips
):
pod
=
Pod
()
pod
.
rank
=
node_rank
pod
.
addr
=
ip
for
i
in
range
(
len
(
selected_gpus
)):
trainer
=
Trainer
()
trainer
.
gpus
.
append
(
selected_gpus
[
i
])
trainer
.
endpoint
=
"%s:%d"
%
(
ip
,
paddle_ports
[
i
])
trainer
.
rank
=
trainer_rank
trainer_rank
+=
1
pod
.
trainers
.
append
(
trainer
)
cluster
.
pods
.
append
(
pod
)
pod_rank
=
node_ips
.
index
(
node_ip
)
return
cluster
,
cluster
.
pods
[
pod_rank
]
def
terminate_local_procs
(
procs
):
for
p
in
procs
:
if
p
.
proc
.
poll
()
is
None
:
p
.
proc
.
terminate
()
p
.
log_fn
.
close
()
logger
.
debug
(
"terminate process id:{}"
.
format
(
p
.
proc
.
pid
))
#wait all process terminiated
time
.
sleep
(
3
)
for
step
in
range
(
0
,
50
):
alive
=
False
for
p
in
procs
:
if
p
.
proc
.
poll
()
is
None
:
# not termniate
os
.
kill
(
p
.
proc
.
pid
,
signal
.
SIGKILL
)
alive
=
True
if
not
alive
:
logger
.
info
(
"terminate all the procs"
)
return
time
.
sleep
(
3
)
logger
.
fatal
(
"can't kill all process and exit"
)
exit
(
1
)
def
get_host_name_ip
():
try
:
host_name
=
socket
.
gethostname
()
host_ip
=
socket
.
gethostbyname
(
host_name
)
return
host_name
,
host_ip
except
:
return
None
def
add_arguments
(
argname
,
type
,
default
,
help
,
argparser
,
**
kwargs
):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
argparser
.
add_argument
(
"--"
+
argname
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
def
find_free_ports
(
num
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
s
.
bind
((
''
,
0
))
return
s
.
getsockname
()[
1
]
port_set
=
set
()
step
=
0
while
True
:
port
=
__free_port
()
if
port
not
in
port_set
:
port_set
.
add
(
port
)
if
len
(
port_set
)
>=
num
:
return
port_set
step
+=
1
if
step
>
100
:
print
(
"can't find avilable port and use the specified static port now!"
)
return
None
return
None
class
TrainerProc
(
object
):
def
__init__
(
self
):
self
.
proc
=
None
self
.
log_fn
=
None
self
.
log_offset
=
None
self
.
rank
=
None
self
.
local_rank
=
None
self
.
cmd
=
None
def
start_local_trainers
(
cluster
,
pod
,
training_script
,
training_script_args
,
log_dir
=
None
):
current_env
=
copy
.
copy
(
os
.
environ
.
copy
())
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
#so just delete them.
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
procs
=
[]
for
idx
,
t
in
enumerate
(
pod
.
trainers
):
proc_env
=
{
"FLAGS_selected_gpus"
:
"%s"
%
","
.
join
([
str
(
g
)
for
g
in
t
.
gpus
]),
"PADDLE_TRAINER_ID"
:
"%d"
%
t
.
rank
,
"PADDLE_CURRENT_ENDPOINT"
:
"%s"
%
t
.
endpoint
,
"PADDLE_TRAINERS_NUM"
:
"%d"
%
cluster
.
trainers_nranks
(),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
())
}
current_env
.
update
(
proc_env
)
logger
.
debug
(
"trainer proc env:{}"
.
format
(
current_env
))
cmd
=
[
sys
.
executable
,
"-u"
,
training_script
]
+
training_script_args
logger
.
info
(
"start trainer proc:{} env:{}"
.
format
(
cmd
,
proc_env
))
fn
=
None
if
log_dir
is
not
None
:
os
.
system
(
"mkdir -p {}"
.
format
(
log_dir
))
fn
=
open
(
"%s/workerlog.%d"
%
(
log_dir
,
idx
),
"a"
)
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
stdout
=
fn
,
stderr
=
fn
)
else
:
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
)
tp
=
TrainerProc
()
tp
.
proc
=
proc
tp
.
rank
=
t
.
rank
tp
.
local_rank
=
idx
tp
.
log_fn
=
fn
tp
.
log_offset
=
fn
.
tell
()
if
fn
else
None
tp
.
cmd
=
cmd
procs
.
append
(
tp
)
return
procs
def
pull_worker_log
(
tp
):
if
tp
.
log_fn
:
with
open
(
tp
.
log_fn
.
name
,
'r'
)
as
fin
:
fin
.
seek
(
tp
.
log_offset
,
0
)
for
line
in
fin
:
try
:
sys
.
stdout
.
write
(
line
)
except
UnicodeEncodeError
:
sys
.
stdout
.
write
(
'UnicodeEncodeError occurs at this line. '
'Please refer to the original log file "%s"
\n
'
%
tp
.
log_fn
.
name
)
tp
.
log_offset
=
fin
.
tell
()
def
watch_local_trainers
(
procs
,
nranks
):
try
:
error
=
False
error_rank
=
[]
# wait all process finish or one error
alive
=
False
for
p
in
procs
:
if
p
.
log_fn
and
p
.
local_rank
==
0
:
pull_worker_log
(
p
)
ret
=
p
.
proc
.
poll
()
if
ret
is
None
:
alive
=
True
elif
ret
!=
0
:
error
=
True
error_rank
.
append
(
p
.
rank
)
if
error
:
terminate_local_procs
(
procs
)
exit
(
1
)
except
KeyboardInterrupt
:
logger
.
warning
(
"KeyboardInterrupt, exit"
)
terminate_local_procs
(
procs
)
raise
except
SystemExit
:
logger
.
error
(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
.
format
(
nranks
,
error_rank
))
terminate_local_procs
(
procs
)
raise
except
:
logger
.
error
(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
.
format
(
nranks
,
error_rank
))
terminate_local_procs
(
procs
)
raise
return
alive
python/paddle/fleet/runtime/runtime_base.py
浏览文件 @
1731b32d
...
...
@@ -19,11 +19,8 @@ class RuntimeBase(object):
def
__init__
(
self
):
pass
def
_set_basic_info
(
self
,
loss
,
role_maker
,
optimizer
,
strategy
):
self
.
loss
=
loss
self
.
role_maker
=
role_maker
self
.
optimizer
=
optimizer
self
.
strategy
=
strategy
def
_set_basic_info
(
self
,
context
):
self
.
context
=
context
def
_run_worker
(
self
):
pass
...
...
python/paddle/fluid/dygraph/layers.py
浏览文件 @
1731b32d
...
...
@@ -129,6 +129,45 @@ class Layer(core.Layer):
for
layer
in
self
.
sublayers
():
layer
.
eval
()
def
apply
(
self
,
fn
):
"""
Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
as well as self. Typical use includes initializing the parameters of a model.
Parameters:
fn (function): a function to be applied to each sublayer
Returns:
Layer: self
Example::
.. code-block:: python
import paddle
import paddle.nn as nn
paddle.enable_imperative()
net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
def init_weights(layer):
if type(layer) == nn.Linear:
print('before init weight:', layer.weight.numpy())
new_weight = paddle.fill_constant(layer.weight.shape, layer.weight.dtype, value=0.9)
layer.weight.set_value(new_weight)
print('after init weight:', layer.weight.numpy())
net.apply(init_weights)
print(net.state_dict())
"""
for
layer
in
self
.
sublayers
():
layer
.
apply
(
fn
)
fn
(
self
)
return
self
def
full_name
(
self
):
"""Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
1731b32d
...
...
@@ -27,6 +27,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
list
(
APPEND MIXED_DIST_TEST_OPS test_communicator_geo
)
list
(
APPEND MIXED_DIST_TEST_OPS test_communicator_half_async
)
list
(
APPEND MIXED_DIST_TEST_OPS test_communicator_sync
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_api_input
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint
)
list
(
APPEND MIXED_DIST_TEST_OPS test_collective_optimizer
)
...
...
@@ -399,6 +400,7 @@ if(WITH_DISTRIBUTE)
py_test_modules
(
test_fleet_checkpoint MODULES test_fleet_checkpoint
)
endif
()
bash_test_modules
(
test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_fleet_launch MODULES test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
set
(
dist_ut_port 20001
)
foreach
(
TEST_OP
${
DIST_TEST_OPS
}
)
...
...
python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
浏览文件 @
1731b32d
...
...
@@ -18,6 +18,7 @@ import unittest
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph.dygraph_to_static
import
ProgramTranslator
from
paddle.fluid.dygraph.io
import
VARIABLE_FILENAME
from
bert_dygraph_model
import
PretrainModelLayer
from
bert_utils
import
get_bert_config
,
get_feed_data_reader
...
...
@@ -28,9 +29,11 @@ place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
SEED
=
2020
STEP_NUM
=
10
PRINT_STEP
=
2
MODEL_SAVE_PATH
=
"./bert.inference.model"
DY_STATE_DICT_SAVE_PATH
=
"./bert.dygraph"
def
train
(
bert_config
,
data_reader
):
def
train
(
bert_config
,
data_reader
,
to_static
):
with
fluid
.
dygraph
.
guard
(
place
):
fluid
.
default_main_program
().
random_seed
=
SEED
fluid
.
default_startup_program
().
random_seed
=
SEED
...
...
@@ -79,18 +82,74 @@ def train(bert_config, data_reader):
step_idx
+=
1
if
step_idx
==
STEP_NUM
:
if
to_static
:
fluid
.
dygraph
.
jit
.
save
(
bert
,
MODEL_SAVE_PATH
)
else
:
fluid
.
dygraph
.
save_dygraph
(
bert
.
state_dict
(),
DY_STATE_DICT_SAVE_PATH
)
break
return
loss
,
ppl
def
train_dygraph
(
bert_config
,
data_reader
):
program_translator
.
enable
(
False
)
return
train
(
bert_config
,
data_reader
)
return
train
(
bert_config
,
data_reader
,
False
)
def
train_static
(
bert_config
,
data_reader
):
program_translator
.
enable
(
True
)
return
train
(
bert_config
,
data_reader
)
return
train
(
bert_config
,
data_reader
,
True
)
def
predict_static
(
data
):
exe
=
fluid
.
Executor
(
place
)
# load inference model
[
inference_program
,
feed_target_names
,
fetch_targets
]
=
fluid
.
io
.
load_inference_model
(
MODEL_SAVE_PATH
,
executor
=
exe
,
params_filename
=
VARIABLE_FILENAME
)
pred_res
=
exe
.
run
(
inference_program
,
feed
=
dict
(
zip
(
feed_target_names
,
data
)),
fetch_list
=
fetch_targets
)
return
pred_res
def
predict_dygraph
(
bert_config
,
data
):
program_translator
.
enable
(
False
)
with
fluid
.
dygraph
.
guard
(
place
):
bert
=
PretrainModelLayer
(
config
=
bert_config
,
weight_sharing
=
False
,
use_fp16
=
False
)
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
DY_STATE_DICT_SAVE_PATH
)
bert
.
set_dict
(
model_dict
)
bert
.
eval
()
input_vars
=
[
fluid
.
dygraph
.
to_variable
(
x
)
for
x
in
data
]
src_ids
,
pos_ids
,
sent_ids
,
input_mask
,
mask_label
,
mask_pos
,
labels
=
input_vars
pred_res
=
bert
(
src_ids
=
src_ids
,
position_ids
=
pos_ids
,
sentence_ids
=
sent_ids
,
input_mask
=
input_mask
,
mask_label
=
mask_label
,
mask_pos
=
mask_pos
,
labels
=
labels
)
pred_res
=
[
var
.
numpy
()
for
var
in
pred_res
]
return
pred_res
def
predict_dygraph_jit
(
data
):
with
fluid
.
dygraph
.
guard
(
place
):
bert
=
fluid
.
dygraph
.
jit
.
load
(
MODEL_SAVE_PATH
)
bert
.
eval
()
src_ids
,
pos_ids
,
sent_ids
,
input_mask
,
mask_label
,
mask_pos
,
labels
=
data
pred_res
=
bert
(
src_ids
,
pos_ids
,
sent_ids
,
input_mask
,
mask_label
,
mask_pos
,
labels
)
pred_res
=
[
var
.
numpy
()
for
var
in
pred_res
]
return
pred_res
class
TestBert
(
unittest
.
TestCase
):
...
...
@@ -104,14 +163,36 @@ class TestBert(unittest.TestCase):
dygraph_loss
,
dygraph_ppl
=
train_dygraph
(
self
.
bert_config
,
self
.
data_reader
)
self
.
assertTrue
(
np
.
allclose
(
static_loss
,
static
_loss
),
msg
=
"static_loss: {}
\n
static
_loss: {}"
.
format
(
static_loss
,
dygraph_loss
))
np
.
allclose
(
static_loss
,
dygraph
_loss
),
msg
=
"static_loss: {}
\n
dygraph
_loss: {}"
.
format
(
static_loss
,
dygraph_loss
))
self
.
assertTrue
(
np
.
allclose
(
static_ppl
,
dygraph_ppl
),
msg
=
"static_ppl: {}
\n
dygraph_ppl: {}"
.
format
(
static_ppl
,
dygraph_ppl
))
self
.
verify_predict
()
def
verify_predict
(
self
):
for
data
in
self
.
data_reader
.
data_generator
()():
dygraph_pred_res
=
predict_dygraph
(
self
.
bert_config
,
data
)
static_pred_res
=
predict_static
(
data
)
dygraph_jit_pred_res
=
predict_dygraph_jit
(
data
)
for
dy_res
,
st_res
,
dy_jit_res
in
zip
(
dygraph_pred_res
,
static_pred_res
,
dygraph_jit_pred_res
):
self
.
assertTrue
(
np
.
allclose
(
st_res
,
dy_res
),
"dygraph_res: {},
\n
static_res: {}"
.
format
(
dy_res
[
~
np
.
isclose
(
st_res
,
dy_res
)],
st_res
[
~
np
.
isclose
(
st_res
,
dy_res
)]))
self
.
assertTrue
(
np
.
allclose
(
st_res
,
dy_jit_res
),
"dygraph_jit_res: {},
\n
static_res: {}"
.
format
(
dy_jit_res
[
~
np
.
isclose
(
st_res
,
dy_jit_res
)],
st_res
[
~
np
.
isclose
(
st_res
,
dy_jit_res
)]))
break
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
浏览文件 @
1731b32d
...
...
@@ -692,13 +692,20 @@ class TestTrain(unittest.TestCase):
video_data
=
np
.
array
([
item
[
0
]
for
item
in
data
]).
astype
(
DATATYPE
)
static_pred_res
=
self
.
predict_static
(
video_data
)
dygraph_pred_res
=
self
.
predict_dygraph
(
video_data
)
dygraph_jit_pred_res
=
self
.
predict_dygraph_jit
(
video_data
)
for
dy_res
,
st_res
in
zip
(
dygraph_pred_res
,
static_pred_res
):
for
dy_res
,
st_res
,
dy_jit_res
in
zip
(
dygraph_pred_res
,
static_pred_res
,
dygraph_jit_pred_res
):
self
.
assertTrue
(
np
.
allclose
(
st_res
,
dy_res
),
"dygraph_res: {},
\n
static_res: {}"
.
format
(
dy_res
[
~
np
.
isclose
(
st_res
,
dy_res
)],
st_res
[
~
np
.
isclose
(
st_res
,
dy_res
)]))
self
.
assertTrue
(
np
.
allclose
(
st_res
,
dy_jit_res
),
"dygraph_jit_res: {},
\n
static_res: {}"
.
format
(
dy_jit_res
[
~
np
.
isclose
(
st_res
,
dy_jit_res
)],
st_res
[
~
np
.
isclose
(
st_res
,
dy_jit_res
)]))
break
def
predict_dygraph
(
self
,
data
):
...
...
@@ -731,6 +738,17 @@ class TestTrain(unittest.TestCase):
return
pred_res
def
predict_dygraph_jit
(
self
,
data
):
with
fluid
.
dygraph
.
guard
(
self
.
place
):
bmn
=
fluid
.
dygraph
.
jit
.
load
(
self
.
args
.
infer_dir
)
bmn
.
eval
()
x
=
to_variable
(
data
)
pred_res
=
bmn
(
x
)
pred_res
=
[
var
.
numpy
()
for
var
in
pred_res
]
return
pred_res
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
浏览文件 @
1731b32d
...
...
@@ -535,9 +535,14 @@ class TestLACModel(unittest.TestCase):
batch
=
[
np
.
vstack
(
var
)
for
var
in
zip
(
*
batch
)]
dy_pre
=
self
.
predict_dygraph
(
batch
)
st_pre
=
self
.
predict_static
(
batch
)
dy_jit_pre
=
self
.
predict_dygraph_jit
(
batch
)
self
.
assertTrue
(
np
.
allclose
(
dy_pre
,
st_pre
),
msg
=
"dy_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_pre
,
st_pre
))
self
.
assertTrue
(
np
.
allclose
(
dy_jit_pre
,
st_pre
),
msg
=
"dy_jit_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_jit_pre
,
st_pre
))
def
predict_dygraph
(
self
,
batch
):
words
,
targets
,
length
=
batch
...
...
@@ -576,6 +581,16 @@ class TestLACModel(unittest.TestCase):
fetch_list
=
fetch_targets
)
return
pred_res
[
0
]
def
predict_dygraph_jit
(
self
,
batch
):
words
,
targets
,
length
=
batch
with
fluid
.
dygraph
.
guard
(
self
.
place
):
model
=
fluid
.
dygraph
.
jit
.
load
(
self
.
args
.
model_save_dir
)
model
.
eval
()
pred_res
=
model
(
to_variable
(
words
),
to_variable
(
length
))
return
pred_res
.
numpy
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
浏览文件 @
1731b32d
...
...
@@ -19,6 +19,7 @@ from paddle.fluid.initializer import MSRA
from
paddle.fluid.param_attr
import
ParamAttr
from
paddle.fluid.dygraph.nn
import
Conv2D
,
Pool2D
,
BatchNorm
,
Linear
from
paddle.fluid.dygraph
import
declarative
,
ProgramTranslator
from
paddle.fluid.dygraph.io
import
VARIABLE_FILENAME
import
unittest
...
...
@@ -433,14 +434,15 @@ class Args(object):
class_dim
=
50
print_step
=
1
train_step
=
10
place
=
fluid
.
CUDAPlace
(
0
)
if
fluid
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
model_save_path
=
model
+
".inference.model"
dy_state_dict_save_path
=
model
+
".dygraph"
def
train_mobilenet
(
args
,
to_static
):
program_translator
.
enable
(
to_static
)
place
=
fluid
.
CUDAPlace
(
0
)
if
fluid
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
with
fluid
.
dygraph
.
guard
(
place
):
with
fluid
.
dygraph
.
guard
(
args
.
place
):
np
.
random
.
seed
(
SEED
)
fluid
.
default_startup_program
().
random_seed
=
SEED
...
...
@@ -461,7 +463,7 @@ def train_mobilenet(args, to_static):
# 3. reader
train_reader
=
fake_data_reader
(
args
.
batch_size
,
args
.
class_dim
)
train_data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
16
)
train_data_loader
.
set_sample_list_generator
(
train_reader
,
place
)
train_data_loader
.
set_sample_list_generator
(
train_reader
)
# 4. train loop
loss_data
=
[]
...
...
@@ -498,17 +500,64 @@ def train_mobilenet(args, to_static):
batch_id
+=
1
t_last
=
time
.
time
()
if
batch_id
>
args
.
train_step
:
if
to_static
:
fluid
.
dygraph
.
jit
.
save
(
net
,
args
.
model_save_path
)
else
:
fluid
.
dygraph
.
save_dygraph
(
net
.
state_dict
(),
args
.
dy_state_dict_save_path
)
break
return
np
.
array
(
loss_data
)
def
predict_static
(
args
,
data
):
exe
=
fluid
.
Executor
(
args
.
place
)
# load inference model
[
inference_program
,
feed_target_names
,
fetch_targets
]
=
fluid
.
io
.
load_inference_model
(
args
.
model_save_path
,
executor
=
exe
,
params_filename
=
VARIABLE_FILENAME
)
pred_res
=
exe
.
run
(
inference_program
,
feed
=
{
feed_target_names
[
0
]:
data
},
fetch_list
=
fetch_targets
)
return
pred_res
[
0
]
def
predict_dygraph
(
args
,
data
):
program_translator
.
enable
(
False
)
with
fluid
.
dygraph
.
guard
(
args
.
place
):
if
args
.
model
==
"MobileNetV1"
:
model
=
MobileNetV1
(
class_dim
=
args
.
class_dim
,
scale
=
1.0
)
elif
args
.
model
==
"MobileNetV2"
:
model
=
MobileNetV2
(
class_dim
=
args
.
class_dim
,
scale
=
1.0
)
# load dygraph trained parameters
model_dict
,
_
=
fluid
.
load_dygraph
(
args
.
dy_state_dict_save_path
)
model
.
set_dict
(
model_dict
)
model
.
eval
()
pred_res
=
model
(
fluid
.
dygraph
.
to_variable
(
data
))
return
pred_res
.
numpy
()
def
predict_dygraph_jit
(
args
,
data
):
with
fluid
.
dygraph
.
guard
(
args
.
place
):
model
=
fluid
.
dygraph
.
jit
.
load
(
args
.
model_save_path
)
model
.
eval
()
pred_res
=
model
(
data
)
return
pred_res
.
numpy
()
class
TestMobileNet
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
args
=
Args
()
def
train
(
self
,
model_name
,
to_static
):
self
.
args
.
model
=
model_name
self
.
args
.
model_save_path
=
model_name
+
".inference.model"
self
.
args
.
dy_state_dict_save_path
=
model_name
+
".dygraph"
out
=
train_mobilenet
(
self
.
args
,
to_static
)
return
out
...
...
@@ -519,12 +568,36 @@ class TestMobileNet(unittest.TestCase):
np
.
allclose
(
dy_out
,
st_out
),
msg
=
"dy_out: {}, st_out: {}"
.
format
(
dy_out
,
st_out
))
def
test_mobileNet
(
self
):
def
assert_same_predict
(
self
,
model_name
):
self
.
args
.
model
=
model_name
self
.
args
.
model_save_path
=
model_name
+
".inference.model"
self
.
args
.
dy_state_dict_save_path
=
model_name
+
".dygraph"
local_random
=
np
.
random
.
RandomState
(
SEED
)
image
=
local_random
.
random_sample
([
1
,
3
,
224
,
224
]).
astype
(
'float32'
)
dy_pre
=
predict_dygraph
(
self
.
args
,
image
)
st_pre
=
predict_static
(
self
.
args
,
image
)
dy_jit_pre
=
predict_dygraph_jit
(
self
.
args
,
image
)
self
.
assertTrue
(
np
.
allclose
(
dy_pre
,
st_pre
),
msg
=
"dy_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_pre
,
st_pre
))
self
.
assertTrue
(
np
.
allclose
(
dy_jit_pre
,
st_pre
),
msg
=
"dy_jit_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_jit_pre
,
st_pre
))
def
test_mobile_net
(
self
):
# MobileNet-V1
self
.
assert_same_loss
(
"MobileNetV1"
)
# MobileNet-V2
self
.
assert_same_loss
(
"MobileNetV2"
)
self
.
verify_predict
()
def
verify_predict
(
self
):
# MobileNet-V1
self
.
assert_same_predict
(
"MobileNetV1"
)
# MobileNet-V2
self
.
assert_same_predict
(
"MobileNetV2"
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
浏览文件 @
1731b32d
...
...
@@ -22,39 +22,33 @@ import numpy as np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph
.jit
import
dygraph_to_static_func
from
paddle.fluid.dygraph
import
declarative
,
ProgramTranslator
from
paddle.fluid.dygraph.nn
import
BatchNorm
,
Conv2D
,
Linear
,
Pool2D
from
paddle.fluid.dygraph.io
import
VARIABLE_FILENAME
SEED
=
2020
IMAGENET1000
=
1281167
base_lr
=
0.1
base_lr
=
0.
00
1
momentum_rate
=
0.9
l2_decay
=
1e-4
batch_size
=
8
epoch_num
=
1
place
=
fluid
.
CUDAPlace
(
0
)
if
fluid
.
is_compiled_with_cuda
()
\
else
fluid
.
CPUPlace
()
MODEL_SAVE_PATH
=
"./resnet.inference.model"
DY_STATE_DICT_SAVE_PATH
=
"./resnet.dygraph"
program_translator
=
ProgramTranslator
()
if
fluid
.
is_compiled_with_cuda
():
fluid
.
set_flags
({
'FLAGS_cudnn_deterministic'
:
True
})
def
optimizer_setting
(
parameter_list
=
None
):
total_images
=
IMAGENET1000
step
=
int
(
math
.
ceil
(
float
(
total_images
)
/
batch_size
))
epochs
=
[
30
,
60
,
90
]
bd
=
[
step
*
e
for
e
in
epochs
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
if
fluid
.
in_dygraph_mode
():
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
),
momentum
=
momentum_rate
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
l2_decay
),
parameter_list
=
parameter_list
)
else
:
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
),
momentum
=
momentum_rate
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
l2_decay
))
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
base_lr
,
momentum
=
momentum_rate
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
l2_decay
),
parameter_list
=
parameter_list
)
return
optimizer
...
...
@@ -189,8 +183,8 @@ class ResNet(fluid.dygraph.Layer):
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
@
d
ygraph_to_static_func
def
forward
(
self
,
inputs
,
label
):
@
d
eclarative
def
forward
(
self
,
inputs
):
y
=
self
.
conv
(
inputs
)
y
=
self
.
pool2d_max
(
y
)
for
bottleneck_block
in
self
.
bottleneck_block_list
:
...
...
@@ -199,77 +193,144 @@ class ResNet(fluid.dygraph.Layer):
y
=
fluid
.
layers
.
reshape
(
y
,
shape
=
[
-
1
,
self
.
pool2d_avg_output
])
pred
=
self
.
out
(
y
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
pred
,
label
=
label
)
avg_loss_
=
fluid
.
layers
.
mean
(
x
=
loss
)
acc_top1_
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
,
k
=
1
)
acc_top5_
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
,
k
=
5
)
return
pred
return
pred
,
avg_loss_
,
acc_top1_
,
acc_top5_
def
reader_decorator
(
reader
):
def
__reader__
():
for
item
in
reader
():
img
=
np
.
array
(
item
[
0
]).
astype
(
'float32'
).
reshape
(
3
,
224
,
224
)
label
=
np
.
array
(
item
[
1
]).
astype
(
'int64'
).
reshape
(
1
)
yield
img
,
label
return
__reader__
def
train_resnet_in_static_mode
():
def
train
(
to_static
):
"""
Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
"""
with
fluid
.
dygraph
.
guard
(
place
):
np
.
random
.
seed
(
SEED
)
fluid
.
default_startup_program
().
random_seed
=
SEED
fluid
.
default_main_program
().
random_seed
=
SEED
train_reader
=
paddle
.
batch
(
reader_decorator
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
)),
batch_size
=
batch_size
,
drop_last
=
True
)
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
5
,
iterable
=
True
)
data_loader
.
set_sample_list_generator
(
train_reader
)
resnet
=
ResNet
()
optimizer
=
optimizer_setting
(
parameter_list
=
resnet
.
parameters
())
for
epoch
in
range
(
epoch_num
):
total_loss
=
0.0
total_acc1
=
0.0
total_acc5
=
0.0
total_sample
=
0
for
batch_id
,
data
in
enumerate
(
data_loader
()):
start_time
=
time
.
time
()
img
,
label
=
data
pred
=
resnet
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
pred
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
acc_top1
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
,
k
=
1
)
acc_top5
=
fluid
.
layers
.
accuracy
(
input
=
pred
,
label
=
label
,
k
=
5
)
avg_loss
.
backward
()
optimizer
.
minimize
(
avg_loss
)
resnet
.
clear_gradients
()
total_loss
+=
avg_loss
total_acc1
+=
acc_top1
total_acc5
+=
acc_top5
total_sample
+=
1
end_time
=
time
.
time
()
if
batch_id
%
2
==
0
:
print
(
"epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f"
%
\
(
epoch
,
batch_id
,
total_loss
.
numpy
()
/
total_sample
,
\
total_acc1
.
numpy
()
/
total_sample
,
total_acc5
.
numpy
()
/
total_sample
,
end_time
-
start_time
))
if
batch_id
==
10
:
if
to_static
:
fluid
.
dygraph
.
jit
.
save
(
resnet
,
MODEL_SAVE_PATH
)
else
:
fluid
.
dygraph
.
save_dygraph
(
resnet
.
state_dict
(),
DY_STATE_DICT_SAVE_PATH
)
# avoid dataloader throw abort signaal
data_loader
.
_reset
()
break
return
total_loss
.
numpy
()
def
predict_dygraph
(
data
):
program_translator
.
enable
(
False
)
with
fluid
.
dygraph
.
guard
(
place
):
resnet
=
ResNet
()
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
DY_STATE_DICT_SAVE_PATH
)
resnet
.
set_dict
(
model_dict
)
resnet
.
eval
()
pred_res
=
resnet
(
fluid
.
dygraph
.
to_variable
(
data
))
return
pred_res
.
numpy
()
def
predict_static
(
data
):
exe
=
fluid
.
Executor
(
place
)
startup_prog
=
fluid
.
Program
()
main_prog
=
fluid
.
Program
()
[
inference_program
,
feed_target_names
,
fetch_targets
]
=
fluid
.
io
.
load_inference_model
(
MODEL_SAVE_PATH
,
executor
=
exe
,
params_filename
=
VARIABLE_FILENAME
)
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
pred_res
=
exe
.
run
(
inference_program
,
feed
=
{
feed_target_names
[
0
]:
data
},
fetch_list
=
fetch_targets
)
img
=
fluid
.
data
(
name
=
"img"
,
shape
=
[
None
,
3
,
224
,
224
],
dtype
=
"float32"
)
label
=
fluid
.
data
(
name
=
"label"
,
shape
=
[
None
,
1
],
dtype
=
"int64"
)
label
.
stop_gradient
=
True
resnet
=
ResNet
()
pred
,
avg_loss_
,
acc_top1_
,
acc_top5_
=
resnet
(
img
,
label
)
optimizer
=
optimizer_setting
(
parameter_list
=
resnet
.
parameters
())
optimizer
.
minimize
(
avg_loss_
)
exe
.
run
(
startup_prog
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
),
batch_size
=
batch_size
)
for
epoch
in
range
(
epoch_num
):
total_loss
=
0.0
total_acc1
=
0.0
total_acc5
=
0.0
total_sample
=
0
for
batch_id
,
data
in
enumerate
(
train_reader
()):
start_time
=
time
.
time
()
dy_x_data
=
np
.
array
(
[
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
if
len
(
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
))
!=
batch_size
:
continue
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
-
1
,
1
)
avg_loss
,
acc_top1
,
acc_top5
=
exe
.
run
(
main_prog
,
feed
=
{
"img"
:
dy_x_data
,
"label"
:
y_data
},
fetch_list
=
[
avg_loss_
,
acc_top1_
,
acc_top5_
])
total_loss
+=
avg_loss
total_acc1
+=
acc_top1
total_acc5
+=
acc_top5
total_sample
+=
1
end_time
=
time
.
time
()
if
batch_id
%
2
==
0
:
print
(
"epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f"
%
\
(
epoch
,
batch_id
,
total_loss
/
total_sample
,
\
total_acc1
/
total_sample
,
total_acc5
/
total_sample
,
end_time
-
start_time
))
if
batch_id
==
10
:
break
return
pred_res
[
0
]
def
predict_dygraph_jit
(
data
):
with
fluid
.
dygraph
.
guard
(
place
):
resnet
=
fluid
.
dygraph
.
jit
.
load
(
MODEL_SAVE_PATH
)
resnet
.
eval
()
pred_res
=
resnet
(
data
)
return
pred_res
.
numpy
()
class
TestResnet
(
unittest
.
TestCase
):
def
test_in_static_mode
(
self
):
train_resnet_in_static_mode
()
def
train
(
self
,
to_static
):
program_translator
.
enable
(
to_static
)
return
train
(
to_static
)
def
verify_predict
(
self
):
image
=
np
.
random
.
random
([
1
,
3
,
224
,
224
]).
astype
(
'float32'
)
dy_pre
=
predict_dygraph
(
image
)
st_pre
=
predict_static
(
image
)
dy_jit_pre
=
predict_dygraph_jit
(
image
)
self
.
assertTrue
(
np
.
allclose
(
dy_pre
,
st_pre
),
msg
=
"dy_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_pre
,
st_pre
))
self
.
assertTrue
(
np
.
allclose
(
dy_jit_pre
,
st_pre
),
msg
=
"dy_jit_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_jit_pre
,
st_pre
))
def
test_resnet
(
self
):
static_loss
=
self
.
train
(
to_static
=
True
)
dygraph_loss
=
self
.
train
(
to_static
=
False
)
self
.
assertTrue
(
np
.
allclose
(
static_loss
,
dygraph_loss
),
msg
=
"static_loss: {}
\n
dygraph_loss: {}"
.
format
(
static_loss
,
dygraph_loss
))
self
.
verify_predict
()
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
浏览文件 @
1731b32d
...
...
@@ -24,6 +24,7 @@ from paddle.fluid.dygraph.base import to_variable
from
paddle.fluid.dygraph.nn
import
BatchNorm
,
Conv2D
,
Linear
,
Pool2D
from
paddle.fluid.dygraph
import
declarative
from
paddle.fluid.dygraph
import
ProgramTranslator
from
paddle.fluid.dygraph.io
import
VARIABLE_FILENAME
SEED
=
2020
np
.
random
.
seed
(
SEED
)
...
...
@@ -32,6 +33,8 @@ BATCH_SIZE = 8
EPOCH_NUM
=
1
PRINT_STEP
=
2
STEP_NUM
=
10
MODEL_SAVE_PATH
=
"./se_resnet.inference.model"
DY_STATE_DICT_SAVE_PATH
=
"./se_resnet.dygraph"
place
=
fluid
.
CUDAPlace
(
0
)
if
fluid
.
is_compiled_with_cuda
()
\
else
fluid
.
CPUPlace
()
...
...
@@ -377,11 +380,60 @@ def train(train_reader, to_static):
step_idx
+=
1
if
step_idx
==
STEP_NUM
:
if
to_static
:
configs
=
fluid
.
dygraph
.
jit
.
SaveLoadConfig
()
configs
.
output_spec
=
[
pred
]
fluid
.
dygraph
.
jit
.
save
(
se_resnext
,
MODEL_SAVE_PATH
,
[
img
],
configs
)
else
:
fluid
.
dygraph
.
save_dygraph
(
se_resnext
.
state_dict
(),
DY_STATE_DICT_SAVE_PATH
)
break
return
pred
.
numpy
(),
avg_loss
.
numpy
(),
acc_top1
.
numpy
(),
acc_top5
.
numpy
(
)
def
predict_dygraph
(
data
):
program_translator
=
ProgramTranslator
()
program_translator
.
enable
(
False
)
with
fluid
.
dygraph
.
guard
(
place
):
se_resnext
=
SeResNeXt
()
model_dict
,
_
=
fluid
.
dygraph
.
load_dygraph
(
DY_STATE_DICT_SAVE_PATH
)
se_resnext
.
set_dict
(
model_dict
)
se_resnext
.
eval
()
label
=
np
.
random
.
random
([
1
,
1
]).
astype
(
"int64"
)
img
=
fluid
.
dygraph
.
to_variable
(
data
)
label
=
fluid
.
dygraph
.
to_variable
(
label
)
pred_res
,
_
,
_
,
_
=
se_resnext
(
img
,
label
)
return
pred_res
.
numpy
()
def
predict_static
(
data
):
exe
=
fluid
.
Executor
(
place
)
[
inference_program
,
feed_target_names
,
fetch_targets
]
=
fluid
.
io
.
load_inference_model
(
MODEL_SAVE_PATH
,
executor
=
exe
,
params_filename
=
VARIABLE_FILENAME
)
pred_res
=
exe
.
run
(
inference_program
,
feed
=
{
feed_target_names
[
0
]:
data
},
fetch_list
=
fetch_targets
)
return
pred_res
[
0
]
def
predict_dygraph_jit
(
data
):
with
fluid
.
dygraph
.
guard
(
place
):
se_resnext
=
fluid
.
dygraph
.
jit
.
load
(
MODEL_SAVE_PATH
)
se_resnext
.
eval
()
pred_res
=
se_resnext
(
data
)
return
pred_res
.
numpy
()
class
TestSeResnet
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
train_reader
=
paddle
.
batch
(
...
...
@@ -390,6 +442,18 @@ class TestSeResnet(unittest.TestCase):
batch_size
=
BATCH_SIZE
,
drop_last
=
True
)
def
verify_predict
(
self
):
image
=
np
.
random
.
random
([
1
,
3
,
224
,
224
]).
astype
(
'float32'
)
dy_pre
=
predict_dygraph
(
image
)
st_pre
=
predict_static
(
image
)
dy_jit_pre
=
predict_dygraph_jit
(
image
)
self
.
assertTrue
(
np
.
allclose
(
dy_pre
,
st_pre
),
msg
=
"dy_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_pre
,
st_pre
))
self
.
assertTrue
(
np
.
allclose
(
dy_jit_pre
,
st_pre
),
msg
=
"dy_jit_pre:
\n
{}
\n
, st_pre:
\n
{}."
.
format
(
dy_jit_pre
,
st_pre
))
def
test_check_result
(
self
):
pred_1
,
loss_1
,
acc1_1
,
acc5_1
=
train
(
self
.
train_reader
,
to_static
=
False
)
...
...
@@ -409,6 +473,8 @@ class TestSeResnet(unittest.TestCase):
np
.
allclose
(
acc5_1
,
acc5_2
),
msg
=
"static acc5: {}
\n
dygraph acc5: {}"
.
format
(
acc5_1
,
acc5_2
))
self
.
verify_predict
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_compare_op.py
浏览文件 @
1731b32d
...
...
@@ -72,25 +72,40 @@ def create_paddle_case(op_type, callback):
class
PaddleCls
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
op_type
=
op_type
self
.
input_x
=
np
.
array
([
1
,
2
,
3
,
4
])
self
.
input_y
=
np
.
array
([
1
,
3
,
2
,
4
])
self
.
input_x
=
np
.
array
([
1
,
2
,
3
,
4
])
.
astype
(
np
.
int64
)
self
.
input_y
=
np
.
array
([
1
,
3
,
2
,
4
])
.
astype
(
np
.
int64
)
self
.
real_result
=
callback
(
self
.
input_x
,
self
.
input_y
)
self
.
place
=
fluid
.
CPUPlace
()
if
core
.
is_compiled_with_cuda
():
self
.
place
=
paddle
.
CUDAPlace
(
0
)
def
test_api
(
self
):
with
program_guard
(
Program
(),
Program
()):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
4
],
dtype
=
'int64'
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
4
],
dtype
=
'int64'
)
x
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
4
],
dtype
=
'int64'
)
y
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
4
],
dtype
=
'int64'
)
op
=
eval
(
"paddle.%s"
%
(
self
.
op_type
))
out
=
op
(
x
,
y
)
place
=
fluid
.
CPUPlace
()
if
core
.
is_compiled_with_cuda
():
place
=
paddle
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
self
.
place
)
res
,
=
exe
.
run
(
feed
=
{
"x"
:
self
.
input_x
,
"y"
:
self
.
input_y
},
fetch_list
=
[
out
])
self
.
assertEqual
((
res
==
self
.
real_result
).
all
(),
True
)
def
test_broadcast_api_1
(
self
):
with
program_guard
(
Program
(),
Program
()):
x
=
paddle
.
nn
.
data
(
name
=
'x'
,
shape
=
[
1
,
2
,
1
,
3
],
dtype
=
'int32'
)
y
=
paddle
.
nn
.
data
(
name
=
'y'
,
shape
=
[
1
,
2
,
3
],
dtype
=
'int32'
)
op
=
eval
(
"paddle.%s"
%
(
self
.
op_type
))
out
=
op
(
x
,
y
)
exe
=
paddle
.
Executor
(
self
.
place
)
input_x
=
np
.
arange
(
1
,
7
).
reshape
((
1
,
2
,
1
,
3
)).
astype
(
np
.
int32
)
input_y
=
np
.
arange
(
0
,
6
).
reshape
((
1
,
2
,
3
)).
astype
(
np
.
int32
)
real_result
=
callback
(
input_x
,
input_y
)
res
,
=
exe
.
run
(
feed
=
{
"x"
:
input_x
,
"y"
:
input_y
},
fetch_list
=
[
out
])
self
.
assertEqual
((
res
==
real_result
).
all
(),
True
)
def
test_attr_name
(
self
):
with
program_guard
(
Program
(),
Program
()):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
4
],
dtype
=
'int32'
)
...
...
@@ -104,6 +119,7 @@ def create_paddle_case(op_type, callback):
globals
()[
cls_name
]
=
PaddleCls
create_paddle_case
(
'less_than'
,
lambda
_a
,
_b
:
_a
<
_b
)
create_paddle_case
(
'less_equal'
,
lambda
_a
,
_b
:
_a
<=
_b
)
create_paddle_case
(
'greater_than'
,
lambda
_a
,
_b
:
_a
>
_b
)
create_paddle_case
(
'greater_equal'
,
lambda
_a
,
_b
:
_a
>=
_b
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_launch.sh
0 → 100644
浏览文件 @
1731b32d
#!/bin/bash
set
-e
function
test_launch_ps
(){
fleetrun
--server_num
=
2
--worker_num
=
2 fleet_ps_training.py 2> ut.elog
if
grep
-q
"server are killed"
ut.elog
;
then
echo
"test pserver launch succeed"
else
echo
"test pserver launch failed"
exit
-1
fi
}
if
[[
${
WITH_GPU
}
==
"OFF"
]]
;
then
test_launch_ps
exit
0
fi
test_launch_ps
# use default values
fleetrun multi_process.py
# use paddlecloud
echo
"begin test use paddlecloud"
cluster_node_ips
=
"127.0.0.1,127.0.0.2"
export
PADDLE_TRAINERS_NUM
=
2
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PORT
=
35019
export
TRAINER_PORTS_NUM
=
2
distributed_args
=
"--ips=
${
cluster_node_ips
}
--gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES
=
0,1 fleetrun
${
distributed_args
}
multi_process.py
str1
=
"selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2
=
"selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0
=
"multi_process.check_0.log"
file_1
=
"multi_process.check_1.log"
echo
"paddlecloud params test"
if
grep
-q
"
$str1
"
"
$file_0
"
;
then
echo
"find trainer 0"
else
echo
"not find trainer 0"
exit
-1
fi
if
grep
-q
"
$str2
"
"
$file_1
"
;
then
echo
"find trainer 1"
else
echo
"not find trainer 1"
exit
-1
fi
# test async poll process
if
[
-f
$file_0
]
;
then
rm
$file_0
fi
if
[
-f
$file_1
]
;
then
rm
$file_1
fi
unset
PADDLE_PORT
unset
TRAINER_PORTS_NUM
echo
""
echo
"paddle.distributed.launch async poll process test"
if
!
CUDA_VISIBLE_DEVICES
=
0,1 fleetrun
${
distributed_args
}
multi_process.py abort
;
then
echo
"train abort as planned"
fi
abort_str1
=
"abort>>> selected_gpus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0"
if
grep
-q
"
$abort_str1
"
"
$file_0
"
;
then
echo
"trainer 0 abort as planned"
else
echo
"trainer 0 not abort as planned"
exit
-1
fi
if
[
!
-f
$file_1
]
;
then
echo
"trainer 1 terminate as planned"
else
echo
"trainer 1 not terminate as planned"
exit
-1
fi
#test for random ports
file_0_0
=
"test_launch_filelock_0_0.log"
file_1_0
=
"test_launch_filelock_1_0.log"
rm
-rf
$file_0_0
$file_0_1
distributed_args
=
"--gpus=0,1 --log_dir=testlog"
export
PADDLE_LAUNCH_LOG
=
"test_launch_filelock_0"
CUDA_VISIBLE_DEVICES
=
0,1 fleetrun
${
distributed_args
}
find_ports.py
str_0
=
"worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
python/paddle/fluid/tests/unittests/test_fleet_util.py
浏览文件 @
1731b32d
...
...
@@ -33,8 +33,10 @@ class TestFleetUtil(unittest.TestCase):
role_maker
=
None
# should be fleet.PaddleCloudRoleMaker()
optimize_ops
=
[]
params_grads
=
[]
util
=
factory
.
_create_util
(
strategy
,
role_maker
,
optimize_ops
,
params_grads
)
context
=
{}
context
[
"role_maker"
]
=
role_maker
context
[
"valid_strategy"
]
=
strategy
util
=
factory
.
_create_util
(
context
)
self
.
assertEqual
(
util
.
role_maker
,
None
)
def
test_get_util
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
浏览文件 @
1731b32d
...
...
@@ -298,16 +298,15 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
class
TestDygraphDoubleGradVisitedUniq
(
TestCase
):
def
test_compare
(
self
):
value
=
np
.
random
.
uniform
(
-
0.5
,
0.5
,
100
).
reshape
(
10
,
2
,
5
).
astype
(
"float32"
)
value
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
3
,
32
,
32
]).
astype
(
'float32'
)
def
model_f
(
input
):
linear
=
fluid
.
dygraph
.
Linear
(
5
,
3
,
bias_attr
=
False
)
conv2d
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
)
for
i
in
range
(
10
):
if
i
==
0
:
out
=
linear
(
input
)
out
=
conv2d
(
input
)
else
:
out
=
out
+
linear
(
input
)
out
=
out
+
conv2d
(
input
)
return
out
backward_strategy
=
fluid
.
dygraph
.
BackwardStrategy
()
...
...
@@ -319,8 +318,14 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
out
=
model_f
(
a
)
dx
=
fluid
.
dygraph
.
grad
(
outputs
=
[
out
],
inputs
=
[
a
],
create_graph
=
True
,
retain_graph
=
True
,
\
only_inputs
=
True
,
allow_unused
=
False
,
backward_strategy
=
backward_strategy
)
dx
=
fluid
.
dygraph
.
grad
(
outputs
=
[
out
],
inputs
=
[
a
],
create_graph
=
True
,
retain_graph
=
True
,
only_inputs
=
True
,
allow_unused
=
False
,
backward_strategy
=
backward_strategy
)
grad_1
=
dx
[
0
].
numpy
()
...
...
@@ -334,7 +339,9 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
grad_2
=
a
.
gradient
()
self
.
assertTrue
(
np
.
array_equal
(
grad_1
,
grad_2
))
self
.
assertTrue
(
np
.
allclose
(
grad_1
,
grad_2
,
rtol
=
1.e-5
,
atol
=
1.e-8
,
equal_nan
=
True
))
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
0 → 100644
浏览文件 @
1731b32d
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
paddle
import
paddle.nn
as
nn
import
paddle.fluid
as
fluid
import
numpy
as
np
class
LeNetDygraph
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
num_classes
=
10
,
classifier_activation
=
'softmax'
):
super
(
LeNetDygraph
,
self
).
__init__
()
self
.
num_classes
=
num_classes
self
.
features
=
nn
.
Sequential
(
nn
.
Conv2D
(
1
,
6
,
3
,
stride
=
1
,
padding
=
1
),
nn
.
ReLU
(),
nn
.
Pool2D
(
2
,
'max'
,
2
),
nn
.
Conv2D
(
6
,
16
,
5
,
stride
=
1
,
padding
=
0
),
nn
.
ReLU
(),
nn
.
Pool2D
(
2
,
'max'
,
2
))
if
num_classes
>
0
:
self
.
fc
=
nn
.
Sequential
(
nn
.
Linear
(
400
,
120
),
nn
.
Linear
(
120
,
84
),
nn
.
Linear
(
84
,
10
,
act
=
classifier_activation
))
def
forward
(
self
,
inputs
):
x
=
self
.
features
(
inputs
)
if
self
.
num_classes
>
0
:
x
=
fluid
.
layers
.
flatten
(
x
,
1
)
x
=
self
.
fc
(
x
)
return
x
def
init_weights
(
layer
):
if
type
(
layer
)
==
nn
.
Linear
:
new_weight
=
paddle
.
fill_constant
(
layer
.
weight
.
shape
,
layer
.
weight
.
dtype
,
value
=
0.9
)
layer
.
weight
.
set_value
(
new_weight
)
new_bias
=
paddle
.
fill_constant
(
layer
.
bias
.
shape
,
layer
.
bias
.
dtype
,
value
=-
0.1
)
layer
.
bias
.
set_value
(
new_bias
)
elif
type
(
layer
)
==
nn
.
Conv2D
:
new_weight
=
paddle
.
fill_constant
(
layer
.
weight
.
shape
,
layer
.
weight
.
dtype
,
value
=
0.7
)
layer
.
weight
.
set_value
(
new_weight
)
new_bias
=
paddle
.
fill_constant
(
layer
.
bias
.
shape
,
layer
.
bias
.
dtype
,
value
=-
0.2
)
layer
.
bias
.
set_value
(
new_bias
)
class
TestLayerApply
(
unittest
.
TestCase
):
def
test_apply_init_weight
(
self
):
with
fluid
.
dygraph
.
guard
():
net
=
LeNetDygraph
()
net
.
apply
(
init_weights
)
for
layer
in
net
.
sublayers
():
if
type
(
layer
)
==
nn
.
Linear
:
np
.
testing
.
assert_allclose
(
layer
.
weight
.
numpy
(),
0.9
)
np
.
testing
.
assert_allclose
(
layer
.
bias
.
numpy
(),
-
0.1
)
elif
type
(
layer
)
==
nn
.
Conv2D
:
np
.
testing
.
assert_allclose
(
layer
.
weight
.
numpy
(),
0.7
)
np
.
testing
.
assert_allclose
(
layer
.
bias
.
numpy
(),
-
0.2
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/setup.py.in
浏览文件 @
1731b32d
...
...
@@ -475,6 +475,11 @@ with redirect_stdout():
cmdclass={
'install_headers': InstallHeaders,
'install': InstallCommand,
},
entry_points={
'console_scripts': [
'fleetrun = paddle.fleet.launch:launch'
]
}
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录