Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
5b50307b
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5b50307b
编写于
6月 15, 2018
作者:
Q
qiaolongfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into update-api-reference-1
上级
6ace04f6
5ea039b3
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
280 addition
and
124 deletion
+280
-124
paddle/fluid/inference/tensorrt/convert/op_converter.h
paddle/fluid/inference/tensorrt/convert/op_converter.h
+2
-1
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+23
-9
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+8
-9
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+2
-1
paddle/fluid/operators/mean_op.cc
paddle/fluid/operators/mean_op.cc
+3
-5
paddle/fluid/operators/tensorrt_engine_op.cc
paddle/fluid/operators/tensorrt_engine_op.cc
+18
-10
paddle/fluid/operators/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt_engine_op.h
+17
-16
paddle/fluid/operators/tensorrt_engine_op_test.cc
paddle/fluid/operators/tensorrt_engine_op_test.cc
+98
-1
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+1
-1
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+25
-27
python/paddle/fluid/layers/layer_function_generator.py
python/paddle/fluid/layers/layer_function_generator.py
+19
-10
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+17
-13
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+1
-0
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+23
-6
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
...n/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+12
-9
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
...paddle/fluid/transpiler/memory_optimization_transpiler.py
+11
-6
未找到文件。
paddle/fluid/inference/tensorrt/convert/op_converter.h
浏览文件 @
5b50307b
...
@@ -64,7 +64,8 @@ class OpConverter {
...
@@ -64,7 +64,8 @@ class OpConverter {
(
*
it
)(
op
,
scope
,
test_mode
);
(
*
it
)(
op
,
scope
,
test_mode
);
}
}
// convert fluid block to tensorrt network
// Convert a fluid block to tensorrt network, NOTE it just convert operators,
// the INetwork's inputs and outputs should specified in some other modules.
void
ConvertBlock
(
const
framework
::
proto
::
BlockDesc
&
block
,
void
ConvertBlock
(
const
framework
::
proto
::
BlockDesc
&
block
,
const
std
::
unordered_set
<
std
::
string
>&
parameters
,
const
std
::
unordered_set
<
std
::
string
>&
parameters
,
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
{
const
framework
::
Scope
&
scope
,
TensorRTEngine
*
engine
)
{
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
5b50307b
...
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
...
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
nvinfer1
::
Weights
w_
;
nvinfer1
::
Weights
w_
;
};
};
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
cudaStream_t
*
stream
,
TensorRTEngine
(
int
max_batch
,
int
max_workspace
,
cudaStream_t
*
stream
=
nullptr
,
nvinfer1
::
ILogger
&
logger
=
NaiveLogger
::
Global
())
nvinfer1
::
ILogger
&
logger
=
NaiveLogger
::
Global
())
:
max_batch_
(
max_batch
),
:
max_batch_
(
max_batch
),
max_workspace_
(
max_workspace
),
max_workspace_
(
max_workspace
),
stream_
(
stream
),
stream_
(
stream
?
stream
:
&
default_stream_
),
logger_
(
logger
)
{}
logger_
(
logger
)
{}
virtual
~
TensorRTEngine
();
virtual
~
TensorRTEngine
();
...
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
...
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
// the max memory size the engine uses
// the max memory size the engine uses
int
max_workspace_
;
int
max_workspace_
;
cudaStream_t
*
stream_
;
cudaStream_t
*
stream_
;
// If stream_ is not set from outside, hold its own stream.
cudaStream_t
default_stream_
;
nvinfer1
::
ILogger
&
logger_
;
nvinfer1
::
ILogger
&
logger_
;
std
::
vector
<
Buffer
>
buffers_
;
std
::
vector
<
Buffer
>
buffers_
;
...
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
...
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
*/
*/
class
TRT_EngineManager
{
class
TRT_EngineManager
{
public:
public:
TensorRTEngine
*
Create
(
int
max_batch
,
int
max_workspace
,
bool
HasEngine
(
const
std
::
string
&
name
)
const
{
cudaStream_t
*
stream
)
{
return
engines_
.
count
(
name
)
!=
0
;
engines_
.
emplace_back
(
new
TensorRTEngine
(
max_batch
,
max_workspace
,
stream
));
}
return
engines_
.
back
().
get
();
// Get an engine called `name`.
TensorRTEngine
*
Get
(
const
std
::
string
&
name
)
const
{
return
engines_
.
at
(
name
).
get
();
}
// Create or get an engine called `name`
TensorRTEngine
*
Create
(
int
max_batch
,
int
max_workspace
,
cudaStream_t
*
stream
,
const
std
::
string
&
name
)
{
auto
*
p
=
new
TensorRTEngine
(
max_batch
,
max_workspace
,
stream
);
engines_
[
name
].
reset
(
p
);
return
p
;
}
}
void
DeleteALl
()
{
void
DeleteALl
()
{
for
(
auto
&
ptr
:
engines_
)
{
for
(
auto
&
item
:
engines_
)
{
ptr
.
reset
(
nullptr
);
item
.
second
.
reset
(
nullptr
);
}
}
}
}
private:
private:
std
::
vector
<
std
::
unique_ptr
<
TensorRTEngine
>>
engines_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
TensorRTEngine
>>
engines_
;
};
};
}
// namespace tensorrt
}
// namespace tensorrt
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
5b50307b
...
@@ -252,15 +252,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -252,15 +252,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"Output of Softshrink operator"
);
AddOutput
(
"Out"
,
"Output of Softshrink operator"
);
AddAttr
<
float
>
(
"lambda"
,
"non-negative offset"
).
SetDefault
(
0.5
f
);
AddAttr
<
float
>
(
"lambda"
,
"non-negative offset"
).
SetDefault
(
0.5
f
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Softshrink Activation Operator.
:strong:`Softshrink Activation Operator`
$$
.. math::
out = \begin{cases}
out = \begin{cases}
x - \lambda, \text{if } x > \lambda \\
x - \lambda, \text{if } x > \lambda \\
x + \lambda, \text{if } x < -\lambda \\
x + \lambda, \text{if } x < -\lambda \\
0, \text{otherwise}
0, \text{otherwise}
\end{cases}
\end{cases}
$$
)DOC"
);
)DOC"
);
}
}
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
5b50307b
...
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
};
};
void
SignalHandler
::
StopAndExit
(
int
signal_num
)
{
void
SignalHandler
::
StopAndExit
(
int
signal_num
)
{
VLOG
(
3
)
<<
"Catch interrupt signal: "
<<
signal_num
<<
", program will exit"
;
// Do not use VLOG here for the device for printing maybe already released.
// exit will release interal allocated resoureces.
exit
(
0
);
exit
(
0
);
}
}
...
...
paddle/fluid/operators/mean_op.cc
浏览文件 @
5b50307b
...
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
...
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
class
MeanOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
MeanOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
public:
void
Make
()
override
{
void
Make
()
override
{
AddInput
(
"X"
,
"The input of mean op"
);
AddInput
(
"X"
,
"
(Tensor)
The input of mean op"
);
AddOutput
(
"Out"
,
"The output of mean op"
).
Reuse
(
"X"
);
AddOutput
(
"Out"
,
"
(Tensor)
The output of mean op"
).
Reuse
(
"X"
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Mean Operator.
Mean Operator calculates the mean of all elements in X.
Out is a scalar which is the mean of all elements in X.
)DOC"
);
)DOC"
);
}
}
...
...
paddle/fluid/operators/tensorrt_engine_op.cc
浏览文件 @
5b50307b
...
@@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
...
@@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
}
// namespace
}
// namespace
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
void
paddle
::
operators
::
TensorRTEngineKernel
<
DeviceContext
,
T
>::
Prepare
(
void
TensorRTEngineKernel
<
DeviceContext
,
T
>::
Prepare
(
const
framework
::
ExecutionContext
&
context
)
const
{
const
framework
::
ExecutionContext
&
context
)
const
{
VLOG
(
4
)
<<
"Prepare engine"
;
VLOG
(
4
)
<<
"Prepare engine"
;
// Get the ProgramDesc and pass to convert.
// Get the ProgramDesc and pass to convert.
framework
::
proto
::
BlockDesc
block_desc
;
framework
::
proto
::
BlockDesc
block_desc
;
block_desc
.
ParseFromString
(
context
.
Attr
<
std
::
string
>
(
"subgraph"
));
block_desc
.
ParseFromString
(
context
.
Attr
<
std
::
string
>
(
"subgraph"
));
max_batch_
=
context
.
Attr
<
int
>
(
"max_batch"
);
int
max_batch
=
context
.
Attr
<
int
>
(
"max_batch"
);
auto
max_workspace
=
context
.
Attr
<
int
>
(
"max_workspace"
);
auto
max_workspace
=
context
.
Attr
<
int
>
(
"max_workspace"
);
engine_
=
Singleton
<
TRT_EngineManager
>::
Global
().
Create
(
auto
params
=
context
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"parameters"
);
max_batch_
,
max_workspace
,
&
stream_
);
std
::
unordered_set
<
std
::
string
>
parameters
;
engine_
->
InitNetwork
();
for
(
const
auto
&
param
:
params
)
{
parameters
.
insert
(
param
);
}
// TODO(Superjomn) replace this with a different stream
auto
*
engine
=
Singleton
<
TRT_EngineManager
>::
Global
().
Create
(
max_batch
,
max_workspace
,
nullptr
/*engine hold its own stream*/
,
context
.
Attr
<
std
::
string
>
(
"engine_uniq_key"
));
engine
->
InitNetwork
();
framework
::
BlockDesc
block
(
nullptr
/*programdesc*/
,
&
block_desc
);
framework
::
BlockDesc
block
(
nullptr
/*programdesc*/
,
&
block_desc
);
// Add inputs
// Add inputs
...
@@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
...
@@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
PADDLE_ENFORCE_EQ
(
var
->
GetType
(),
FluidDT
::
VarType_Type_LOD_TENSOR
,
PADDLE_ENFORCE_EQ
(
var
->
GetType
(),
FluidDT
::
VarType_Type_LOD_TENSOR
,
"TensorRT engine only takes LoDTensor as input"
);
"TensorRT engine only takes LoDTensor as input"
);
auto
shape
=
var
->
GetShape
();
auto
shape
=
var
->
GetShape
();
engine
_
->
DeclareInput
(
engine
->
DeclareInput
(
input
,
FluidDataType2TRT
(
input
,
FluidDataType2TRT
(
var
->
Proto
()
->
type
().
lod_tensor
().
tensor
().
data_type
()),
var
->
Proto
()
->
type
().
lod_tensor
().
tensor
().
data_type
()),
Vec2TRT_Dims
(
var
->
GetShape
()));
Vec2TRT_Dims
(
var
->
GetShape
()));
}
}
// TODO(Superjomn) parameters should be passed after analysised from outside.
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
().
ConvertBlock
(
inference
::
Singleton
<
inference
::
tensorrt
::
OpConverter
>::
Global
().
ConvertBlock
(
block_desc
,
{},
context
.
scope
(),
engine_
);
block_desc
,
parameters
,
context
.
scope
(),
engine
);
// Add outputs
// Add outputs
VLOG
(
4
)
<<
"declare outputs"
;
VLOG
(
4
)
<<
"declare outputs"
;
for
(
auto
&
output
:
context
.
Outputs
(
"Ys"
))
{
for
(
auto
&
output
:
context
.
Outputs
(
"Ys"
))
{
VLOG
(
4
)
<<
"declare output "
<<
output
;
VLOG
(
4
)
<<
"declare output "
<<
output
;
engine
_
->
DeclareOutput
(
output
);
engine
->
DeclareOutput
(
output
);
}
}
engine
_
->
FreezeNetwork
();
engine
->
FreezeNetwork
();
}
}
class
TensorRTEngineOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
TensorRTEngineOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
@@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Xs"
,
"A list of inputs."
).
AsDuplicable
();
AddInput
(
"Xs"
,
"A list of inputs."
).
AsDuplicable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDuplicable
();
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"subgraph"
,
"the subgraph."
);
AddAttr
<
std
::
string
>
(
"engine_uniq_key"
,
"unique key for the TRT engine."
);
AddAttr
<
int
>
(
"max_batch"
,
"the maximum batch size."
);
AddAttr
<
int
>
(
"max_batch"
,
"the maximum batch size."
);
AddAttr
<
int
>
(
"max_workspace"
,
"the maximum batch size."
);
AddAttr
<
int
>
(
"max_workspace"
,
"the maximum batch size."
);
AddComment
(
"TensorRT engine operator."
);
AddComment
(
"TensorRT engine operator."
);
...
...
paddle/fluid/operators/tensorrt_engine_op.h
浏览文件 @
5b50307b
...
@@ -19,10 +19,14 @@
...
@@ -19,10 +19,14 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
inference
::
Singleton
;
using
inference
::
tensorrt
::
TRT_EngineManager
;
class
TensorRTEngineOp
:
public
framework
::
OperatorWithKernel
{
class
TensorRTEngineOp
:
public
framework
::
OperatorWithKernel
{
public:
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
@@ -47,16 +51,18 @@ template <typename DeviceContext, typename T>
...
@@ -47,16 +51,18 @@ template <typename DeviceContext, typename T>
class
TensorRTEngineKernel
:
public
framework
::
OpKernel
<
T
>
{
class
TensorRTEngineKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
if
(
!
engine_
)
{
auto
engine_name
=
context
.
Attr
<
std
::
string
>
(
"engine_uniq_key"
);
if
(
!
Singleton
<
TRT_EngineManager
>::
Global
().
HasEngine
(
engine_name
))
{
Prepare
(
context
);
Prepare
(
context
);
}
}
auto
*
engine
=
Singleton
<
TRT_EngineManager
>::
Global
().
Get
(
engine_name
);
auto
input_names
=
context
.
op
().
Inputs
(
"Xs"
);
auto
input_names
=
context
.
op
().
Inputs
(
"Xs"
);
PADDLE_ENFORCE
(
!
input_names
.
empty
(),
"should pass more than one inputs"
);
PADDLE_ENFORCE
(
!
input_names
.
empty
(),
"should pass more than one inputs"
);
// Try to determine a batch_size
// Try to determine a batch_size
auto
&
tensor0
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
auto
&
tensor0
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
context
.
scope
(),
input_names
.
front
());
context
.
scope
(),
input_names
.
front
());
int
batch_size
=
tensor0
.
dims
()[
0
];
int
batch_size
=
tensor0
.
dims
()[
0
];
PADDLE_ENFORCE_LE
(
batch_size
,
max_batch_
);
PADDLE_ENFORCE_LE
(
batch_size
,
context
.
Attr
<
int
>
(
"max_batch"
)
);
// Convert input tensor from fluid to engine.
// Convert input tensor from fluid to engine.
for
(
const
auto
&
x
:
context
.
Inputs
(
"Xs"
))
{
for
(
const
auto
&
x
:
context
.
Inputs
(
"Xs"
))
{
...
@@ -64,20 +70,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
...
@@ -64,20 +70,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
auto
&
t
=
inference
::
analysis
::
GetFromScope
<
framework
::
LoDTensor
>
(
context
.
scope
(),
x
);
context
.
scope
(),
x
);
if
(
platform
::
is_cpu_place
(
t
.
place
()))
{
if
(
platform
::
is_cpu_place
(
t
.
place
()))
{
engine
_
->
SetInputFromCPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
engine
->
SetInputFromCPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
t
.
memory_size
());
}
else
{
}
else
{
engine
_
->
SetInputFromGPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
engine
->
SetInputFromGPU
(
x
,
static_cast
<
const
void
*>
(
t
.
data
<
void
>
()),
t
.
memory_size
());
t
.
memory_size
());
}
}
}
}
// Execute the engine.
// Execute the engine.
PADDLE_ENFORCE_GT
(
batch_size
,
0
);
PADDLE_ENFORCE_GT
(
batch_size
,
0
);
engine
_
->
Execute
(
batch_size
);
engine
->
Execute
(
batch_size
);
// Convert output tensor from engine to fluid
// Convert output tensor from engine to fluid
for
(
const
auto
&
y
:
context
.
Outputs
(
"Ys"
))
{
for
(
const
auto
&
y
:
context
.
Outputs
(
"Ys"
))
{
// convert output and copy to fluid.
// convert output and copy to fluid.
nvinfer1
::
ITensor
*
trt_t
=
engine
_
->
GetITensor
(
y
);
nvinfer1
::
ITensor
*
trt_t
=
engine
->
GetITensor
(
y
);
auto
dims
=
trt_t
->
getDimensions
();
auto
dims
=
trt_t
->
getDimensions
();
// Use the output ITensor's dims to reshape the Fluid Tensor.
// Use the output ITensor's dims to reshape the Fluid Tensor.
std
::
vector
<
int
>
ddim
(
dims
.
d
,
dims
.
d
+
dims
.
nbDims
);
std
::
vector
<
int
>
ddim
(
dims
.
d
,
dims
.
d
+
dims
.
nbDims
);
...
@@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
...
@@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto
size
=
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
auto
size
=
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
if
(
platform
::
is_cpu_place
(
fluid_t
->
place
()))
{
if
(
platform
::
is_cpu_place
(
fluid_t
->
place
()))
{
// TODO(Superjomn) change this float to dtype size.
// TODO(Superjomn) change this float to dtype size.
engine
_
->
GetOutputInCPU
(
engine
->
GetOutputInCPU
(
y
,
fluid_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
y
,
fluid_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
size
*
sizeof
(
float
));
size
*
sizeof
(
float
));
}
else
{
}
else
{
engine
_
->
GetOutputInGPU
(
engine
->
GetOutputInGPU
(
y
,
fluid_t
->
mutable_data
<
float
>
(
platform
::
CUDAPlace
()),
y
,
fluid_t
->
mutable_data
<
float
>
(
platform
::
CUDAPlace
()),
size
*
sizeof
(
float
));
size
*
sizeof
(
float
));
}
}
}
}
cudaStreamSynchronize
(
stream_
);
cudaStreamSynchronize
(
*
engine
->
stream
()
);
}
}
protected:
protected:
// Build the engine.
// Build the engine.
void
Prepare
(
const
framework
::
ExecutionContext
&
context
)
const
;
void
Prepare
(
const
framework
::
ExecutionContext
&
context
)
const
;
private:
mutable
cudaStream_t
stream_
;
mutable
inference
::
tensorrt
::
TensorRTEngine
*
engine_
{
nullptr
};
mutable
int
max_batch_
{
0
};
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/tensorrt_engine_op_test.cc
浏览文件 @
5b50307b
...
@@ -79,6 +79,17 @@ void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
...
@@ -79,6 +79,17 @@ void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
attr
->
set_type
(
paddle
::
framework
::
proto
::
AttrType
::
LONG
);
attr
->
set_type
(
paddle
::
framework
::
proto
::
AttrType
::
LONG
);
attr
->
set_l
(
data
);
attr
->
set_l
(
data
);
}
}
template
<
>
void
SetAttr
<
std
::
vector
<
std
::
string
>>
(
framework
::
proto
::
OpDesc
*
op
,
const
std
::
string
&
name
,
const
std
::
vector
<
std
::
string
>&
data
)
{
auto
*
attr
=
op
->
add_attrs
();
attr
->
set_name
(
name
);
attr
->
set_type
(
paddle
::
framework
::
proto
::
AttrType
::
STRINGS
);
for
(
const
auto
&
s
:
data
)
{
attr
->
add_strings
(
s
.
c_str
());
}
}
}
// namespace
}
// namespace
...
@@ -123,11 +134,15 @@ TEST(TensorRTEngineOp, manual) {
...
@@ -123,11 +134,15 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"subgraph"
,
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"subgraph"
,
block_
->
SerializeAsString
());
block_
->
SerializeAsString
());
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_batch"
,
3
0
);
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_batch"
,
10
0
);
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_workspace"
,
1
<<
10
);
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_workspace"
,
1
<<
10
);
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"engine_uniq_key"
,
"a_engine"
);
SetAttr
<
std
::
vector
<
std
::
string
>>
(
engine_op_desc
.
Proto
(),
"parameters"
,
std
::
vector
<
std
::
string
>
({}));
LOG
(
INFO
)
<<
"create engine op"
;
LOG
(
INFO
)
<<
"create engine op"
;
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
*
engine_op_desc
.
Proto
());
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
*
engine_op_desc
.
Proto
());
LOG
(
INFO
)
<<
"engine_op "
<<
engine_op
.
get
();
framework
::
Scope
scope
;
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
platform
::
CPUPlace
place
;
...
@@ -145,6 +160,88 @@ TEST(TensorRTEngineOp, manual) {
...
@@ -145,6 +160,88 @@ TEST(TensorRTEngineOp, manual) {
engine_op
->
Run
(
scope
,
place
);
engine_op
->
Run
(
scope
,
place
);
}
}
void
Execute
(
int
batch_size
,
int
input_dim
,
int
output_dim
,
int
nlayers
=
1
)
{
framework
::
ProgramDesc
program
;
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
platform
::
CPUDeviceContext
ctx
(
place
);
auto
*
block_
=
program
.
Proto
()
->
add_blocks
();
block_
->
set_idx
(
0
);
block_
->
set_parent_idx
(
-
1
);
using
shape_t
=
std
::
vector
<
int64_t
>
;
LOG
(
INFO
)
<<
"create block desc"
;
framework
::
BlockDesc
block_desc
(
&
program
,
block_
);
auto
AddFCLayer
=
[
&
](
const
std
::
string
&
x_name
,
const
std
::
string
&
y_name
,
const
std
::
string
&
z_name
,
bool
x_created
,
const
shape_t
&
x_shape
,
const
shape_t
&
y_shape
,
const
shape_t
&
z_shape
)
{
LOG
(
INFO
)
<<
"create fc op"
;
auto
*
fc
=
block_desc
.
AppendOp
();
fc
->
SetType
(
"mul"
);
fc
->
SetInput
(
"X"
,
std
::
vector
<
std
::
string
>
({
x_name
}));
fc
->
SetInput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
y_name
}));
fc
->
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
z_name
}));
// Set inputs' variable shape in BlockDesc
if
(
!
x_created
)
{
AddTensorToBlockDesc
(
block_
,
x_name
,
std
::
vector
<
int64_t
>
({
batch_size
,
input_dim
,
1
,
1
}));
}
AddTensorToBlockDesc
(
block_
,
y_name
,
std
::
vector
<
int64_t
>
({
input_dim
,
output_dim
}));
AddTensorToBlockDesc
(
block_
,
z_name
,
std
::
vector
<
int64_t
>
({
batch_size
,
output_dim
}));
// Prepare variables.
if
(
!
x_created
)
{
CreateCPUTensor
(
&
scope
,
x_name
,
std
::
vector
<
int64_t
>
(
x_shape
));
}
CreateCPUTensor
(
&
scope
,
y_name
,
std
::
vector
<
int64_t
>
(
y_shape
));
CreateCPUTensor
(
&
scope
,
z_name
,
std
::
vector
<
int64_t
>
(
z_shape
));
// It is wired, need to copy manually.
*
block_
->
add_ops
()
=
*
fc
->
Proto
();
};
// Test with 4 layer FC
AddFCLayer
(
"x0"
,
"y0"
,
"z0"
,
false
,
{
batch_size
,
input_dim
},
{
input_dim
,
output_dim
},
{
batch_size
,
output_dim
});
AddFCLayer
(
"z0"
,
"y1"
,
"z1"
,
true
,
{},
{
output_dim
,
output_dim
},
{
batch_size
,
output_dim
});
AddFCLayer
(
"z1"
,
"y2"
,
"z2"
,
true
,
{},
{
output_dim
,
output_dim
},
{
batch_size
,
output_dim
});
AddFCLayer
(
"z2"
,
"y3"
,
"z3"
,
true
,
{},
{
output_dim
,
output_dim
},
{
batch_size
,
output_dim
});
LOG
(
INFO
)
<<
"create tensorrt desc"
;
framework
::
OpDesc
engine_op_desc
(
nullptr
);
engine_op_desc
.
SetType
(
"tensorrt_engine"
);
engine_op_desc
.
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
({
"x0"
}));
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z3"
}));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"subgraph"
,
block_
->
SerializeAsString
());
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_batch"
,
batch_size
);
SetAttr
<
int
>
(
engine_op_desc
.
Proto
(),
"max_workspace"
,
2
<<
10
);
SetAttr
<
std
::
vector
<
std
::
string
>>
(
engine_op_desc
.
Proto
(),
"parameters"
,
std
::
vector
<
std
::
string
>
({
"y0"
,
"y1"
,
"y2"
,
"y3"
}));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"engine_uniq_key"
,
"b_engine"
);
auto
engine_op
=
framework
::
OpRegistry
::
CreateOp
(
*
engine_op_desc
.
Proto
());
// Execute them.
engine_op
->
Run
(
scope
,
place
);
}
// Test with a larger FC layer.
TEST
(
TensorRTEngineOp
,
fc
)
{
Execute
(
40
,
256
,
256
);
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
5b50307b
python/paddle/fluid/layers/io.py
浏览文件 @
5b50307b
...
@@ -22,9 +22,9 @@ from ..executor import global_scope
...
@@ -22,9 +22,9 @@ from ..executor import global_scope
from
layer_function_generator
import
generate_layer_fn
,
templatedoc
from
layer_function_generator
import
generate_layer_fn
,
templatedoc
__all__
=
[
__all__
=
[
'data'
,
'BlockGuardServ'
,
'ListenAndServ'
,
'Send'
,
'
open_recordio_file
'
,
'data'
,
'BlockGuardServ'
,
'ListenAndServ'
,
'Send'
,
'
Recv
'
,
'open_
files'
,
'read_file'
,
'shuffle'
,
'batch'
,
'double_buffer
'
,
'open_
recordio_file'
,
'open_files'
,
'read_file'
,
'shuffle'
,
'batch
'
,
'random_data_generator'
,
'Preprocessor'
,
'load'
'
double_buffer'
,
'
random_data_generator'
,
'Preprocessor'
,
'load'
]
]
...
@@ -177,18 +177,17 @@ class ListenAndServ(object):
...
@@ -177,18 +177,17 @@ class ListenAndServ(object):
})
})
def
Send
(
endpoints
,
send_vars
,
get_vars
=
Non
e
):
def
Send
(
endpoints
,
send_vars
,
sync
=
Tru
e
):
"""
"""
Send layer
Send variables to the server side, and get vars from server
side when server have finished running server side program.
Args:
Args:
endpoints: comma seperated IP:PORT pairs in the order
endpoints
(str)
: comma seperated IP:PORT pairs in the order
of send_vars to send
of send_vars to send
send_vars
: vars to send
send_vars
(list): variables to send to server
get_vars: vars to get from server after send completes.
sync (bool): whether to wait the request finish
Send variables to the server side, and get vars from server
side when server have finished running server side program.
"""
"""
assert
(
type
(
send_vars
)
==
list
)
assert
(
type
(
send_vars
)
==
list
)
...
@@ -196,40 +195,33 @@ def Send(endpoints, send_vars, get_vars=None):
...
@@ -196,40 +195,33 @@ def Send(endpoints, send_vars, get_vars=None):
endpoints
=
list
(
set
(
epmap
))
endpoints
=
list
(
set
(
epmap
))
helper
=
LayerHelper
(
"Send"
,
**
locals
())
helper
=
LayerHelper
(
"Send"
,
**
locals
())
if
not
get_vars
:
get_vars
=
[]
for
s
in
send_vars
:
v
=
helper
.
create_tmp_variable
(
dtype
=
s
.
dtype
,
stop_gradient
=
True
)
get_vars
.
append
(
v
)
rpc_op_role_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
rpc_op_role_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
helper
.
append_op
(
helper
.
append_op
(
type
=
"send"
,
type
=
"send"
,
inputs
=
{
"X"
:
send_vars
},
inputs
=
{
"X"
:
send_vars
},
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
attrs
=
{
"endpoints"
:
endpoints
,
"endpoints"
:
endpoints
,
"epmap"
:
epmap
,
"epmap"
:
epmap
,
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
rpc_op_role_name
:
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
})
})
if
sync
:
return
get_vars
helper
.
append_op
(
type
=
"send_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
def
Recv
(
endpoints
,
get_vars
):
def
Recv
(
endpoints
,
get_vars
,
sync
=
True
):
"""
"""
Rec
v layer
Rec
eive variables from server side
Args:
Args:
endpoints: comma seperated IP:PORT pairs in the order
endpoints
(str)
: comma seperated IP:PORT pairs in the order
of send_vars to send
of send_vars to send
send_vars: vars to send
get_vars (list): vars to get from server after send completes.
get_vars: vars to get from server after send completes.
sync (bool): whether to wait the request finish
Send variables to the server side, and get vars from server
Returns:
side when server have finished running server side program.
list: list of received variables
"""
"""
assert
(
type
(
send_vars
)
==
list
)
assert
(
type
(
get_vars
)
==
list
)
assert
(
type
(
get_vars
)
==
list
)
epmap
=
endpoints
.
split
(
","
)
epmap
=
endpoints
.
split
(
","
)
...
@@ -242,6 +234,9 @@ def Recv(endpoints, get_vars):
...
@@ -242,6 +234,9 @@ def Recv(endpoints, get_vars):
outputs
=
{
"Out"
:
get_vars
},
outputs
=
{
"Out"
:
get_vars
},
attrs
=
{
"endpoints"
:
endpoints
,
attrs
=
{
"endpoints"
:
endpoints
,
"epmap"
:
epmap
})
"epmap"
:
epmap
})
if
sync
:
helper
.
append_op
(
type
=
"fetch_barrier"
,
attrs
=
{
"endpoints"
:
endpoints
})
return
get_vars
def
monkey_patch_reader_methods
(
reader
):
def
monkey_patch_reader_methods
(
reader
):
...
@@ -541,6 +536,9 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
...
@@ -541,6 +536,9 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
def
shuffle
(
reader
,
buffer_size
):
def
shuffle
(
reader
,
buffer_size
):
"""
Shuffle the reader.
"""
return
__create_unshared_decorated_reader__
(
return
__create_unshared_decorated_reader__
(
'create_shuffle_reader'
,
reader
,
{
'buffer_size'
:
int
(
buffer_size
)})
'create_shuffle_reader'
,
reader
,
{
'buffer_size'
:
int
(
buffer_size
)})
...
...
python/paddle/fluid/layers/layer_function_generator.py
浏览文件 @
5b50307b
...
@@ -44,6 +44,11 @@ def _type_to_str_(tp):
...
@@ -44,6 +44,11 @@ def _type_to_str_(tp):
return
framework_pb2
.
AttrType
.
Name
(
tp
)
return
framework_pb2
.
AttrType
.
Name
(
tp
)
_two_dollar_pattern_
=
re
.
compile
(
r
"\$\$([^\$]+)\$\$"
)
_single_dollar_pattern_
=
re
.
compile
(
r
"\$([^\$]+)\$"
)
_two_bang_pattern_
=
re
.
compile
(
r
"!!([^!]+)!!"
)
def
_generate_doc_string_
(
op_proto
):
def
_generate_doc_string_
(
op_proto
):
"""
"""
Generate docstring by OpProto
Generate docstring by OpProto
...
@@ -55,22 +60,26 @@ def _generate_doc_string_(op_proto):
...
@@ -55,22 +60,26 @@ def _generate_doc_string_(op_proto):
str: the document string
str: the document string
"""
"""
def
escape_math
(
text
):
return
_two_bang_pattern_
.
sub
(
r
'$$\1$$'
,
_single_dollar_pattern_
.
sub
(
r
':math:`\1`'
,
_two_dollar_pattern_
.
sub
(
r
"!!\1!!"
,
text
)))
if
not
isinstance
(
op_proto
,
framework_pb2
.
OpProto
):
if
not
isinstance
(
op_proto
,
framework_pb2
.
OpProto
):
raise
TypeError
(
"OpProto should be `framework_pb2.OpProto`"
)
raise
TypeError
(
"OpProto should be `framework_pb2.OpProto`"
)
buf
=
cStringIO
.
StringIO
()
buf
=
cStringIO
.
StringIO
()
buf
.
write
(
op_proto
.
comment
)
buf
.
write
(
escape_math
(
op_proto
.
comment
)
)
buf
.
write
(
'
\n
Args:
\n
'
)
buf
.
write
(
'
\n
Args:
\n
'
)
for
each_input
in
op_proto
.
inputs
:
for
each_input
in
op_proto
.
inputs
:
line_begin
=
' {0}: '
.
format
(
_convert_
(
each_input
.
name
))
line_begin
=
' {0}: '
.
format
(
_convert_
(
each_input
.
name
))
buf
.
write
(
line_begin
)
buf
.
write
(
line_begin
)
buf
.
write
(
each_input
.
comment
)
buf
.
write
(
escape_math
(
each_input
.
comment
))
buf
.
write
(
'
\n
'
)
if
each_input
.
duplicable
:
buf
.
write
(
' '
*
len
(
line_begin
))
buf
.
write
(
" Duplicatable."
)
buf
.
write
(
'Duplicable: '
)
if
each_input
.
dispensable
:
buf
.
write
(
str
(
each_input
.
duplicable
))
buf
.
write
(
" Optional."
)
buf
.
write
(
' Optional: '
)
buf
.
write
(
str
(
each_input
.
dispensable
))
buf
.
write
(
'
\n
'
)
buf
.
write
(
'
\n
'
)
skip_attrs
=
OpProtoHolder
.
generated_op_attr_names
()
skip_attrs
=
OpProtoHolder
.
generated_op_attr_names
()
...
@@ -83,7 +92,7 @@ def _generate_doc_string_(op_proto):
...
@@ -83,7 +92,7 @@ def _generate_doc_string_(op_proto):
buf
.
write
(
' ('
)
buf
.
write
(
' ('
)
buf
.
write
(
_type_to_str_
(
each_attr
.
type
))
buf
.
write
(
_type_to_str_
(
each_attr
.
type
))
buf
.
write
(
'): '
)
buf
.
write
(
'): '
)
buf
.
write
(
e
ach_attr
.
comment
)
buf
.
write
(
e
scape_math
(
each_attr
.
comment
)
)
buf
.
write
(
'
\n
'
)
buf
.
write
(
'
\n
'
)
if
len
(
op_proto
.
outputs
)
!=
0
:
if
len
(
op_proto
.
outputs
)
!=
0
:
...
@@ -92,7 +101,7 @@ def _generate_doc_string_(op_proto):
...
@@ -92,7 +101,7 @@ def _generate_doc_string_(op_proto):
for
each_opt
in
op_proto
.
outputs
:
for
each_opt
in
op_proto
.
outputs
:
if
not
each_opt
.
intermediate
:
if
not
each_opt
.
intermediate
:
break
break
buf
.
write
(
e
ach_opt
.
comment
)
buf
.
write
(
e
scape_math
(
each_opt
.
comment
)
)
return
buf
.
getvalue
()
return
buf
.
getvalue
()
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
5b50307b
...
@@ -225,11 +225,11 @@ def embedding(input,
...
@@ -225,11 +225,11 @@ def embedding(input,
have two elements which indicate the size of the dictionary of
have two elements which indicate the size of the dictionary of
embeddings and the size of each embedding vector respectively.
embeddings and the size of each embedding vector respectively.
is_sparse(bool): The flag indicating whether to use sparse update.
is_sparse(bool): The flag indicating whether to use sparse update.
is_distributed
(bool): Whether to run lookup table from remote parameter server.
is_distributed(bool): Whether to run lookup table from remote parameter server.
padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
Otherwise the given :attr:`padding_idx` indicates padding the output
Otherwise the given :attr:`padding_idx` indicates padding the output
with zeros whenever lookup encounters it in :attr:`input`. If
with zeros whenever lookup encounters it in :attr:`input`. If
:math:`padding_idx < 0`, the
padding_idx
to use in lookup is
:math:`padding_idx < 0`, the
:attr:`padding_idx`
to use in lookup is
:math:`size[0] + dim`.
:math:`size[0] + dim`.
param_attr(ParamAttr): Parameters for this layer
param_attr(ParamAttr): Parameters for this layer
dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
...
@@ -1235,14 +1235,17 @@ def conv2d(input,
...
@@ -1235,14 +1235,17 @@ def conv2d(input,
act
=
None
,
act
=
None
,
name
=
None
):
name
=
None
):
"""
"""
**Convlution2D Layer**
The convolution2D layer calculates the output based on the input, filter
The convolution2D layer calculates the output based on the input, filter
and strides, paddings, dilations, groups parameters. Input
(Input)
and
and strides, paddings, dilations, groups parameters. Input and
Output
(Output) are in NCHW format. W
here N is batch size, C is the number of
Output
are in NCHW format, w
here N is batch size, C is the number of
channels, H is the height of the feature, and W is the width of the feature.
channels, H is the height of the feature, and W is the width of the feature.
The details of convolution layer, please refer UFLDL's `convolution,
Filter is in MCHW format, where M is the number of output image channels,
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
C is the number of input image channels, H is the height of the filter,
and W is the width of the filter. If the groups is greater than 1,
C will equal the number of input image channels divided by the groups.
Please refer to UFLDL's `convolution
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
for more detials.
If bias attribution and activation type are provided, bias is added to the
If bias attribution and activation type are provided, bias is added to the
output of the convolution, and the corresponding activation function is
output of the convolution, and the corresponding activation function is
applied to the final result.
applied to the final result.
...
@@ -1253,15 +1256,14 @@ def conv2d(input,
...
@@ -1253,15 +1256,14 @@ def conv2d(input,
Out = \sigma (W
\\
ast X + b)
Out = \sigma (W
\\
ast X + b)
In the above equation
:
Where
:
* :math:`X`: Input value, a tensor with NCHW format.
* :math:`X`: Input value, a tensor with NCHW format.
* :math:`W`: Filter value, a tensor with MCHW format.
* :math:`W`: Filter value, a tensor with MCHW format.
* :math:`
\\
ast`: Convolution operation.
* :math:`
\\
ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`
\\
sigma`: Activation function.
* :math:`
\\
sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
different.
Example:
Example:
...
@@ -1272,6 +1274,7 @@ def conv2d(input,
...
@@ -1272,6 +1274,7 @@ def conv2d(input,
Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
- Output:
- Output:
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
Where
Where
...
@@ -1306,7 +1309,8 @@ def conv2d(input,
...
@@ -1306,7 +1309,8 @@ def conv2d(input,
bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True
library is installed. Default: True
use_mkldnn (bool): Use mkldnn kernels or not.
use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
with mkldnn library. Default: False
act (str): Activation type. Default: None
act (str): Activation type. Default: None
name (str|None): A name for this layer(optional). If set None, the layer
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
will be named automatically.
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
5b50307b
...
@@ -219,6 +219,7 @@ def assign(input, output):
...
@@ -219,6 +219,7 @@ def assign(input, output):
Examples:
Examples:
.. code-block:: python
.. code-block:: python
out = fluid.layers.create_tensor(dtype='float32')
out = fluid.layers.create_tensor(dtype='float32')
hidden = fluid.layers.fc(input=data, size=10)
hidden = fluid.layers.fc(input=data, size=10)
fluid.layers.assign(hidden, out)
fluid.layers.assign(hidden, out)
...
...
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
5b50307b
...
@@ -16,6 +16,7 @@ import os
...
@@ -16,6 +16,7 @@ import os
import
time
import
time
import
unittest
import
unittest
from
multiprocessing
import
Process
from
multiprocessing
import
Process
import
signal
import
numpy
import
numpy
...
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
...
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
class
TestSendOp
(
unittest
.
TestCase
):
class
TestSendOp
(
unittest
.
TestCase
):
@
unittest
.
skip
(
"This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
)
def
test_send
(
self
):
def
test_send
(
self
):
# Run init_serv in a thread
# Run init_serv in a thread
place
=
fluid
.
CPUPlace
()
place
=
fluid
.
CPUPlace
()
...
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
...
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
p
.
daemon
=
True
p
.
daemon
=
True
p
.
start
()
p
.
start
()
time
.
sleep
(
10
)
self
.
ps_timeout
=
5
self
.
_wait_ps_ready
(
p
.
pid
)
with
open
(
"/tmp/paddle.%d.port"
%
p
.
pid
,
"r"
)
as
fn
:
with
open
(
"/tmp/paddle.%d.port"
%
p
.
pid
,
"r"
)
as
fn
:
selected_port
=
int
(
fn
.
readlines
()[
0
])
selected_port
=
int
(
fn
.
readlines
()[
0
])
self
.
init_client
(
place
,
selected_port
)
self
.
init_client
(
place
,
selected_port
)
...
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
...
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
self
.
assertTrue
(
numpy
.
allclose
(
self
.
local_out
,
self
.
dist_out
))
self
.
assertTrue
(
numpy
.
allclose
(
self
.
local_out
,
self
.
dist_out
))
# FIXME(typhoonzero): find a way to gracefully shutdown the server.
# FIXME(typhoonzero): find a way to gracefully shutdown the server.
os
.
system
(
"kill -9 %d"
%
p
.
pid
)
os
.
kill
(
p
.
pid
,
signal
.
SIGKILL
)
p
.
join
()
p
.
join
()
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
sleep_time
=
0.5
while
True
:
assert
start_left_time
>=
0
,
"wait ps ready failed"
time
.
sleep
(
sleep_time
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
:
start_left_time
-=
sleep_time
def
init_serv
(
self
,
place
):
def
init_serv
(
self
,
place
):
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
...
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
...
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
dtype
=
"float32"
,
dtype
=
"float32"
,
persistable
=
False
,
persistable
=
False
,
shape
=
[
32
,
32
])
shape
=
[
32
,
32
])
o
=
layers
.
Send
(
"127.0.0.1:%d"
%
port
,
[
x
],
[
get_var
])
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
get_var
,
main
.
global_block
())
layers
.
Send
(
"127.0.0.1:%d"
%
port
,
[
x
])
o
=
layers
.
Recv
(
"127.0.0.1:%d"
%
port
,
[
get_var
])
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
self
.
dist_out
=
exe
.
run
(
main
,
fetch_list
=
o
)
# o is a list
self
.
dist_out
=
exe
.
run
(
main
,
fetch_list
=
o
)
# o is a list
...
...
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
浏览文件 @
5b50307b
...
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
...
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
ps_timeout
=
5
self
.
ps_timeout
=
5
self
.
ip
=
"127.0.0.1"
self
.
ip
=
"127.0.0.1"
self
.
port
=
"
6173
"
self
.
port
=
"
0
"
self
.
trainers
=
1
self
.
trainers
=
1
self
.
trainer_id
=
1
self
.
trainer_id
=
0
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
p
=
Process
(
p
=
Process
(
target
=
run_pserver
,
target
=
run_pserver
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainers
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainers
,
self
.
trainer_id
))
self
.
trainer_id
))
p
.
daemon
=
True
p
.
start
()
p
.
start
()
return
p
.
pid
return
p
def
_wait_ps_ready
(
self
,
pid
):
def
_wait_ps_ready
(
self
,
pid
):
start_left_time
=
self
.
ps_timeout
start_left_time
=
self
.
ps_timeout
...
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
...
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
def
test_handle_signal_in_serv_op
(
self
):
def
test_handle_signal_in_serv_op
(
self
):
# run pserver on CPU in sync mode
# run pserver on CPU in sync mode
p
id
=
self
.
_start_pserver
(
False
,
True
)
p
1
=
self
.
_start_pserver
(
False
,
True
)
self
.
_wait_ps_ready
(
pid
)
self
.
_wait_ps_ready
(
p
1
.
p
id
)
# raise SIGTERM to pserver
# raise SIGTERM to pserver
os
.
kill
(
pid
,
signal
.
SIGTERM
)
os
.
kill
(
p1
.
pid
,
signal
.
SIGKILL
)
p1
.
join
()
# run pserver on CPU in async mode
# run pserver on CPU in async mode
p
id
=
self
.
_start_pserver
(
False
,
False
)
p
2
=
self
.
_start_pserver
(
False
,
False
)
self
.
_wait_ps_ready
(
pid
)
self
.
_wait_ps_ready
(
p
2
.
p
id
)
# raise SIGTERM to pserver
# raise SIGTERM to pserver
os
.
kill
(
pid
,
signal
.
SIGTERM
)
os
.
kill
(
p2
.
pid
,
signal
.
SIGKILL
)
p2
.
join
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
浏览文件 @
5b50307b
...
@@ -157,9 +157,11 @@ class ControlFlowGraph(object):
...
@@ -157,9 +157,11 @@ class ControlFlowGraph(object):
if
op
.
type
()
==
"fill_constant"
and
op
.
attr
(
"force_cpu"
)
==
True
:
if
op
.
type
()
==
"fill_constant"
and
op
.
attr
(
"force_cpu"
)
==
True
:
self
.
_skip_opt
.
update
(
op
.
output_arg_names
())
self
.
_skip_opt
.
update
(
op
.
output_arg_names
())
def
release_memory
(
self
):
def
release_memory
(
self
,
skip_opt_set
=
None
):
self
.
_dataflow_analyze
()
self
.
_dataflow_analyze
()
self
.
_update_skip_opt_set
()
self
.
_update_skip_opt_set
()
if
skip_opt_set
:
self
.
_skip_opt
.
update
(
skip_opt_set
)
fwd_id
=
0
fwd_id
=
0
bwd_id
=
0
bwd_id
=
0
for
i
in
range
(
self
.
op_size
):
for
i
in
range
(
self
.
op_size
):
...
@@ -183,7 +185,7 @@ class ControlFlowGraph(object):
...
@@ -183,7 +185,7 @@ class ControlFlowGraph(object):
else
:
else
:
bwd_id
+=
1
bwd_id
+=
1
def
memory_optimize
(
self
,
level
=
0
):
def
memory_optimize
(
self
,
skip_opt_set
=
None
,
level
=
0
):
def
compare_shape
(
x_shape
,
cache_shape
,
opt_level
):
def
compare_shape
(
x_shape
,
cache_shape
,
opt_level
):
if
opt_level
==
0
:
if
opt_level
==
0
:
return
x_shape
==
cache_shape
return
x_shape
==
cache_shape
...
@@ -200,6 +202,9 @@ class ControlFlowGraph(object):
...
@@ -200,6 +202,9 @@ class ControlFlowGraph(object):
self
.
_dataflow_analyze
()
self
.
_dataflow_analyze
()
self
.
_update_skip_opt_set
()
self
.
_update_skip_opt_set
()
# update skip set to meet users' demand
if
skip_opt_set
:
self
.
_skip_opt
.
update
(
skip_opt_set
)
self
.
pool
=
[]
self
.
pool
=
[]
for
i
in
range
(
self
.
op_size
):
for
i
in
range
(
self
.
op_size
):
op
=
self
.
_ops
[
i
]
op
=
self
.
_ops
[
i
]
...
@@ -358,7 +363,7 @@ def _get_cfgs(input_program):
...
@@ -358,7 +363,7 @@ def _get_cfgs(input_program):
return
cfgs
return
cfgs
def
memory_optimize
(
input_program
,
print_log
=
False
,
level
=
0
):
def
memory_optimize
(
input_program
,
skip_opt_set
=
None
,
print_log
=
False
,
level
=
0
):
"""Optimize memory by reusing var memory.
"""Optimize memory by reusing var memory.
Note: it doesn't not support subblock nested in subblock.
Note: it doesn't not support subblock nested in subblock.
...
@@ -374,10 +379,10 @@ def memory_optimize(input_program, print_log=False, level=0):
...
@@ -374,10 +379,10 @@ def memory_optimize(input_program, print_log=False, level=0):
PRINT_LOG
=
print_log
PRINT_LOG
=
print_log
cfgs
=
_get_cfgs
(
input_program
)
cfgs
=
_get_cfgs
(
input_program
)
for
cfg
in
cfgs
:
for
cfg
in
cfgs
:
cfg
.
memory_optimize
(
level
)
cfg
.
memory_optimize
(
skip_opt_set
=
skip_opt_set
,
level
=
level
)
def
release_memory
(
input_program
):
def
release_memory
(
input_program
,
skip_opt_set
=
None
):
cfgs
=
_get_cfgs
(
input_program
)
cfgs
=
_get_cfgs
(
input_program
)
for
cfg
in
cfgs
:
for
cfg
in
cfgs
:
cfg
.
release_memory
()
cfg
.
release_memory
(
skip_opt_set
=
skip_opt_set
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录