Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
706f3839
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
706f3839
编写于
6月 19, 2018
作者:
Y
yuyang18
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add-none-layers-api-doc
上级
fe5de04b
9988f8ec
变更
21
隐藏空白更改
内联
并排
Showing
21 changed file
with
855 addition
and
236 deletion
+855
-236
cmake/external/mkldnn.cmake
cmake/external/mkldnn.cmake
+2
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+3
-2
paddle/fluid/operators/activation_mkldnn_op.cc
paddle/fluid/operators/activation_mkldnn_op.cc
+196
-120
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+16
-13
paddle/fluid/operators/concat_op.h
paddle/fluid/operators/concat_op.h
+26
-15
paddle/fluid/operators/detection_map_op.cc
paddle/fluid/operators/detection_map_op.cc
+6
-6
paddle/fluid/operators/math/concat.cc
paddle/fluid/operators/math/concat.cc
+15
-10
paddle/fluid/operators/math/concat.cu
paddle/fluid/operators/math/concat.cu
+25
-16
paddle/fluid/operators/math/concat.h
paddle/fluid/operators/math/concat.h
+2
-1
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+15
-9
paddle/testing/paddle_gtest_main.cc
paddle/testing/paddle_gtest_main.cc
+3
-2
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-1
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+7
-1
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+43
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+99
-0
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+115
-7
python/paddle/fluid/profiler.py
python/paddle/fluid/profiler.py
+112
-5
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+9
-0
python/paddle/fluid/tests/unittests/test_optimizer.py
python/paddle/fluid/tests/unittests/test_optimizer.py
+66
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+53
-5
python/paddle/fluid/transpiler/inference_transpiler.py
python/paddle/fluid/transpiler/inference_transpiler.py
+41
-20
未找到文件。
cmake/external/mkldnn.cmake
浏览文件 @
706f3839
...
...
@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
ELSE
()
MESSAGE
(
FATAL_ERROR
"Should enable MKLML when build MKLDNN"
)
ENDIF
()
SET
(
MKLDNN_FLAG
"-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result"
)
SET
(
MKLDNN_FLAG
"-Wno-error=strict-overflow -Wno-error=unused-result"
)
SET
(
MKLDNN_FLAG
"
${
MKLDNN_FLAG
}
-Wno-unused-result -Wno-unused-value"
)
SET
(
MKLDNN_CFLAG
"
${
CMAKE_C_FLAGS
}
${
MKLDNN_FLAG
}
"
)
SET
(
MKLDNN_CXXFLAG
"
${
CMAKE_CXX_FLAGS
}
${
MKLDNN_FLAG
}
"
)
ExternalProject_Add
(
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
706f3839
...
...
@@ -295,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std
::
unique_ptr
<
ExecutorPrepareContext
>
Executor
::
Prepare
(
const
ProgramDesc
&
program
,
int
block_id
)
{
auto
*
ctx
=
new
ExecutorPrepareContext
(
program
,
block_id
);
std
::
unique_ptr
<
ExecutorPrepareContext
>
ctx
(
new
ExecutorPrepareContext
(
program
,
block_id
));
PADDLE_ENFORCE_LT
(
static_cast
<
size_t
>
(
block_id
),
program
.
Size
());
auto
&
block
=
program
.
Block
(
block_id
);
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
ctx
->
ops_
.
push_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
return
std
::
unique_ptr
<
ExecutorPrepareContext
>
(
ctx
)
;
return
ctx
;
}
std
::
vector
<
std
::
shared_ptr
<
ExecutorPrepareContext
>>
Executor
::
Prepare
(
...
...
paddle/fluid/operators/activation_mkldnn_op.cc
浏览文件 @
706f3839
...
...
@@ -12,16 +12,20 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mkldnn_activation_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
MKLDNNDeviceContext
;
using
framework
::
DataLayout
;
using
framework
::
Tensor
;
using
mkldnn
::
memory
;
using
mkldnn
::
primitive
;
using
mkldnn
::
stream
;
using
platform
::
GetMKLDNNFormat
;
using
platform
::
MKLDNNDeviceContext
;
using
platform
::
to_void_cast
;
namespace
{
std
::
string
gethash
(
const
mkldnn
::
memory
::
dims
&
operand_dims
,
...
...
@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims,
};
return
dim2str
(
operand_dims
)
+
std
::
to_string
(
algorithm
);
}
}
// namespace
template
<
typename
Functor
>
class
MKLDNNActivationKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
PADDLE_ENFORCE
(
x
->
layout
()
==
DataLayout
::
kMKLDNN
&&
x
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input x tensor"
);
Functor
functor
;
auto
attrs
=
functor
.
GetAttrs
();
for
(
auto
&
attr
:
attrs
)
{
*
attr
.
second
=
ctx
.
Attr
<
float
>
(
attr
.
first
);
}
functor
(
ctx
);
}
};
template
<
typename
T
,
typename
ExecContext
>
void
eltwise_forward
(
const
ExecContext
&
ctx
,
mkldnn
::
algorithm
algorithm
,
const
T
alpha
=
0
,
const
T
beta
=
0
)
{
template
<
typename
Functor
>
class
MKLDNNActivationGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
diff_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
PADDLE_ENFORCE
(
diff_y
->
layout
()
==
DataLayout
::
kMKLDNN
&&
diff_y
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input OutGrad tensor"
);
Functor
functor
;
auto
attrs
=
functor
.
GetAttrs
();
for
(
auto
&
attr
:
attrs
)
{
*
attr
.
second
=
ctx
.
Attr
<
float
>
(
attr
.
first
);
}
functor
(
ctx
);
}
};
template
<
typename
T
>
void
eltwise_forward
(
const
framework
::
ExecutionContext
&
ctx
,
mkldnn
::
algorithm
algorithm
,
const
T
alpha
=
0
,
const
T
beta
=
0
)
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
// get buffers
const
auto
*
src
=
ctx
.
template
Input
<
Tensor
>(
"X"
);
const
auto
*
src_data
=
src
->
template
data
<
T
>();
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
*
dst
=
ctx
.
template
Output
<
Tensor
>(
"Out"
);
T
*
dst_data
=
dst
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
());
const
T
*
x_data
=
x
->
data
<
T
>
(
);
T
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// get memory dim
PADDLE_ENFORCE
(
src
->
dims
().
size
()
==
2
||
src
->
dims
().
size
()
==
4
,
PADDLE_ENFORCE
(
x
->
dims
().
size
()
==
2
||
x
->
dims
().
size
()
==
4
,
"Input dim must be with 2 or 4"
);
std
::
vector
<
int
>
src_tz
=
framework
::
vectorize2int
(
src
->
dims
());
std
::
vector
<
int
>
src_tz
=
framework
::
vectorize2int
(
x
->
dims
());
auto
src_format
=
src_tz
.
size
()
==
2
?
mkldnn
::
memory
::
format
::
nc
:
x
->
format
();
const
std
::
string
key
=
gethash
(
src_tz
,
algorithm
);
const
std
::
string
key_src_data
=
key
+
ctx
.
op
().
Output
(
"Out"
)
+
"@eltwise_fwd_src_data"
;
const
std
::
string
key_src_mem
=
key
+
"@eltwise_fwd_src_mem"
;
const
std
::
string
key_dst_mem
=
key
+
"@eltwise_fwd_dst_mem"
;
const
std
::
string
key_fwd
=
key
+
"@eltwise_fwd"
;
const
std
::
string
key_src_layout
=
key
+
ctx
.
op
().
Output
(
"Out"
)
+
"@eltwise_fwd_src_layout"
;
const
std
::
string
key_with_layout
=
key
+
std
::
to_string
(
src_format
);
const
std
::
string
key_src_mem
=
key_with_layout
+
"@eltwise_fwd_src_mem"
;
const
std
::
string
key_dst_mem
=
key_with_layout
+
"@eltwise_fwd_dst_mem"
;
const
std
::
string
key_fwd
=
key_with_layout
+
"@eltwise_fwd"
;
const
std
::
string
key_fwd_pd
=
key_with_layout
+
"@eltwise_fwd_pd"
;
// save input data and layout to be referred in backward path
auto
p_src_data
=
std
::
make_shared
<
const
T
*>
(
x_data
);
dev_ctx
.
SetBlob
(
key_src_data
,
p_src_data
);
auto
p_src_layout
=
std
::
make_shared
<
memory
::
format
>
(
src_format
);
dev_ctx
.
SetBlob
(
key_src_layout
,
p_src_layout
);
auto
p_fwd
=
std
::
static_pointer_cast
<
mkldnn
::
eltwise_forward
>
(
dev_ctx
.
GetBlob
(
key_fwd
));
// save input data to be referred in backward path
auto
p_src_data
=
std
::
make_shared
<
const
T
*>
(
src_data
);
dev_ctx
.
SetBlob
(
key_src_data
,
p_src_data
);
std
::
shared_ptr
<
memory
>
dst_memory
;
if
(
p_fwd
==
nullptr
)
{
// create memory description
auto
data_md
=
src_tz
.
size
()
==
2
?
platform
::
MKLDNNMemDesc
(
src_tz
,
mkldnn
::
memory
::
f32
,
mkldnn
::
memory
::
format
::
nc
)
:
platform
::
MKLDNNMemDesc
(
src_tz
,
mkldnn
::
memory
::
f32
,
mkldnn
::
memory
::
format
::
nchw
);
// create memory primitives
auto
p_src_mem
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
(
{
data_md
,
mkldnn_engine
},
platform
::
to_void_cast
(
src_data
)));
dev_ctx
.
SetBlob
(
key_src_mem
,
p_src_mem
);
auto
p_dst_mem
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
(
{
data_md
,
mkldnn_engine
},
platform
::
to_void_cast
(
dst_data
)));
dev_ctx
.
SetBlob
(
key_dst_mem
,
p_dst_mem
);
auto
fwd_desc
=
mkldnn
::
eltwise_forward
::
desc
(
mkldnn
::
prop_kind
::
forward_training
,
algorithm
,
data_md
,
alpha
,
beta
);
auto
p_fwd_pd
=
std
::
make_shared
<
mkldnn
::
eltwise_forward
::
primitive_desc
>
(
fwd_desc
,
mkldnn_engine
);
const
std
::
string
key_fwd_pd
=
key
+
"eltwise_fwd_pd"
;
dev_ctx
.
SetBlob
(
key_fwd_pd
,
p_fwd_pd
);
p_fwd
=
std
::
make_shared
<
mkldnn
::
eltwise_forward
>
(
*
p_fwd_pd
,
*
(
p_src_mem
.
get
()),
*
(
p_dst_mem
.
get
()));
// create mkldnn memory for input X
auto
src_md
=
platform
::
MKLDNNMemDesc
(
src_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
src_format
);
auto
src_memory
=
std
::
shared_ptr
<
memory
>
(
new
memory
({
src_md
,
mkldnn_engine
},
to_void_cast
(
x_data
)));
// save src_memory to be referred in backward path
dev_ctx
.
SetBlob
(
key_src_mem
,
src_memory
);
// create primitive descriptor for activation forward and save it
auto
forward_desc
=
mkldnn
::
eltwise_forward
::
desc
(
mkldnn
::
prop_kind
::
forward_training
,
algorithm
,
src_memory
->
get_primitive_desc
().
desc
(),
alpha
,
beta
);
auto
forward_pd
=
std
::
make_shared
<
mkldnn
::
eltwise_forward
::
primitive_desc
>
(
forward_desc
,
mkldnn_engine
);
// save prim desc into global device context to be referred in backward path
dev_ctx
.
SetBlob
(
key_fwd_pd
,
forward_pd
);
// create mkldnn memory for output y
dst_memory
=
std
::
make_shared
<
memory
>
(
forward_pd
->
dst_primitive_desc
(),
y_data
);
dev_ctx
.
SetBlob
(
key_dst_mem
,
dst_memory
);
// create activation primitive
p_fwd
=
std
::
make_shared
<
mkldnn
::
eltwise_forward
>
(
*
forward_pd
,
*
src_memory
,
*
dst_memory
);
dev_ctx
.
SetBlob
(
key_fwd
,
p_fwd
);
}
else
{
// primitives already exist
auto
p_src_mem
=
auto
src_memory
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_src_mem
));
PADDLE_ENFORCE
(
p_src_mem
!=
nullptr
,
"Fail to find eltwise
p_src_mem
in device context."
);
auto
p_dst_mem
=
PADDLE_ENFORCE
(
src_memory
!=
nullptr
,
"Fail to find eltwise
src_memory
in device context."
);
dst_memory
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_dst_mem
));
PADDLE_ENFORCE
(
p_dst_mem
!=
nullptr
,
"Fail to find eltwise
p_src_mem
in device context."
);
PADDLE_ENFORCE
(
dst_memory
!=
nullptr
,
"Fail to find eltwise
dst_memory
in device context."
);
p_src_mem
->
set_data_handle
(
platform
::
to_void_reinterpret_cast
(
src
_data
));
p_dst_mem
->
set_data_handle
(
dst
_data
);
src_memory
->
set_data_handle
(
platform
::
to_void_cast
(
x
_data
));
dst_memory
->
set_data_handle
(
y
_data
);
}
// push primitive to stream and wait until it's executed
std
::
vector
<
mkldnn
::
primitive
>
pipeline
=
{
*
(
p_fwd
.
get
())};
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
std
::
vector
<
primitive
>
pipeline
;
pipeline
.
push_back
(
*
p_fwd
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
y
->
set_layout
(
DataLayout
::
kMKLDNN
);
y
->
set_format
(
GetMKLDNNFormat
(
*
dst_memory
));
}
template
<
typename
T
,
typename
ExecContext
>
void
eltwise_grad
(
const
ExecContext
&
ctx
,
mkldnn
::
algorithm
algorithm
,
const
T
alpha
=
0
,
const
T
beta
=
0
)
{
template
<
typename
T
>
void
eltwise_grad
(
const
framework
::
ExecutionContext
&
ctx
,
mkldnn
::
algorithm
algorithm
,
const
T
alpha
=
0
,
const
T
beta
=
0
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
// get buffers
const
auto
*
out
=
ctx
.
template
Input
<
Tensor
>(
"Out"
);
auto
*
dout
=
ctx
.
template
Input
<
Tensor
>(
framework
::
GradVarName
(
"Out"
));
const
auto
*
diff_dst
=
dout
->
template
data
<
T
>();
const
auto
*
diff_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
diff_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dx
=
ctx
.
template
Output
<
framework
::
Tensor
>(
framework
::
GradVarName
(
"X"
));
const
T
*
diff_src
=
dx
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
());
const
T
*
diff_y_data
=
diff_y
->
data
<
T
>
();
T
*
diff_x_data
=
diff_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// get memory dim
std
::
vector
<
int
>
src_tz
=
framework
::
vectorize2int
(
out
->
dims
());
std
::
vector
<
int
>
diff_dst_tz
=
framework
::
vectorize2int
(
diff_y
->
dims
());
const
std
::
string
key
=
gethash
(
src_tz
,
algorithm
);
const
std
::
string
key_diff_src_mem
=
key
+
"@eltwise_diff_src_mem"
;
const
std
::
string
key_diff_dst_mem
=
key
+
"@eltwise_diff_dst_mem"
;
const
std
::
string
key_grad
=
key
+
"@eltwise_grad"
;
auto
diff_y_format
=
diff_dst_tz
.
size
()
==
2
?
mkldnn
::
memory
::
format
::
nc
:
diff_y
->
format
();
const
std
::
string
key
=
gethash
(
diff_dst_tz
,
algorithm
);
const
std
::
string
key_src_data
=
key
+
ctx
.
op
().
Input
(
"Out"
)
+
"@eltwise_fwd_src_data"
;
const
std
::
string
key_src_layout
=
key
+
ctx
.
op
().
Input
(
"Out"
)
+
"@eltwise_fwd_src_layout"
;
const
auto
p_src_layout
=
std
::
static_pointer_cast
<
memory
::
format
>
(
dev_ctx
.
GetBlob
(
key_src_layout
));
const
std
::
string
key_src_mem
=
key
+
std
::
to_string
(
*
p_src_layout
)
+
"@eltwise_fwd_src_mem"
;
const
std
::
string
key_fwd_pd
=
key
+
std
::
to_string
(
*
p_src_layout
)
+
"@eltwise_fwd_pd"
;
const
std
::
string
key_with_layouts
=
key
+
std
::
to_string
(
*
p_src_layout
)
+
"-"
+
std
::
to_string
(
diff_y_format
);
const
std
::
string
key_diff_src_mem
=
key_with_layouts
+
"@eltwise_diff_src_mem"
;
const
std
::
string
key_diff_dst_mem
=
key_with_layouts
+
"@eltwise_diff_dst_mem"
;
const
std
::
string
key_grad
=
key_with_layouts
+
"@eltwise_grad"
;
const
auto
p_src_data
=
std
::
static_pointer_cast
<
T
*>
(
dev_ctx
.
GetBlob
(
key_src_data
));
const
std
::
string
key_src_mem
=
key
+
"@eltwise_fwd_src_mem"
;
auto
p_src_mem
=
auto
src_memory
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_src_mem
));
p_src_mem
->
set_data_handle
(
*
p_src_data
.
get
());
PADDLE_ENFORCE
(
src_memory
!=
nullptr
,
"Fail to find src_memory in device context"
);
src_memory
->
set_data_handle
(
*
p_src_data
.
get
());
std
::
shared_ptr
<
memory
>
diff_src_memory
;
auto
p_grad
=
std
::
static_pointer_cast
<
mkldnn
::
eltwise_
forward
::
primitive
>
(
auto
p_grad
=
std
::
static_pointer_cast
<
mkldnn
::
eltwise_
backward
>
(
dev_ctx
.
GetBlob
(
key_grad
));
if
(
p_grad
==
nullptr
)
{
// create m
emory description
auto
d
ata_md
=
src_tz
.
size
()
==
2
?
platform
::
MKLDNNMemDesc
(
src_tz
,
mkldnn
::
memory
::
f32
,
mkldnn
::
memory
::
format
::
nc
)
:
platform
::
MKLDNNMemDesc
(
src_tz
,
mkldnn
::
memory
::
f32
,
mkldnn
::
memory
::
format
::
nchw
);
//
create memory primitives
std
::
shared_ptr
<
void
>
p_diff_src_mem
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
(
{
data_md
,
mkldnn_engine
},
platform
::
to_void_cast
(
diff_src
)
));
dev_ctx
.
SetBlob
(
key_diff_src_mem
,
p_diff_src_mem
);
std
::
shared_ptr
<
void
>
p_diff_dst_mem
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
(
{
data_md
,
mkldnn_engine
},
platform
::
to_void_cast
(
diff_dst
)));
dev_ctx
.
SetBlob
(
key_diff_dst_mem
,
p_diff_dst_mem
);
auto
bwd_desc
=
mkldnn
::
eltwise_backward
::
desc
(
algorithm
,
data_md
,
data_md
,
alpha
,
beta
);
const
std
::
string
key_fwd_pd
=
key
+
"eltwise_fwd_pd"
;
auto
*
p_fwd_pd
=
static_cast
<
mkldnn
::
eltwise_forward
::
primitive_desc
*>
(
dev_ctx
.
GetBlob
(
key_fwd_pd
).
get
());
auto
eltwise_bwd_prim_desc
=
mkldnn
::
eltwise_backward
::
primitive_desc
(
bwd_desc
,
mkldnn_engine
,
*
p_fwd_pd
);
// create m
kldnn memory for input diff_y
auto
d
iff_dst_md
=
platform
::
MKLDNNMemDesc
(
diff_dst_tz
,
platform
::
MKLDNNGetDataType
<
T
>
(),
diff_y_format
);
auto
diff_dst_memory
=
std
::
shared_ptr
<
memory
>
(
new
memory
({
diff_dst_md
,
mkldnn_engine
},
to_void_cast
(
diff_y_data
)));
dev_ctx
.
SetBlob
(
key_diff_dst_mem
,
diff_dst_memory
);
//
retrieve eltwise primitive desc from device context
auto
forward_pd
=
std
::
static_pointer_cast
<
mkldnn
::
eltwise_forward
::
primitive_desc
>
(
dev_ctx
.
GetBlob
(
key_fwd_pd
));
PADDLE_ENFORCE
(
forward_pd
!=
nullptr
,
"Fail to find eltwise_fwd_pd in device context"
);
// ceate primitive descriptor for activation backward
auto
backward_desc
=
mkldnn
::
eltwise_backward
::
desc
(
algorithm
,
diff_dst_memory
->
get_primitive_desc
().
desc
(),
src_memory
->
get_primitive_desc
().
desc
(),
alpha
,
beta
);
auto
backward_pd
=
mkldnn
::
eltwise_backward
::
primitive_desc
(
backward_desc
,
mkldnn_engine
,
*
forward_pd
);
// create mkldnn memory for output diff_src
diff_src_memory
=
std
::
make_shared
<
memory
>
(
backward_pd
.
diff_src_primitive_desc
(),
diff_x_data
);
dev_ctx
.
SetBlob
(
key_diff_src_mem
,
diff_src_memory
);
// create activation backward primitive
p_grad
=
std
::
make_shared
<
mkldnn
::
eltwise_backward
>
(
eltwise_bwd_prim_desc
,
*
static_cast
<
mkldnn
::
memory
*>
(
p_src_mem
.
get
()),
*
(
static_cast
<
mkldnn
::
memory
*>
(
p_diff_dst_mem
.
get
())),
*
(
static_cast
<
mkldnn
::
memory
*>
(
p_diff_src_mem
.
get
())));
backward_pd
,
*
src_memory
,
*
diff_dst_memory
,
*
diff_src_memory
);
dev_ctx
.
SetBlob
(
key_grad
,
p_grad
);
}
else
{
// primitives already exist
auto
p_diff_src_mem
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
diff_src_memory
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_diff_src_mem
));
auto
p_diff_dst_mem
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
auto
diff_dst_memory
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_diff_dst_mem
));
p_diff_src_mem
->
set_data_handle
(
platform
::
to_void_reinterpret_cast
(
diff_
src
));
p_diff_dst_mem
->
set_data_handle
(
platform
::
to_void_reinterpret_cast
(
diff_
dst
));
diff_src_memory
->
set_data_handle
(
platform
::
to_void_reinterpret_cast
(
diff_
x_data
));
diff_dst_memory
->
set_data_handle
(
platform
::
to_void_reinterpret_cast
(
diff_
y_data
));
}
// push primitive to stream and wait until it's executed
std
::
vector
<
mkldnn
::
primitive
>
pipeline
=
{
*
(
p_grad
.
get
())};
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
std
::
vector
<
primitive
>
pipeline
;
pipeline
.
push_back
(
*
p_grad
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
(
GetMKLDNNFormat
(
*
diff_src_memory
));
}
}
// anonymous namespace
template
<
typename
T
,
mkldnn
::
algorithm
algorithm
>
struct
MKLDNNActivationFunc
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
ExecContext
>
void
operator
()(
const
ExecContext
&
ctx
)
const
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
const
{
eltwise_forward
<
T
>
(
ctx
,
algorithm
);
}
};
template
<
typename
T
,
mkldnn
::
algorithm
algorithm
>
struct
MKLDNNActivationGradFunc
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
ExecContext
>
void
operator
()(
const
ExecContext
&
ctx
)
const
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
const
{
eltwise_grad
<
T
>
(
ctx
,
algorithm
);
}
};
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
706f3839
...
...
@@ -19,18 +19,20 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \
class OP_NAME##OpMaker \
: public ::paddle::framework::OpProtoAndCheckerMaker { \
public: \
void Make() override { \
AddInput("X", "Input of " #OP_NAME " operator"); \
AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \
AddAttr<bool>("use_mkldnn", \
"(default false) Only used in mkldnn kernel") \
.SetDefault(false); \
AddComment(OP_COMMENT); \
} \
using
paddle
::
framework
::
Tensor
;
#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \
class OP_NAME##OpMaker \
: public ::paddle::framework::OpProtoAndCheckerMaker { \
public: \
void Make() override { \
AddInput("X", "Input of " #OP_NAME " operator"); \
AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \
AddAttr<bool>("use_mkldnn", \
"(bool, default false) Only used in mkldnn kernel") \
.SetDefault(false); \
AddComment(#OP_COMMENT); \
} \
}
#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE) \
...
...
@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
const
framework
::
OperatorWithKernel
&
oper
,
const
std
::
string
&
name
)
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kPlain
};
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
auto
it
=
oper
.
Attrs
().
find
(
"use_mkldnn"
);
...
...
@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel {
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
GetKernelType
(
ctx
,
*
this
,
"X"
);
...
...
@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"Out"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
GetKernelType
(
ctx
,
*
this
,
"Out"
);
...
...
paddle/fluid/operators/concat_op.h
浏览文件 @
706f3839
...
...
@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
class
ConcatGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
out_grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
ins
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"X"
);
auto
out_var_names
=
ctx
.
Outputs
(
framework
::
GradVarName
(
"X"
));
auto
outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int64_t
axis
=
static_cast
<
int64_t
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
// get output tensor that the name is not kEmptyVarName
std
::
vector
<
framework
::
Tensor
*>
outputs
;
for
(
size_t
j
=
0
;
j
<
outs
.
size
();
++
j
)
{
if
(
out_var_names
[
j
]
!=
framework
::
kEmptyVarName
)
{
outs
[
j
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
outputs
.
push_back
(
outs
[
j
]);
}
else
{
outputs
.
push_back
(
nullptr
);
}
}
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if
(
axis
==
0
&&
outs
.
size
()
<
10
)
{
size_t
input_offset
=
0
;
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
const
auto
in_stride
=
framework
::
stride_numel
(
out_grad
->
dims
());
for
(
auto
&
out
:
outs
)
{
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
StridedNumelCopyWithAxis
<
T
>
(
ctx
.
device_context
(),
axis
,
out
->
data
<
T
>
(),
out_stride
,
in
->
data
<
T
>
()
+
input_offset
,
in_stride
,
out_stride
[
axis
]);
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
auto
out_stride
=
framework
::
stride_numel
(
ins
[
i
]
->
dims
());
auto
*
out
=
outputs
[
i
];
if
(
out
!=
nullptr
)
{
StridedNumelCopyWithAxis
<
T
>
(
ctx
.
device_context
(),
axis
,
out
->
data
<
T
>
(),
out_stride
,
out_grad
->
data
<
T
>
()
+
input_offset
,
in_stride
,
out_stride
[
axis
]);
}
input_offset
+=
out_stride
[
axis
];
}
}
else
{
std
::
vector
<
framework
::
Tensor
>
outputs
(
outs
.
size
());
for
(
size_t
j
=
0
;
j
<
outs
.
size
();
++
j
)
{
outs
[
j
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
outputs
[
j
]
=
*
outs
[
j
];
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
paddle
::
operators
::
math
::
ConcatGradFunctor
<
DeviceContext
,
T
>
concat_grad_functor
;
concat_grad_functor
(
dev_ctx
,
*
in
,
static_cast
<
int
>
(
axis
),
&
outputs
);
concat_grad_functor
(
dev_ctx
,
*
out_grad
,
ins
,
static_cast
<
int
>
(
axis
),
&
outputs
);
}
}
};
...
...
paddle/fluid/operators/detection_map_op.cc
浏览文件 @
706f3839
...
...
@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment
(
R"DOC(
Detection mAP evaluate operator.
The general steps are as follows. First, calculate the true positive and
false positive according to the input of detection and labels, then
calculate the mAP evaluate value.
Supporting '11 point' and 'integral' mAP algorithm. Please get more information
from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
false positive according to the input of detection and labels, then
calculate the mAP evaluate value.
Supporting '11 point' and 'integral' mAP algorithm. Please get more information
from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
)DOC"
);
}
...
...
paddle/fluid/operators/math/concat.cc
浏览文件 @
706f3839
...
...
@@ -70,35 +70,40 @@ template <typename T>
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>*
outputs
)
{
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
const
framework
::
Tensor
*>&
ref_inputs
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
*>*
outputs
)
{
// TODO(zcd): Add input data validity checking
in
t
num
=
outputs
->
size
();
size_
t
num
=
outputs
->
size
();
int
input_rows
=
1
;
auto
dim_0
=
outputs
->
at
(
0
).
dims
();
auto
dim_0
=
ref_inputs
[
0
]
->
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
input_rows
*=
dim_0
[
i
];
}
int
input_cols
=
0
;
std
::
vector
<
int64_t
>
output_cols
(
outputs
->
size
());
for
(
in
t
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
outputs
->
at
(
i
).
numel
()
/
input_rows
;
for
(
size_
t
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
ref_inputs
[
i
]
->
numel
()
/
input_rows
;
input_cols
+=
t_cols
;
output_cols
[
i
]
=
t_cols
;
}
auto
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
// computation
for
(
in
t
k
=
0
;
k
<
input_rows
;
++
k
)
{
for
(
size_
t
k
=
0
;
k
<
input_rows
;
++
k
)
{
const
T
*
src_ptr
=
input
.
data
<
T
>
()
+
k
*
input_cols
;
int
col_idx
=
0
;
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
int
col_len
=
output_cols
[
j
];
T
*
dst_ptr
=
outputs
->
at
(
j
).
data
<
T
>
()
+
k
*
col_len
;
memory
::
Copy
(
cpu_place
,
dst_ptr
,
cpu_place
,
src_ptr
+
col_idx
,
sizeof
(
T
)
*
col_len
);
auto
*
out_tensor
=
outputs
->
at
(
j
);
if
(
out_tensor
!=
nullptr
)
{
T
*
dst_ptr
=
out_tensor
->
data
<
T
>
()
+
k
*
col_len
;
memory
::
Copy
(
cpu_place
,
dst_ptr
,
cpu_place
,
src_ptr
+
col_idx
,
sizeof
(
T
)
*
col_len
);
}
col_idx
+=
col_len
;
}
}
...
...
paddle/fluid/operators/math/concat.cu
浏览文件 @
706f3839
...
...
@@ -102,10 +102,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
output_ptr
=
outputs_data
[
curr_segment
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
...
...
@@ -118,10 +120,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
int
split
=
tid_x
/
fixed_out_col
;
int
in_offset
=
tid_x
-
split
*
fixed_out_col
;
T
*
output_ptr
=
outputs_data
[
split
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
fixed_out_col
+
in_offset
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
if
(
output_ptr
!=
nullptr
)
{
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
in_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
fixed_out_col
+
in_offset
]
=
input_data
[
tid_y
*
in_col
+
tid_x
];
}
}
}
...
...
@@ -203,17 +207,18 @@ template <typename T>
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>*
outputs
)
{
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
const
framework
::
Tensor
*>&
ref_inputs
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
*>*
outputs
)
{
// TODO(zcd): Add input data validity checking
int
o_num
=
outputs
->
size
();
int
out_row
=
1
;
auto
dim_0
=
outputs
->
at
(
0
).
dims
();
auto
dim_0
=
ref_inputs
[
0
]
->
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
out_row
*=
dim_0
[
i
];
}
int
out
_col
=
outputs
->
at
(
0
).
numel
()
/
out_row
;
int
out
0_col
=
ref_inputs
[
0
]
->
numel
()
/
out_row
;
int
in_col
=
0
,
in_row
=
out_row
;
bool
sameShape
=
true
;
...
...
@@ -223,13 +228,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
o_num
;
++
i
)
{
int
t_col
=
outputs
->
at
(
i
)
.
numel
()
/
out_row
;
int
t_col
=
outputs
->
at
(
i
)
->
numel
()
/
out_row
;
if
(
sameShape
)
{
if
(
t_col
!=
out_col
)
sameShape
=
false
;
if
(
t_col
!=
out
0
_col
)
sameShape
=
false
;
}
in_col
+=
t_col
;
outputs_cols
[
i
+
1
]
=
in_col
;
outputs_ptr
[
i
]
=
outputs
->
at
(
i
).
data
<
T
>
();
if
(
outputs
->
at
(
i
)
!=
nullptr
)
{
outputs_ptr
[
i
]
=
outputs
->
at
(
i
)
->
data
<
T
>
();
}
else
{
outputs_ptr
[
i
]
=
nullptr
;
}
}
T
**
dev_out_gpu_data
=
...
...
@@ -255,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
if
(
sameShape
)
{
KernelConcatGrad
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
input
.
data
<
T
>
(),
in_row
,
in_col
,
out_col
,
dev_out_gpu_data
);
input
.
data
<
T
>
(),
in_row
,
in_col
,
out
0
_col
,
dev_out_gpu_data
);
}
else
{
const
int
*
dev_outs_col_data
=
outputs_cols
.
CUDAData
(
context
.
GetPlace
());
KernelConcatGrad
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
...
...
paddle/fluid/operators/math/concat.h
浏览文件 @
706f3839
...
...
@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T>
class
ConcatGradFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>*
outputs
);
const
std
::
vector
<
const
framework
::
Tensor
*>&
ref_inputs
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
*>*
outputs
);
};
}
// namespace math
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
706f3839
...
...
@@ -28,9 +28,15 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
DEFINE_uint64
(
initial_cpu_memory_in_mb
,
500
,
"Default initial 500MB of CPU memory for PaddlePaddle, in MD unit."
);
DEFINE_uint64
(
initial_cpu_memory_in_mb
,
#ifdef PADDLE_WITH_MKLDNN
/* Aligned with mozga-intel, MKLDNN need at least 5000 MB
* to obtain the best performance*/
5000
,
#else
500
,
#endif
"Initial CPU memory for PaddlePaddle, in MD unit."
);
DEFINE_double
(
fraction_of_cuda_pinned_memory_to_use
,
0.5
,
...
...
@@ -59,10 +65,7 @@ inline size_t CpuTotalPhysicalMemory() {
size_t
CpuMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return
std
::
min
(
static_cast
<
size_t
>
(
FLAGS_fraction_of_cpu_memory_to_use
*
CpuTotalPhysicalMemory
()),
static_cast
<
size_t
>
(
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
));
return
FLAGS_fraction_of_cpu_memory_to_use
*
CpuTotalPhysicalMemory
();
}
size_t
CpuMinChunkSize
()
{
...
...
@@ -71,8 +74,11 @@ size_t CpuMinChunkSize() {
}
size_t
CpuMaxChunkSize
()
{
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
return
CpuMaxAllocSize
()
/
32
;
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
// or the initial_cpu_memory_in_mb.
return
std
::
min
(
static_cast
<
size_t
>
(
CpuMaxAllocSize
()
/
32
),
static_cast
<
size_t
>
(
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
));
}
size_t
CUDAPinnedMaxAllocSize
()
{
...
...
paddle/testing/paddle_gtest_main.cc
浏览文件 @
706f3839
...
...
@@ -30,8 +30,9 @@ int main(int argc, char** argv) {
new_argv
.
push_back
(
strdup
(
"--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"
));
#else
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_pinned_memory,use_mkldnn"
));
new_argv
.
push_back
(
strdup
(
"--undefok=use_mkldnn"
));
new_argv
.
push_back
(
strdup
(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"
));
new_argv
.
push_back
(
strdup
(
"--undefok=use_mkldnn,initial_cpu_memory_in_mb"
));
#endif
int
new_argc
=
static_cast
<
int
>
(
new_argv
.
size
());
char
**
new_argv_address
=
new_argv
.
data
();
...
...
python/paddle/fluid/__init__.py
浏览文件 @
706f3839
...
...
@@ -117,7 +117,7 @@ def __bootstrap__():
read_env_flags
=
[
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope'
,
'use_mkldnn'
'eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb'
]
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
...
...
python/paddle/fluid/framework.py
浏览文件 @
706f3839
...
...
@@ -644,7 +644,13 @@ class Operator(object):
def
set_attr
(
self
,
name
,
val
):
self
.
attrs
[
name
]
=
val
self
.
desc
.
set_attr
(
name
,
val
)
if
isinstance
(
val
,
Block
):
self
.
desc
.
set_block_attr
(
name
,
val
.
desc
)
elif
isinstance
(
val
,
core
.
BlockDesc
)
or
\
isinstance
(
val
,
core
.
ProgramDesc
):
self
.
desc
.
set_serialized_attr
(
name
,
val
.
serialize_to_string
())
else
:
self
.
desc
.
set_attr
(
name
,
val
)
@
property
def
attr_names
(
self
):
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
706f3839
...
...
@@ -16,7 +16,7 @@ All layers just related to the detection neural network.
"""
from
layer_function_generator
import
generate_layer_fn
from
layer_function_generator
import
autodoc
from
layer_function_generator
import
autodoc
,
templatedoc
from
..layer_helper
import
LayerHelper
import
tensor
import
nn
...
...
@@ -155,7 +155,7 @@ def detection_output(loc,
return
nmsed_outs
@
auto
doc
()
@
template
doc
()
def
detection_map
(
detect_res
,
label
,
class_num
,
...
...
@@ -166,6 +166,47 @@ def detection_map(detect_res,
input_states
=
None
,
out_states
=
None
,
ap_version
=
'integral'
):
"""
${comment}
Args:
detect_res: ${detect_res_comment}
label: ${label_comment}
class_num: ${class_num_comment}
background_label: ${background_label_comment}
overlap_threshold: ${overlap_threshold_comment}
evaluate_difficult: ${evaluate_difficult_comment}
has_state: ${has_state_comment}
input_states: If not None, It contains 3 elements:
1. pos_count ${pos_count_comment}.
2. true_pos ${true_pos_comment}.
3. false_pos ${false_pos_comment}.
out_states: If not None, it contains 3 elements.
1. accum_pos_count ${accum_pos_count_comment}.
2. accum_true_pos ${accum_true_pos_comment}.
3. accum_false_pos ${accum_false_pos_comment}.
ap_version: ${ap_type_comment}
Returns:
${map_comment}
Examples:
.. code-block:: python
detect_res = fluid.layers.data(
name='detect_res',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
label = fluid.layers.data(
name='label',
shape=[10, 6],
append_batch_size=False,
dtype='float32')
map_out = fluid.layers.detection_map(detect_res, label, 21)
"""
helper
=
LayerHelper
(
"detection_map"
,
**
locals
())
def
__create_var
(
type
):
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
706f3839
...
...
@@ -93,6 +93,7 @@ __all__ = [
'mean_iou'
,
'relu'
,
'log'
,
'crop'
,
]
...
...
@@ -5003,3 +5004,101 @@ def mean_iou(input, label, num_classes):
},
attrs
=
{
"num_classes"
:
num_classes
})
return
out_mean_iou
,
out_wrong
,
out_correct
def
crop
(
x
,
shape
=
None
,
offsets
=
None
,
name
=
None
):
"""
Crop input into output, as specified by offsets and shape.
.. code-block:: text
* Case 1:
Given
X = [[0, 1, 2, 0, 0]
[0, 3, 4, 0, 0]
[0, 0, 0, 0, 0]],
and
shape = [2, 2],
offsets = [0, 1],
output is:
Out = [[1, 2],
[3, 4]].
* Case 2:
Given
X = [[0, 1, 2, 5, 0]
[0, 3, 4, 6, 0]
[0, 0, 0, 0, 0]],
and shape is tensor
shape = [[0, 0, 0]
[0, 0, 0]]
and
offsets = [0, 1],
output is:
Out = [[1, 2, 5],
[3, 4, 6]].
Args:
x (Variable): The input tensor variable.
shape (Variable|list/tuple of integer): The output shape is specified
by `shape`, which can a Variable or a list/tupe of integer.
If a tensor Variable, it's rank must be the same as `x`. This way
is suitable for the case that the output shape may be changed each
iteration. If a list/tupe of integer, it's length must be the same
as the rank of `x`
offsets (Variable|list/tuple of integer|None): Specifies the copping
offsets at each dimension. It can be a Variable or or a list/tupe
of integer. If a tensor Variable, it's rank must be the same as `x`.
This way is suitable for the case that the offsets may be changed
each iteration. If a list/tupe of integer, it's length must be the
same as the rank of `x`. If None, the offsets are 0 at each
dimension.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The cropped tensor variable.
Raises:
ValueError: If shape is not a list, tuple or Variable.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
crop = fluid.layers.crop(x, shape=y)
# or
z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
crop = fluid.layers.crop(z, shape=[2, 3])
"""
helper
=
LayerHelper
(
'crop'
,
**
locals
())
if
not
(
isinstance
(
shape
,
list
)
or
isinstance
(
shape
,
tuple
)
or
\
isinstance
(
shape
,
Variable
)):
raise
ValueError
(
"The shape should be a list, tuple or Variable."
)
if
offsets
is
None
:
offsets
=
[
0
]
*
len
(
x
.
shape
)
out
=
helper
.
create_tmp_variable
(
x
.
dtype
)
ipts
=
{
'X'
:
x
}
attrs
=
{}
if
isinstance
(
shape
,
Variable
):
ipts
[
'Y'
]
=
shape
else
:
attrs
[
'shape'
]
=
shape
if
isinstance
(
offsets
,
Variable
):
ipts
[
'Offsets'
]
=
offsets
else
:
attrs
[
'offsets'
]
=
offsets
helper
.
append_op
(
type
=
'crop'
,
inputs
=
ipts
,
outputs
=
{
'Out'
:
out
},
attrs
=
None
if
len
(
attrs
)
==
0
else
attrs
)
return
out
python/paddle/fluid/optimizer.py
浏览文件 @
706f3839
...
...
@@ -26,10 +26,10 @@ from clip import append_gradient_clip_ops, error_clip_callback
from
contextlib
import
contextmanager
__all__
=
[
'SGD'
,
'Momentum'
,
'Adagrad'
,
'Adam'
,
'Adamax'
,
'DecayedAdagrad'
,
'SGD'
,
'Momentum'
,
'Adagrad'
,
'Adam'
,
'Adamax'
,
'DecayedAdagrad'
,
'Ftrl'
,
'SGDOptimizer'
,
'MomentumOptimizer'
,
'AdagradOptimizer'
,
'AdamOptimizer'
,
'AdamaxOptimizer'
,
'DecayedAdagradOptimizer'
,
'
Adadelta
Optimizer'
,
'
RMSPropOptimizer'
,
'Adadelta'
,
'ModelAverage'
,
'
Optimizer'
'AdamaxOptimizer'
,
'DecayedAdagradOptimizer'
,
'
RMSProp
Optimizer'
,
'
FtrlOptimizer'
,
'Adadelta'
,
'ModelAverage'
,
'Optimizer'
,
'RMSProp
Optimizer'
]
...
...
@@ -794,7 +794,7 @@ class AdadeltaOptimizer(Optimizer):
E(dx_t^2) &=
\\
rho * E(dx_{t-1}^2) + (1-
\\
rho) * (-g*learning
\\
_rate)^2
Args:
learning_rate(float): global le
ra
ning rate
learning_rate(float): global le
ar
ning rate
rho(float): rho in equation
epsilon(float): epsilon in equation
...
...
@@ -895,9 +895,9 @@ class RMSPropOptimizer(Optimizer):
Args:
learning_rate(float): global le
ra
ning rate.
rho(float): rho is :math:`
\\
rho` in equation, set 0.95 by default.
epsilon(float): :math:`
\\
epsilon` in equation is smoothing term to
learning_rate(float): global le
ar
ning rate.
rho(float): rho is :math:
`
\\
rho` in equation, set 0.95 by default.
epsilon(float): :math:
`
\\
epsilon` in equation is smoothing term to
avoid division by zero, set 1e-6 by default.
momentum(float): :math:`
\\
beta` in equation is the momentum term,
set 0.0 by default.
...
...
@@ -976,6 +976,113 @@ class RMSPropOptimizer(Optimizer):
return
rmsprop_op
class
FtrlOptimizer
(
Optimizer
):
"""
FTRL (Follow The Regularized Leader) Optimizer.
The paper that proposed Follow The Regularized Leader (FTRL):
(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
.. math::
&new\_accum = squared\_accum + grad^2
&if (lr\_power == -0.5):
&\quad linear\_accum += grad -
\\
frac{
\\
sqrt{new\_accum} -
\\
sqrt{squared\_accum}}{learning\_rate * param}
&else:
&\quad linear\_accum += grad -
\\
frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
&x = l1 * sign(linear\_accum) - linear\_accum
&if (lr\_power == -0.5):
&\quad y =
\\
frac{
\\
sqrt{new\_accum}}{learning\_rate} + (2 * l2)
&\quad pre\_shrink =
\\
frac{x}{y}
&\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
&else:
&\quad y =
\\
frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
&\quad pre\_shrink =
\\
frac{x}{y}
&\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
&squared\_accum += grad^2
Args:
learning_rate (float|Variable): global learning rate.
l1 (float):
l2 (float):
lr_power (float):
Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None.
Examples:
.. code-block:: python
optimizer = fluid.optimizer.Ftrl(0.0001)
_, params_grads = optimizer.minimize(cost)
"""
_squared_acc_str
=
"squared"
_linear_acc_str
=
"linear"
def
__init__
(
self
,
learning_rate
,
l1
=
0.0
,
l2
=
0.0
,
lr_power
=-
0.5
,
**
kwargs
):
super
(
FtrlOptimizer
,
self
).
__init__
(
learning_rate
=
learning_rate
,
**
kwargs
)
if
learning_rate
is
None
:
raise
ValueError
(
"learning_rate is not set."
)
self
.
type
=
"ftrl"
self
.
_l1
=
l1
self
.
_l2
=
l2
self
.
_lr_power
=
lr_power
def
_create_accumulators
(
self
,
block
,
parameters
):
if
not
isinstance
(
block
,
framework
.
Block
):
raise
TypeError
(
"block is not instance of framework.Block."
)
for
p
in
parameters
:
self
.
_add_accumulator
(
self
.
_squared_acc_str
,
p
)
self
.
_add_accumulator
(
self
.
_linear_acc_str
,
p
)
def
_append_optimize_op
(
self
,
block
,
param_and_grad
):
if
not
isinstance
(
block
,
framework
.
Block
):
raise
TypeError
(
"block is not instance of framework.Block."
)
squared_acc
=
self
.
_get_accumulator
(
self
.
_squared_acc_str
,
param_and_grad
[
0
])
linear_acc
=
self
.
_get_accumulator
(
self
.
_linear_acc_str
,
param_and_grad
[
0
])
ftrl_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"SquaredAccumulator"
:
squared_acc
,
"LinearAccumulator"
:
linear_acc
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
),
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"SquaredAccumOut"
:
squared_acc
,
"LinearAccumOut"
:
linear_acc
},
attrs
=
{
"l1"
:
self
.
_l1
,
"l2"
:
self
.
_l1
,
"lr_power"
:
self
.
_lr_power
})
return
ftrl_op
# We short the class name, since users will use the optimizer with the package
# name. The sample code:
#
...
...
@@ -992,6 +1099,7 @@ Adamax = AdamaxOptimizer
DecayedAdagrad
=
DecayedAdagradOptimizer
Adadelta
=
AdadeltaOptimizer
RMSProp
=
RMSPropOptimizer
Ftrl
=
FtrlOptimizer
class
ModelAverage
(
Optimizer
):
...
...
python/paddle/fluid/profiler.py
浏览文件 @
706f3839
...
...
@@ -42,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None):
counters/options for profiling by `config` argument. The default config
is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
Then users can use NVIDIA Visual Profiler
(https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
this output file to visualize results.
Args:
output_file (string) : The output file name, the result will be
...
...
@@ -50,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None):
Comma separated values format. It should be 'kvp' or 'csv'.
config (list of string) : The profiler options and counters can refer
to "Compute Command Line Profiler User Guide".
Raises:
ValueError: If `output_mode` is not in ['kvp', 'csv'].
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
epoc = 8
dshape = [4, 3, 28, 28]
data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output_file = 'cuda_profiler.txt'
with profiler.cuda_profiler(output_file, 'csv') as nvprof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
# then use NVIDIA Visual Profiler (nvvp) to load this output file
# to visualize results.
"""
if
output_mode
is
None
:
output_mode
=
'csv'
...
...
@@ -69,19 +99,52 @@ def cuda_profiler(output_file, output_mode=None, config=None):
def
reset_profiler
():
"""The profiler clear interface.
reset_profiler will clear the previous time record.
"""
Clear the previous time record. This interface does not work for
`fluid.profiler.cuda_profiler`, it only works for
`fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
and `fluid.profiler.profiler`.
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
with profiler.profiler(state, 'total', '/tmp/profile'):
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
# ...
"""
core
.
reset_profiler
()
def
start_profiler
(
state
):
"""Enable the profiler.
"""
Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
`fluid.profiler.stop_profiler` to insert the code, except the usage of
`fluid.profiler.profiler` interface.
Args:
state (string) : The profiling state, which should be 'CPU', 'GPU'
or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
GPU as well. 'All' also generates timeline.
Raises:
ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
profiler.start_profiler('GPU')
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
# except each iteration
profiler.stop_profiler('total', '/tmp/profile')
"""
if
core
.
is_profiler_enabled
():
return
...
...
@@ -97,7 +160,10 @@ def start_profiler(state):
def
stop_profiler
(
sorted_key
=
None
,
profile_path
=
'/tmp/profile'
):
"""Stop the profiler.
"""
Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
`fluid.profiler.stop_profiler` to insert the code, except the usage of
`fluid.profiler.profiler` interface.
Args:
sorted_key (string) : If None, the profiling results will be printed
...
...
@@ -111,6 +177,23 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile
proto output file.
Raises:
ValueError: If `sorted_key` is not in
['calls', 'total', 'max', 'min', 'ave'].
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
profiler.start_profiler('GPU')
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
# except each iteration
profiler.stop_profiler('total', '/tmp/profile')
"""
if
not
core
.
is_profiler_enabled
():
return
...
...
@@ -137,7 +220,12 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
to add more records in C++ code.
If the state == 'All', a profile proto file will be written to
`profile_path`. This file records timeline information during the execution.
Then users can visualize this file to see the timeline, please refer
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
...
...
@@ -156,6 +244,25 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile
proto output file.
Raises:
ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
not in ['calls', 'total', 'max', 'min', 'ave'].
Examples:
.. code-block:: python
import paddle.fluid.profiler as profiler
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[],
use_program_cache=True)
# ...
"""
start_profiler
(
state
)
yield
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
706f3839
...
...
@@ -401,6 +401,15 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_maxout
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
5
],
dtype
=
"float32"
)
y
=
layers
.
data
(
name
=
'y'
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
output
=
layers
.
crop
(
x
,
shape
=
y
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_optimizer.py
浏览文件 @
706f3839
...
...
@@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
self
.
assertAlmostEqual
(
init_ops
[
1
].
attr
(
'value'
),
0.0
)
class
TestFtrlOptimizer
(
unittest
.
TestCase
):
class
MockFtrl
(
optimizer
.
FtrlOptimizer
):
def
get_accumulators
(
self
):
return
self
.
_accumulators
def
get_squared_str
(
self
):
return
self
.
_squared_acc_str
def
get_linear_str
(
self
):
return
self
.
_linear_acc_str
def
test_ftrl_optimizer
(
self
):
init_program
=
framework
.
Program
()
program
=
framework
.
Program
()
block
=
program
.
global_block
()
mul_x
=
block
.
create_parameter
(
dtype
=
"float32"
,
shape
=
[
5
,
10
],
lod_level
=
0
,
name
=
"mul.x"
,
optimize_attr
=
{
'learning_rate'
:
1.1
})
mul_y
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
10
,
8
],
lod_level
=
0
,
name
=
"mul.y"
)
mul_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
5
,
8
],
lod_level
=
0
,
name
=
"mul.out"
)
block
.
append_op
(
type
=
"mul"
,
inputs
=
{
"X"
:
mul_x
,
"Y"
:
mul_y
},
outputs
=
{
"Out"
:
mul_out
},
attrs
=
{
"x_num_col_dims"
:
1
})
mean_out
=
block
.
create_var
(
dtype
=
"float32"
,
shape
=
[
1
],
lod_level
=
0
,
name
=
"mean.out"
)
block
.
append_op
(
type
=
"mean"
,
inputs
=
{
"X"
:
mul_out
},
outputs
=
{
"Out"
:
mean_out
})
learning_rate
=
0.01
ftrl_optimizer
=
self
.
MockFtrl
(
learning_rate
=
learning_rate
,
l1
=
0.0
,
l2
=
0.0
,
lr_power
=-
0.5
)
params_grads
=
append_backward
(
mean_out
)
self
.
assertEqual
(
len
(
params_grads
),
1
)
self
.
assertEqual
(
len
(
ftrl_optimizer
.
get_accumulators
()),
0
)
opts
=
ftrl_optimizer
.
create_optimization_pass
(
params_grads
,
mul_out
,
init_program
)
self
.
assertEqual
(
len
(
opts
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
opts
],
[
"fill_constant"
,
"elementwise_mul"
,
"ftrl"
])
# Check accumulators
accumulators
=
ftrl_optimizer
.
get_accumulators
()
self
.
assertEqual
(
len
(
accumulators
),
2
)
self
.
assertTrue
(
ftrl_optimizer
.
get_squared_str
()
in
accumulators
)
self
.
assertTrue
(
ftrl_optimizer
.
get_linear_str
()
in
accumulators
)
squared_acc
=
accumulators
[
ftrl_optimizer
.
get_squared_str
()]
linear_acc
=
accumulators
[
ftrl_optimizer
.
get_linear_str
()]
self
.
assertEqual
(
len
(
squared_acc
),
1
)
self
.
assertEqual
(
len
(
linear_acc
),
1
)
self
.
assertTrue
(
mul_x
.
name
in
squared_acc
)
self
.
assertTrue
(
mul_x
.
name
in
linear_acc
)
# Check init_program
init_ops
=
init_program
.
global_block
().
ops
self
.
assertEqual
(
len
(
init_ops
),
3
)
self
.
assertEqual
(
init_ops
[
0
].
type
,
"fill_constant"
)
self
.
assertAlmostEqual
(
init_ops
[
0
].
attr
(
'value'
),
learning_rate
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
706f3839
...
...
@@ -24,7 +24,7 @@ Steps to transpile trainer:
1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
3. modify trainer program add split_op to each grad variable.
4. append send_op to send splited variables to server and
4. append send_op to send splited variables to server and
5. add recv_op to fetch params(splited blocks or origin param) from server.
6. append concat_op to merge splited blocks to update local weights.
...
...
@@ -44,7 +44,7 @@ import numpy as np
from
ps_dispatcher
import
RoundRobin
,
HashName
,
PSDispatcher
from
..
import
core
,
framework
from
..framework
import
Program
,
default_main_program
,
\
default_startup_program
,
\
default_startup_program
,
Block
,
\
Variable
,
Parameter
,
grad_var_name
from
details
import
*
...
...
@@ -471,7 +471,7 @@ class DistributeTranspiler:
self
.
_append_pserver_ops
(
block
,
op
,
endpoint
,
grad_to_block_id
,
self
.
origin_program
,
merged_var
)
else
:
self
.
_append_pserver_non_opt_ops
(
block
,
op
,
endpoint
)
self
.
_append_pserver_non_opt_ops
(
block
,
op
)
def
__op_have_grad_input__
(
op
):
for
varname
in
op
.
input_arg_names
:
...
...
@@ -479,13 +479,39 @@ class DistributeTranspiler:
return
varname
return
""
def
__clone_lr_op_sub_block__
(
op
,
program
,
new_block
):
if
not
op
.
has_attr
(
'sub_block'
):
return
origin_block_desc
=
op
.
attr
(
'sub_block'
)
origin_block
=
self
.
origin_program
.
block
(
origin_block_desc
.
id
)
assert
isinstance
(
origin_block
,
Block
)
# we put the new sub block to new block to follow the block
# hierarchy of the original blocks
new_sub_block
=
program
.
create_block
(
new_block
.
idx
)
# clone vars
for
var
in
origin_block
.
vars
:
new_sub_block
.
clone_variable
(
var
)
# clone ops
for
op
in
origin_block
.
ops
:
self
.
_clone_lr_op
(
program
,
new_sub_block
,
op
)
# clone sub_block of op
__clone_lr_op_sub_block__
(
op
,
program
,
new_sub_block
)
# reset the block of op
op
.
set_attr
(
'sub_block'
,
new_sub_block
)
# append lr decay ops to the child block if exists
lr_ops
=
self
.
_get_lr_ops
()
if
len
(
lr_ops
)
>
0
:
lr_decay_block
=
pserver_program
.
create_block
(
pserver_program
.
num_blocks
-
1
)
for
_
,
op
in
enumerate
(
lr_ops
):
self
.
_append_pserver_non_opt_ops
(
lr_decay_block
,
op
,
endpoint
)
self
.
_append_pserver_non_opt_ops
(
lr_decay_block
,
op
)
# append sub blocks to pserver_program in lr_decay_op
__clone_lr_op_sub_block__
(
op
,
pserver_program
,
lr_decay_block
)
# append op to the current block
grad_to_block_id
=
[]
...
...
@@ -1116,7 +1142,29 @@ class DistributeTranspiler:
break
return
grad_block
def
_append_pserver_non_opt_ops
(
self
,
optimize_block
,
opt_op
,
endpoint
):
def
_clone_lr_op
(
self
,
program
,
block
,
op
):
inputs
=
self
.
_get_input_map_from_op
(
self
.
origin_program
.
global_block
().
vars
,
op
)
for
key
,
varlist
in
inputs
.
iteritems
():
if
not
isinstance
(
varlist
,
list
):
varlist
=
[
varlist
]
for
var
in
varlist
:
if
var
not
in
program
.
global_block
().
vars
:
block
.
clone_variable
(
var
)
outputs
=
self
.
_get_output_map_from_op
(
self
.
origin_program
.
global_block
().
vars
,
op
)
for
key
,
varlist
in
outputs
.
iteritems
():
if
not
isinstance
(
varlist
,
list
):
varlist
=
[
varlist
]
for
var
in
varlist
:
if
var
not
in
program
.
global_block
().
vars
:
block
.
clone_variable
(
var
)
block
.
append_op
(
type
=
op
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
op
.
attrs
)
def
_append_pserver_non_opt_ops
(
self
,
optimize_block
,
opt_op
):
program
=
optimize_block
.
program
# Append the ops for parameters that do not need to be optimized/updated
inputs
=
self
.
_get_input_map_from_op
(
...
...
python/paddle/fluid/transpiler/inference_transpiler.py
浏览文件 @
706f3839
...
...
@@ -19,16 +19,30 @@ from ..executor import global_scope
class
InferenceTranspiler
:
'''
Convert the fluid program to optimized inference program.
There are several optimizations, only fuse batch normalization is supported now.
Examples:
.. code-block:: python
# As InferenceTranspiler will modify the original program,
# please clone before use it.
inference_transpiler_program = program.clone()
t = fluid.InferenceTranspiler()
t.transpile(inference_transpiler_program, place)
'''
def
transpile
(
self
,
program
,
place
,
scope
=
None
):
'''
Transpile the program. Support only fuse batch normalization now.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope or None
Run the transpiler.
Args:
program (Program): program to transpile
place (Place): inference place
scope (Scope|None): inference Scope
'''
if
not
isinstance
(
program
,
Program
):
raise
TypeError
(
"program should be as Program type"
)
...
...
@@ -49,36 +63,43 @@ class InferenceTranspiler:
can be integrated with them. Doing so will give us a forward acceleration,
especially in environments like mobile or embedded.
For input X:
- Conv process: X = input * W + bias
- Batch norm process: X' = (X - mean) / std
- Scale Process: Y = a * X' + b
For input :math:`X`:
- Conv process: :math:`X = input * W + bias`
- Batch norm process: :math:`X' = (X - mean) / std`
- Scale Process: :math:`Y = a * X' + b`
After fuse into one operation:
Y = (input * W + bias - mean) / std * a + b
= input * a * W / std + ((bias - mean) / std * a + b)
.. math::
Y &= (input * W + bias - mean) / std * a + b
\\\\
&= input * a * W / std + ((bias - mean) / std * a + b)
The operator transformation is:
- before:
- conv->batch_norm->any_other_op (bias == 0)
- conv->elementwise_add->batch_norm->any_other_op (bias != 0)
- after:
- conv->elementwise_add->any_other_op
The transpile stages are:
1. insert elementwise_add op when bias == 0.
2. fuse the batch_norm's parameters to conv and elementwise_add operators.
3. remove batch_norm ops which are not used in any other ops.
4. adjust the input of any_other_op to be the output of elementwise_add operator.
5. remove unused variables.
:param program: program to transpile
:type program: Program
:param place: inference place
:type place: Place
:param scope: inference scope
:type scope: Scope
Args:
program (Program): program to transpile
place (Place): inference place
scope (Scope): inference Scope
'''
self
.
scope
=
scope
self
.
place
=
place
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录