Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
35e32a8e
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
1 年多 前同步成功
通知
696
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
35e32a8e
编写于
6月 13, 2018
作者:
G
guosheng
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/paddle
into fix-beam_search
上级
92a7766e
14e83376
变更
12
显示空白变更内容
内联
并排
Showing
12 changed file
with
275 addition
and
255 deletion
+275
-255
doc/fluid/api/detection.rst
doc/fluid/api/detection.rst
+0
-0
doc/fluid/api/gen_doc.sh
doc/fluid/api/gen_doc.sh
+1
-1
doc/fluid/api/io.rst
doc/fluid/api/io.rst
+0
-18
doc/fluid/api/layers.rst
doc/fluid/api/layers.rst
+51
-49
doc/fluid/api/optimizer.rst
doc/fluid/api/optimizer.rst
+0
-7
doc/fluid/api/profiler.rst
doc/fluid/api/profiler.rst
+0
-12
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+1
-1
paddle/fluid/operators/batch_norm_mkldnn_op.cc
paddle/fluid/operators/batch_norm_mkldnn_op.cc
+180
-146
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+13
-11
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+4
-3
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+1
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+24
-6
未找到文件。
doc/fluid/api/detection.rst
0 → 100644
浏览文件 @
35e32a8e
doc/fluid/api/gen_doc.sh
浏览文件 @
35e32a8e
#!/bin/bash
python gen_doc.py layers
--submodules
control_flow device io nn ops tensor
>
layers.rst
python gen_doc.py layers
--submodules
control_flow device io nn ops tensor
detection
>
layers.rst
for
module
in
data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
do
...
...
doc/fluid/api/io.rst
浏览文件 @
35e32a8e
...
...
@@ -59,21 +59,3 @@ get_inference_program
.. autofunction:: paddle.fluid.io.get_inference_program
:noindex:
save_checkpoint
---------------
.. autofunction:: paddle.fluid.io.save_checkpoint
:noindex:
load_checkpoint
---------------
.. autofunction:: paddle.fluid.io.load_checkpoint
:noindex:
clean_checkpoint
----------------
.. autofunction:: paddle.fluid.io.clean_checkpoint
:noindex:
doc/fluid/api/layers.rst
浏览文件 @
35e32a8e
...
...
@@ -181,12 +181,6 @@ Print
.. autofunction:: paddle.fluid.layers.Print
:noindex:
is_empty
--------
.. autofunction:: paddle.fluid.layers.is_empty
:noindex:
device
======
...
...
@@ -261,19 +255,6 @@ double_buffer
.. autofunction:: paddle.fluid.layers.double_buffer
:noindex:
random_data_generator
---------------------
.. autofunction:: paddle.fluid.layers.random_data_generator
:noindex:
Preprocessor
------------
.. autoclass:: paddle.fluid.layers.Preprocessor
:members:
:noindex:
nn
==
...
...
@@ -613,30 +594,6 @@ roi_pool
.. autofunction:: paddle.fluid.layers.roi_pool
:noindex:
dice_loss
---------
.. autofunction:: paddle.fluid.layers.dice_loss
:noindex:
resize_bilinear
---------------
.. autofunction:: paddle.fluid.layers.resize_bilinear
:noindex:
gather
------
.. autofunction:: paddle.fluid.layers.gather
:noindex:
random_crop
-----------
.. autofunction:: paddle.fluid.layers.random_crop
:noindex:
ops
===
...
...
@@ -784,12 +741,6 @@ sum
.. autofunction:: paddle.fluid.layers.sum
:noindex:
shape
-----
.. autofunction:: paddle.fluid.layers.shape
:noindex:
sigmoid
-------
...
...
@@ -1039,3 +990,54 @@ zeros
.. autofunction:: paddle.fluid.layers.zeros
:noindex:
detection
=========
multi_box_head
--------------
.. autofunction:: paddle.fluid.layers.multi_box_head
:noindex:
bipartite_match
---------------
.. autofunction:: paddle.fluid.layers.bipartite_match
:noindex:
target_assign
-------------
.. autofunction:: paddle.fluid.layers.target_assign
:noindex:
detection_output
----------------
.. autofunction:: paddle.fluid.layers.detection_output
:noindex:
ssd_loss
--------
.. autofunction:: paddle.fluid.layers.ssd_loss
:noindex:
detection_map
-------------
.. autofunction:: paddle.fluid.layers.detection_map
:noindex:
iou_similarity
--------------
.. autofunction:: paddle.fluid.layers.iou_similarity
:noindex:
box_coder
---------
.. autofunction:: paddle.fluid.layers.box_coder
:noindex:
doc/fluid/api/optimizer.rst
浏览文件 @
35e32a8e
...
...
@@ -89,13 +89,6 @@ DecayedAdagradOptimizer
:members:
:noindex:
RMSPropOptimizer
----------------
.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
:members:
:noindex:
Adadelta
--------
...
...
doc/fluid/api/profiler.rst
浏览文件 @
35e32a8e
...
...
@@ -23,15 +23,3 @@ profiler
.. autofunction:: paddle.fluid.profiler.profiler
:noindex:
start_profiler
--------------
.. autofunction:: paddle.fluid.profiler.start_profiler
:noindex:
stop_profiler
-------------
.. autofunction:: paddle.fluid.profiler.stop_profiler
:noindex:
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
35e32a8e
...
...
@@ -84,7 +84,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library
(
feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc
cares
grpc++_unsecure grpc_unsecure gpr
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
...
...
paddle/fluid/operators/batch_norm_mkldnn_op.cc
浏览文件 @
35e32a8e
...
...
@@ -19,10 +19,17 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
batch_norm_bwd
=
mkldnn
::
batch_normalization_backward
;
using
batch_norm_fwd
=
mkldnn
::
batch_normalization_forward
;
using
framework
::
DataLayout
;
using
framework
::
Tensor
;
using
mkldnn
::
memory
;
using
mkldnn
::
primitive
;
using
mkldnn
::
reorder
;
using
mkldnn
::
stream
;
using
paddle
::
platform
::
MKLDNNDeviceContext
;
using
paddle
::
platform
::
MKLDNNMemDesc
;
using
mkldnn
::
memory
;
using
platform
::
to_void_cast
;
template
<
typename
T
>
using
EigenArrayMap
=
...
...
@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) {
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
template
<
typename
T
>
inline
void
*
cast_const_to_void
(
const
T
*
t
)
{
return
static_cast
<
void
*>
(
const_cast
<
T
*>
(
t
));
}
}
// namespace
template
<
typename
T
>
class
BatchNormMKLDNNOpKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
auto
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
PADDLE_ENFORCE
(
data_layout
==
framework
::
DataLayout
::
kNCHW
,
"MKLDNN batch normalization handles only NCHW data layout"
);
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
...
...
@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
shift
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
variance_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE
(
x
->
layout
()
==
DataLayout
::
kMKLDNN
&&
x
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input x tensor"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
mean_data
=
mean
->
data
<
T
>
();
const
T
*
variance_data
=
variance
->
data
<
T
>
();
T
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
mean_out_data
=
mean_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
variance_out_data
=
variance_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batch_mean_data
=
nullptr
;
T
*
batch_variance_data
=
nullptr
;
if
(
!
is_test
)
{
batch_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_mean
_data
=
batch_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
batch_variance
_data
=
batch_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
auto
propagation
=
is_test
==
true
?
mkldnn
::
prop_kind
::
forward_scoring
:
mkldnn
::
prop_kind
::
forward_training
;
auto
dims
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
src_md
=
MKLDNNMemDesc
(
dims
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
dst_md
=
MKLDNNMemDesc
(
dims
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
src_pd
=
mkldnn
::
memory
::
primitive_desc
{
src_md
,
mkldnn_engine
};
auto
dst_pd
=
mkldnn
::
memory
::
primitive_desc
{
dst_md
,
mkldnn_engine
};
auto
src
=
mkldnn
::
memory
{
src_pd
,
cast_const_to_void
(
x
->
data
<
T
>
())};
auto
dst
=
mkldnn
::
memory
{
dst_pd
,
y
->
data
<
T
>
()};
auto
src_tz
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
scale_tz
=
paddle
::
framework
::
vectorize2int
(
scale
->
dims
());
PADDLE_ENFORCE
(
scale_tz
.
size
()
==
1
,
"Dims of scale tensor is NOT 1"
);
const
unsigned
int
ic
=
scale_tz
[
0
];
unsigned
flags
=
mkldnn
::
use_scale_shift
;
if
(
is_test
)
flags
|=
mkldnn
::
use_global_stats
;
// create mkldnn memory from input x tensor
auto
src_memory
=
memory
({{{
src_tz
},
memory
::
data_type
::
f32
,
x
->
format
()},
mkldnn_engine
},
to_void_cast
(
x_data
));
// create primitive descriptor for batch norm forward
using
bn_fwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_forward
>
;
auto
batch_norm_fwd_desc
=
bn_fwd_types
::
op_desc
{
propagation
,
src_md
,
epsilon
,
flags
};
auto
batch_norm_fwd_pd
=
bn_fwd_types
::
op_prim
{
batch_norm_fwd_desc
,
mkldnn_engine
};
auto
batch_norm_fwd_desc
=
bn_fwd_types
::
op_desc
{
propagation
,
src_memory
.
get_primitive_desc
().
desc
(),
epsilon
,
flags
};
std
::
shared_ptr
<
batch_norm_fwd
::
primitive_desc
>
batch_norm_fwd_pd
=
std
::
shared_ptr
<
batch_norm_fwd
::
primitive_desc
>
(
new
batch_norm_fwd
::
primitive_desc
(
batch_norm_fwd_desc
,
mkldnn_engine
));
const
unsigned
int
ic
=
dims
[
1
];
// Save the pd to be used in backward pass
const
std
::
string
key
=
ctx
.
op
().
Output
(
"SavedMean"
);
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
dev_ctx
.
SetBlob
(
key_batch_norm_fwd_pd
,
batch_norm_fwd_pd
);
// MKLDNN requires a single piece of memory for scale and shift/bias data
const
size_t
scaleshift_size
=
2
*
ic
;
...
...
@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
copy_to_weights
(
scale
->
data
<
T
>
(),
scale
->
data
<
T
>
()
+
ic
,
shift
->
data
<
T
>
(),
shift
->
data
<
T
>
()
+
ic
,
&
scaleshift_data
);
auto
scaleshift_memory
=
mkldnn
::
memory
{
batch_norm_fwd_pd
.
weights_primitive_desc
(),
scaleshift_data
.
data
()};
// crate mkldnn memory for weights(scale/shift)
auto
scaleshift_memory
=
memory
(
batch_norm_fwd_pd
->
weights_primitive_desc
(),
scaleshift_data
.
data
());
if
(
is_test
)
{
auto
mean_memory
=
mkldnn
::
memory
{
batch_norm_fwd_pd
.
mean_primitive_desc
(),
cast_const_to_void
(
mean
->
data
<
T
>
())};
// create mkldnn memory for output y tensor
auto
dst_memory
=
memory
(
batch_norm_fwd_pd
->
dst_primitive_desc
(),
y_data
);
if
(
is_test
)
{
// create mkldnn memory for stats (as input)
auto
mean_memory
=
memory
(
batch_norm_fwd_pd
->
mean_primitive_desc
(),
to_void_cast
(
mean_data
));
auto
variance_memory
=
m
kldnn
::
memory
{
batch_norm_fwd_pd
.
variance_primitive_desc
(),
cast_const_to_void
(
variance
->
data
<
T
>
())}
;
m
emory
(
batch_norm_fwd_pd
->
variance_primitive_desc
(),
to_void_cast
(
variance_data
))
;
run_batch_norm_op
<
typename
bn_fwd_types
::
op_type
>
(
batch_norm_fwd_pd
,
src
,
(
const
mkldnn
::
primitive
::
at
&
)
mean_memory
,
*
batch_norm_fwd_pd
,
src_memory
,
(
const
mkldnn
::
primitive
::
at
&
)
mean_memory
,
(
const
mkldnn
::
primitive
::
at
&
)
variance_memory
,
scaleshift_memory
,
dst
);
dst
_memory
);
}
else
{
// create mkldnn memory for stats (as output)
auto
mean_memory
=
mkldnn
::
memory
{
batch_norm_fwd_pd
.
mean_primitive_desc
(),
cast_const_to_void
(
batch_mean
->
data
<
T
>
())};
auto
variance_memory
=
mkldnn
::
memory
{
batch_norm_fwd_pd
.
variance_primitive_desc
(),
cast_const_to_void
(
batch_variance
->
data
<
T
>
())};
memory
(
batch_norm_fwd_pd
->
mean_primitive_desc
(),
batch_mean_data
);
auto
variance_memory
=
memory
(
batch_norm_fwd_pd
->
variance_primitive_desc
(),
batch_variance_data
);
run_batch_norm_op
<
bn_fwd_types
::
op_type
>
(
batch_norm_fwd_pd
,
src
,
scaleshift_memory
,
dst
,
run_batch_norm_op
<
bn_fwd_types
::
op_type
>
(
*
batch_norm_fwd_pd
,
src_memory
,
scaleshift_memory
,
dst
_memory
,
mean_memory
,
variance_memory
);
}
if
(
!
is_test
)
{
const
unsigned
int
in
=
dims
[
0
];
const
unsigned
int
sample_size
=
x
->
numel
()
/
in
/
ic
;
// saved_xx is use just in this batch of data
EigenVectorArrayMap
<
T
>
saved_mean_e
(
batch_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
ic
);
EigenVectorArrayMap
<
T
>
saved_variance_e
(
batch_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
ic
);
saved_mean_e
.
setZero
();
saved_variance_e
.
setZero
();
const
unsigned
int
x_arr_size
=
in
*
ic
;
ConstEigenArrayMap
<
T
>
x_arr
(
x
->
data
<
T
>
(),
sample_size
,
x_arr_size
);
for
(
unsigned
int
nc
=
0
;
nc
<
x_arr_size
;
++
nc
)
{
saved_mean_e
(
nc
%
ic
)
+=
x_arr
.
col
(
nc
).
sum
();
}
saved_mean_e
/=
in
*
sample_size
;
for
(
unsigned
int
nc
=
0
;
nc
<
x_arr_size
;
++
nc
)
{
saved_variance_e
(
nc
%
ic
)
+=
(
x_arr
.
col
(
nc
)
-
saved_mean_e
(
nc
%
ic
)).
matrix
().
squaredNorm
();
}
saved_variance_e
/=
in
*
sample_size
;
ConstEigenVectorArrayMap
<
T
>
mean_arr
{
mean
->
data
<
T
>
(),
ic
};
ConstEigenVectorArrayMap
<
T
>
variance_arr
{
variance
->
data
<
T
>
(),
ic
};
// mkldnn only compute stats for current batch
// so we need compute momentum stats via Eigen lib
EigenVectorArrayMap
<
T
>
batch_mean_e
(
batch_mean_data
,
ic
);
EigenVectorArrayMap
<
T
>
batch_variance_e
(
batch_variance_data
,
ic
);
ConstEigenVectorArrayMap
<
T
>
mean_e
(
mean_data
,
ic
);
ConstEigenVectorArrayMap
<
T
>
variance_e
{
variance_data
,
ic
};
EigenVectorArrayMap
<
T
>
running_mean_arr
(
mean_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
ic
);
EigenVectorArrayMap
<
T
>
running_var_arr
(
variance_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
ic
);
EigenVectorArrayMap
<
T
>
running_mean_e
(
mean_out_data
,
ic
);
EigenVectorArrayMap
<
T
>
running_variance_e
(
variance_out_data
,
ic
);
auto
one_minus_momentum
=
1.
-
momentum
;
running_mean_arr
=
mean_arr
*
momentum
+
saved_mean_e
*
one_minus_momentum
;
running_var_arr
=
variance_arr
*
momentum
+
saved_variance_e
*
one_minus_momentum
;
running_mean_e
=
mean_e
*
momentum
+
batch_mean_e
*
one_minus_momentum
;
running_variance_e
=
variance_e
*
momentum
+
batch_variance_e
*
one_minus_momentum
;
}
y
->
set_layout
(
DataLayout
::
kMKLDNN
);
y
->
set_format
(
(
memory
::
format
)
dst_memory
.
get_primitive_desc
().
desc
().
data
.
format
);
}
};
...
...
@@ -217,11 +212,6 @@ template <typename T>
class
BatchNormMKLDNNGradOpKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
auto
data_layout
=
framework
::
StringToDataLayout
(
data_layout_str
);
PADDLE_ENFORCE
(
data_layout
==
framework
::
DataLayout
::
kNCHW
,
"MKLDNN batch normalization handles only NCHW data layout"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
auto
mkldnn_engine
=
dev_ctx
.
GetEngine
();
...
...
@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto
*
diff_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
diff_shift
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
diff_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
diff_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
diff_shift
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE
(
diff_y
->
layout
()
==
DataLayout
::
kMKLDNN
&&
diff_y
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input diff_y tensor"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
diff_y_data
=
diff_y
->
data
<
T
>
();
const
T
*
batch_mean_data
=
batch_mean
->
data
<
T
>
();
const
T
*
batch_variance_data
=
batch_variance
->
data
<
T
>
();
const
T
*
scale_data
=
scale
->
data
<
T
>
();
const
T
*
shift_data
=
shift
->
data
<
T
>
();
T
*
diff_x_data
=
diff_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
diff_scale_data
=
diff_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
diff_shift_data
=
diff_shift
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
src_tz
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
auto
diff_src_tz
=
src_tz
;
auto
dst_tz
=
src_tz
;
auto
diff_dst_tz
=
dst_tz
;
auto
scale_tz
=
paddle
::
framework
::
vectorize2int
(
scale
->
dims
());
PADDLE_ENFORCE
(
scale_tz
.
size
()
==
1
,
"Dims of scale tensor is NOT 1"
);
const
unsigned
int
ic
=
scale_tz
[
0
];
// Retrieve bn_fwd_pd from device context
const
std
::
string
key
=
ctx
.
op
().
Input
(
"SavedMean"
);
const
std
::
string
key_batch_norm_fwd_pd
=
key
+
"@bn_fwd_pd"
;
auto
batch_norm_fwd_pd
=
std
::
static_pointer_cast
<
batch_norm_fwd
::
primitive_desc
>
(
dev_ctx
.
GetBlob
(
key_batch_norm_fwd_pd
));
PADDLE_ENFORCE
(
batch_norm_fwd_pd
!=
nullptr
,
"Fail to find batch_norm_fwd_pd in device context"
);
auto
dims
=
paddle
::
framework
::
vectorize2int
(
x
->
dims
());
unsigned
flags
=
mkldnn
::
use_scale_shift
|
!
mkldnn
::
use_global_stats
;
using
bn_bwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_backward
>
;
auto
src_md
=
MKLDNNMemDesc
(
dims
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
dst_md
=
MKLDNNMemDesc
(
dims
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
diff_src_md
=
MKLDNNMemDesc
(
dims
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
diff_dst_md
=
MKLDNNMemDesc
(
dims
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
// create mkldnn memory from input diff_y tensor
auto
user_diff_dst_memory
=
memory
({{{
diff_dst_tz
},
memory
::
data_type
::
f32
,
diff_y
->
format
()},
mkldnn_engine
},
to_void_cast
(
diff_y_data
));
using
bn_bwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_backward
>
;
using
bn_fwd_types
=
bn_type_traits
<
mkldnn
::
batch_normalization_forward
>
;
// create mkldnn memory from input x tensor
auto
src_memory
=
memory
({{{
src_tz
},
memory
::
data_type
::
f32
,
x
->
format
()},
mkldnn_engine
},
to_void_cast
(
x_data
));
auto
batch_norm_fwd_desc
=
bn_fwd_types
::
op_desc
{
mkldnn
::
prop_kind
::
forward_training
,
src_md
,
epsilon
,
flags
};
auto
batch_norm_fwd_pd
=
bn_fwd_types
::
op_prim
{
batch_norm_fwd_desc
,
mkldnn_engine
};
// for diff_dst, try to use same format as dst in forward pass
auto
diff_dst_pd
=
batch_norm_fwd_pd
.
get
()
->
dst_primitive_desc
();
auto
diff_dst_md
=
diff_dst_pd
.
desc
();
// create primitive descriptor for batch norm backward
unsigned
flags
=
mkldnn
::
use_scale_shift
;
auto
batch_norm_bwd_desc
=
bn_bwd_types
::
op_desc
{
mkldnn
::
prop_kind
::
backward
,
diff_dst_md
,
dst_md
,
epsilon
,
flags
};
mkldnn
::
prop_kind
::
backward
,
diff_dst_md
,
src_memory
.
get_primitive_desc
().
desc
(),
epsilon
,
flags
};
auto
batch_norm_bwd_pd
=
bn_bwd_types
::
op_prim
{
batch_norm_bwd_desc
,
mkldnn_engine
,
batch_norm_fwd_pd
};
auto
src
=
mkldnn
::
memory
{{
src_md
,
mkldnn_engine
},
cast_const_to_void
(
x
->
data
<
T
>
())};
auto
mean
=
mkldnn
::
memory
{
batch_norm_bwd_pd
.
mean_primitive_desc
(),
cast_const_to_void
(
batch_mean
->
data
<
T
>
())};
auto
variance
=
mkldnn
::
memory
{
batch_norm_bwd_pd
.
variance_primitive_desc
(),
cast_const_to_void
(
batch_variance
->
data
<
T
>
())};
auto
diff_dst
=
mkldnn
::
memory
{{
diff_dst_md
,
mkldnn_engine
},
cast_const_to_void
(
diff_y
->
data
<
T
>
())};
batch_norm_bwd_desc
,
mkldnn_engine
,
*
batch_norm_fwd_pd
};
// reorder user_diff_dst if it's not in preferred format
auto
diff_dst_memory
=
user_diff_dst_memory
;
primitive
reorder_diff_dst
;
bool
is_diff_dst_reordered
=
false
;
if
(
diff_dst_pd
!=
user_diff_dst_memory
.
get_primitive_desc
())
{
diff_dst_memory
=
memory
(
diff_dst_pd
);
reorder_diff_dst
=
reorder
(
user_diff_dst_memory
,
diff_dst_memory
);
is_diff_dst_reordered
=
true
;
}
const
unsigned
int
ic
=
dims
[
1
];
// create mkldnn memory for input tensors (src/mean/variance)
auto
mean_memory
=
memory
(
batch_norm_bwd_pd
.
mean_primitive_desc
(),
to_void_cast
(
batch_mean_data
));
auto
variance_memory
=
memory
(
batch_norm_bwd_pd
.
variance_primitive_desc
(),
to_void_cast
(
batch_variance_data
));
// MKLDNN requires a single piece of memory for scale and shift/bias data
const
size_t
scaleshift_size
=
2
*
ic
;
std
::
vector
<
T
>
scaleshift_data
;
scaleshift_data
.
reserve
(
scaleshift_size
);
copy_to_weights
(
scale
->
data
<
T
>
(),
scale
->
data
<
T
>
()
+
ic
,
shift
->
data
<
T
>
()
,
shift
->
data
<
T
>
()
+
ic
,
&
scaleshift_data
);
copy_to_weights
(
scale
_data
,
scale_data
+
ic
,
shift_data
,
shift_data
+
ic
,
&
scaleshift_data
);
auto
scaleshift_memory
=
mkldnn
::
memory
{
batch_norm_bwd_pd
.
weights_primitive_desc
(),
scaleshift_data
.
data
()};
// create mkldnn memory for input tensors (scale/shift)
auto
scaleshift_memory
=
memory
(
batch_norm_bwd_pd
.
weights_primitive_desc
(),
scaleshift_data
.
data
());
// create mkldnn memory for output diff weights (combined scale/shift)
std
::
vector
<
T
>
diff_scaleshift_data
;
diff_scaleshift_data
.
reserve
(
scaleshift_size
);
copy_to_weights
(
diff_scale
->
data
<
T
>
(),
diff_scale
->
data
<
T
>
()
+
ic
,
diff_shift
->
data
<
T
>
(),
diff_shift
->
data
<
T
>
()
+
ic
,
&
diff_scaleshift_data
);
auto
diff_scaleshift_memory
=
mkldnn
::
memory
{
batch_norm_bwd_pd
.
diff_weights_primitive_desc
(),
diff_scaleshift_data
.
data
()};
auto
diff_src
=
mkldnn
::
memory
{{
diff_src_md
,
mkldnn_engine
},
static_cast
<
void
*>
(
diff_x
->
data
<
T
>
())};
run_batch_norm_op
<
bn_bwd_types
::
op_type
>
(
batch_norm_bwd_pd
,
src
,
mean
,
variance
,
diff_dst
,
scaleshift_memory
,
diff_src
,
diff_scaleshift_memory
);
memory
(
batch_norm_bwd_pd
.
diff_weights_primitive_desc
(),
diff_scaleshift_data
.
data
());
// here assume diff_src is in the same format of src
auto
diff_src_memory
=
memory
(
src_memory
.
get_primitive_desc
(),
diff_x_data
);
// finally create batch_norm backward primitive
auto
batch_norm_bwd_prim
=
batch_norm_bwd
(
batch_norm_bwd_pd
,
src_memory
,
mean_memory
,
variance_memory
,
diff_dst_memory
,
scaleshift_memory
,
diff_src_memory
,
diff_scaleshift_memory
);
// execute optional reorder and batch_norm backward primitive
std
::
vector
<
primitive
>
pipeline
;
if
(
is_diff_dst_reordered
)
pipeline
.
push_back
(
reorder_diff_dst
);
pipeline
.
push_back
(
batch_norm_bwd_prim
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// copy back diff sacle/shift to output tensors (diff scale/shift)
diff_scaleshift_data
.
resize
(
scaleshift_size
);
auto
it
=
std
::
begin
(
diff_scaleshift_data
);
std
::
copy
(
it
,
std
::
next
(
it
,
ic
),
diff_scale
->
data
<
T
>
()
);
std
::
copy
(
it
,
std
::
next
(
it
,
ic
),
diff_scale
_data
);
std
::
copy
(
std
::
next
(
it
,
ic
),
std
::
end
(
diff_scaleshift_data
),
diff_shift
->
data
<
T
>
());
diff_shift_data
);
// set layout/format of output tensors
diff_x
->
set_layout
(
DataLayout
::
kMKLDNN
);
diff_x
->
set_format
((
memory
::
format
)
diff_src_memory
.
get_primitive_desc
()
.
desc
()
.
data
.
format
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
batch_norm
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
REGISTER_OP_KERNEL
(
batch_norm
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
BatchNormMKLDNNOpKernel
<
float
>
);
REGISTER_OP_KERNEL
(
batch_norm_grad
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
REGISTER_OP_KERNEL
(
batch_norm_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
BatchNormMKLDNNGradOpKernel
<
float
>
);
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
35e32a8e
...
...
@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel {
ctx
.
Input
<
Tensor
>
(
"Variance"
)
->
type
()),
"Variance input should be of float type"
);
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library
_
==
framework
::
LibraryType
::
kPlain
&&
if
(
library
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
_
=
framework
::
LibraryType
::
kMKLDNN
;
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
_
);
library
);
}
};
...
...
@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
PADDLE_THROW
(
"can't find Y@GRAD"
);
}
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
DataLayout
::
kAnyLayout
;
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library
_
==
framework
::
LibraryType
::
kPlain
&&
if
(
library
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
_
=
framework
::
LibraryType
::
kMKLDNN
;
layout
_
=
framework
::
DataLayout
::
kMKLDNN
;
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
(),
layout
_
,
library_
);
layout
,
library
);
}
};
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
35e32a8e
...
...
@@ -59,9 +59,10 @@ inline size_t CpuTotalPhysicalMemory() {
size_t
CpuMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return
std
::
min
(
static_cast
<
size_t
>
(
FLAGS_fraction_of_cpu_memory_to_use
*
return
std
::
min
(
static_cast
<
size_t
>
(
FLAGS_fraction_of_cpu_memory_to_use
*
CpuTotalPhysicalMemory
()),
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
);
static_cast
<
size_t
>
(
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
)
);
}
size_t
CpuMinChunkSize
()
{
...
...
python/paddle/fluid/framework.py
浏览文件 @
35e32a8e
...
...
@@ -382,7 +382,7 @@ class Operator(object):
'rnn_memory_helper_grad'
,
'conditional_block'
,
'while'
,
'send'
,
'recv'
,
'listen_and_serv'
,
'parallel_do'
,
'save_combine'
,
'load_combine'
,
'ncclInit'
,
'channel_create'
,
'channel_close'
,
'channel_send'
,
'channel_recv'
,
'select'
'channel_recv'
,
'select'
,
'gen_nccl_id'
}
def
__init__
(
self
,
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
35e32a8e
...
...
@@ -261,9 +261,10 @@ def embedding(input,
return
tmp
# TODO(qijun): expose H0 and C0
def
dynamic_lstm
(
input
,
size
,
h_0
=
None
,
c_0
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
use_peepholes
=
True
,
...
...
@@ -324,6 +325,13 @@ def dynamic_lstm(input,
(T X 4D), where T is the total time steps in this
mini-batch, D is the hidden size.
size(int): 4 * hidden size.
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the hidden size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weights.
...
...
@@ -387,12 +395,20 @@ def dynamic_lstm(input,
cell
=
helper
.
create_tmp_variable
(
dtype
)
batch_gate
=
helper
.
create_tmp_variable
(
dtype
)
batch_cell_pre_act
=
helper
.
create_tmp_variable
(
dtype
)
inputs
=
{
'Input'
:
input
,
'Weight'
:
weight
,
'Bias'
:
bias
}
batch_size
=
input
.
shape
[
0
]
if
h_0
:
assert
h_0
.
shape
==
(
batch_size
,
size
),
\
'The shape of h0 should be (batch_size, %d)'
%
size
inputs
[
'H0'
]
=
h_0
if
c_0
:
assert
c_0
.
shape
==
(
batch_size
,
size
),
\
'The shape of c0 should be (batch_size, %d)'
%
size
inputs
[
'C0'
]
=
c_0
helper
.
append_op
(
type
=
'lstm'
,
inputs
=
{
'Input'
:
input
,
'Weight'
:
weight
,
'Bias'
:
bias
},
inputs
=
inputs
,
outputs
=
{
'Hidden'
:
hidden
,
'Cell'
:
cell
,
...
...
@@ -677,11 +693,13 @@ def dynamic_gru(input,
attr
=
helper
.
param_attr
,
shape
=
[
size
,
3
*
size
],
dtype
=
dtype
)
bias
=
helper
.
create_parameter
(
attr
=
helper
.
bias_attr
,
shape
=
[
1
,
3
*
size
],
dtype
=
dtype
,
is_bias
=
True
)
batch_size
=
input
.
shape
[
0
]
inputs
=
{
'Input'
:
input
,
'Weight'
:
weight
,
'Bias'
:
bias
}
if
h_0
!=
None
:
assert
h_0
.
shape
==
(
size
,
size
),
'The shape of h0 should be(%d, %d)'
%
(
size
,
size
)
inputs
[
'h0'
]
=
h_0
batch_size
,
size
),
'The shape of h0 should be(batch_size, %d)'
%
size
inputs
[
'H0'
]
=
h_0
hidden
=
helper
.
create_tmp_variable
(
dtype
)
batch_gate
=
helper
.
create_tmp_variable
(
dtype
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录