Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
a8c4324d
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a8c4324d
编写于
2月 25, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
fix hang bug
上级
b17541a9
3ccd8964
变更
59
显示空白变更内容
内联
并排
Showing
59 changed file
with
2038 addition
and
508 deletion
+2038
-508
README.md
README.md
+11
-11
README_cn.md
README_cn.md
+11
-11
paddle/fluid/API.spec
paddle/fluid/API.spec
+2
-1
paddle/fluid/framework/data_layout_transform.cc
paddle/fluid/framework/data_layout_transform.cc
+7
-16
paddle/fluid/framework/data_transform.cc
paddle/fluid/framework/data_transform.cc
+24
-6
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+3
-0
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+15
-3
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+16
-2
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+35
-2
paddle/fluid/framework/operator_kernel_configs.h
paddle/fluid/framework/operator_kernel_configs.h
+118
-0
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+33
-8
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+0
-5
paddle/fluid/imperative/layer.cc
paddle/fluid/imperative/layer.cc
+2
-1
paddle/fluid/imperative/layer.h
paddle/fluid/imperative/layer.h
+14
-5
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+3
-2
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-1
paddle/fluid/operators/beam_search_decode_op.cc
paddle/fluid/operators/beam_search_decode_op.cc
+1
-1
paddle/fluid/operators/beam_search_decode_op.h
paddle/fluid/operators/beam_search_decode_op.h
+1
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+14
-45
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+1
-95
paddle/fluid/operators/conv_fusion_op.cu.cc
paddle/fluid/operators/conv_fusion_op.cu.cc
+9
-24
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+34
-5
paddle/fluid/operators/detection/yolov3_loss_op.cc
paddle/fluid/operators/detection/yolov3_loss_op.cc
+23
-17
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-0
paddle/fluid/operators/math/sample_prob.cc
paddle/fluid/operators/math/sample_prob.cc
+26
-0
paddle/fluid/operators/math/sample_prob.cu
paddle/fluid/operators/math/sample_prob.cu
+161
-0
paddle/fluid/operators/math/sample_prob.h
paddle/fluid/operators/math/sample_prob.h
+118
-0
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+1
-1
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+23
-22
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+6
-2
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+3
-4
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+3
-4
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+1
-1
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+22
-3
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+84
-54
paddle/fluid/operators/sample_logits_op.cc
paddle/fluid/operators/sample_logits_op.cc
+225
-0
paddle/fluid/operators/sample_logits_op.cu
paddle/fluid/operators/sample_logits_op.cu
+257
-0
paddle/fluid/operators/sample_logits_op.h
paddle/fluid/operators/sample_logits_op.h
+245
-0
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+3
-3
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+2
-2
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+12
-2
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+1
-2
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+1
-0
paddle/fluid/platform/event.h
paddle/fluid/platform/event.h
+65
-0
paddle/fluid/platform/mkldnn_reuse.h
paddle/fluid/platform/mkldnn_reuse.h
+45
-38
paddle/fluid/platform/mkldnn_utils.h
paddle/fluid/platform/mkldnn_utils.h
+69
-0
paddle/fluid/platform/profiler.cu
paddle/fluid/platform/profiler.cu
+10
-10
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+5
-46
paddle/fluid/platform/profiler_test.cc
paddle/fluid/platform/profiler_test.cc
+0
-1
paddle/fluid/platform/temporary_allocator_test.cc
paddle/fluid/platform/temporary_allocator_test.cc
+4
-4
paddle/fluid/train/demo/demo_trainer.cc
paddle/fluid/train/demo/demo_trainer.cc
+2
-2
paddle/fluid/train/test_train_recognize_digits.cc
paddle/fluid/train/test_train_recognize_digits.cc
+1
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+3
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+9
-1
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+10
-9
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+208
-17
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+22
-7
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+1
-10
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+11
-0
未找到文件。
README.md
浏览文件 @
a8c4324d
...
...
@@ -3,8 +3,8 @@
English |
[
简体中文
](
./README_cn.md
)
[
![Build Status
](
https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop
)
](https://travis-ci.org/PaddlePaddle/Paddle)
[
![Documentation Status
](
https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat
)
](http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html)
[
![Documentation Status
](
https://img.shields.io/badge/中文文档-最新-brightgreen.svg
)
](http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html)
[
![Documentation Status
](
https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat
)
](http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html)
[
![Documentation Status
](
https://img.shields.io/badge/中文文档-最新-brightgreen.svg
)
](http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html)
[
![Release
](
https://img.shields.io/github/release/PaddlePaddle/Paddle.svg
)
](https://github.com/PaddlePaddle/Paddle/releases)
[
![License
](
https://img.shields.io/badge/license-Apache%202-blue.svg
)
](LICENSE)
...
...
@@ -18,7 +18,7 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our
[
release announcement
](
https://github.com/PaddlePaddle/Paddle/releases
)
to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.
2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2
)
### Latest PaddlePaddle Release: [Fluid 1.
3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3
)
### Install Latest Stable Release:
```
# Linux CPU
...
...
@@ -26,9 +26,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
2
.0.post87
pip install paddlepaddle-gpu==1.
3
.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
2
.0.post85
pip install paddlepaddle-gpu==1.
3
.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
...
...
@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## Installation
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/beginners_guide/install/index_c
n.html
)
on our website.
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/
en/1.3/beginners_guide/index_e
n.html
)
on our website.
## Documentation
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html
)
documentation.
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html
)
documentation.
-
[
Deep Learning 101
](
https://github.com/PaddlePaddle/book
)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/user_guides/howto/training/cluster_howto
.html
)
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/
en/1.3/user_guides/howto/training/multi_node_en
.html
)
You can run distributed training jobs on MPI clusters.
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/api_cn/index_c
n.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/
en/1.3/api/index_e
n.html
)
Our new API enables much shorter programs.
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/
zh/1.2/advanced_usage/development/contribute_to_paddle/index_c
n.html
)
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/
en/1.3/advanced_usage/development/contribute_to_paddle/index_e
n.html
)
We appreciate your contributions!
...
...
README_cn.md
浏览文件 @
a8c4324d
...
...
@@ -3,8 +3,8 @@
[
English
](
./README.md
)
| 简体中文
[
![Build Status
](
https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop
)
](https://travis-ci.org/PaddlePaddle/Paddle)
[
![Documentation Status
](
https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat
)
](http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html)
[
![Documentation Status
](
https://img.shields.io/badge/中文文档-最新-brightgreen.svg
)
](http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html)
[
![Documentation Status
](
https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat
)
](http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html)
[
![Documentation Status
](
https://img.shields.io/badge/中文文档-最新-brightgreen.svg
)
](http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html)
[
![Release
](
https://img.shields.io/github/release/PaddlePaddle/Paddle.svg
)
](https://github.com/PaddlePaddle/Paddle/releases)
[
![License
](
https://img.shields.io/badge/license-Apache%202-blue.svg
)
](LICENSE)
...
...
@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
跟进PaddlePaddle最新特性请参考我们的
[
版本说明
](
https://github.com/PaddlePaddle/Paddle/releases
)
### PaddlePaddle最新版本: [Fluid 1.
2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2
)
### PaddlePaddle最新版本: [Fluid 1.
3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3
)
### 安装最新稳定版本:
```
# Linux CPU
...
...
@@ -24,9 +24,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
2
.0.post87
pip install paddlepaddle-gpu==1.
3
.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
2
.0.post85
pip install paddlepaddle-gpu==1.
3
.0.post85
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
```
...
...
@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## 安装
推荐阅读官网上的
[
安装说明
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/install/index_cn.html
)
推荐阅读官网上的
[
安装说明
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/install/index_cn.html
)
## 文档
我们提供
[
英文
](
http://paddlepaddle.org/documentation/docs/en/1.
2/getstarted
/index_en.html
)
和
[
中文
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/beginners_guide/index.html
)
文档
我们提供
[
英文
](
http://paddlepaddle.org/documentation/docs/en/1.
3/beginners_guide
/index_en.html
)
和
[
中文
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/beginners_guide/index.html
)
文档
-
[
深度学习101
](
https://github.com/PaddlePaddle/book
)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
-
[
分布式训练
](
http://paddlepaddle.org/documentation/docs/zh/1.
2/user_guides/howto/training/cluster_howto
.html
)
-
[
分布式训练
](
http://paddlepaddle.org/documentation/docs/zh/1.
3/user_guides/howto/training/multi_node
.html
)
可以在MPI集群上运行分布式训练任务
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/api_cn/index_cn.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/api_cn/index_cn.html
)
新的API支持代码更少更简洁的程序
-
[
贡献方式
](
http://paddlepaddle.org/documentation/docs/zh/1.
2
/advanced_usage/development/contribute_to_paddle/index_cn.html
)
-
[
贡献方式
](
http://paddlepaddle.org/documentation/docs/zh/1.
3
/advanced_usage/development/contribute_to_paddle/index_cn.html
)
欢迎您的贡献!
...
...
paddle/fluid/API.spec
浏览文件 @
a8c4324d
...
...
@@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.sampled_softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
...
...
@@ -303,7 +304,7 @@ paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keyword
paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(
None, None, None, None
))
paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(
'float32', -1.0, 1.0, 0
))
paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/data_layout_transform.cc
浏览文件 @
a8c4324d
...
...
@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout
=
out_layout
==
DataLayout
::
kAnyLayout
?
DataLayout
::
kNCHW
:
out_layout
;
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
expected_kernel_type
.
place_
));
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
std
::
vector
<
int
>
in_tz
=
paddle
::
framework
::
vectorize2int
(
in
.
dims
());
std
::
vector
<
int
>
out_tz
=
in_tz
;
...
...
@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s"
,
in
.
type
());
memory
::
data_type
out_type
=
in_type
;
auto
in_format
=
platform
::
MKLDNNFormatForSize
(
in_tz
.
size
(),
in
.
format
());
auto
out_format
=
platform
::
MKLDNNFormatForSize
(
in_tz
.
size
(),
ToMKLDNNFormat
(
out_layout
));
// output tensor has the same dims as input. Reorder don't change dims
out
->
Resize
(
in
.
dims
());
if
(
in_format
!=
out_format
)
{
// tempory mem pd fr out , to make reorder
auto
out_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
out
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
,
out_type
);
if
(
in
.
get_mkldnn_prim_desc
()
!=
out_mem_pd
)
{
void
*
in_data
=
GetDataFromTensor
(
in
,
in_type
);
auto
out_data
=
out
->
mutable_data
(
expected_kernel_type
.
place_
,
in
.
type
());
auto
in_memory
=
memory
({{{
in_tz
},
in_type
,
in_format
},
cpu_engine
},
in_data
);
auto
out_memory
=
memory
({{{
out_tz
},
out_type
,
out_format
},
cpu_engine
},
out_data
);
auto
in_memory
=
memory
(
in
.
get_mkldnn_prim_desc
(),
in_data
);
auto
out_memory
=
memory
(
out_mem_pd
,
out_data
);
platform
::
Reorder
(
in_memory
,
out_memory
);
}
else
{
out
->
ShareDataWith
(
in
);
}
out
->
set_layout
(
out_layout
);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out
->
set_format
(
memory
::
format
::
format_undef
);
#endif
}
...
...
paddle/fluid/framework/data_transform.cc
浏览文件 @
a8c4324d
...
...
@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
out
.
ShareDataWith
(
input_tensor
);
// TODO(jczaja): Remove that once all mkldnn ops
// are modified to work with mkldnn_blocked
auto
mkldnn_fmt
=
[
&
](
int
rank
)
{
switch
(
rank
)
{
case
5
:
return
mkldnn
::
memory
::
format
::
ncdhw
;
case
4
:
return
mkldnn
::
memory
::
format
::
nchw
;
case
3
:
return
mkldnn
::
memory
::
format
::
ncw
;
case
2
:
return
mkldnn
::
memory
::
format
::
nc
;
case
1
:
return
mkldnn
::
memory
::
format
::
x
;
default:
return
mkldnn
::
memory
::
format
::
blocked
;
}
};
auto
out_format
=
platform
::
MKLDNNFormatForSize
(
in
.
dims
().
size
(),
ToMKLDNNFormat
(
lin
));
auto
out_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
out
.
dims
()),
mkldnn_fmt
(
out
.
dims
().
size
()));
out
.
ShareDataWith
(
input_tensor
);
out
.
set_layout
(
DataLayout
::
kMKLDNN
);
out
.
set_format
(
out_format
);
out
.
set_mkldnn_prim_desc
(
out_mem_pd
);
#endif
}
else
{
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
a8c4324d
...
...
@@ -135,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
void
AppendMultiDevPass
(
const
BuildStrategy
&
strategy
)
{
ir
::
Pass
*
multi_devices_pass
;
if
(
strategy_
.
is_distribution_
)
{
VLOG
(
3
)
<<
"multi device parameter server mode"
;
multi_devices_pass
=
AppendPass
(
"dist_multi_devices_pass"
).
get
();
}
else
{
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
VLOG
(
3
)
<<
"multi devices collective mode with allreduce"
;
multi_devices_pass
=
AppendPass
(
"allreduce_mode_multi_devices_pass"
).
get
();
}
else
if
(
strategy
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
VLOG
(
3
)
<<
"multi deivces collective mode with reduce"
;
multi_devices_pass
=
AppendPass
(
"reduce_mode_multi_devices_pass"
).
get
();
}
else
{
PADDLE_THROW
(
"Unknown reduce strategy."
);
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
a8c4324d
...
...
@@ -937,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
}
void
DistSSAGraphBuilder
::
InsertPostprocessOps
(
ir
::
Graph
*
result
)
const
{
if
(
need_broadcast_var_
||
(
UseGPU
()
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
))
{
// broad cast received parameters when training in parameter server mode.
if
(
need_broadcast_var_
)
{
// There are 4 conditions:
// 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
// Need to broadcast received parameters to other GPU.
// 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
// broadcast received parameters to other GPU.
// 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
// broadcast received parameters to other scope.
// 4. CPU && Reduce: because all parameters share the same memory, did not
// broadcast received parameters.
if
(
!
UseGPU
()
&&
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kReduce
)
{
return
;
}
if
(
strategy_
.
fuse_broadcast_op_
)
{
CreateFusedBroadcastOp
(
result
,
bcast_var_name_set_
);
}
else
{
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
a8c4324d
...
...
@@ -904,6 +904,16 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
this
->
InferShape
(
&
infer_shape_ctx
);
}
std
::
vector
<
KernelConfig
>*
OperatorWithKernel
::
GetKernelConfig
(
const
OpKernelType
&
key
)
const
{
auto
config_iter
=
kernel_configs_map_
.
find
(
key
);
std
::
vector
<
KernelConfig
>*
kernel_configs
=
nullptr
;
if
(
config_iter
!=
kernel_configs_map_
.
end
())
{
kernel_configs
=
&
(
config_iter
->
second
);
}
return
kernel_configs
;
}
void
OperatorWithKernel
::
RunImpl
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
{
RuntimeContext
ctx
(
Inputs
(),
Outputs
(),
scope
);
...
...
@@ -921,7 +931,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
));
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -940,6 +950,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
KernelTypeToString
(
expected_kernel_key
));
}
std
::
vector
<
KernelConfig
>*
kernel_configs
=
GetKernelConfig
(
expected_kernel_key
);
// do data transformScope &transfer_scope;
std
::
vector
<
std
::
string
>
transfered_inplace_vars
;
auto
*
transfer_scope
=
...
...
@@ -957,7 +970,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
this
->
InferShape
(
&
infer_shape_ctx
);
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
));
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
ctx
,
kernel_configs
));
if
(
!
transfered_inplace_vars
.
empty
())
{
// there is inplace variable has been transfered.
...
...
paddle/fluid/framework/operator.h
浏览文件 @
a8c4324d
...
...
@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
...
...
@@ -184,12 +185,30 @@ class OperatorBase {
const
platform
::
Place
&
place
)
const
=
0
;
};
#ifdef PADDLE_WITH_CUDA
using
KernelConfig
=
boost
::
variant
<
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
,
std
::
shared_ptr
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>>
;
#else
using
KernelConfig
=
boost
::
variant
<
boost
::
blank
>
;
#endif
using
OpKernelConfigsMap
=
std
::
unordered_map
<
OpKernelType
,
std
::
vector
<
KernelConfig
>
,
OpKernelType
::
Hash
>
;
class
ExecutionContext
{
public:
ExecutionContext
(
const
OperatorBase
&
op
,
const
Scope
&
scope
,
const
platform
::
DeviceContext
&
device_context
,
const
RuntimeContext
&
ctx
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
)
{}
const
RuntimeContext
&
ctx
,
std
::
vector
<
KernelConfig
>*
configs
)
:
op_
(
op
),
scope_
(
scope
),
device_context_
(
device_context
),
ctx_
(
ctx
),
kernel_configs_
(
configs
)
{}
const
OperatorBase
&
op
()
const
{
return
op_
;
}
...
...
@@ -398,11 +417,20 @@ class ExecutionContext {
return
temp_tensor
;
}
template
<
typename
T
>
T
&
GetKernelConfig
(
int
idx
)
const
{
PADDLE_ENFORCE
(
kernel_configs_
&&
kernel_configs_
->
size
()
>
idx
,
"%s selected kernel doesn't have kernel config %lu <= %d"
,
op_
.
Type
().
c_str
(),
kernel_configs_
->
size
(),
idx
);
return
*
boost
::
get
<
std
::
shared_ptr
<
T
>>
(
kernel_configs_
->
at
(
idx
));
}
private:
const
OperatorBase
&
op_
;
const
Scope
&
scope_
;
const
platform
::
DeviceContext
&
device_context_
;
const
RuntimeContext
&
ctx_
;
mutable
std
::
vector
<
KernelConfig
>*
kernel_configs_
;
};
template
<>
...
...
@@ -483,6 +511,8 @@ class OperatorWithKernel : public OperatorBase {
virtual
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
;
std
::
vector
<
KernelConfig
>*
GetKernelConfig
(
const
OpKernelType
&
key
)
const
;
protected:
virtual
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
Tensor
&
tensor
,
...
...
@@ -508,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
void
TransferInplaceVarsBack
(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
Scope
&
exec_scope
)
const
;
protected:
mutable
OpKernelConfigsMap
kernel_configs_map_
;
};
extern
bool
OpSupportGPU
(
const
std
::
string
&
op_type
);
...
...
paddle/fluid/framework/operator_kernel_configs.h
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <unordered_map>
#include <vector>
namespace
paddle
{
namespace
framework
{
// Not thread-safe. Should be owned per-kernel.
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
framework
::
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/tensor.h
浏览文件 @
a8c4324d
...
...
@@ -27,6 +27,10 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_utils.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -37,10 +41,34 @@ class Tensor {
#ifdef PADDLE_WITH_MKLDNN
public:
inline
mkldnn
::
memory
::
format
format
()
const
{
return
format_
;
}
// TODO(jczaja): This is depracted and will be removed
inline
mkldnn
::
memory
::
format
format
()
const
{
if
(
layout_
==
DataLayout
::
kMKLDNN
)
{
return
static_cast
<
mkldnn
::
memory
::
format
>
(
mem_pd_
.
desc
().
data
.
format
);
}
else
{
return
mkldnn
::
memory
::
format
::
format_undef
;
}
}
inline
void
set_format
(
const
mkldnn
::
memory
::
format
format
)
{
format_
=
format
;
// TODO(jczaja): This is depracted and will be removed
inline
void
set_format
(
const
mkldnn
::
memory
::
format
fmt
,
mkldnn
::
memory
::
data_type
data_type
=
mkldnn
::
memory
::
f32
)
{
mem_pd_
=
paddle
::
platform
::
create_prim_desc_from_format
(
paddle
::
framework
::
vectorize2int
(
dims
()),
fmt
,
data_type
);
layout_
=
DataLayout
::
kMKLDNN
;
}
inline
mkldnn
::
memory
::
primitive_desc
get_mkldnn_prim_desc
()
const
{
return
mem_pd_
;
}
inline
void
set_mkldnn_prim_desc
(
const
mkldnn
::
memory
::
primitive_desc
&
mem_pd
)
{
// Internally MKL-DNN is just copying (increasing reference counter)
// to shared_ptr. So asignment should be quite cheap
mem_pd_
=
mem_pd
;
layout_
=
DataLayout
::
kMKLDNN
;
}
protected:
...
...
@@ -48,12 +76,9 @@ class Tensor {
* @brief the detail format of memory block which have layout as kMKLDNN
*
* @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
* nChw16c, etc. For a MKLDNN memory block, layout will be set as
* DataLayout::kMKLDNN meanwhile detail memory format will be kept in
* this field.
* nChw16c, etc. For a MKLDNN memory block, we store memory descriptor
*/
mkldnn
::
memory
::
format
format_
=
mkldnn
::
memory
::
format
::
format_undef
;
mutable
mkldnn
::
memory
::
primitive_desc
mem_pd_
;
#endif
public:
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
a8c4324d
...
...
@@ -50,8 +50,6 @@ class Scope;
}
// namespace framework
namespace
operators
{
template
<
typename
T
>
class
AlgorithmsCache
;
class
CudnnRNNCache
;
...
...
@@ -144,9 +142,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#ifndef _WIN32
ncclUniqueId
,
platform
::
Communicator
,
#endif
operators
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
,
operators
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
,
operators
::
CudnnRNNCache
,
#endif
int
,
float
>
;
...
...
paddle/fluid/imperative/layer.cc
浏览文件 @
a8c4324d
...
...
@@ -249,7 +249,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
framework
::
Scope
scope
;
PreparedOp
p
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
place_
);
p
.
op
.
RuntimeInferShape
(
scope
,
place_
,
ctx
);
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
));
p
.
func
(
framework
::
ExecutionContext
(
p
.
op
,
scope
,
*
p
.
dev_ctx
,
p
.
ctx
,
nullptr
));
}
}
...
...
paddle/fluid/imperative/layer.h
浏览文件 @
a8c4324d
...
...
@@ -44,8 +44,13 @@ class PreparedOp {
PreparedOp
(
const
framework
::
OperatorBase
&
op
,
const
framework
::
RuntimeContext
&
ctx
,
framework
::
OperatorWithKernel
::
OpKernelFunc
func
,
platform
::
DeviceContext
*
dev_ctx
)
:
op
(
op
),
ctx
(
ctx
),
func
(
func
),
dev_ctx
(
dev_ctx
)
{}
platform
::
DeviceContext
*
dev_ctx
,
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
)
:
op
(
op
),
ctx
(
ctx
),
func
(
func
),
dev_ctx
(
dev_ctx
),
kernel_configs
(
kernel_configs
)
{}
static
PreparedOp
Prepare
(
const
framework
::
RuntimeContext
&
ctx
,
const
framework
::
OperatorWithKernel
&
op
,
...
...
@@ -64,8 +69,9 @@ class PreparedOp {
framework
::
OperatorWithKernel
::
OpKernelMap
&
kernels
=
kernels_iter
->
second
;
auto
expected_kernel_key
=
op
.
GetExpectedKernelType
(
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
));
auto
expected_kernel_key
=
op
.
GetExpectedKernelType
(
framework
::
ExecutionContext
(
op
,
framework
::
Scope
(),
*
dev_ctx
,
ctx
,
nullptr
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -83,7 +89,9 @@ class PreparedOp {
PADDLE_THROW
(
"op %s does not have kernel for %s"
,
op
.
Type
(),
KernelTypeToString
(
expected_kernel_key
));
}
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
);
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
=
op
.
GetKernelConfig
(
expected_kernel_key
);
return
PreparedOp
(
op
,
ctx
,
kernel_iter
->
second
,
dev_ctx
,
kernel_configs
);
}
inline
platform
::
DeviceContext
*
GetDeviceContext
()
const
{
return
dev_ctx
;
}
...
...
@@ -92,6 +100,7 @@ class PreparedOp {
const
framework
::
RuntimeContext
&
ctx
;
framework
::
OperatorWithKernel
::
OpKernelFunc
func
;
platform
::
DeviceContext
*
dev_ctx
;
std
::
vector
<
framework
::
KernelConfig
>*
kernel_configs
;
};
class
OpBase
;
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
a8c4324d
...
...
@@ -138,8 +138,9 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
op
->
place_
=
GetExpectedPlace
(
expected_place
,
inputs
);
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
prepared_op
.
func
(
framework
::
ExecutionContext
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
));
prepared_op
.
func
(
framework
::
ExecutionContext
(
prepared_op
.
op
,
scope
,
*
prepared_op
.
dev_ctx
,
prepared_op
.
ctx
,
prepared_op
.
kernel_configs
));
if
(
!
stop_gradient
)
{
std
::
unique_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
string
>>
grad_to_var
(
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
a8c4324d
...
...
@@ -66,7 +66,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
dynload_warpctc
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler
sample_prob
tree2col
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search
)
if
(
WITH_GPU
)
set
(
COMMON_OP_DEPS
${
COMMON_OP_DEPS
}
depthwise_conv prelu
)
...
...
paddle/fluid/operators/beam_search_decode_op.cc
浏览文件 @
a8c4324d
...
...
@@ -123,7 +123,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
framework
::
RuntimeContext
run_ctx
(
Inputs
(),
Outputs
(),
scope
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
*
this
,
scope
,
dev_ctx
,
run_ctx
,
nullptr
);
const
LoDTensorArray
*
ids
=
ctx
.
Input
<
LoDTensorArray
>
(
"Ids"
);
const
LoDTensorArray
*
scores
=
ctx
.
Input
<
LoDTensorArray
>
(
"Scores"
);
...
...
paddle/fluid/operators/beam_search_decode_op.h
浏览文件 @
a8c4324d
...
...
@@ -122,7 +122,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
auto
cpu_place
=
std
::
unique_ptr
<
paddle
::
platform
::
CPUPlace
>
(
new
paddle
::
platform
::
CPUPlace
());
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
(
*
cpu_place
.
get
()
);
paddle
::
platform
::
CPUDeviceContext
cpu_ctx
(
*
cpu_place
);
framework
::
LoD
lod
;
lod
.
push_back
(
source_level_lod
);
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
a8c4324d
...
...
@@ -42,6 +42,7 @@ using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using
DataLayout
=
platform
::
DataLayout
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
@@ -169,18 +170,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
if
(
exhaustive_search
&&
(
!
half_float
))
{
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
cudnn_workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
platform
::
CUDADeviceContext
>
(
framework
::
make_ddim
(
...
...
@@ -188,7 +179,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
dev_ctx
);
cudnn_workspace_ptr
=
static_cast
<
void
*>
(
cudnn_workspace
.
data
<
int8_t
>
());
algo
=
algo_cache
->
GetAlgorithm
(
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
...
...
@@ -382,22 +373,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>*
data_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdDataAlgoCache
))
{
data_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
else
{
data_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>&
data_algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
(
0
);
data_algo
=
data_algo_cache
->
GetAlgorithm
(
data_algo
=
data_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
...
...
@@ -448,22 +428,11 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>*
f_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdFilterAlgoCache
))
{
f_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
else
{
f_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>&
f_algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
(
1
);
filter_algo
=
f_algo_cache
->
GetAlgorithm
(
filter_algo
=
f_algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
...
...
paddle/fluid/operators/conv_cudnn_op_cache.h
浏览文件 @
a8c4324d
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <functional>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cudnn_helper.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
...
...
@@ -46,100 +47,5 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static
constexpr
size_t
kNUM_CUDNN_BWD_DATA_ALGS
=
5
;
#endif
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
AlgorithmsCache
()
:
search_times_
(
0
)
{
hash_
.
clear
();
}
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
TAlgorithm
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
std
::
mutex
mutex_
;
int
search_times_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
int64_t
area
,
int
search_times
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
if
(
hash_
.
find
(
area
)
!=
hash_
.
end
())
{
return
hash_
[
area
];
}
if
(
search_times_
<
search_times
)
{
auto
algo
=
gen_func
();
hash_
[
area
]
=
algo
;
++
search_times_
;
return
algo
;
}
TAlgorithm
algo
;
int64_t
min
=
static_cast
<
uint64_t
>
(
INT_MAX
);
for
(
const
auto
&
m
:
hash_
)
{
if
(
m
.
first
<
min
)
{
min
=
m
.
first
;
algo
=
m
.
second
;
}
}
return
algo
;
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_fusion_op.cu.cc
浏览文件 @
a8c4324d
...
...
@@ -30,6 +30,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using
ScopedConvolutionDescriptor
=
platform
::
ScopedConvolutionDescriptor
;
using
ScopedActivationDescriptor
=
platform
::
ScopedActivationDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
framework
::
AlgorithmsCache
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
...
...
@@ -139,37 +141,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
}
return
fwd_perf_stat
[
0
].
algo
;
};
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>&
algo_cache
=
ctx
.
GetKernelConfig
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
(
0
);
int
search_times
=
ctx
.
Attr
<
int
>
(
"search_times"
);
search_times
=
std
::
max
(
static_cast
<
int
>
(
FLAGS_cudnn_exhaustive_search_times
),
search_times
);
// TODO(dangqingqing): Unify this if-else.
if
(
search_times
>
0
)
{
// The searched algo will be cached by `search_times` times for
// different input dimension. For other dimensions, select the algo
// of closest area.
auto
var_name
=
ctx
.
Inputs
(
"AlgoCache"
)[
0
];
algo_cache
=
ctx
.
scope
()
.
FindVar
(
var_name
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
[
2
]
*
x_dims
[
3
],
search_times
,
0
,
search_func
);
}
else
{
// Cache searched algo in Var(kCUDNNFwdAlgoCache).
// all conv ops use the same kCUDNNFwdAlgoCache variable.
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
// TODO(qingqing) remove const_cast
algo_cache
=
const_cast
<
framework
::
Scope
*>
(
ctx
.
scope
().
parent
())
->
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
search_func
);
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
a8c4324d
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
...
...
@@ -109,8 +110,20 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
"float16 can only be used when CUDNN is used"
);
}
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
auto
type
=
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
layout
,
library
,
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
// TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn
// to false. It should be fixed and then here should only create if library
// is kCUDNN.
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>
());
configs
.
push_back
(
p
);
}
#endif
return
type
;
}
void
Conv2DOpMaker
::
Make
()
{
...
...
@@ -410,9 +423,25 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
}
#endif
return
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
auto
type
=
framework
::
OpKernelType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
(),
ctx
.
GetPlace
(),
layout_
,
library_
,
customized_type_value
);
#ifdef PADDLE_WITH_CUDA
if
(
library_
==
framework
::
LibraryType
::
kCUDNN
)
{
std
::
vector
<
framework
::
KernelConfig
>&
configs
=
kernel_configs_map_
[
type
];
if
(
configs
.
empty
())
{
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
p
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>
());
configs
.
push_back
(
p
);
std
::
shared_ptr
<
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
p2
(
new
framework
::
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>
());
configs
.
push_back
(
p2
);
}
}
#endif
return
type
;
}
class
Conv2dGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
...
...
paddle/fluid/operators/detection/yolov3_loss_op.cc
浏览文件 @
a8c4324d
...
...
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"The ignore threshold to ignore confidence loss."
)
.
SetDefault
(
0.7
);
AddComment
(
R"DOC(
This operator generate
yolov3 loss by
given predict result and ground
This operator generate
s yolov3 loss based on
given predict result and ground
truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W
should be the same,
specify the grid size, each grid point predict given
number boxes, this given number is specified by anchors, it should be
half anchors length, which following will be represented as S. In the
second dimention(the channel dimention), C should be S * (class_num + 5),
c
lass_num is the box categoriy number of source dataset(such as coco),
s
o in the second dimention, stores 4 box location coordinates x, y, w, h
a
nd
confidence score of the box and class one-hot key of each anchor box.
should be the same,
H and W specify the grid size, each grid point predict
given number boxes, this given number, which following will be represented as S,
is specified by the number of anchors, In the second dimension(the channel
dimension), C should be equal to S * (class_num + 5), class_num is the object
c
ategory number of source dataset(such as 80 in coco dataset), so in the
s
econd(channel) dimension, apart from 4 box location coordinates x, y, w, h,
a
lso includes
confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$
, the box predictions
correspnd to
:
Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`
, the box predictions
should be as follows
:
$$
b_x = \sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y
b_x = \\sigma(t_x) + c_x
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
$$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
is specified by anchors.
In the equation above, :math:`c_x, c_y` is the left top corner of current grid
and :math:`p_w, p_h`
is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger th
e
n ignore
the max IoU should be 1, and if the anchor box has IoU bigger th
a
n ignore
thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss,
...
...
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow.
calculated as follow
s
.
$$
weight_{box} = 2.0 - t_w * t_h
$$
Final loss will be represented as follow.
Final loss will be represented as follow
s
.
$$
loss = (loss_{xy} + loss_{wh}) * weight_{box}
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
a8c4324d
...
...
@@ -39,6 +39,7 @@ math_library(cross_entropy)
math_library
(
cos_sim_functor
)
math_library
(
depthwise_conv DEPS cub
)
math_library
(
im2col
)
math_library
(
sample_prob
)
math_library
(
sampler
)
math_library
(
gru_compute DEPS activation_functions math_function
)
...
...
paddle/fluid/operators/math/sample_prob.cc
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/sample_prob.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
class
SampleWithProb
<
platform
::
CPUDeviceContext
,
float
>;
template
class
SampleWithProb
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/sample_prob.cu
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <thrust/random.h>
#include <thrust/sort.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
__device__
T
gpu_adjust_prob
(
const
T
prob
,
const
int
num_samples
,
const
int
num_tries
)
{
if
(
num_samples
==
num_tries
)
{
return
prob
*
num_samples
;
}
else
{
return
-
expm1
(
num_tries
*
log1p
(
-
prob
));
}
}
class
GPULogUniformSampler
{
public:
__device__
int64_t
Sample
(
float
random
,
const
int
range
,
const
float
log_range
)
const
;
__device__
float
Probability
(
int64_t
value
,
const
float
log_range
)
const
;
};
__device__
int64_t
GPULogUniformSampler
::
Sample
(
float
random
,
const
int
range
,
const
float
log_range
)
const
{
// Got Log Uniform distribution from uniform distribution by
// inverse_transform_sampling method
const
int64_t
value
=
static_cast
<
int64_t
>
(
exp
(
random
*
log_range
))
-
1
;
// Mathematically, value should be <= range_, but might not be due to some
// floating point roundoff, so we mod by range_.
return
value
%
range
;
}
__device__
float
GPULogUniformSampler
::
Probability
(
int64_t
value
,
const
float
log_range
)
const
{
// Given f(x) = 1/[(x+1) * log_range_]
// The value's probability is integral of f(x) from value to (value + 1)
return
(
log
((
value
+
2.0
)
/
(
value
+
1.0
)))
/
log_range
;
}
template
<
typename
T
>
__global__
void
SamplingCondidate
(
const
size_t
n
,
const
int
num_tries
,
const
int
range
,
const
float
log_range
,
const
int
num_true
,
const
std
::
size_t
num_samples
,
const
int64_t
*
label_data
,
int64_t
*
samples_data
,
T
*
probabilities_data
)
{
const
int
num_sampled_classes
=
num_true
+
num_samples
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
0
;
GPULogUniformSampler
sampler
;
for
(;
idx
<
n
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
col_idx
=
idx
%
num_sampled_classes
;
int
row_idx
=
idx
/
num_sampled_classes
;
if
(
col_idx
<
num_true
)
{
samples_data
[
idx
]
=
label_data
[
row_idx
*
num_true
+
col_idx
];
}
else
{
samples_data
[
idx
]
=
samples_data
[
col_idx
];
}
probabilities_data
[
idx
]
=
sampler
.
Probability
(
samples_data
[
idx
],
log_range
);
probabilities_data
[
idx
]
=
gpu_adjust_prob
(
probabilities_data
[
idx
],
num_samples
,
num_tries
);
}
}
template
<
typename
T
>
int
UniqSampler
(
const
Sampler
&
sampler
,
const
std
::
size_t
num_samples
,
int64_t
*
samples_data
)
{
// sample num_samles unique samples for an example, note that they are not
// all negative samples
std
::
unordered_set
<
int64_t
>
tmp_samples
;
tmp_samples
.
clear
();
int
num_tries
=
0
;
int
j
=
0
;
while
(
j
<
num_samples
)
{
++
num_tries
;
auto
v
=
sampler
.
Sample
();
auto
insert_ok
=
tmp_samples
.
insert
(
v
).
second
;
if
(
!
insert_ok
)
{
continue
;
}
samples_data
[
j
]
=
v
;
++
j
;
}
return
num_tries
;
}
template
<
typename
T
>
void
GPUSampleWithProb
<
T
>::
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
int
seed
,
const
int
dict_size
,
const
bool
uniq
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
)
{
// UNDERSTAND: dimension issues
const
auto
lbl_dim
=
L
->
dims
();
const
int
batch_size
=
lbl_dim
[
0
];
const
int
num_true
=
lbl_dim
[
1
];
const
int
num_sampled_classes
=
num_true
+
num_samples
;
framework
::
DDim
ret_dim
{
batch_size
,
num_sampled_classes
};
// UNDERSTAND: raw data view
const
int64_t
*
label_data
=
L
->
data
<
int64_t
>
();
int64_t
*
samples_data
=
S
->
data
<
int64_t
>
();
T
*
probabilities_data
=
P
->
data
<
T
>
();
int
s_size
=
num_samples
;
framework
::
DDim
s_dim
{
s_size
};
Tensor
s
;
int64_t
*
s_data
=
s
.
mutable_data
<
int64_t
>
(
s_dim
,
platform
::
CPUPlace
());
math
::
LogUniformSampler
sampler
(
dict_size
,
seed
);
int
range
=
dict_size
;
float
log_range
=
log
(
range
+
1
);
int
num_tries
=
UniqSampler
<
T
>
(
sampler
,
num_samples
,
s_data
);
VLOG
(
1
)
<<
"num_tries: "
<<
num_tries
;
PADDLE_ENFORCE
(
cudaMemcpy
(
samples_data
+
num_true
,
s_data
,
sizeof
(
int64_t
)
*
num_samples
,
cudaMemcpyHostToDevice
));
int
threads
=
512
;
const
size_t
size
=
batch_size
*
num_sampled_classes
;
int
grid
=
(
batch_size
*
num_sampled_classes
+
threads
-
1
)
/
threads
;
SamplingCondidate
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
size
,
num_tries
,
range
,
log_range
,
num_true
,
num_samples
,
label_data
,
samples_data
,
probabilities_data
);
}
template
class
GPUSampleWithProb
<
float
>;
template
class
GPUSampleWithProb
<
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/sample_prob.h
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/sampler.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
using
Tensor
=
framework
::
Tensor
;
/* UNDERSTAND: utility function to adjust probability for unique sampling,
return whatever as it is if not using unique samping */
template
<
typename
T
>
static
T
adjust_prob
(
const
T
prob
,
const
int
num_samples
,
const
int
num_tries
)
{
if
(
num_samples
==
num_tries
)
{
return
prob
*
num_samples
;
}
else
{
return
-
expm1
(
num_tries
*
log1p
(
-
prob
));
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
SampleWithProb
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
Sampler
&
sampler
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
)
{
// UNDERSTAND: dimension issues
const
auto
lbl_dim
=
L
->
dims
();
const
int
batch_size
=
lbl_dim
[
0
];
const
int
num_true
=
lbl_dim
[
1
];
const
int
num_sampled_classes
=
num_true
+
num_samples
;
framework
::
DDim
ret_dim
{
batch_size
,
num_sampled_classes
};
// UNDERSTAND: raw data view
const
int64_t
*
label_data
=
L
->
data
<
int64_t
>
();
int64_t
*
samples_data
=
S
->
mutable_data
<
int64_t
>
(
ret_dim
,
context
.
GetPlace
());
T
*
probabilities_data
=
P
->
mutable_data
<
T
>
(
ret_dim
,
context
.
GetPlace
());
// temp sets for unique sampling
std
::
unordered_set
<
int64_t
>
tmp_samples
;
int
j
=
0
;
// column index
// add true labels, not that efficient
while
(
j
<
num_true
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
j
;
auto
v
=
label_data
[
i
*
num_true
+
j
];
samples_data
[
samples_index
]
=
v
;
probabilities_data
[
samples_index
]
=
sampler
.
Probability
(
v
);
}
++
j
;
}
// sample num_samles unique samples for an example, note that they are not
// all negative samples
tmp_samples
.
clear
();
int
num_tries
=
0
;
while
(
j
<
num_sampled_classes
)
{
++
num_tries
;
auto
v
=
sampler
.
Sample
();
auto
insert_ok
=
tmp_samples
.
insert
(
v
).
second
;
if
(
!
insert_ok
)
{
continue
;
}
auto
p
=
sampler
.
Probability
(
v
);
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
j
;
samples_data
[
samples_index
]
=
v
;
probabilities_data
[
samples_index
]
=
p
;
}
++
j
;
}
// compute Q(y|x), because of unique sampling, probabilities need to be
// adjusted
for
(
int
k
=
0
;
k
<
num_sampled_classes
;
++
k
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
samples_index
=
i
*
num_sampled_classes
+
k
;
probabilities_data
[
samples_index
]
=
adjust_prob
(
probabilities_data
[
samples_index
],
num_samples
,
num_tries
);
}
}
}
};
#ifdef PADDLE_WITH_CUDA
template
<
typename
T
>
class
GPUSampleWithProb
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
int
seed
,
const
int
dict_size
,
const
bool
uniq
,
const
std
::
size_t
num_samples
,
const
Tensor
*
L
,
Tensor
*
S
,
Tensor
*
P
);
};
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
浏览文件 @
a8c4324d
...
...
@@ -225,7 +225,7 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx
.
GetBlob
(
key_src_mem
));
PADDLE_ENFORCE
(
src_memory
!=
nullptr
,
"Fail to find src_memory in device context"
);
src_memory
->
set_data_handle
(
*
p_src_data
.
get
()
);
src_memory
->
set_data_handle
(
*
p_src_data
);
std
::
shared_ptr
<
memory
>
diff_src_memory
;
...
...
paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
浏览文件 @
a8c4324d
...
...
@@ -96,12 +96,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
*
bias
=
ctx
.
HasInput
(
"Bias"
)
?
ctx
.
Input
<
Tensor
>
(
"Bias"
)
:
nullptr
;
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
&&
input
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Input tensor"
);
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
&&
filter
->
format
()
!=
memory
::
format
::
format_undef
,
"Wrong layout/format set for Filter tensor"
);
PADDLE_ENFORCE
(
input
->
layout
()
==
DataLayout
::
kMKLDNN
);
PADDLE_ENFORCE
(
filter
->
layout
()
==
DataLayout
::
kMKLDNN
);
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
||
input
->
dims
().
size
()
==
5
,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"
);
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
||
filter
->
dims
().
size
()
==
5
,
...
...
@@ -148,14 +144,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
primitive
>
pipeline
;
auto
src_format
=
input
->
format
();
// For convolution with groups we need to recreate primitive descriptor
// as Paddle tensor is not having group dims while mkldnn treats
// group as another dimensions
mkldnn
::
memory
::
primitive_desc
user_weights_mpd
=
filter
->
get_mkldnn_prim_desc
();
if
(
g
>
1
)
{
mkldnn
::
memory
::
format
weights_format
=
GetWeightsFormat
(
filter
->
format
(),
g
,
is_conv3d
);
auto
user_src_md
=
platform
::
MKLDNNMemDesc
(
{
src_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
src_format
);
auto
user_weights_md
=
platform
::
MKLDNNMemDesc
(
{
weights_tz
},
platform
::
MKLDNNGetDataType
<
T
>
(),
weights_format
);
user_weights_mpd
=
mkldnn
::
memory
::
primitive_desc
(
user_weights_md
,
mkldnn_engine
);
}
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
...
...
@@ -165,7 +166,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
chosen_memory_format
=
platform
::
data_format_to_memory_format
(
data_format
);
weights_format
=
mkldnn
::
memory
::
format
::
any
;
mkldnn
::
memory
::
format
weights_format
=
mkldnn
::
memory
::
format
::
any
;
// Check the format for user's special output
if
(
chosen_memory_format
!=
mkldnn
::
memory
::
format
::
any
)
{
if
(
is_conv3d
)
{
...
...
@@ -205,10 +206,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform
::
ConvMKLDNNHandler
handler
(
conv_pd
,
dev_ctx
,
mkldnn_engine
,
key
);
// create mkldnn memory from input tensors (data/weights)
auto
user_src_memory_p
=
handler
.
AcquireSrcMemory
(
user_src_md
,
to_void_cast
<
T
>
(
input_data
));
auto
user_src_memory_p
=
handler
.
AcquireSrcMemory
(
input
->
get_mkldnn_prim_desc
()
,
to_void_cast
<
T
>
(
input_data
));
auto
user_weights_memory_p
=
handler
.
AcquireWeightsMemory
(
user_weights_md
,
to_void_cast
<
T
>
(
filter_data
));
user_weights_m
p
d
,
to_void_cast
<
T
>
(
filter_data
));
// create reorder primitive if the input format is not the preferred one
auto
src_memory_p
=
...
...
@@ -281,8 +282,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline
.
push_back
(
*
conv_p
);
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
output
->
set_
format
(
GetMKLDNNFormat
(
*
dst_memory_p
)
);
auto
dst_mpd
=
dst_memory_p
->
get_primitive_desc
(
);
output
->
set_
mkldnn_prim_desc
(
dst_mpd
);
}
void
ComputeINT8
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
{
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
...
...
@@ -947,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// push primitive to stream and wait until it's executed
pipeline
.
push_back
(
*
conv_bwd_weights_p
);
filter_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
filter_grad
->
set_
format
(
GetMKLDNNFormat
(
*
diff_weights_memory_p
)
);
auto
filter_grad_mpd
=
diff_weights_memory_p
->
get_primitive_desc
(
);
filter_grad
->
set_
mkldnn_prim_desc
(
filter_grad_mpd
);
}
if
(
input_grad
)
{
...
...
paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
浏览文件 @
a8c4324d
...
...
@@ -42,8 +42,12 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
// The format of output is set as the mkldnn's format
// TODO(@mozga-intel) The format of matrix sets inside the another layers.
tensor
->
set_layout
(
DataLayout
::
kMKLDNN
);
tensor
->
set_format
(
mkldnn
::
memory
::
format
::
oihw
);
// TODO(jczaja): Remove this hack after checking performance on block layout
auto
tensor_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
tensor
->
dims
()),
mkldnn
::
memory
::
format
::
oihw
);
tensor
->
set_mkldnn_prim_desc
(
tensor_mem_pd
);
}
};
}
// namespace operators
...
...
paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
浏览文件 @
a8c4324d
...
...
@@ -198,7 +198,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
// push primitive to stream and wait until it's executed
std
::
vector
<
mkldnn
::
primitive
>
pipeline
{
*
(
pool_p
.
get
())
};
std
::
vector
<
mkldnn
::
primitive
>
pipeline
{
*
pool_p
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
output
->
set_layout
(
DataLayout
::
kMKLDNN
);
...
...
@@ -367,8 +367,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
dev_ctx
.
SetBlob
(
key_pool_diff_dst_mem_p
,
diff_dst_memory
);
pool_bwd_p
=
std
::
make_shared
<
pooling_backward
>
(
pool_bwd_pd
,
*
(
diff_dst_memory
.
get
()),
*
workspace_memory
,
*
(
diff_src_memory
));
pool_bwd_pd
,
*
diff_dst_memory
,
*
workspace_memory
,
*
diff_src_memory
);
dev_ctx
.
SetBlob
(
key_pool_bwd_p
,
pool_bwd_p
);
}
else
{
...
...
@@ -404,7 +403,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
if
(
is_diff_dst_reordered
)
{
pipeline
.
push_back
(
reorder_diff_dst
);
}
pipeline
.
push_back
(
*
(
pool_bwd_p
.
get
())
);
pipeline
.
push_back
(
*
pool_bwd_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
in_x_grad
->
set_layout
(
DataLayout
::
kMKLDNN
);
...
...
paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
浏览文件 @
a8c4324d
...
...
@@ -66,8 +66,7 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
"Fail to find softmax primitive in device context"
);
if
(
softmax_p
==
nullptr
)
{
softmax_p
=
std
::
make_shared
<
mkldnn
::
softmax_forward
>
(
*
(
softmax_pd_
.
get
()),
*
(
static_cast
<
mkldnn
::
memory
*>
(
src_memory_p
.
get
())),
*
softmax_pd_
,
*
(
static_cast
<
mkldnn
::
memory
*>
(
src_memory_p
.
get
())),
*
(
static_cast
<
mkldnn
::
memory
*>
(
dst_memory_p
.
get
())));
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_p
);
}
else
{
...
...
@@ -88,8 +87,8 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
"Fail to find softmax backward primitive in device context"
);
if
(
softmax_bwd_p
==
nullptr
)
{
softmax_bwd_p
=
std
::
make_shared
<
mkldnn
::
softmax_backward
>
(
*
softmax_bwd_pd_
,
*
(
dst_memory_p
.
get
()),
*
(
diff_dst_memory_p
.
get
())
,
*
(
diff_src_memory_p
.
get
())
);
*
softmax_bwd_pd_
,
*
dst_memory_p
,
*
diff_dst_memory_p
,
*
diff_src_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
softmax_bwd_p
);
}
else
{
is_reusing_
=
true
;
...
...
paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
浏览文件 @
a8c4324d
...
...
@@ -160,7 +160,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
get_selected_row
=
[
&
](
size_t
i
)
->
const
SelectedRows
&
{
if
(
i
==
0
&&
in0
)
{
return
*
in0
.
get
()
;
return
*
in0
;
}
else
{
return
in_vars
[
i
]
->
Get
<
SelectedRows
>
();
}
...
...
paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
浏览文件 @
a8c4324d
...
...
@@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
mkldnn_engine
,
key
);
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
input
->
format
(),
platform
::
to_void_cast
<
T
>
(
input_data
));
input
->
get_mkldnn_prim_desc
(),
platform
::
to_void_cast
<
T
>
(
input_data
));
auto
transpose_dst_memory_p
=
handler
.
AcquireDstMemory
(
output
,
ctx
.
GetPlace
());
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
...
...
@@ -61,6 +61,15 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
transpose_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto
output_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
output
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
);
output
->
set_mkldnn_prim_desc
(
output_mem_pd
);
}
};
...
...
@@ -102,8 +111,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
platform
::
TransposeMKLDNNHandler
handler
(
nchw_tz
,
reversed_axis
,
dev_ctx
,
mkldnn_engine
,
key
);
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
out_grad
->
format
(),
platform
::
to_void_cast
<
T
>
(
out_grad_data
));
auto
transpose_src_memory_p
=
handler
.
AcquireSrcMemory
(
out_grad
->
get_mkldnn_prim_desc
(),
platform
::
to_void_cast
<
T
>
(
out_grad_data
));
auto
transpose_dst_memory_p
=
handler
.
AcquireDstMemory
(
x_grad
,
ctx
.
GetPlace
());
auto
transpose_p
=
handler
.
AcquireTranspose
(
transpose_dst_memory_p
,
...
...
@@ -112,6 +122,15 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std
::
vector
<
mkldnn
::
primitive
>
pipeline
;
pipeline
.
push_back
(
*
transpose_p
);
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
// Transpose did change logical dimensions of Tensor, but reorder does not.
// Reorder does change only physical layout eg. format , strides
// so we need to create new primitive descriptor with changed logical layout
// so it match output shape
auto
x_grad_mem_pd
=
paddle
::
platform
::
create_prim_desc_from_dims
(
paddle
::
framework
::
vectorize2int
(
x_grad
->
dims
()),
mkldnn
::
memory
::
format
::
blocked
);
x_grad
->
set_mkldnn_prim_desc
(
x_grad_mem_pd
);
}
};
...
...
paddle/fluid/operators/pool_op.cc
浏览文件 @
a8c4324d
...
...
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
"be ignored."
);
// TODO(Chengduo): Add checker.
// (Currently,
// TypedAttrChecker don't support vector type.)
AddAttr
<
bool
>
(
"global_pooling"
,
AddAttr
<
bool
>
(
"global_pooling"
,
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, k
size and paddings will be ignored."
)
"If global_pooling = true, kernel
size and paddings will be ignored."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"(vector<int>, default {1, 1}), strides(height, "
...
...
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
"paddings"
,
"(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator."
"If global_pooling = true, paddings and ksize will be ignored."
)
"If global_pooling = true, paddings and k
ernel
size will be ignored."
)
.
SetDefault
({
0
,
0
});
AddAttr
<
bool
>
(
"exclusive"
,
...
...
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
.
SetDefault
(
false
);
...
...
@@ -262,28 +263,37 @@ Example:
For exclusive = false:
$$
hstart = i * strides[0] - paddings[0]
$$
$$
hend = hstart + ksize[0]
$$
$$
wstart = j * strides[1] - paddings[1]
$$
$$
wend = wstart + ksize[1]
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$
For exclusive = true:
$$
hstart = max(0, i * strides[0] - paddings[0])
$$
$$
hend = min(H, hstart + ksize[0])
$$
$$
wstart = max(0, j * strides[1] - paddings[1])
$$
$$
wend = min(W, wstart + ksize[1])
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC"
);
}
...
...
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
AddAttr
<
bool
>
(
"global_pooling"
,
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, k
size and paddings wille
be ignored."
)
"If global_pooling = true, k
ernel size and paddings will
be ignored."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
...
...
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
.
SetDefault
(
false
);
...
...
@@ -393,45 +403,65 @@ Example:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
For ceil_mode = false:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For ceil_mode = true:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
For exclusive = false:
$$
dstart = i * strides[0] - paddings[0]
$$
$$
dend = dstart + ksize[0]
$$
$$
hstart = j * strides[1] - paddings[1]
$$
$$
hend = hstart + ksize[1]
$$
$$
wstart = k * strides[2] - paddings[2]
$$
$$
wend = wstart + ksize[2]
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = true:
$$
dstart = max(0, i * strides[0] - paddings[0])
$$
$$
dend = min(D, dstart + ksize[0])
hstart = max(0, j * strides[1] - paddings[1])
$$
$$
hend = min(H, hstart + ksize[1])
$$
$$
wstart = max(0, k * strides[2] - paddings[2])
$$
$$
wend = min(W, wstart + ksize[2])
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
dstart = floor(i * D_{in} / D_{out})
dend = ceil((i + 1) * D_{in} / D_{out})
hstart = floor(j * H_{in} / H_{out})
hend = ceil((j + 1) * H_{in} / H_{out})
wstart = floor(k * W_{in} / W_{out})
wend = ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
...
...
paddle/fluid/operators/sample_logits_op.cc
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/fluid/operators/math/sample_prob.h"
namespace
paddle
{
namespace
operators
{
class
SampleLogitsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Logits"
,
"(Tensor, default: Tensor<float>), The unscaled log probabilities "
"which is a 2-D tensor with shape [N x K]. N is the batch_size, "
"and K is the class number."
);
AddInput
(
"Labels"
,
"(Tensor) The ground truth which is a 2-D tensor. Labels is a "
"Tensor<int64> with shape [N x NT], where NT is the number of"
"true labels for each example."
);
AddInput
(
"CustomizedSamples"
,
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S],"
" where N is the batch size, NT is the number of true labels "
"and S is the number of negtive sample for each example."
"The first NT elements of each row should be the same with true "
"labels, "
"followed by S custom negtive samples. This tensor"
"is only used when use_customized_samples is true."
)
.
AsDispensable
();
AddInput
(
"CustomizedProbabilities"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The tensor has the same shape with CustomSamples,"
"and each element represents probability of element in CustomSamples. "
"This "
"tensor is only used when use_customized_samples is true."
)
.
AsDispensable
();
AddOutput
(
"Samples"
,
"(Tensor, default: Tensor<int64_t>), A 2-D tensor with shape [N, "
"NT + S]."
"The outputs value of sampler, including NT true lables and S "
"negetive samples "
"for each example. This will be used in"
"backward calculation."
)
.
AsIntermediate
();
AddOutput
(
"Probabilities"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The probabilites of sampled positive and negtive labels."
)
.
AsIntermediate
();
AddOutput
(
"SampledLogits"
,
"(Tensor, default: Tensor<float>), A 2-D tensor with shape"
"[N, NT + S]. The outputs value of sampled logits, which will be"
"used in backward propagation."
)
.
AsIntermediate
();
AddOutput
(
"SampledLabels"
,
"(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled labels"
"with shape [N, NT]. The tonsor contains hard labels as input to "
" softmax op, that is 0, 1, ..., NT-1 because of the first NT elements"
" of Sampels are positive lables."
);
AddAttr
<
bool
>
(
"use_customized_samples"
,
"An indicator whether to use customized samples with probabilities, if "
"True"
"the operator will use customized samples and customized probabilities"
"otherwise, the operator will generate them by itself."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"uniq"
,
"An indicator whether to sample non-repetitive negtive labels, if True"
"the operator will sample negtive labels without replacement."
"Otherwise, the operator will sample negtive labels with replacement."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"remove_accidental_hits"
,
"An indicator whether to remove accidental hits when samples hits true"
"labels, the removal is implemented by subtracting the corresponding"
"logits by float_max to subpress their softmax to be zero."
)
.
SetDefault
(
true
);
AddAttr
<
int
>
(
"num_samples"
,
"The number of negative samples."
);
AddAttr
<
int
>
(
"seed"
,
"Random seed for generating samples"
).
SetDefault
(
0
);
AddComment
(
R"DOC(
"""
Computes sampled output training logits and labels suitable for implementing
sampled softmax.
"""
)DOC"
);
}
};
class
SampleLogitsOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Logits"
),
"Input(Logits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Labels"
),
"Input(Labels) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Samples"
),
"Output(Samples) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Probabilities"
),
"Output(Probabilities) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SampledLogits"
),
"Output(SampledLogits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SampledLabels"
),
"Output(SampledLabels) should be not null."
);
auto
logits_dims
=
ctx
->
GetInputDim
(
"Logits"
);
auto
labels_dims
=
ctx
->
GetInputDim
(
"Labels"
);
PADDLE_ENFORCE_EQ
(
logits_dims
.
size
(),
2UL
,
"The logits of softmax_with_cross_entropy should be a 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
labels_dims
.
size
(),
2UL
,
"The labels should be a 2-D tensor."
);
const
int
num_samples
=
ctx
->
Attrs
().
Get
<
int
>
(
"num_samples"
);
const
int
num_sampled_classes
=
labels_dims
[
1
]
+
num_samples
;
ctx
->
SetOutputDim
(
"Samples"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"Probabilities"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"SampledLogits"
,
{
logits_dims
[
0
],
num_sampled_classes
});
ctx
->
SetOutputDim
(
"SampledLabels"
,
{
logits_dims
[
0
],
labels_dims
[
1
]});
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
"Logits"
));
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
kt
;
}
};
// UNDERSTAND: InferShape for Grad
class
SampleLogitsOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Logits"
),
"Input(Logits) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Labels"
),
"Input(Labels) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Samples"
),
"Input(Samples) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"SampledLogits"
),
"Input(SampledLogits) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"SampledLogits"
)),
"Input(SampledLogits@Grad) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Logits"
)),
"Output(Logits@Grad) should be not null."
);
auto
logit_dims
=
ctx
->
GetInputDim
(
"Logits"
);
auto
label_dims
=
ctx
->
GetInputDim
(
"Labels"
);
PADDLE_ENFORCE_EQ
(
label_dims
.
size
(),
2UL
,
"The label should be a 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
logit_dims
.
size
(),
2UL
,
"The logits should be a 2-D tensor."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Logits"
),
ctx
->
GetInputDim
(
"Logits"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
framework
::
GetDataTypeOfVar
(
ctx
.
InputVar
(
framework
::
GradVarName
(
"SampledLogits"
)));
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
kt
;
}
};
// UNDERSTAND: what's the rule for making a GradMaker TODO
class
SampleLogitsGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
grad_op
=
new
framework
::
OpDesc
();
grad_op
->
SetType
(
"sample_logits_grad"
);
grad_op
->
SetInput
(
"Logits"
,
Input
(
"Logits"
));
grad_op
->
SetInput
(
"Labels"
,
Input
(
"Labels"
));
grad_op
->
SetInput
(
"Samples"
,
Output
(
"Samples"
));
grad_op
->
SetInput
(
"SampledLogits"
,
Output
(
"SampledLogits"
));
grad_op
->
SetInput
(
framework
::
GradVarName
(
"SampledLogits"
),
OutputGrad
(
"SampledLogits"
));
grad_op
->
SetOutput
(
framework
::
GradVarName
(
"Logits"
),
InputGrad
(
"Logits"
));
grad_op
->
SetAttrMap
(
Attrs
());
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
grad_op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
sample_logits
,
ops
::
SampleLogitsOp
,
ops
::
SampleLogitsOpMaker
,
ops
::
SampleLogitsGradMaker
);
REGISTER_OPERATOR
(
sample_logits_grad
,
ops
::
SampleLogitsOpGrad
);
REGISTER_OP_CPU_KERNEL
(
sample_logits
,
ops
::
SampleLogitsKernel
<
float
>
,
ops
::
SampleLogitsKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
sample_logits_grad
,
ops
::
SampleLogitsGradKernel
<
float
>
,
ops
::
SampleLogitsGradKernel
<
double
>
);
paddle/fluid/operators/sample_logits_op.cu
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h"
namespace
paddle
{
namespace
operators
{
// UNDERSTAND: something like take_along_axis in numpy.
template
<
typename
T
>
__global__
void
GPUTakeAlongD1
(
size_t
size
,
const
int
batch_size
,
const
int
array_slice_size
,
const
int
idx_slice_size
,
const
T
*
p_array
,
const
int64_t
*
p_index
,
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
/
idx_slice_size
;
auto
array_index
=
p_index
[
idx
];
p_value
[
idx
]
=
p_array
[
i
*
array_slice_size
+
array_index
];
}
}
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template
<
typename
T
>
__global__
void
GPUPutAlongD1
(
size_t
size
,
const
int
batch_size
,
const
int
array_slice_size
,
const
int
idx_slice_size
,
T
*
p_array
,
const
int64_t
*
p_index
,
const
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
// size == batch_size
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
;
for
(
int
j
=
0
;
j
<
idx_slice_size
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_array
[
i
*
array_slice_size
+
array_index
]
+=
p_value
[
i
*
idx_slice_size
+
j
];
}
}
}
// UNDERSTAND: set label as 0,1,...,num_true-1
template
<
typename
T
>
__global__
void
GPUSetLabel
(
size_t
size
,
const
int
num_true
,
int64_t
*
p_array
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
p_array
[
idx
]
=
idx
%
num_true
;
}
}
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template
<
typename
T
>
__global__
void
gpu_compute_remove_accidental_hits
(
const
int
size
,
const
int
num_true
,
const
int
idx_slice_size
,
const
int64_t
*
p_index
,
T
*
p_value
)
{
const
auto
value_slice_size
=
idx_slice_size
;
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
step_size
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
idx
<
size
;
idx
+=
step_size
)
{
int
i
=
idx
/
idx_slice_size
;
if
(
idx
%
idx_slice_size
<
num_true
)
continue
;
for
(
int
j
=
0
;
j
<
num_true
;
++
j
)
{
const
auto
true_idx
=
i
*
idx_slice_size
+
j
;
if
(
p_index
[
true_idx
]
==
p_index
[
idx
])
{
p_value
[
idx
]
-=
1e20
;
break
;
}
}
}
}
template
<
typename
T
>
class
SampleLogitsCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
// get necessary inputs
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Labels"
);
VLOG
(
3
)
<<
"Enter SampleLogitsCUDAKernel"
;
// get necessary outputs
Tensor
*
samples
=
context
.
Output
<
Tensor
>
(
"Samples"
);
Tensor
*
probabilities
=
context
.
Output
<
Tensor
>
(
"Probabilities"
);
Tensor
*
sampled_logits
=
context
.
Output
<
Tensor
>
(
"SampledLogits"
);
Tensor
*
sampled_labels
=
context
.
Output
<
Tensor
>
(
"SampledLabels"
);
// shapes
const
auto
batch_size
=
logits
->
dims
()[
0
];
const
auto
num_classes
=
logits
->
dims
()[
1
];
const
auto
labels_dim
=
labels
->
dims
();
const
auto
num_true
=
labels_dim
[
1
];
const
auto
samples_dim
=
samples
->
dims
();
// attrs
const
auto
num_samples
=
context
.
Attr
<
int
>
(
"num_samples"
);
const
bool
use_customized_samples
=
context
.
Attr
<
bool
>
(
"use_customized_samples"
);
const
bool
uniq
=
context
.
Attr
<
bool
>
(
"uniq"
);
const
bool
remove_accidental_hits
=
context
.
Attr
<
bool
>
(
"remove_accidental_hits"
);
// device contexts
auto
&
dev_ctx
=
context
.
cuda_device_context
();
// UNDERSTAND: allocate memories for temporaries
sampled_logits
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
sampled_logits
,
static_cast
<
T
>
(
0
));
auto
sampled_labels_data
=
sampled_labels
->
mutable_data
<
int64_t
>
(
labels_dim
,
context
.
GetPlace
());
int
threads
=
512
;
size_t
size
=
batch_size
*
num_true
;
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUSetLabel
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
num_true
,
sampled_labels_data
);
if
(
use_customized_samples
)
{
const
Tensor
*
customized_samples
=
context
.
Input
<
Tensor
>
(
"CustomizedSamples"
);
const
Tensor
*
customized_probabilities
=
context
.
Input
<
Tensor
>
(
"CustomizedProbabilities"
);
samples
->
ShareDataWith
(
*
customized_samples
);
probabilities
->
ShareDataWith
(
*
customized_probabilities
);
}
else
{
samples
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
probabilities
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
// UNDERSTAND: sampling
const
auto
seed
=
context
.
Attr
<
int
>
(
"seed"
);
auto
sampler_with_prob
=
math
::
GPUSampleWithProb
<
T
>
();
sampler_with_prob
(
context
.
cuda_device_context
(),
seed
,
num_classes
,
uniq
,
num_samples
,
labels
,
samples
,
probabilities
);
}
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
const
auto
num_take
=
samples
->
dims
()[
1
];
const
auto
array_dims
=
logits
->
dims
();
const
auto
idx_dims
=
samples
->
dims
();
const
T
*
p_array
=
logits
->
data
<
T
>
();
const
int64_t
*
p_index
=
samples
->
data
<
int64_t
>
();
T
*
p_value
=
sampled_logits
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
size
=
batch_size
*
num_take
;
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUTakeAlongD1
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
batch_size
,
array_slice_size
,
idx_slice_size
,
p_array
,
p_index
,
p_value
);
if
(
remove_accidental_hits
)
{
const
size_t
size
=
batch_size
*
(
num_true
+
num_samples
);
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
gpu_compute_remove_accidental_hits
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
num_true
,
idx_slice_size
,
p_index
,
p_value
);
}
// subtracted sampled logits with logQ(y|x)
auto
probs
=
EigenMatrix
<
T
>::
From
(
*
probabilities
);
auto
smp_logits
=
EigenMatrix
<
T
>::
From
(
*
sampled_logits
);
smp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
smp_logits
-
probs
.
log
().
unaryExpr
(
TolerableValue
<
T
>
()))
.
unaryExpr
(
TolerableValue
<
T
>
());
}
};
template
<
typename
T
>
class
SampleLogitsGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
logits_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
Tensor
*
samples
=
context
.
Input
<
Tensor
>
(
"Samples"
);
const
Tensor
*
sampled_logits_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"SampledLogits"
));
logits_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
cuda_device_context
();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
logits_grad
,
static_cast
<
T
>
(
0
));
// UNDERSTAND: scatter it back to logit_grad
const
auto
batch_size
=
samples
->
dims
()[
0
];
const
auto
num_put
=
samples
->
dims
()[
1
];
const
auto
array_dims
=
logits_grad
->
dims
();
const
auto
idx_dims
=
samples
->
dims
();
T
*
p_array
=
logits_grad
->
data
<
T
>
();
const
int64_t
*
p_index
=
samples
->
data
<
int64_t
>
();
const
T
*
p_value
=
sampled_logits_grad
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
int
threads
=
128
;
const
size_t
size
=
batch_size
;
int
grid
=
(
size
+
threads
-
1
)
/
threads
;
GPUPutAlongD1
<
T
><<<
grid
,
threads
,
0
,
context
.
cuda_device_context
().
stream
()
>>>
(
size
,
batch_size
,
array_slice_size
,
idx_slice_size
,
p_array
,
p_index
,
p_value
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
sample_logits
,
ops
::
SampleLogitsCUDAKernel
<
float
>
,
ops
::
SampleLogitsCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
sample_logits_grad
,
ops
::
SampleLogitsGradCUDAKernel
<
float
>
,
ops
::
SampleLogitsGradCUDAKernel
<
double
>
);
paddle/fluid/operators/sample_logits_op.h
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
struct
TolerableValue
{
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
PADDLE_ASSERT
(
std
::
is_floating_point
<
T
>::
value
);
const
T
kApproInf
=
1e20
;
if
(
x
==
INFINITY
)
return
kApproInf
;
if
(
x
==
-
INFINITY
)
return
-
kApproInf
;
return
x
;
}
};
// UNDERSTAND: something like take_along_axis in numpy.
template
<
typename
T
>
static
void
CPUTakeAlongD1
(
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Tensor
&
array
,
const
framework
::
Tensor
&
index
,
framework
::
Tensor
*
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
.
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
.
dims
()[
0
]
&&
index
.
dims
()
==
value
->
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_take
=
index
.
dims
()[
1
];
const
auto
array_dims
=
array
.
dims
();
const
auto
idx_dims
=
index
.
dims
();
// UNDERSTAND: no allocations here
const
T
*
p_array
=
array
.
data
<
T
>
();
const
int64_t
*
p_index
=
index
.
data
<
int64_t
>
();
T
*
p_value
=
value
->
data
<
T
>
();
// src slice size
const
auto
array_slice_size
=
array_dims
[
1
];
// index slice size
const
auto
idx_slice_size
=
idx_dims
[
1
];
const
auto
value_slice_size
=
idx_slice_size
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_take
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_value
[
i
*
value_slice_size
+
j
]
=
p_array
[
i
*
array_slice_size
+
array_index
];
}
}
}
// UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
// indices, scatter is done in += way.
template
<
typename
T
>
static
void
CPUPutAlongD1
(
const
platform
::
DeviceContext
&
ctx
,
framework
::
Tensor
*
array
,
const
framework
::
Tensor
&
index
,
const
framework
::
Tensor
&
value
)
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
2
&&
array
->
dims
().
size
()
==
2
&&
index
.
dims
()[
0
]
==
array
->
dims
()[
0
]
&&
index
.
dims
()
==
value
.
dims
());
const
auto
batch_size
=
index
.
dims
()[
0
];
const
auto
num_put
=
index
.
dims
()[
1
];
auto
array_dims
=
array
->
dims
();
auto
idx_dims
=
index
.
dims
();
// UNDERSTAND: no allocations here
T
*
p_array
=
array
->
data
<
T
>
();
const
int64_t
*
p_index
=
index
.
data
<
int64_t
>
();
const
T
*
p_value
=
value
.
data
<
T
>
();
// slice sizes
const
auto
array_slice_size
=
array_dims
[
1
];
const
auto
idx_slice_size
=
idx_dims
[
1
];
const
auto
value_slice_size
=
idx_slice_size
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_put
;
++
j
)
{
auto
array_index
=
p_index
[
i
*
idx_slice_size
+
j
];
p_array
[
i
*
array_slice_size
+
array_index
]
+=
p_value
[
i
*
value_slice_size
+
j
];
}
}
}
// UNDERSTAND: compute accidentdal hits from samples and minus corresponding
// logits by a float max, here 1e20
template
<
typename
T
>
static
void
compute_remove_accidental_hits
(
const
platform
::
DeviceContext
&
ctx
,
framework
::
Tensor
*
sampled_logits
,
const
framework
::
Tensor
&
samples
,
const
int
num_true
)
{
const
auto
batch_size
=
sampled_logits
->
dims
()[
0
];
const
auto
num_sampled_classes
=
sampled_logits
->
dims
()[
1
];
T
*
sampled_logits_data
=
sampled_logits
->
data
<
T
>
();
const
auto
samples_data
=
samples
.
data
<
int64_t
>
();
std
::
unordered_set
<
int64_t
>
tmp_true_labels
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
tmp_true_labels
.
clear
();
tmp_true_labels
.
insert
(
samples_data
+
i
*
num_sampled_classes
,
samples_data
+
i
*
num_sampled_classes
+
num_true
);
for
(
int
j
=
num_true
;
j
<
num_sampled_classes
;
++
j
)
{
const
auto
idx
=
i
*
num_sampled_classes
+
j
;
if
(
tmp_true_labels
.
find
(
samples_data
[
idx
])
!=
tmp_true_labels
.
end
())
sampled_logits_data
[
idx
]
-=
1e20
;
}
}
}
template
<
typename
T
>
class
SampleLogitsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
context
.
GetPlace
()),
"This kernel only runs on CPU."
);
VLOG
(
3
)
<<
"Enter SampleLogitsKernel"
;
// get necessary inputs
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Labels"
);
// get necessary outputs
Tensor
*
samples
=
context
.
Output
<
Tensor
>
(
"Samples"
);
Tensor
*
probabilities
=
context
.
Output
<
Tensor
>
(
"Probabilities"
);
Tensor
*
sampled_logits
=
context
.
Output
<
Tensor
>
(
"SampledLogits"
);
Tensor
*
sampled_labels
=
context
.
Output
<
Tensor
>
(
"SampledLabels"
);
// shapes
const
auto
batch_size
=
logits
->
dims
()[
0
];
const
auto
num_classes
=
logits
->
dims
()[
1
];
const
auto
labels_dim
=
labels
->
dims
();
const
auto
num_true
=
labels_dim
[
1
];
const
auto
samples_dim
=
samples
->
dims
();
// attrs
const
auto
num_samples
=
context
.
Attr
<
int
>
(
"num_samples"
);
const
bool
use_customized_samples
=
context
.
Attr
<
bool
>
(
"use_customized_samples"
);
const
bool
remove_accidental_hits
=
context
.
Attr
<
bool
>
(
"remove_accidental_hits"
);
// device contexts
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
// UNDERSTAND: allocate memories for temporaries
sampled_logits
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
auto
sampled_labels_data
=
sampled_labels
->
mutable_data
<
int64_t
>
(
labels_dim
,
context
.
GetPlace
());
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_true
;
++
j
)
{
sampled_labels_data
[
i
*
num_true
+
j
]
=
j
;
}
}
if
(
use_customized_samples
)
{
const
Tensor
*
customized_samples
=
context
.
Input
<
Tensor
>
(
"CustomizedSamples"
);
const
Tensor
*
customized_probabilities
=
context
.
Input
<
Tensor
>
(
"CustomizedProbabilities"
);
samples
->
ShareDataWith
(
*
customized_samples
);
probabilities
->
ShareDataWith
(
*
customized_probabilities
);
}
else
{
samples
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
probabilities
->
mutable_data
<
T
>
(
samples_dim
,
context
.
GetPlace
());
// UNDERSTAND: sampling
const
auto
seed
=
context
.
Attr
<
int
>
(
"seed"
);
auto
sampler_with_prob
=
math
::
SampleWithProb
<
platform
::
CPUDeviceContext
,
T
>
();
sampler_with_prob
(
dev_ctx
,
math
::
LogUniformSampler
(
num_classes
,
seed
),
num_samples
,
labels
,
samples
,
probabilities
);
}
// UNDERSTAND: gather sampled logits and remove accidental hits if needed
CPUTakeAlongD1
<
T
>
(
dev_ctx
,
*
logits
,
*
samples
,
sampled_logits
);
if
(
remove_accidental_hits
)
{
compute_remove_accidental_hits
<
T
>
(
dev_ctx
,
sampled_logits
,
*
samples
,
num_true
);
}
// subtracted sampled logits with logQ(y|x)
auto
probs
=
EigenMatrix
<
T
>::
From
(
*
probabilities
);
auto
smp_logits
=
EigenMatrix
<
T
>::
From
(
*
sampled_logits
);
smp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
smp_logits
-
probs
.
log
().
unaryExpr
(
TolerableValue
<
T
>
()))
.
unaryExpr
(
TolerableValue
<
T
>
());
}
};
template
<
typename
T
>
class
SampleLogitsGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
Tensor
=
framework
::
Tensor
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
logits_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
Tensor
*
samples
=
context
.
Input
<
Tensor
>
(
"Samples"
);
const
Tensor
*
sampled_logits_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"SampledLogits"
));
logits_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
logits_grad
,
static_cast
<
T
>
(
0
));
// UNDERSTAND: scatter it back to logit_grad
CPUPutAlongD1
<
T
>
(
dev_ctx
,
logits_grad
,
*
samples
,
*
sampled_logits_grad
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
a8c4324d
...
...
@@ -87,11 +87,11 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library
(
timer SRCS timer.cc
)
cc_test
(
timer_test SRCS timer_test.cc DEPS timer
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
device_context
${
GPU_CTX_DEPS
}
)
cc_library
(
device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto
${
GPU_CTX_DEPS
}
)
if
(
WITH_GPU
)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_
context device_tracer
)
nv_library
(
profiler SRCS profiler.cc profiler.cu DEPS device_
tracer gpu_info enforce
)
else
()
cc_library
(
profiler SRCS profiler.cc DEPS device_
context device_tracer
)
cc_library
(
profiler SRCS profiler.cc DEPS device_
tracer enforce
)
endif
()
cc_test
(
profiler_test SRCS profiler_test.cc DEPS profiler
)
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
a8c4324d
...
...
@@ -394,7 +394,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
int
tid
=
platform
::
get_cur_thread_id
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
()
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
);
// Find KeyBlob for current thread
auto
map_it
=
pMap
->
find
(
tid
);
...
...
@@ -427,7 +427,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
int
tid
=
platform
::
get_cur_thread_id
();
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
.
get
()
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
p_mutex_
);
// Find KeyBlob for current thread firstly
auto
map_it
=
pMap
->
find
(
tid
);
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
a8c4324d
...
...
@@ -136,7 +136,7 @@ void EnableActivity() {
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
// We don't track these activities for now.
//
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL
(
dynload
::
cuptiActivityEnable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
...
...
@@ -155,7 +155,7 @@ void DisableActivity() {
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_DRIVER
));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_RUNTIME
));
//
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL
(
dynload
::
cuptiActivityDisable
(
CUPTI_ACTIVITY_KIND_MEMSET
));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
...
...
@@ -212,6 +212,14 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
memcpy
->
correlationId
,
memcpy
->
bytes
);
break
;
}
case
CUPTI_ACTIVITY_KIND_MEMSET
:
{
auto
*
memset
=
reinterpret_cast
<
const
CUpti_ActivityMemset
*>
(
record
);
tracer
->
AddKernelRecords
(
"MEMSET"
,
memset
->
start
,
memset
->
end
,
memset
->
deviceId
,
memset
->
streamId
,
memset
->
correlationId
);
break
;
}
case
CUPTI_ACTIVITY_KIND_DRIVER
:
{
auto
*
api
=
reinterpret_cast
<
const
CUpti_ActivityAPI
*>
(
record
);
if
(
api
->
start
!=
0
&&
api
->
end
!=
0
)
...
...
@@ -348,6 +356,8 @@ class DeviceTracerImpl : public DeviceTracer {
const
std
::
vector
<
int
>
cbids
{
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
#if CUDA_VERSION >= 9000
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
a8c4324d
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h"
...
...
@@ -32,8 +33,6 @@ inline uint64_t PosixInNsec() {
return
1000
*
(
static_cast
<
uint64_t
>
(
tv
.
tv_sec
)
*
1000000
+
tv
.
tv_usec
);
}
class
Event
;
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
a8c4324d
...
...
@@ -34,6 +34,7 @@ limitations under the License. */
#include <type_traits>
#include <utility>
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
...
...
paddle/fluid/platform/event.h
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
namespace
paddle
{
namespace
platform
{
enum
EventType
{
kMark
,
kPushRange
,
kPopRange
};
class
Event
{
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventType
type
,
std
::
string
name
,
uint32_t
thread_id
);
const
EventType
&
type
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
uint32_t
thread_id
()
const
{
return
thread_id_
;
}
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
#endif
double
CpuElapsedMs
(
const
Event
&
e
)
const
;
double
CudaElapsedMs
(
const
Event
&
e
)
const
;
private:
EventType
type_
;
std
::
string
name_
;
uint32_t
thread_id_
;
int64_t
cpu_ns_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t
gpu_ns_
=
0
;
public:
void
AddCudaElapsedTime
(
int64_t
start_ns
,
int64_t
end_ns
)
{
gpu_ns_
+=
end_ns
-
start_ns
;
}
private:
#else
cudaEvent_t
event_
=
nullptr
;
int
device_
=
-
1
;
#endif
#endif
};
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/mkldnn_reuse.h
浏览文件 @
a8c4324d
...
...
@@ -39,6 +39,45 @@ class MKLDNNHandler {
return
this
->
AcquireMemory
(
md
,
ptr
,
"@user_src_mem_p"
);
}
// TODO(jczaja): extract common part and make AcquireMemory
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
primitive_desc
&
mpd
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_src_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
primitive_desc
&
mpd
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_weights_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mpd
,
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireWeightsMemory
(
const
mkldnn
::
memory
::
desc
&
md
,
void
*
ptr
,
user_function
custom_func
=
{})
{
...
...
@@ -273,37 +312,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
mkldnn
::
engine
engine
,
const
std
::
string
&
base_key
)
:
platform
::
MKLDNNHandler
(
dev_ctx
,
engine
,
base_key
),
dims_
(
dims
),
axis_
(
axis
),
logical_axis_
(
dims
.
size
(),
0
)
{}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireSrcMemory
(
const
mkldnn
::
memory
::
format
&
fmt
,
void
*
ptr
)
{
auto
local_key
=
key_
+
"@user_src_mem_p"
;
auto
mem_p
=
std
::
static_pointer_cast
<
mkldnn
::
memory
>
(
dev_ctx_
.
GetBlob
(
local_key
));
PADDLE_ENFORCE
((
mem_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
" find mem primitive in device context"
);
if
(
mem_p
==
nullptr
)
{
// Make memory descriptor using input format, unless it
// cannot be trusted (nchw) then make up memory fmt manually
for
(
size_t
i
=
0
;
i
<
logical_axis_
.
size
();
++
i
)
{
logical_axis_
[
i
]
=
i
;
}
auto
src_md
=
fmt
!=
mkldnn
::
memory
::
format
::
nchw
?
platform
::
MKLDNNMemDesc
(
dims_
,
platform
::
MKLDNNGetDataType
<
float
>
(),
fmt
)
:
Axis2MemoryDesc
(
dims_
,
logical_axis_
);
mem_p
=
std
::
make_shared
<
mkldnn
::
memory
>
(
mkldnn
::
memory
::
primitive_desc
{
src_md
,
engine_
},
ptr
);
dev_ctx_
.
SetBlob
(
local_key
,
mem_p
);
}
else
{
mem_p
->
set_data_handle
(
ptr
);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_
=
true
;
}
return
mem_p
;
}
axis_
(
axis
)
{}
std
::
shared_ptr
<
mkldnn
::
memory
>
AcquireDstMemory
(
framework
::
Tensor
*
output
,
platform
::
Place
place
)
{
...
...
@@ -388,7 +397,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
private:
std
::
vector
<
int
>
dims_
;
std
::
vector
<
int
>
axis_
;
std
::
vector
<
int
>
logical_axis_
;
};
template
<
class
forward_t
,
class
backward_data_t
,
class
backward_weights_t
>
...
...
@@ -548,9 +556,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
()),
*
(
dst_memory_p
.
get
()));
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
src_memory_p
,
*
weights_memory_p
,
*
dst_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
...
...
@@ -570,9 +577,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
PADDLE_ENFORCE
((
conv_p
!=
nullptr
)
||
(
is_reusing_
==
false
),
"Fail to find convolution primitive in device context"
);
if
(
conv_p
==
nullptr
)
{
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
(
src_memory_p
),
*
(
weights_memory_p
.
get
())
,
*
(
bias_memory_p
.
get
()),
*
(
dst_memory_p
.
get
())
);
conv_p
=
std
::
make_shared
<
forward_t
>
(
*
conv_pd_
,
*
src_memory_p
,
*
weights_memory_p
,
*
bias_memory_p
,
*
dst_memory_p
);
dev_ctx_
.
SetBlob
(
prim_key
,
conv_p
);
}
else
{
...
...
paddle/fluid/platform/mkldnn_utils.h
0 → 100644
浏览文件 @
a8c4324d
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkldnn.h>
#include <string>
namespace
paddle
{
namespace
platform
{
inline
mkldnn
::
memory
::
primitive_desc
create_prim_desc_from_dims
(
const
std
::
vector
<
int
>&
ltz
,
mkldnn
::
memory
::
format
fmt
,
mkldnn
::
memory
::
data_type
data_type
=
mkldnn
::
memory
::
data_type
::
f32
)
{
mkldnn_memory_desc_t
mem_fmt
;
mem_fmt
.
primitive_kind
=
mkldnn_memory
;
mem_fmt
.
ndims
=
ltz
.
size
();
for
(
unsigned
int
i
=
0
;
i
<
ltz
.
size
();
++
i
)
{
mem_fmt
.
dims
[
i
]
=
ltz
[
i
];
// logical dimensions (nchw format,
// regardless physical layout)
}
mem_fmt
.
data_type
=
static_cast
<
mkldnn_data_type_t
>
(
data_type
);
mem_fmt
.
format
=
static_cast
<
mkldnn_memory_format_t
>
(
fmt
);
unsigned
int
total_stride
=
1
;
for
(
int
i
=
ltz
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
mem_fmt
.
layout_desc
.
blocking
.
padding_dims
[
i
]
=
ltz
[
i
];
// logical dimensions (nchw format, regardless physical
// layout)
mem_fmt
.
layout_desc
.
blocking
.
block_dims
[
i
]
=
1
;
mem_fmt
.
layout_desc
.
blocking
.
offset_padding_to_data
[
i
]
=
0
;
// no offset
mem_fmt
.
layout_desc
.
blocking
.
strides
[
0
][
i
]
=
total_stride
;
mem_fmt
.
layout_desc
.
blocking
.
strides
[
1
][
i
]
=
1
;
total_stride
*=
ltz
[
i
];
}
mem_fmt
.
layout_desc
.
blocking
.
offset_padding
=
0
;
// no initial offset
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
*
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
place
));
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
return
mkldnn
::
memory
::
primitive_desc
(
mem_fmt
,
cpu_engine
);
}
inline
mkldnn
::
memory
::
primitive_desc
create_prim_desc_from_format
(
const
std
::
vector
<
int
>&
ltz
,
const
mkldnn
::
memory
::
format
format
,
const
mkldnn
::
memory
::
data_type
data_type
)
{
auto
md
=
mkldnn
::
memory
::
desc
({
ltz
},
data_type
,
format
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
place
=
paddle
::
platform
::
CPUPlace
();
auto
dev_ctx
=
dynamic_cast
<
platform
::
MKLDNNDeviceContext
*>
(
pool
.
Get
(
place
));
PADDLE_ENFORCE_NOT_NULL
(
dev_ctx
,
"Could not get valid device"
);
auto
&
cpu_engine
=
dev_ctx
->
GetEngine
();
return
mkldnn
::
memory
::
primitive_desc
(
md
,
cpu_engine
);
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler.cu
浏览文件 @
a8c4324d
...
...
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <cuda.h>
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -22,26 +21,27 @@ namespace platform {
__global__
void
DummyKernel
(
int
*
a
)
{
a
[
0
]
=
0
;
}
static
void
ForEachDevice
(
std
::
function
<
void
(
int
)
>
func
)
{
auto
original_device
=
GetCurrentDeviceId
();
int
count
=
GetCUDADeviceCount
();
auto
original_device
=
platform
::
GetCurrentDeviceId
();
int
count
=
platform
::
GetCUDADeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
SetDeviceId
(
i
);
platform
::
SetDeviceId
(
i
);
func
(
i
);
}
SetDeviceId
(
original_device
);
platform
::
SetDeviceId
(
original_device
);
}
void
DummyKernelAndEvent
()
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
ForEachDevice
([](
int
d
)
{
CUDADeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDAPlace
(
d
));
platform
::
SetDeviceId
(
d
);
cudaStream_t
stream
;
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream
));
Mark
(
"_cuda_startup_"
);
int
*
ptr
;
PADDLE_ENFORCE
(
cudaMalloc
(
&
ptr
,
sizeof
(
int
)));
DummyKernel
<<<
1
,
1
,
0
,
dev_ctx
->
stream
()
>>>
(
ptr
);
dev_ctx
->
Wait
(
);
DummyKernel
<<<
1
,
1
,
0
,
stream
>>>
(
ptr
);
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
)
);
PADDLE_ENFORCE
(
cudaFree
(
ptr
));
delete
dev_ctx
;
});
}
}
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
a8c4324d
...
...
@@ -17,54 +17,13 @@ limitations under the License. */
#include <list>
#include <string>
#include <vector>
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
platform
{
enum
EventType
{
kMark
,
kPushRange
,
kPopRange
};
class
Event
{
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventType
type
,
std
::
string
name
,
uint32_t
thread_id
);
const
EventType
&
type
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
uint32_t
thread_id
()
const
{
return
thread_id_
;
}
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h"
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t
event
()
const
{
return
event_
;
}
int
device
()
const
{
return
device_
;
}
#endif
#include "paddle/fluid/platform/gpu_info.h"
#endif
double
CpuElapsedMs
(
const
Event
&
e
)
const
;
double
CudaElapsedMs
(
const
Event
&
e
)
const
;
private:
EventType
type_
;
std
::
string
name_
;
uint32_t
thread_id_
;
int64_t
cpu_ns_
;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t
gpu_ns_
=
0
;
public:
void
AddCudaElapsedTime
(
int64_t
start_ns
,
int64_t
end_ns
)
{
gpu_ns_
+=
end_ns
-
start_ns
;
}
private:
#else
cudaEvent_t
event_
=
nullptr
;
int
device_
=
-
1
;
#endif
#endif
};
namespace
paddle
{
namespace
platform
{
enum
ProfilerState
{
kDisabled
,
// disabled state
...
...
paddle/fluid/platform/profiler_test.cc
浏览文件 @
a8c4324d
...
...
@@ -33,7 +33,6 @@ TEST(Event, CpuElapsedTime) {
}
TEST
(
RecordEvent
,
RecordEvent
)
{
using
paddle
::
platform
::
DeviceContext
;
using
paddle
::
platform
::
Event
;
using
paddle
::
platform
::
EventType
;
using
paddle
::
platform
::
RecordEvent
;
...
...
paddle/fluid/platform/temporary_allocator_test.cc
浏览文件 @
a8c4324d
...
...
@@ -141,7 +141,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
...
...
@@ -156,7 +156,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
tensor
=
ctx
.
AllocateTmpTensor
<
float
,
platform
::
CUDADeviceContext
>
(
...
...
@@ -179,7 +179,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CPUDeviceContext
*>
(
pool
.
Get
(
cpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
int
numel
=
memory_size
/
sizeof
(
float
);
framework
::
Tensor
out_side_tensor
;
...
...
@@ -200,7 +200,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
gpu_place
));
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
);
framework
::
ExecutionContext
ctx
(
op
,
scope
,
*
dev_ctx
,
run_ctx
,
nullptr
);
size_t
memory_size
=
500
;
int
numel
=
memory_size
/
sizeof
(
float
);
...
...
paddle/fluid/train/demo/demo_trainer.cc
浏览文件 @
a8c4324d
...
...
@@ -73,7 +73,7 @@ int main() {
PADDLE_ENFORCE_NE
(
loss_name
,
""
,
"loss not found"
);
// init all parameters
executor
.
Run
(
*
startup_program
.
get
()
,
&
scope
,
0
);
executor
.
Run
(
*
startup_program
,
&
scope
,
0
);
// prepare data
auto
x_var
=
scope
.
Var
(
"x"
);
...
...
@@ -101,7 +101,7 @@ int main() {
clock_t
t1
=
clock
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
executor
.
Run
(
*
train_program
.
get
()
,
&
scope
,
0
,
false
,
true
);
executor
.
Run
(
*
train_program
,
&
scope
,
0
,
false
,
true
);
std
::
cout
<<
"step: "
<<
i
<<
" loss: "
<<
loss_var
->
Get
<
paddle
::
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
]
<<
std
::
endl
;
...
...
paddle/fluid/train/test_train_recognize_digits.cc
浏览文件 @
a8c4324d
...
...
@@ -74,7 +74,7 @@ void Train() {
float
first_loss
=
0.0
;
float
last_loss
=
0.0
;
for
(
int
i
=
0
;
i
<
100
;
++
i
)
{
executor
.
Run
(
*
train_program
.
get
()
,
&
scope
,
0
,
false
,
true
);
executor
.
Run
(
*
train_program
,
&
scope
,
0
,
false
,
true
);
if
(
i
==
0
)
{
first_loss
=
loss_var
->
Get
<
framework
::
LoDTensor
>
().
data
<
float
>
()[
0
];
}
else
if
(
i
==
99
)
{
...
...
python/paddle/fluid/compiler.py
浏览文件 @
a8c4324d
...
...
@@ -20,6 +20,7 @@ from .. import compat as cpt
from
.framework
import
cuda_places
,
cpu_places
from
.
import
core
from
.
import
framework
__all__
=
[
'CompiledProgram'
,
'ExecutionStrategy'
,
'BuildStrategy'
]
...
...
@@ -124,6 +125,8 @@ class CompiledProgram(object):
self
.
_places
=
[
_place_obj
(
p
)
for
p
in
places
]
else
:
self
.
_places
=
None
self
.
_build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
self
.
_program
)
return
self
def
with_inference_optimize
(
self
,
config
):
...
...
python/paddle/fluid/framework.py
浏览文件 @
a8c4324d
...
...
@@ -123,6 +123,15 @@ def cuda_pinned_places(device_count=None):
return
[
core
.
cuda_pinned_places
()]
*
device_count
def
is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
NameScope
(
object
):
def
__init__
(
self
,
name
=
""
,
parent
=
None
):
self
.
_children
=
dict
()
...
...
@@ -759,7 +768,6 @@ class Operator(object):
self
.
_update_desc_attr
(
attr_name
,
attr_val
)
self
.
desc
.
check_attrs
()
if
self
.
_has_kernel
(
type
):
self
.
desc
.
infer_var_type
(
self
.
block
.
desc
)
self
.
desc
.
infer_shape
(
self
.
block
.
desc
)
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
a8c4324d
...
...
@@ -551,9 +551,10 @@ def yolov3_loss(x,
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32)
anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors,
anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32)
"""
helper
=
LayerHelper
(
'yolov3_loss'
,
**
locals
())
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
a8c4324d
...
...
@@ -87,6 +87,7 @@ __all__ = [
'transpose'
,
'im2sequence'
,
'nce'
,
'sampled_softmax_with_cross_entropy'
,
'hsigmoid'
,
'beam_search'
,
'row_conv'
,
...
...
@@ -2472,7 +2473,7 @@ def pool2d(input,
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
conv
2d = fluid.layers.pool2d(
pool
2d = fluid.layers.pool2d(
input=data,
pool_size=2,
pool_type='max',
...
...
@@ -2521,6 +2522,7 @@ def pool2d(input,
return
pool_out
@
templatedoc
()
def
pool3d
(
input
,
pool_size
=-
1
,
pool_type
=
"max"
,
...
...
@@ -2532,13 +2534,19 @@ def pool3d(input,
name
=
None
,
exclusive
=
True
):
"""
This function adds the operator for pooling in 3-dimensions, using the
pooling configurations mentioned in input parameters.
${comment}
Args:
input (Variable): ${input_comment}
pool_size (int): ${ksize_comment}
pool_type (str): ${pooling_type_comment}
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCDHW, where N is batch size, C is
the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width
of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size
is a tuple or list, it must contain three integers,
(pool_size_Depth, pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be the cube of an int.
pool_type (string): ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
global_pooling (bool): ${global_pooling_comment}
...
...
@@ -2551,6 +2559,19 @@ def pool3d(input,
Returns:
Variable: output of pool3d layer.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32, 32], dtype='float32')
pool3d = fluid.layers.pool3d(
input=data,
pool_size=2,
pool_type='max',
pool_stride=1,
global_pooling=False)
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
...
...
@@ -2600,7 +2621,27 @@ def adaptive_pool2d(input,
require_index
=
False
,
name
=
None
):
"""
${comment}
**Adaptive Pool2d Operator**
The adaptive_pool2d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(pool_size) should contain two elements which
represent height and width, respectively. Also the H and W dimensions of output(Out)
is same as Parameter(pool_size).
For average adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) &=
\\
frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Args:
input (Variable): The input tensor of pooling operator. The format of
...
...
@@ -2610,8 +2651,8 @@ def adaptive_pool2d(input,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
i
t cannot be set in average pooling type.
require_index (bool): If true, the index of max pooling point
will be returned along
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
...
...
@@ -2692,18 +2733,42 @@ def adaptive_pool3d(input,
require_index
=
False
,
name
=
None
):
"""
${comment}
**Adaptive Pool3d Operator**
The adaptive_pool3d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
size, C is the number of channels, D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(pool_size) should contain
three elements which represent height and width, respectively. Also the D, H and W
dimensions of output(Out) is same as Parameter(pool_size).
For average adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) &=
\\
frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args:
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is
the number of channels,
H is the height of the
feature, and W is the width of the feature.
input tensor is NC
D
HW, where N is batch size, C is
the number of channels,
D is the depth of the feature,
H is the height of the
feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain t
wo
integers, (Depth, Height, Width).
it must contain t
hree
integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
i
t cannot be set in average pooling type.
require_index (bool): If true, the index of max pooling point
will be returned along
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
...
...
@@ -2740,7 +2805,7 @@ def adaptive_pool3d(input,
name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d(
input=data,
pool_size=[3, 3],
pool_size=[3, 3
, 3
],
pool_type='avg')
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
...
...
@@ -5796,6 +5861,132 @@ def softmax_with_cross_entropy(logits,
return
loss
def
sampled_softmax_with_cross_entropy
(
logits
,
label
,
num_samples
,
num_true
=
1
,
remove_accidental_hits
=
True
,
use_customized_samples
=
False
,
customized_samples
=
None
,
customized_probabilities
=
None
,
seed
=
0
):
"""
**Sampled Softmax With Cross Entropy Operator.**
Cross entropy loss with sampled softmax is used as the output layer for
larger output classes extensively. This operator samples a number of samples
for all examples, and computes the softmax normalized values for each
row of the sampled tensor, after which cross-entropy loss is computed.
Because this operator performs a softmax on logits internally, it expects
unscaled logits. This operator should not be used with the output of
softmax operator since that would produce incorrect results.
For examples with T true labels (T >= 1), we assume that each true label has
a probability of 1/T. For each sample, S samples are generated using a
log uniform distribution. True labels are concatenated with these samples to
form T + S samples for each example. So, assume the shape of logits is
[N x K], the shape for samples is [N x (T+S)]. For each sampled label, a
probability is calculated, which corresponds to the Q(y|x) in
[Jean et al., 2014](http://arxiv.org/abs/1412.2007).
Logits are sampled according to the sampled labels. Then if
remove_accidental_hits is True, if a sample[i, j] accidentally hits true
labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to
make its softmax result close to zero. Then sampled logits are subtracted by
logQ(y|x), these sampled logits and re-indexed labels are used to compute
a softmax with cross entropy.
Args:
logits (Variable): The unscaled log probabilities, which is a 2-D tensor
with shape [N x K]. N is the batch_size, and K is the class number.
label (Variable): The ground truth which is a 2-D tensor. Label is a
Tensor<int64> with shape [N x T], where T is the number of true
labels per example.
num_samples (int): The number for each example, num_samples should be
less than the number of class.
num_true(int): The number of target classes per training example.
remove_accidental_hits (bool): A flag indicating whether to remove
accidental hits when sampling. If True and if a sample[i, j]
accidentally hits true labels, then the corresponding
sampled_logits[i, j] is minus by 1e20 to make its softmax result
close to zero. Default is True.
use_customized_samples (bool): Whether to use custom samples and probabities to sample
logits.
customized_samples (Variable): User defined samples, which is a 2-D tensor
with shape [N, T + S]. S is the num_samples, and T is the number of true
labels per example.
customized_probabilities (Variable): User defined probabilities of samples,
a 2-D tensor which has the same shape with customized_samples.
seed (int): The random seed for generating random number, which is used
in the process of sampling. Default is 0.
Returns:
Variable: Return the cross entropy loss which is a 2-D tensor with shape
[N x 1].
Examples:
.. code-block:: python
logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
label = fluid.layers.data(name='label', shape=[5], dtype='int64')
fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.sampled_softmax_with_cross_entropy(
logits=fc, label=label, num_samples=25)
"""
helper
=
LayerHelper
(
'sample_logits'
,
**
locals
())
samples
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int64'
)
probabilities
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
sampled_logits
\
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
sampled_label
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int64'
)
sampled_softlabel
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
helper
.
append_op
(
type
=
'sample_logits'
,
inputs
=
{
'Logits'
:
logits
,
'Labels'
:
label
,
'CustomizedSamples'
:
customized_samples
,
'CustomizedProbabilities'
:
customized_probabilities
},
outputs
=
{
'Samples'
:
samples
,
'Probabilities'
:
probabilities
,
'SampledLabels'
:
sampled_label
,
'SampledLogits'
:
sampled_logits
},
attrs
=
{
'use_customized_samples'
:
use_customized_samples
,
'uniq'
:
True
,
'remove_accidental_hits'
:
remove_accidental_hits
,
'num_samples'
:
num_samples
,
'seed'
:
seed
})
loss
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
softmax
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
helper
.
append_op
(
type
=
'one_hot'
,
inputs
=
{
'X'
:
sampled_label
},
attrs
=
{
'depth'
:
num_samples
+
1
},
outputs
=
{
'Out'
:
sampled_softlabel
})
helper
.
append_op
(
type
=
'softmax_with_cross_entropy'
,
inputs
=
{
'Logits'
:
sampled_logits
,
'Label'
:
sampled_softlabel
},
outputs
=
{
'Softmax'
:
softmax
,
'Loss'
:
loss
},
attrs
=
{
'soft_label'
:
True
,
'ignore_index'
:
False
,
'numeric_stable_mode'
:
False
})
return
loss
/
num_true
def
smooth_l1
(
x
,
y
,
inside_weight
=
None
,
outside_weight
=
None
,
sigma
=
None
):
"""
This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
a8c4324d
...
...
@@ -60,7 +60,28 @@ __all__ += ["uniform_random"]
_uniform_random_
=
generate_layer_fn
(
'uniform_random'
)
def
uniform_random
(
shape
,
dtype
=
None
,
min
=
None
,
max
=
None
,
seed
=
None
):
def
uniform_random
(
shape
,
dtype
=
'float32'
,
min
=-
1.0
,
max
=
1.0
,
seed
=
0
):
"""
This operator initializes a variable with random values sampled from a
uniform distribution. The random result is in set [min, max].
Args:
shape (list): The shape of output variable.
dtype(np.dtype|core.VarDesc.VarType|str): The type of data, such as
float32, float64 etc. Default: float32.
min (float): Minimum value of uniform random. Default -1.0.
max (float): Maximun value of uniform random. Default 1.0.
seed (int): Random seed used for generating samples. 0 means use a
seed generated by the system. Note that if seed is not 0, this
operator will always generate the same random numbers every time.
Default 0.
Examples:
.. code-block:: python
result = fluid.layers.uniform_random(shape=[32, 784])
"""
locals_var
=
locals
().
keys
()
if
not
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
dtype
=
convert_np_dtype_to_dtype_
(
dtype
)
...
...
@@ -72,12 +93,6 @@ def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
return
_uniform_random_
(
**
kwargs
)
uniform_random
.
__doc__
=
_uniform_random_
.
__doc__
+
"""
Examples:
>>> result = fluid.layers.uniform_random(shape=[32, 784])
"""
__all__
+=
[
'hard_shrink'
]
_hard_shrink_
=
generate_layer_fn
(
'hard_shrink'
)
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
a8c4324d
...
...
@@ -29,15 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
def
_is_pserver_mode
(
main_program
):
main
=
main_program
if
main_program
\
else
framework
.
default_main_program
()
for
op
in
main
.
global_block
().
ops
:
if
op
.
type
in
[
"send"
,
"recv"
]:
return
True
return
False
class
ParallelExecutor
(
object
):
"""
ParallelExecutor is designed for data parallelism, which focuses on distributing
...
...
@@ -140,7 +131,7 @@ class ParallelExecutor(object):
# FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
# num_trainers is 1, so the current fields of build_strategy doesn't tell if
# it's distributed model.
build_strategy
.
is_distribution
=
_
is_pserver_mode
(
build_strategy
.
is_distribution
=
framework
.
is_pserver_mode
(
main_program
)
or
num_trainers
>
1
# step4: get main_program, scope, local_scopes
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
a8c4324d
...
...
@@ -374,6 +374,17 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_sampled_softmax_with_cross_entropy
(
self
):
program
=
Program
()
with
program_guard
(
program
):
logits
=
layers
.
data
(
name
=
'Logits'
,
shape
=
[
256
],
dtype
=
'float64'
)
label
=
layers
.
data
(
name
=
'Label'
,
shape
=
[
1
],
dtype
=
'int64'
)
num_samples
=
25
output
=
layers
.
sampled_softmax_with_cross_entropy
(
logits
,
label
,
num_samples
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
@
decorators
.
prog_scope
()
def
test_nce
(
self
):
window_size
=
5
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录