Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
c58ce0a6
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c58ce0a6
编写于
3月 07, 2018
作者:
Q
qijun
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'baidu/develop' into update_getstarted_doc
上级
26922a3b
84aea8a8
变更
60
显示空白变更内容
内联
并排
Showing
60 changed file
with
2969 addition
and
1215 deletion
+2969
-1215
benchmark/cluster/vgg16/vgg16_fluid.py
benchmark/cluster/vgg16/vgg16_fluid.py
+20
-15
cmake/external/snappy.cmake
cmake/external/snappy.cmake
+1
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+10
-5
paddle/fluid/framework/channel.h
paddle/fluid/framework/channel.h
+38
-11
paddle/fluid/framework/channel_impl.h
paddle/fluid/framework/channel_impl.h
+229
-0
paddle/fluid/framework/channel_test.cc
paddle/fluid/framework/channel_test.cc
+53
-62
paddle/fluid/framework/data_transform.cc
paddle/fluid/framework/data_transform.cc
+1
-0
paddle/fluid/framework/data_type.h
paddle/fluid/framework/data_type.h
+9
-1
paddle/fluid/framework/data_type_transform.cc
paddle/fluid/framework/data_type_transform.cc
+12
-2
paddle/fluid/framework/data_type_transform.cu
paddle/fluid/framework/data_type_transform.cu
+1
-0
paddle/fluid/framework/data_type_transform_test.cc
paddle/fluid/framework/data_type_transform_test.cc
+131
-18
paddle/fluid/framework/data_type_transform_test.cu
paddle/fluid/framework/data_type_transform_test.cu
+215
-0
paddle/fluid/framework/details/buffered_channel.h
paddle/fluid/framework/details/buffered_channel.h
+0
-142
paddle/fluid/framework/details/unbuffered_channel.h
paddle/fluid/framework/details/unbuffered_channel.h
+0
-174
paddle/fluid/framework/reader.cc
paddle/fluid/framework/reader.cc
+0
-87
paddle/fluid/framework/reader.h
paddle/fluid/framework/reader.h
+2
-77
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+42
-16
paddle/fluid/framework/tensor_util_test.cu
paddle/fluid/framework/tensor_util_test.cu
+47
-14
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+31
-8
paddle/fluid/operators/concat_op.cc
paddle/fluid/operators/concat_op.cc
+5
-4
paddle/fluid/operators/concat_op.h
paddle/fluid/operators/concat_op.h
+38
-46
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+313
-0
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+30
-21
paddle/fluid/operators/create_reader_op.cc
paddle/fluid/operators/create_reader_op.cc
+0
-246
paddle/fluid/operators/detail/CMakeLists.txt
paddle/fluid/operators/detail/CMakeLists.txt
+3
-1
paddle/fluid/operators/elementwise_mul_op.h
paddle/fluid/operators/elementwise_mul_op.h
+8
-75
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+1
-1
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+3
-0
paddle/fluid/operators/math/concat.cc
paddle/fluid/operators/math/concat.cc
+119
-0
paddle/fluid/operators/math/concat.cu
paddle/fluid/operators/math/concat.cu
+280
-0
paddle/fluid/operators/math/concat.h
paddle/fluid/operators/math/concat.h
+63
-0
paddle/fluid/operators/math/concat_test.cc
paddle/fluid/operators/math/concat_test.cc
+336
-0
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+7
-5
paddle/fluid/operators/reader/CMakeLists.txt
paddle/fluid/operators/reader/CMakeLists.txt
+5
-0
paddle/fluid/operators/reader/create_batch_reader_op.cc
paddle/fluid/operators/reader/create_batch_reader_op.cc
+137
-0
paddle/fluid/operators/reader/create_random_data_generator_op.cc
...fluid/operators/reader/create_random_data_generator_op.cc
+110
-0
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+97
-0
paddle/fluid/operators/reader/reader_op_registry.cc
paddle/fluid/operators/reader/reader_op_registry.cc
+116
-0
paddle/fluid/operators/reader/reader_op_registry.h
paddle/fluid/operators/reader/reader_op_registry.h
+75
-0
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+3
-0
paddle/fluid/operators/reshape_op.h
paddle/fluid/operators/reshape_op.h
+17
-5
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+14
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+34
-48
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+15
-36
paddle/fluid/platform/float16.h
paddle/fluid/platform/float16.h
+52
-4
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+20
-0
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+6
-0
paddle/fluid/platform/mkldnn_helper.h
paddle/fluid/platform/mkldnn_helper.h
+15
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/average.py
python/paddle/fluid/average.py
+61
-0
python/paddle/fluid/backward.py
python/paddle/fluid/backward.py
+1
-1
python/paddle/fluid/evaluator.py
python/paddle/fluid/evaluator.py
+0
-38
python/paddle/fluid/layers/__init__.py
python/paddle/fluid/layers/__init__.py
+3
-0
python/paddle/fluid/layers/metric.py
python/paddle/fluid/layers/metric.py
+57
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+4
-37
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+8
-4
python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
...ry_optimization/test_memopt_image_classification_train.py
+11
-6
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+23
-1
python/paddle/fluid/tests/unittests/test_profiler.py
python/paddle/fluid/tests/unittests/test_profiler.py
+8
-4
python/paddle/fluid/tests/unittests/test_reshape_op.py
python/paddle/fluid/tests/unittests/test_reshape_op.py
+28
-0
未找到文件。
benchmark/cluster/vgg16/vgg16_fluid.py
浏览文件 @
c58ce0a6
...
@@ -138,13 +138,14 @@ def main():
...
@@ -138,13 +138,14 @@ def main():
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
# Evaluator
accuracy
=
fluid
.
evaluator
.
Accuracy
(
input
=
predict
,
label
=
label
)
batch_size
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size
)
# inference program
# inference program
inference_program
=
fluid
.
default_main_program
().
clone
()
inference_program
=
fluid
.
default_main_program
().
clone
()
with
fluid
.
program_guard
(
inference_program
):
with
fluid
.
program_guard
(
inference_program
):
test_target
=
accuracy
.
metrics
+
accuracy
.
states
inference_program
=
fluid
.
io
.
get_inference_program
(
batch_acc
)
inference_program
=
fluid
.
io
.
get_inference_program
(
test_target
)
# Optimization
# Optimization
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
args
.
learning_rate
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
args
.
learning_rate
)
...
@@ -157,27 +158,30 @@ def main():
...
@@ -157,27 +158,30 @@ def main():
# test
# test
def
test
(
exe
):
def
test
(
exe
):
accuracy
.
reset
(
exe
)
test_pass_acc
=
fluid
.
average
.
WeightedAverage
(
)
for
batch_id
,
data
in
enumerate
(
test_reader
()):
for
batch_id
,
data
in
enumerate
(
test_reader
()):
img_data
=
np
.
array
(
map
(
lambda
x
:
x
[
0
].
reshape
(
data_shape
),
img_data
=
np
.
array
(
map
(
lambda
x
:
x
[
0
].
reshape
(
data_shape
),
data
)).
astype
(
"float32"
)
data
)).
astype
(
"float32"
)
y_data
=
np
.
array
(
map
(
lambda
x
:
x
[
1
],
data
)).
astype
(
"int64"
)
y_data
=
np
.
array
(
map
(
lambda
x
:
x
[
1
],
data
)).
astype
(
"int64"
)
y_data
=
y_data
.
reshape
([
-
1
,
1
])
y_data
=
y_data
.
reshape
([
-
1
,
1
])
exe
.
run
(
inference_program
,
outs
=
exe
.
run
(
inference_program
,
feed
=
{
"pixel"
:
img_data
,
feed
=
{
"pixel"
:
img_data
,
"label"
:
y_data
})
"label"
:
y_data
},
fetch_list
=
[
batch_acc
,
batch_size
])
test_pass_acc
.
add
(
value
=
np
.
array
(
outs
[
0
]),
weight
=
np
.
array
(
outs
[
1
]))
return
accuracy
.
eval
(
exe
)
return
test_pass_acc
.
eval
(
)
def
train_loop
(
exe
,
trainer_prog
):
def
train_loop
(
exe
,
trainer_prog
):
iters
=
0
iters
=
0
ts
=
time
.
time
()
ts
=
time
.
time
()
train_pass_acc
=
fluid
.
average
.
WeightedAverage
()
for
pass_id
in
range
(
args
.
num_passes
):
for
pass_id
in
range
(
args
.
num_passes
):
# train
# train
start_time
=
time
.
time
()
start_time
=
time
.
time
()
num_samples
=
0
num_samples
=
0
accuracy
.
reset
(
exe
)
train_pass_acc
.
reset
(
)
with
profiler
.
profiler
(
"CPU"
,
'total'
)
as
prof
:
with
profiler
.
profiler
(
"CPU"
,
'total'
)
as
prof
:
for
batch_id
,
data
in
enumerate
(
train_reader
()):
for
batch_id
,
data
in
enumerate
(
train_reader
()):
ts
=
time
.
time
()
ts
=
time
.
time
()
...
@@ -187,13 +191,14 @@ def main():
...
@@ -187,13 +191,14 @@ def main():
y_data
=
np
.
array
(
map
(
lambda
x
:
x
[
1
],
data
)).
astype
(
"int64"
)
y_data
=
np
.
array
(
map
(
lambda
x
:
x
[
1
],
data
)).
astype
(
"int64"
)
y_data
=
y_data
.
reshape
([
-
1
,
1
])
y_data
=
y_data
.
reshape
([
-
1
,
1
])
loss
,
acc
=
exe
.
run
(
loss
,
acc
,
b_size
=
exe
.
run
(
trainer_prog
,
trainer_prog
,
feed
=
{
"pixel"
:
img_data
,
feed
=
{
"pixel"
:
img_data
,
"label"
:
y_data
},
"label"
:
y_data
},
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
fetch_list
=
[
avg_cost
,
batch_acc
,
batch_size
]
)
iters
+=
1
iters
+=
1
num_samples
+=
len
(
data
)
num_samples
+=
len
(
data
)
train_pass_acc
.
add
(
value
=
acc
,
weight
=
b_size
)
print
(
print
(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
%
(
pass_id
,
iters
,
loss
,
acc
,
%
(
pass_id
,
iters
,
loss
,
acc
,
...
@@ -201,7 +206,7 @@ def main():
...
@@ -201,7 +206,7 @@ def main():
)
# The accuracy is the accumulation of batches, but not the current batch.
)
# The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed
=
time
.
time
()
-
start_time
pass_elapsed
=
time
.
time
()
-
start_time
pass_train_acc
=
accuracy
.
eval
(
exe
)
pass_train_acc
=
train_pass_acc
.
eval
(
)
pass_test_acc
=
test
(
exe
)
pass_test_acc
=
test
(
exe
)
print
(
print
(
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f
\n
"
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f
\n
"
...
...
cmake/external/snappy.cmake
浏览文件 @
c58ce0a6
...
@@ -39,6 +39,7 @@ ExternalProject_Add(
...
@@ -39,6 +39,7 @@ ExternalProject_Add(
-DCMAKE_INSTALL_LIBDIR=
${
SNAPPY_INSTALL_DIR
}
/lib
-DCMAKE_INSTALL_LIBDIR=
${
SNAPPY_INSTALL_DIR
}
/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DBUILD_TESTING=OFF
-DSNAPPY_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=
${
THIRD_PARTY_BUILD_TYPE
}
-DCMAKE_BUILD_TYPE=
${
THIRD_PARTY_BUILD_TYPE
}
${
EXTERNAL_OPTIONAL_ARGS
}
${
EXTERNAL_OPTIONAL_ARGS
}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=
${
SNAPPY_INSTALL_DIR
}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=
${
SNAPPY_INSTALL_DIR
}
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
c58ce0a6
...
@@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
...
@@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto
)
else
()
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto
)
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto
)
endif
()
endif
()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor
)
nv_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor
)
else
()
else
()
cc_test
(
tensor_util_test SRCS tensor_util_test.cc DEPS tensor
)
cc_test
(
tensor_util_test SRCS tensor_util_test.cc DEPS tensor
)
...
@@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
...
@@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test
(
data_device_transform_test SRCS data_device_transform_test.cu
nv_test
(
data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry init math_function
)
DEPS operator op_registry init math_function
)
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
if
(
WITH_GPU
)
cc_test
(
data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform
)
nv_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
nv_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
else
()
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
cc_test
(
data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform
)
endif
()
cc_library
(
data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function
)
cc_library
(
data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function
)
cc_test
(
data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform
)
cc_test
(
data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform
)
...
...
paddle/fluid/framework/channel.h
浏览文件 @
c58ce0a6
...
@@ -28,24 +28,19 @@ class Channel {
...
@@ -28,24 +28,19 @@ class Channel {
virtual
bool
Send
(
T
*
)
=
0
;
virtual
bool
Send
(
T
*
)
=
0
;
virtual
bool
Receive
(
T
*
)
=
0
;
virtual
bool
Receive
(
T
*
)
=
0
;
virtual
size_t
Cap
()
=
0
;
virtual
size_t
Cap
()
=
0
;
virtual
void
Lock
()
=
0
;
virtual
void
Unlock
()
=
0
;
virtual
void
Close
()
=
0
;
virtual
void
Close
()
=
0
;
virtual
~
Channel
()
{}
virtual
~
Channel
()
{}
};
};
// Forward declaration of channel implementations.
// Forward declaration of channel implementations.
namespace
details
{
template
<
typename
T
>
template
<
typename
T
>
class
Buffered
;
class
ChannelImpl
;
template
<
typename
T
>
class
UnBuffered
;
}
// namespace details
template
<
typename
T
>
template
<
typename
T
>
Channel
<
T
>*
MakeChannel
(
size_t
buffer_size
)
{
Channel
<
T
>*
MakeChannel
(
size_t
buffer_size
)
{
if
(
buffer_size
>
0
)
{
return
new
ChannelImpl
<
T
>
(
buffer_size
);
return
new
details
::
Buffered
<
T
>
(
buffer_size
);
}
return
new
details
::
UnBuffered
<
T
>
();
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -89,6 +84,19 @@ class ChannelHolder {
...
@@ -89,6 +84,19 @@ class ChannelHolder {
if
(
IsInitialized
())
holder_
->
Close
();
if
(
IsInitialized
())
holder_
->
Close
();
}
}
size_t
Cap
()
{
if
(
IsInitialized
())
return
holder_
->
Cap
();
return
-
1
;
}
void
Lock
()
{
if
(
IsInitialized
())
holder_
->
Lock
();
}
void
Unlock
()
{
if
(
IsInitialized
())
holder_
->
Unlock
();
}
inline
bool
IsInitialized
()
const
{
return
holder_
!=
nullptr
;
}
inline
bool
IsInitialized
()
const
{
return
holder_
!=
nullptr
;
}
inline
const
std
::
type_index
Type
()
{
inline
const
std
::
type_index
Type
()
{
...
@@ -106,6 +114,9 @@ class ChannelHolder {
...
@@ -106,6 +114,9 @@ class ChannelHolder {
virtual
const
std
::
type_index
Type
()
const
=
0
;
virtual
const
std
::
type_index
Type
()
const
=
0
;
virtual
void
*
Ptr
()
const
=
0
;
virtual
void
*
Ptr
()
const
=
0
;
virtual
void
Close
()
=
0
;
virtual
void
Close
()
=
0
;
virtual
void
Lock
()
=
0
;
virtual
void
Unlock
()
=
0
;
virtual
size_t
Cap
()
=
0
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
@@ -115,11 +126,28 @@ class ChannelHolder {
...
@@ -115,11 +126,28 @@ class ChannelHolder {
}
}
virtual
const
std
::
type_index
Type
()
const
{
return
type_
;
}
virtual
const
std
::
type_index
Type
()
const
{
return
type_
;
}
virtual
void
*
Ptr
()
const
{
return
static_cast
<
void
*>
(
channel_
.
get
());
}
virtual
void
*
Ptr
()
const
{
return
static_cast
<
void
*>
(
channel_
.
get
());
}
virtual
void
Close
()
{
virtual
void
Close
()
{
if
(
channel_
)
channel_
->
Close
();
if
(
channel_
)
channel_
->
Close
();
}
}
virtual
size_t
Cap
()
{
if
(
channel_
)
return
channel_
->
Cap
();
else
return
-
1
;
}
virtual
void
Lock
()
{
if
(
channel_
)
channel_
->
Lock
();
}
virtual
void
Unlock
()
{
if
(
channel_
)
channel_
->
Unlock
();
}
std
::
unique_ptr
<
Channel
<
T
>>
channel_
;
std
::
unique_ptr
<
Channel
<
T
>>
channel_
;
const
std
::
type_index
type_
;
const
std
::
type_index
type_
;
};
};
...
@@ -131,5 +159,4 @@ class ChannelHolder {
...
@@ -131,5 +159,4 @@ class ChannelHolder {
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
#include "paddle/fluid/framework/details/buffered_channel.h"
#include "paddle/fluid/framework/channel_impl.h"
#include "paddle/fluid/framework/details/unbuffered_channel.h"
paddle/fluid/framework/channel_impl.h
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <atomic>
#include <condition_variable>
#include <deque>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
template
<
typename
T
>
class
ChannelImpl
:
public
paddle
::
framework
::
Channel
<
T
>
{
friend
Channel
<
T
>
*
paddle
::
framework
::
MakeChannel
<
T
>
(
size_t
);
friend
void
paddle
::
framework
::
CloseChannel
<
T
>
(
Channel
<
T
>
*
);
public:
virtual
bool
Send
(
T
*
);
virtual
bool
Receive
(
T
*
);
virtual
size_t
Cap
()
{
return
cap_
;
}
virtual
void
Lock
();
virtual
void
Unlock
();
virtual
void
Close
();
ChannelImpl
(
size_t
);
virtual
~
ChannelImpl
();
private:
struct
QueueMessage
{
T
*
data
;
std
::
condition_variable_any
cond
;
bool
chan_closed
=
false
;
bool
completed
=
false
;
QueueMessage
(
T
*
item
)
:
data
(
item
)
{}
void
Wait
(
std
::
unique_lock
<
std
::
recursive_mutex
>
&
lock
)
{
cond
.
wait
(
lock
,
[
this
]()
{
return
completed
;
});
}
void
Notify
()
{
completed
=
true
;
cond
.
notify_all
();
}
};
bool
send_return
(
bool
value
)
{
send_ctr
--
;
destructor_cond_
.
notify_all
();
return
value
;
}
bool
recv_return
(
bool
value
)
{
recv_ctr
--
;
destructor_cond_
.
notify_all
();
return
value
;
}
size_t
cap_
;
std
::
recursive_mutex
mu_
;
bool
closed_
;
std
::
deque
<
T
>
buf_
;
std
::
deque
<
std
::
shared_ptr
<
QueueMessage
>>
recvq
;
std
::
deque
<
std
::
shared_ptr
<
QueueMessage
>>
sendq
;
std
::
atomic
<
unsigned
>
send_ctr
{
0
};
std
::
atomic
<
unsigned
>
recv_ctr
{
0
};
std
::
condition_variable_any
destructor_cond_
;
};
template
<
typename
T
>
ChannelImpl
<
T
>::
ChannelImpl
(
size_t
capacity
)
:
cap_
(
capacity
),
closed_
(
false
),
send_ctr
(
0
),
recv_ctr
(
0
)
{
PADDLE_ENFORCE_GE
(
capacity
,
0
);
}
template
<
typename
T
>
bool
ChannelImpl
<
T
>::
Send
(
T
*
item
)
{
send_ctr
++
;
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
// If channel is closed, do nothing
if
(
closed_
)
{
lock
.
unlock
();
// TODO(abhinavarora) Should panic on closed channel
return
send_return
(
false
);
}
// If there is a receiver, directly pass the value we want
// to send to the receiver, bypassing the channel buffer if any
if
(
!
recvq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
recvq
.
front
();
recvq
.
pop_front
();
// Do the data transfer
*
(
m
->
data
)
=
std
::
move
(
*
item
);
// Wake up the blocked process and unlock
m
->
Notify
();
lock
.
unlock
();
return
send_return
(
true
);
}
// Unbuffered channel will always bypass this
// If buffered channel has space in buffer,
// write the element to the buffer.
if
(
buf_
.
size
()
<
cap_
)
{
// Copy to buffer
buf_
.
push_back
(
std
::
move
(
*
item
));
// Release lock and return true
lock
.
unlock
();
return
send_return
(
true
);
}
// Block on channel, because some receiver will complete
// the operation for us
auto
m
=
std
::
make_shared
<
QueueMessage
>
(
item
);
sendq
.
push_back
(
m
);
m
->
Wait
(
lock
);
// TODO(abhinavarora) Should panic on closed channel
return
send_return
(
!
m
->
chan_closed
);
}
template
<
typename
T
>
bool
ChannelImpl
<
T
>::
Receive
(
T
*
item
)
{
recv_ctr
++
;
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
// If channel is closed and buffer is empty or
// channel is unbuffered
if
(
closed_
&&
buf_
.
empty
())
{
lock
.
unlock
();
return
recv_return
(
false
);
}
// If there is a sender, directly receive the value we want
// from the sender, bypassing the channel buffer if any
if
(
!
sendq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
sendq
.
front
();
sendq
.
pop_front
();
// Do the data transfer
*
item
=
std
::
move
(
*
(
m
->
data
));
// Wake up the blocked process and unlock
m
->
Notify
();
lock
.
unlock
();
return
recv_return
(
true
);
}
// If this is a buffered channel and there are items in buffer
if
(
buf_
.
size
()
>
0
)
{
// Directly read from buffer
*
item
=
std
::
move
(
buf_
.
front
());
buf_
.
pop_front
();
// Release lock and return true
lock
.
unlock
();
return
recv_return
(
true
);
}
// No sender available, block on this channel
// Some receiver will complete the option for us
auto
m
=
std
::
make_shared
<
QueueMessage
>
(
item
);
recvq
.
push_back
(
m
);
m
->
Wait
(
lock
);
return
recv_return
(
!
m
->
chan_closed
);
}
template
<
typename
T
>
void
ChannelImpl
<
T
>::
Lock
()
{
mu_
.
lock
();
}
template
<
typename
T
>
void
ChannelImpl
<
T
>::
Unlock
()
{
mu_
.
unlock
();
}
template
<
typename
T
>
void
ChannelImpl
<
T
>::
Close
()
{
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
if
(
closed_
)
{
// TODO(abhinavarora): closing an already closed channel should panic
lock
.
unlock
();
return
;
}
closed_
=
true
;
// Empty the readers
while
(
!
recvq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
recvq
.
front
();
recvq
.
pop_front
();
m
->
chan_closed
=
true
;
m
->
Notify
();
}
// Empty the senders
while
(
!
sendq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
sendq
.
front
();
sendq
.
pop_front
();
m
->
chan_closed
=
true
;
m
->
Notify
();
}
}
template
<
typename
T
>
ChannelImpl
<
T
>::~
ChannelImpl
()
{
Close
();
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
destructor_cond_
.
wait
(
lock
,
[
this
]()
{
return
send_ctr
==
0
&&
recv_ctr
==
0
;
});
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/channel_test.cc
浏览文件 @
c58ce0a6
...
@@ -23,8 +23,19 @@ using paddle::framework::Channel;
...
@@ -23,8 +23,19 @@ using paddle::framework::Channel;
using
paddle
::
framework
::
ChannelHolder
;
using
paddle
::
framework
::
ChannelHolder
;
using
paddle
::
framework
::
MakeChannel
;
using
paddle
::
framework
::
MakeChannel
;
using
paddle
::
framework
::
CloseChannel
;
using
paddle
::
framework
::
CloseChannel
;
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
TEST
(
Channel
,
ChannelCapacityTest
)
{
const
size_t
buffer_size
=
10
;
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
EXPECT_EQ
(
ch
->
Cap
(),
buffer_size
);
CloseChannel
(
ch
);
delete
ch
;
ch
=
MakeChannel
<
size_t
>
(
0
);
EXPECT_EQ
(
ch
->
Cap
(),
0U
);
CloseChannel
(
ch
);
delete
ch
;
}
void
RecevingOrderEqualToSendingOrder
(
Channel
<
int
>
*
ch
)
{
void
RecevingOrderEqualToSendingOrder
(
Channel
<
int
>
*
ch
)
{
unsigned
sum_send
=
0
;
unsigned
sum_send
=
0
;
...
@@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
...
@@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
}
}
});
});
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
int
recv
;
int
recv
=
999
;
EXPECT_EQ
(
ch
->
Receive
(
&
recv
),
true
);
EXPECT_EQ
(
ch
->
Receive
(
&
recv
),
true
);
EXPECT_EQ
(
recv
,
i
);
EXPECT_EQ
(
recv
,
i
);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
CloseChannel
(
ch
);
CloseChannel
(
ch
);
t
.
join
();
t
.
join
();
EXPECT_EQ
(
sum_send
,
10U
);
EXPECT_EQ
(
sum_send
,
10U
);
delete
ch
;
delete
ch
;
}
}
TEST
(
Channel
,
MakeAndClose
)
{
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto
ch
=
MakeChannel
<
int
>
(
10
);
EXPECT_NE
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
),
nullptr
);
EXPECT_EQ
(
dynamic_cast
<
UnBuffered
<
int
>
*>
(
ch
),
nullptr
);
CloseChannel
(
ch
);
delete
ch
;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto
ch
=
MakeChannel
<
int
>
(
0
);
EXPECT_EQ
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
),
nullptr
);
EXPECT_NE
(
dynamic_cast
<
UnBuffered
<
int
>
*>
(
ch
),
nullptr
);
CloseChannel
(
ch
);
delete
ch
;
}
}
TEST
(
Channel
,
SufficientBufferSizeDoesntBlock
)
{
TEST
(
Channel
,
SufficientBufferSizeDoesntBlock
)
{
const
size_t
buffer_size
=
10
;
const
size_t
buffer_size
=
10
;
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
...
@@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
...
@@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
TEST
(
Channel
,
ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize
)
{
TEST
(
Channel
,
ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize
)
{
const
size_t
buffer_size
=
10
;
const
size_t
buffer_size
=
10
;
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
size_t
sum
=
0
;
std
::
thread
t
([
&
]()
{
std
::
thread
t
([
&
]()
{
// Try to write more than buffer size.
// Try to write more than buffer size.
for
(
size_t
i
=
0
;
i
<
2
*
buffer_size
;
++
i
)
{
for
(
size_t
i
=
0
;
i
<
2
*
buffer_size
;
++
i
)
{
...
@@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
...
@@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
EXPECT_EQ
(
ch
->
Send
(
&
i
),
true
);
// should block after 10 iterations
EXPECT_EQ
(
ch
->
Send
(
&
i
),
true
);
// should block after 10 iterations
else
else
EXPECT_EQ
(
ch
->
Send
(
&
i
),
false
);
EXPECT_EQ
(
ch
->
Send
(
&
i
),
false
);
sum
+=
i
;
}
}
});
});
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1 sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2 sec
EXPECT_EQ
(
sum
,
45U
);
CloseChannel
(
ch
);
CloseChannel
(
ch
);
t
.
join
();
t
.
join
();
delete
ch
;
delete
ch
;
...
@@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
...
@@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
},
},
&
thread_ended
[
i
]);
&
thread_ended
[
i
]);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all the threads are blocked
// Verify that all the threads are blocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
...
@@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
// This should unblock all receivers
// This should unblock all receivers
CloseChannel
(
ch
);
CloseChannel
(
ch
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all threads got unblocked
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
...
@@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
t
[
i
].
join
();
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
t
[
i
].
join
();
}
}
void
ChannelCloseUnblocksSendersTest
(
Channel
<
int
>
*
ch
)
{
void
ChannelCloseUnblocksSendersTest
(
Channel
<
int
>
*
ch
,
bool
isBuffered
)
{
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
size_t
num_threads
=
5
;
size_t
num_threads
=
5
;
std
::
thread
t
[
num_threads
];
std
::
thread
t
[
num_threads
];
bool
thread_ended
[
num_threads
];
bool
thread_ended
[
num_threads
];
...
@@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
...
@@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
},
},
&
thread_ended
[
i
],
&
send_success
[
i
]);
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
if
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
)
)
{
if
(
isBuffered
)
{
// If ch is Buffered, atleast 4 threads must be blocked.
// If ch is Buffered, atleast 4 threads must be blocked.
int
ct
=
0
;
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
...
@@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
// This should unblock all senders
// This should unblock all senders
CloseChannel
(
ch
);
CloseChannel
(
ch
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
// Verify that all threads got unblocked
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
EXPECT_EQ
(
thread_ended
[
i
],
true
);
EXPECT_EQ
(
thread_ended
[
i
],
true
);
}
}
if
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
)
)
{
if
(
isBuffered
)
{
// Verify that only 1 send was successful
// Verify that only 1 send was successful
int
ct
=
0
;
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
...
@@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
// any senders waiting for channel to have write space
// any senders waiting for channel to have write space
TEST
(
Channel
,
BufferedChannelCloseUnblocksSendersTest
)
{
TEST
(
Channel
,
BufferedChannelCloseUnblocksSendersTest
)
{
auto
ch
=
MakeChannel
<
int
>
(
1
);
auto
ch
=
MakeChannel
<
int
>
(
1
);
ChannelCloseUnblocksSendersTest
(
ch
);
ChannelCloseUnblocksSendersTest
(
ch
,
true
);
delete
ch
;
delete
ch
;
}
}
...
@@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
...
@@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
// unblocks any senders waiting for senders
// unblocks any senders waiting for senders
TEST
(
Channel
,
UnbufferedChannelCloseUnblocksSendersTest
)
{
TEST
(
Channel
,
UnbufferedChannelCloseUnblocksSendersTest
)
{
auto
ch
=
MakeChannel
<
int
>
(
0
);
auto
ch
=
MakeChannel
<
int
>
(
0
);
ChannelCloseUnblocks
ReceiversTest
(
ch
);
ChannelCloseUnblocks
SendersTest
(
ch
,
false
);
delete
ch
;
delete
ch
;
}
}
...
@@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
...
@@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
ch
->
Receive
(
&
recv
);
ch
->
Receive
(
&
recv
);
EXPECT_EQ
(
recv
,
i
);
EXPECT_EQ
(
recv
,
i
);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.5
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
EXPECT_EQ
(
sum_send
,
3U
);
EXPECT_EQ
(
sum_send
,
3U
);
CloseChannel
(
ch
);
CloseChannel
(
ch
);
...
@@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
...
@@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
ch
->
Send
(
&
i
);
ch
->
Send
(
&
i
);
sum_send
+=
i
;
sum_send
+=
i
;
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
// wait 0.5
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
EXPECT_EQ
(
sum_send
,
10U
);
EXPECT_EQ
(
sum_send
,
10U
);
EXPECT_EQ
(
sum_receive
,
10U
);
EXPECT_EQ
(
sum_receive
,
10U
);
// send three more elements
// send three more elements
...
@@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
...
@@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
// This tests that destroying a channel unblocks
// This tests that destroying a channel unblocks
// any senders waiting for channel to have write space
// any senders waiting for channel to have write space
void
ChannelDestroyUnblockSenders
(
Channel
<
int
>
*
ch
)
{
void
ChannelDestroyUnblockSenders
(
Channel
<
int
>
*
ch
,
bool
isBuffered
)
{
size_t
num_threads
=
5
;
size_t
num_threads
=
5
;
std
::
thread
t
[
num_threads
];
std
::
thread
t
[
num_threads
];
bool
thread_ended
[
num_threads
];
bool
thread_ended
[
num_threads
];
...
@@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
...
@@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
&
thread_ended
[
i
],
&
send_success
[
i
]);
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
// wait 0.5 sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2 sec
bool
is_buffered_channel
=
false
;
if
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
))
is_buffered_channel
=
true
;
if
(
is
_buffered_channel
)
{
if
(
is
Buffered
)
{
// If channel is buffered, verify that atleast 4 threads are blocked
// If channel is buffered, verify that atleast 4 threads are blocked
int
ct
=
0
;
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
...
@@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
EXPECT_EQ
(
thread_ended
[
i
],
true
);
EXPECT_EQ
(
thread_ended
[
i
],
true
);
}
}
// Count number of successful
d
sends
// Count number of successful sends
int
ct
=
0
;
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
if
(
send_success
[
i
])
ct
++
;
if
(
send_success
[
i
])
ct
++
;
}
}
if
(
is
_buffered_channel
)
{
if
(
is
Buffered
)
{
// Only 1 send must be successful
// Only 1 send must be successful
EXPECT_EQ
(
ct
,
1
);
EXPECT_EQ
(
ct
,
1
);
}
else
{
}
else
{
...
@@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
...
@@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
TEST
(
Channel
,
BufferedChannelDestroyUnblocksSendersTest
)
{
TEST
(
Channel
,
BufferedChannelDestroyUnblocksSendersTest
)
{
size_t
buffer_size
=
1
;
size_t
buffer_size
=
1
;
auto
ch
=
MakeChannel
<
int
>
(
buffer_size
);
auto
ch
=
MakeChannel
<
int
>
(
buffer_size
);
ChannelDestroyUnblockSenders
(
ch
);
ChannelDestroyUnblockSenders
(
ch
,
true
);
}
}
// This tests that destroying an unbuffered channel also unblocks
// This tests that destroying an unbuffered channel also unblocks
...
@@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
...
@@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
TEST
(
Channel
,
UnbufferedChannelDestroyUnblocksSendersTest
)
{
TEST
(
Channel
,
UnbufferedChannelDestroyUnblocksSendersTest
)
{
auto
ch
=
MakeChannel
<
int
>
(
0
);
auto
ch
=
MakeChannel
<
int
>
(
0
);
ChannelDestroyUnblockSenders
(
ch
);
ChannelDestroyUnblockSenders
(
ch
,
false
);
}
TEST
(
ChannelHolder
,
ChannelHolderCapacityTest
)
{
const
size_t
buffer_size
=
10
;
ChannelHolder
*
ch
=
new
ChannelHolder
();
ch
->
Reset
<
int
>
(
buffer_size
);
EXPECT_EQ
(
ch
->
Cap
(),
buffer_size
);
delete
ch
;
ch
=
new
ChannelHolder
();
ch
->
Reset
<
int
>
(
0
);
EXPECT_EQ
(
ch
->
Cap
(),
0U
);
delete
ch
;
}
}
void
ChannelHolderSendReceive
(
ChannelHolder
*
ch
)
{
void
ChannelHolderSendReceive
(
ChannelHolder
*
ch
)
{
...
@@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
...
@@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
},
},
&
thread_ended
[
i
]);
&
thread_ended
[
i
]);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all the threads are blocked
// Verify that all the threads are blocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
...
@@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
// This should unblock all receivers
// This should unblock all receivers
ch
->
close
();
ch
->
close
();
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all threads got unblocked
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
...
@@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
}
}
void
ChannelHolderCloseUnblocksSendersTest
(
ChannelHolder
*
ch
,
bool
isBuffered
)
{
void
ChannelHolderCloseUnblocksSendersTest
(
ChannelHolder
*
ch
,
bool
isBuffered
)
{
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
size_t
num_threads
=
5
;
size_t
num_threads
=
5
;
std
::
thread
t
[
num_threads
];
std
::
thread
t
[
num_threads
];
bool
thread_ended
[
num_threads
];
bool
thread_ended
[
num_threads
];
...
@@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
...
@@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
},
},
&
thread_ended
[
i
],
&
send_success
[
i
]);
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
if
(
isBuffered
)
{
if
(
isBuffered
)
{
// If ch is Buffered, atleast 4 threads must be blocked.
// If ch is Buffered, atleast 4 threads must be blocked.
...
@@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
...
@@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
// This should unblock all senders
// This should unblock all senders
ch
->
close
();
ch
->
close
();
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
// Verify that all threads got unblocked
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
@@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
...
@@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
&
thread_ended
[
i
],
&
send_success
[
i
]);
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
// wait 0.5
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
if
(
isBuffered
)
{
if
(
isBuffered
)
{
// If channel is buffered, verify that atleast 4 threads are blocked
// If channel is buffered, verify that atleast 4 threads are blocked
int
ct
=
0
;
int
ct
=
0
;
...
@@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
...
@@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
},
},
&
thread_ended
[
i
]);
&
thread_ended
[
i
]);
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
// Verify that all threads are blocked
// Verify that all threads are blocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
paddle/fluid/framework/data_transform.cc
浏览文件 @
c58ce0a6
...
@@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
...
@@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
PassTensorData
(
&
out
,
&
in
);
PassTensorData
(
&
out
,
&
in
);
}
}
// do data type transform
if
(
expected_kernel_type
.
data_type_
!=
kernel_type_for_var
.
data_type_
)
{
if
(
expected_kernel_type
.
data_type_
!=
kernel_type_for_var
.
data_type_
)
{
TransDataType
(
kernel_type_for_var
,
expected_kernel_type
,
in
,
&
out
);
TransDataType
(
kernel_type_for_var
,
expected_kernel_type
,
in
,
&
out
);
transformed
=
true
;
transformed
=
true
;
...
...
paddle/fluid/framework/data_type.h
浏览文件 @
c58ce0a6
...
@@ -16,13 +16,16 @@ limitations under the License. */
...
@@ -16,13 +16,16 @@ limitations under the License. */
#include <typeindex>
#include <typeindex>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
inline
proto
::
VarType
::
Type
ToDataType
(
std
::
type_index
type
)
{
inline
proto
::
VarType
::
Type
ToDataType
(
std
::
type_index
type
)
{
using
namespace
paddle
::
framework
::
proto
;
using
namespace
paddle
::
framework
::
proto
;
if
(
typeid
(
float
).
hash_code
()
==
type
.
hash_code
())
{
if
(
typeid
(
platform
::
float16
).
hash_code
()
==
type
.
hash_code
())
{
return
proto
::
VarType
::
FP16
;
}
else
if
(
typeid
(
float
).
hash_code
()
==
type
.
hash_code
())
{
return
proto
::
VarType
::
FP32
;
return
proto
::
VarType
::
FP32
;
}
else
if
(
typeid
(
double
).
hash_code
()
==
type
.
hash_code
())
{
}
else
if
(
typeid
(
double
).
hash_code
()
==
type
.
hash_code
())
{
return
proto
::
VarType
::
FP64
;
return
proto
::
VarType
::
FP64
;
...
@@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
...
@@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
inline
std
::
type_index
ToTypeIndex
(
proto
::
VarType
::
Type
type
)
{
inline
std
::
type_index
ToTypeIndex
(
proto
::
VarType
::
Type
type
)
{
using
namespace
paddle
::
framework
::
proto
;
using
namespace
paddle
::
framework
::
proto
;
switch
(
type
)
{
switch
(
type
)
{
case
proto
::
VarType
::
FP16
:
return
typeid
(
platform
::
float16
);
case
proto
::
VarType
::
FP32
:
case
proto
::
VarType
::
FP32
:
return
typeid
(
float
);
return
typeid
(
float
);
case
proto
::
VarType
::
FP64
:
case
proto
::
VarType
::
FP64
:
...
@@ -59,6 +64,9 @@ template <typename Visitor>
...
@@ -59,6 +64,9 @@ template <typename Visitor>
inline
void
VisitDataType
(
proto
::
VarType
::
Type
type
,
Visitor
visitor
)
{
inline
void
VisitDataType
(
proto
::
VarType
::
Type
type
,
Visitor
visitor
)
{
using
namespace
paddle
::
framework
::
proto
;
using
namespace
paddle
::
framework
::
proto
;
switch
(
type
)
{
switch
(
type
)
{
case
proto
::
VarType
::
FP16
:
visitor
.
template
operator
()
<
platform
::
float16
>();
break
;
case
proto
::
VarType
::
FP32
:
case
proto
::
VarType
::
FP32
:
visitor
.
template
operator
()
<
float
>();
visitor
.
template
operator
()
<
float
>();
break
;
break
;
...
...
paddle/fluid/framework/data_type_transform.cc
浏览文件 @
c58ce0a6
...
@@ -47,9 +47,15 @@ struct CastDataType {
...
@@ -47,9 +47,15 @@ struct CastDataType {
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
CastDataTypeFunctor
<
InType
,
OutType
>
());
#ifdef __NVCC__
}
else
if
(
platform
::
is_gpu_place
(
in_
.
place
()))
{
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
#endif
}
else
{
}
else
{
// TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW
(
"Unsupported place!"
);
PADDLE_THROW
(
"Unsupport CPU <-> GPU!"
);
}
}
}
}
};
};
...
@@ -65,6 +71,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
...
@@ -65,6 +71,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
auto
ctx
=
pool
.
Get
(
in
.
place
());
auto
ctx
=
pool
.
Get
(
in
.
place
());
switch
(
src_type
)
{
switch
(
src_type
)
{
case
proto
::
VarType
::
FP16
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
platform
::
float16
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
FP32
:
case
proto
::
VarType
::
FP32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
float
>
(
in
,
out
,
ctx
));
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
float
>
(
in
,
out
,
ctx
));
break
;
break
;
...
...
paddle/fluid/framework/data_type_transform.cu
0 → 120000
浏览文件 @
c58ce0a6
data_type_transform
.
cc
\ No newline at end of file
paddle/fluid/framework/data_type_transform_test.cc
浏览文件 @
c58ce0a6
...
@@ -22,6 +22,21 @@ TEST(DataTypeTransform, CPUTransform) {
...
@@ -22,6 +22,21 @@ TEST(DataTypeTransform, CPUTransform) {
auto
place
=
CPUPlace
();
auto
place
=
CPUPlace
();
auto
kernel_fp16
=
OpKernelType
(
proto
::
VarType
::
FP16
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp32
=
OpKernelType
(
proto
::
VarType
::
FP32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp64
=
OpKernelType
(
proto
::
VarType
::
FP64
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int32
=
OpKernelType
(
proto
::
VarType
::
INT32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int64
=
OpKernelType
(
proto
::
VarType
::
INT64
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_bool
=
OpKernelType
(
proto
::
VarType
::
BOOL
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
// data type transform from float32
{
Tensor
in
;
Tensor
in
;
Tensor
out
;
Tensor
out
;
...
@@ -32,13 +47,6 @@ TEST(DataTypeTransform, CPUTransform) {
...
@@ -32,13 +47,6 @@ TEST(DataTypeTransform, CPUTransform) {
ptr
[
i
]
=
i
/
3
;
ptr
[
i
]
=
i
/
3
;
}
}
auto
kernel_fp32
=
OpKernelType
(
proto
::
VarType
::
FP32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp64
=
OpKernelType
(
proto
::
VarType
::
FP64
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int32
=
OpKernelType
(
proto
::
VarType
::
INT32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in
,
&
out
);
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in
,
&
out
);
double
*
out_data_double
=
out
.
data
<
double
>
();
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
...
@@ -50,4 +58,109 @@ TEST(DataTypeTransform, CPUTransform) {
...
@@ -50,4 +58,109 @@ TEST(DataTypeTransform, CPUTransform) {
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
i
/
3
));
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
i
/
3
));
}
}
}
// data type transform from/to float16
{
Tensor
in
;
Tensor
out
;
float16
*
ptr
=
in
.
mutable_data
<
float16
>
(
make_ddim
({
2
,
3
}),
place
);
int
data_number
=
2
*
3
;
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ptr
[
i
]
=
i
;
}
// transform from float16 to other data types
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in
,
&
out
);
float
*
out_data_float
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_float
[
i
],
static_cast
<
float
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in
,
&
out
);
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int32
,
in
,
&
out
);
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int64
,
in
,
&
out
);
int64_t
*
out_data_int64
=
out
.
data
<
int64_t
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int64
[
i
],
static_cast
<
int64_t
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_bool
,
in
,
&
out
);
bool
*
out_data_bool
=
out
.
data
<
bool
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_bool
[
i
],
static_cast
<
bool
>
(
ptr
[
i
]));
}
// transform float to float16
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_float
[
i
]
=
i
;
}
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_float
[
i
]).
x
);
}
// transform double to float16
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_double
[
i
]
=
i
;
}
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_double
[
i
]).
x
);
}
// transform int to float16
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int
[
i
]
=
i
;
}
TransDataType
(
kernel_int32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int
[
i
]).
x
);
}
// transform int64 to float16
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int64
[
i
]
=
i
;
}
TransDataType
(
kernel_int64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int64
[
i
]).
x
);
}
// transform bool to float16
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_bool
[
i
]
=
i
;
}
TransDataType
(
kernel_bool
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_bool
[
i
]).
x
);
}
}
}
}
paddle/fluid/framework/data_type_transform_test.cu
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "gtest/gtest.h"
TEST
(
DataTypeTransform
,
GPUTransform
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
cpu_place
=
CPUPlace
();
auto
gpu_place
=
CUDAPlace
(
0
);
CUDADeviceContext
context
(
gpu_place
);
auto
kernel_fp16
=
OpKernelType
(
proto
::
VarType
::
FP16
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp32
=
OpKernelType
(
proto
::
VarType
::
FP32
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp64
=
OpKernelType
(
proto
::
VarType
::
FP64
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int32
=
OpKernelType
(
proto
::
VarType
::
INT32
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int64
=
OpKernelType
(
proto
::
VarType
::
INT64
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_bool
=
OpKernelType
(
proto
::
VarType
::
BOOL
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
// data type transform from float32
{
Tensor
in
;
Tensor
in_gpu
;
Tensor
out_gpu
;
Tensor
out
;
float
*
in_ptr
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
int
data_number
=
sizeof
(
arr
)
/
sizeof
(
arr
[
0
]);
memcpy
(
in_ptr
,
arr
,
sizeof
(
arr
));
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
arr
[
i
]));
}
TransDataType
(
kernel_fp32
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
arr
[
i
]));
}
}
// data type transform from/to float16
{
Tensor
in
;
Tensor
in_gpu
;
Tensor
out_gpu
;
Tensor
out
;
float16
*
ptr
=
in
.
mutable_data
<
float16
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
float16
arr
[
6
]
=
{
float16
(
0
),
float16
(
1
),
float16
(
2
),
float16
(
3
),
float16
(
4
),
float16
(
5
)};
int
data_number
=
sizeof
(
arr
)
/
sizeof
(
arr
[
0
]);
memcpy
(
ptr
,
arr
,
sizeof
(
arr
));
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
// transform from float16 to other data types
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
float
*
out_data_float
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_float
[
i
],
static_cast
<
float
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int64_t
*
out_data_int64
=
out
.
data
<
int64_t
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int64
[
i
],
static_cast
<
int64_t
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_bool
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
bool
*
out_data_bool
=
out
.
data
<
bool
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_bool
[
i
],
static_cast
<
bool
>
(
ptr
[
i
]));
}
// transform float to float16
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_float
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_float
[
i
]).
x
);
}
// transform double to float16
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_double
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_double
[
i
]).
x
);
}
// transform int to float16
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_int32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int
[
i
]).
x
);
}
// transform int64 to float16
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int64
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_int64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int64
[
i
]).
x
);
}
// transform bool to float16
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_bool
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_bool
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_bool
[
i
]).
x
);
}
}
}
paddle/fluid/framework/details/buffered_channel.h
已删除
100644 → 0
浏览文件 @
26922a3b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
// Four of the properties of Buffered Channel:
// - A send to a full channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from an empty channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template
<
typename
T
>
class
Buffered
:
public
paddle
::
framework
::
Channel
<
T
>
{
friend
Channel
<
T
>*
paddle
::
framework
::
MakeChannel
<
T
>
(
size_t
);
friend
void
paddle
::
framework
::
CloseChannel
<
T
>
(
Channel
<
T
>*
);
public:
virtual
bool
Send
(
T
*
);
virtual
bool
Receive
(
T
*
);
virtual
size_t
Cap
()
{
return
cap_
;
}
virtual
void
Close
();
virtual
~
Buffered
();
private:
size_t
cap_
;
std
::
mutex
mu_
;
std
::
condition_variable
empty_cond_var_
;
std
::
condition_variable
full_cond_var_
;
std
::
condition_variable
destructor_cond_var_
;
std
::
deque
<
T
>
channel_
;
std
::
atomic
<
bool
>
closed_
{
false
};
std
::
atomic
<
unsigned
>
send_ctr
{
0
};
std
::
atomic
<
unsigned
>
recv_ctr
{
0
};
Buffered
(
size_t
cap
)
:
cap_
(
cap
),
closed_
(
false
)
{
PADDLE_ENFORCE_GT
(
cap
,
0
);
}
void
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
);
};
template
<
typename
T
>
bool
Buffered
<
T
>::
Send
(
T
*
item
)
{
bool
ret
=
false
;
if
(
closed_
)
{
return
ret
;
}
send_ctr
++
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
full_cond_var_
.
wait
(
lock
,
[
this
]()
{
return
channel_
.
size
()
<
cap_
||
closed_
;
});
if
(
!
closed_
)
{
channel_
.
push_back
(
std
::
move
(
*
item
));
lock
.
unlock
();
empty_cond_var_
.
notify_one
();
ret
=
true
;
}
send_ctr
--
;
destructor_cond_var_
.
notify_one
();
return
ret
;
}
template
<
typename
T
>
bool
Buffered
<
T
>::
Receive
(
T
*
item
)
{
bool
ret
=
false
;
// Once the channel has been closed and all data has been consumed,
// just return false. Don't even try acquiring the mutex.
if
(
closed_
&&
channel_
.
empty
())
{
return
false
;
}
recv_ctr
++
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
empty_cond_var_
.
wait
(
lock
,
[
this
]()
{
return
!
channel_
.
empty
()
||
closed_
;
});
if
(
!
channel_
.
empty
())
{
*
item
=
std
::
move
(
channel_
.
front
());
channel_
.
pop_front
();
full_cond_var_
.
notify_one
();
ret
=
true
;
}
recv_ctr
--
;
destructor_cond_var_
.
notify_one
();
return
ret
;
}
template
<
typename
T
>
void
Buffered
<
T
>::
Close
()
{
if
(
closed_
)
{
return
;
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
closed_
=
true
;
NotifyAllParticipants
(
&
lock
);
}
template
<
typename
T
>
Buffered
<
T
>::~
Buffered
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
closed_
=
true
;
channel_
.
clear
();
NotifyAllParticipants
(
&
lock
);
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
lock
.
lock
();
destructor_cond_var_
.
wait
(
lock
,
[
this
]()
{
return
send_ctr
==
0
&&
recv_ctr
==
0
;
});
}
template
<
typename
T
>
void
Buffered
<
T
>::
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
lock
)
{
lock
->
unlock
();
full_cond_var_
.
notify_all
();
empty_cond_var_
.
notify_all
();
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/unbuffered_channel.h
已删除
100644 → 0
浏览文件 @
26922a3b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <mutex>
#include "paddle/fluid/framework/channel.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
// Four of the properties of UnBuffered Channel:
// - A send to a channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from a channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template
<
typename
T
>
class
UnBuffered
:
public
paddle
::
framework
::
Channel
<
T
>
{
friend
Channel
<
T
>*
paddle
::
framework
::
MakeChannel
<
T
>
(
size_t
);
friend
void
paddle
::
framework
::
CloseChannel
<
T
>
(
Channel
<
T
>*
);
public:
virtual
bool
Send
(
T
*
);
virtual
bool
Receive
(
T
*
);
virtual
size_t
Cap
()
{
return
0
;
}
virtual
void
Close
();
virtual
~
UnBuffered
();
private:
std
::
mutex
mu_ch_
;
// Mutex for readers and writers who are waiting for other reader
// and writer to complete execution
std
::
recursive_mutex
mu_read_
,
mu_write_
;
// reader_found_ is set true when a reader is ready to accept data
// writer_found_ is set true when a writer is ready to send data
// A transaction occurs only when both are true
std
::
atomic
<
bool
>
reader_found_
{
false
},
writer_found_
{
false
};
std
::
condition_variable
cv_channel_
;
std
::
condition_variable_any
cv_reader_
,
cv_writer_
,
cv_destructor_
;
T
*
item
{
nullptr
};
std
::
atomic
<
bool
>
closed_
{
false
};
std
::
atomic
<
unsigned
>
send_ctr
{
0
};
std
::
atomic
<
unsigned
>
recv_ctr
{
0
};
UnBuffered
()
:
closed_
(
false
)
{}
void
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
);
};
// This function implements the concept of how data should
// be sent from a writer to a reader.
template
<
typename
T
>
bool
UnBuffered
<
T
>::
Send
(
T
*
data
)
{
bool
ret
=
false
;
if
(
closed_
)
{
return
ret
;
}
send_ctr
++
;
// Prevent other writers from entering
std
::
unique_lock
<
std
::
recursive_mutex
>
writer_lock
(
mu_write_
);
writer_found_
=
true
;
std
::
unique_lock
<
std
::
recursive_mutex
>
cv_lock
(
mu_write_
);
// If writer comes first, it should wait till a reader arrives
cv_writer_
.
wait
(
cv_lock
,
[
this
]()
{
return
reader_found_
==
true
||
closed_
;
});
cv_reader_
.
notify_one
();
if
(
!
closed_
)
{
std
::
unique_lock
<
std
::
mutex
>
channel_lock
(
mu_ch_
);
item
=
data
;
channel_lock
.
unlock
();
cv_channel_
.
notify_one
();
channel_lock
.
lock
();
cv_channel_
.
wait
(
channel_lock
,
[
this
]()
{
return
item
==
nullptr
||
closed_
;
});
ret
=
true
;
}
writer_found_
=
false
;
send_ctr
--
;
cv_destructor_
.
notify_one
();
return
ret
;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template
<
typename
T
>
bool
UnBuffered
<
T
>::
Receive
(
T
*
data
)
{
bool
ret
=
false
;
// If channel is closed, we don't even want any reader to enter.
// Unlike a buffered channel, an unbuffered channel does not allow
// readers to read after closing because there is no buffer to be consumed.
if
(
closed_
)
return
ret
;
recv_ctr
++
;
// Prevent other readers from entering
std
::
unique_lock
<
std
::
recursive_mutex
>
read_lock
{
mu_read_
};
reader_found_
=
true
;
std
::
unique_lock
<
std
::
recursive_mutex
>
cv_lock
{
mu_read_
};
// If reader comes first, it should wait till a writer arrives
cv_reader_
.
wait
(
cv_lock
,
[
this
]()
{
return
writer_found_
==
true
||
closed_
;
});
cv_writer_
.
notify_one
();
if
(
!
closed_
)
{
std
::
unique_lock
<
std
::
mutex
>
lock_ch
{
mu_ch_
};
// Reader should wait for the writer to first write its data
cv_channel_
.
wait
(
lock_ch
,
[
this
]()
{
return
item
!=
nullptr
||
closed_
;
});
if
(
!
closed_
)
{
*
data
=
std
::
move
(
*
item
);
item
=
nullptr
;
lock_ch
.
unlock
();
ret
=
true
;
}
cv_channel_
.
notify_one
();
}
reader_found_
=
false
;
recv_ctr
--
;
cv_destructor_
.
notify_one
();
return
ret
;
}
// This function implements the sequence of events
// that take place once the channel is closed.
template
<
typename
T
>
void
UnBuffered
<
T
>::
Close
()
{
if
(
closed_
)
{
return
;
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_ch_
);
item
=
nullptr
;
closed_
=
true
;
NotifyAllParticipants
(
&
lock
);
}
// This function implements the sequence of events
// that are executed once the object of an UnBuffered
// channel is destroyed.
template
<
typename
T
>
UnBuffered
<
T
>::~
UnBuffered
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_ch_
);
item
=
nullptr
;
closed_
=
true
;
NotifyAllParticipants
(
&
lock
);
lock
.
lock
();
cv_destructor_
.
wait
(
lock
,
[
this
]()
{
return
send_ctr
==
0
&&
recv_ctr
==
0
;
});
}
// This function notifies all the readers, writers and
// the channel condition variables.
template
<
typename
T
>
void
UnBuffered
<
T
>::
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
lock
)
{
lock
->
unlock
();
cv_writer_
.
notify_all
();
cv_channel_
.
notify_all
();
cv_reader_
.
notify_all
();
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/reader.cc
浏览文件 @
c58ce0a6
...
@@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
...
@@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
return
shapes_
[
idx
];
return
shapes_
[
idx
];
}
}
void
ShuffleReader
::
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
{
if
(
iteration_pos_
>=
buffer_
.
size
())
{
// Reload buffer with new data
buffer_
.
clear
();
buffer_
.
reserve
(
buffer_size_
);
for
(
int
i
=
0
;
i
<
buffer_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
// optimize.
std
::
random_shuffle
(
buffer_
.
begin
(),
buffer_
.
end
());
iteration_pos_
=
0
;
}
out
->
clear
();
if
(
!
buffer_
.
empty
())
{
std
::
swap
(
*
out
,
buffer_
[
iteration_pos_
++
]);
}
// if buffer_ is empty, the 'out' will return as an empty vector.
}
void
BatchReader
::
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
{
buffer_
.
clear
();
buffer_
.
reserve
(
batch_size_
);
for
(
int
i
=
0
;
i
<
batch_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// Concat instances
out
->
clear
();
if
(
buffer_
.
empty
())
{
// if buffer_ is empty, the 'out' will return as an empty vector.
return
;
}
int
out_num
=
buffer_
[
0
].
size
();
out
->
reserve
(
out_num
);
for
(
int
j
=
0
;
j
<
out_num
;
++
j
)
{
// Merge shape and check date type
std
::
type_index
batch_type
=
buffer_
[
0
][
j
].
type
();
DDim
batch_shape
=
buffer_
[
0
][
j
].
dims
();
for
(
size_t
i
=
1
;
i
<
buffer_
.
size
();
++
i
)
{
std
::
type_index
ins_type
=
buffer_
[
i
][
j
].
type
();
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
PADDLE_ENFORCE_EQ
(
batch_type
,
ins_type
);
PADDLE_ENFORCE_EQ
(
slice_ddim
(
batch_shape
,
1
,
batch_shape
.
size
()),
slice_ddim
(
ins_shape
,
1
,
ins_shape
.
size
()));
PADDLE_ENFORCE_GT
(
ins_shape
[
0
],
0
);
batch_shape
[
0
]
+=
ins_shape
[
0
];
}
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
batch_shape
);
out_tensor
.
mutable_data
(
platform
::
CPUPlace
(),
batch_type
);
int64_t
dst_offset
=
0
;
// Merge lod and data
LoD
batch_lod
;
for
(
size_t
i
=
0
;
i
<
buffer_
.
size
();
++
i
)
{
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
LoD
ins_lod
=
buffer_
[
i
][
j
].
lod
();
if
(
i
==
0
)
{
batch_lod
=
ins_lod
;
}
else
{
PADDLE_ENFORCE_EQ
(
batch_lod
.
size
(),
ins_lod
.
size
());
for
(
size_t
level_idx
=
0
;
level_idx
<
batch_lod
.
size
();
++
level_idx
)
{
auto
&
lod_level
=
batch_lod
[
level_idx
];
for
(
size_t
k
=
1
;
k
<
ins_lod
[
level_idx
].
size
();
++
k
)
{
lod_level
.
push_back
(
ins_lod
[
level_idx
][
k
]
+
lod_level
.
back
());
}
}
}
Tensor
dst
=
out_tensor
.
Slice
(
dst_offset
,
dst_offset
+
ins_shape
[
0
]);
TensorCopy
(
buffer_
[
i
][
j
],
platform
::
CPUPlace
(),
&
dst
);
dst_offset
+=
ins_shape
[
0
];
}
out_tensor
.
set_lod
(
batch_lod
);
out
->
push_back
(
out_tensor
);
}
}
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/reader.h
浏览文件 @
c58ce0a6
...
@@ -60,83 +60,8 @@ class DecoratedReader : public ReaderBase {
...
@@ -60,83 +60,8 @@ class DecoratedReader : public ReaderBase {
ReaderBase
*
reader_
;
ReaderBase
*
reader_
;
};
};
// file readers
// The ReaderHolder is used as reader' unified wrapper,
// making it easier to access different type reader in Variables.
template
<
typename
T
>
class
RandomDataGenerator
:
public
FileReader
{
public:
RandomDataGenerator
(
const
std
::
vector
<
DDim
>&
shapes
,
float
min
,
float
max
)
:
FileReader
(
shapes
),
min_
(
min
),
max_
(
max
)
{
PADDLE_ENFORCE_LE
(
min
,
max
,
"'min' shouldn't be greater than 'max'.(%f vs %f)"
,
min
,
max
);
unsigned
int
seed
=
std
::
random_device
()();
engine_
.
seed
(
seed
);
dist_
=
std
::
uniform_real_distribution
<
float
>
(
min_
,
max_
);
}
void
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
override
{
out
->
clear
();
out
->
reserve
(
shapes_
.
size
());
for
(
const
DDim
&
shape
:
shapes_
)
{
PADDLE_ENFORCE_GE
(
shape
.
size
(),
2
,
"The rank of reader's output data should be 2 at least.(Now it's %d)"
,
shape
.
size
());
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
shape
);
T
*
data
=
out_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
int64_t
numel
=
product
(
shape
);
for
(
int64_t
i
=
0
;
i
<
numel
;
++
i
)
{
data
[
i
]
=
dist_
(
engine_
);
}
out
->
push_back
(
out_tensor
);
}
}
bool
HasNext
()
const
override
{
return
true
;
}
void
ReInit
()
override
{
return
;
}
private:
float
min_
;
float
max_
;
std
::
minstd_rand
engine_
;
std
::
uniform_real_distribution
<
float
>
dist_
;
};
// decorated readers
class
ShuffleReader
:
public
DecoratedReader
{
public:
ShuffleReader
(
ReaderBase
*
reader
,
int
buffer_size
)
:
DecoratedReader
(
reader
),
buffer_size_
(
buffer_size
),
iteration_pos_
(
0
)
{
buffer_
.
reserve
(
buffer_size
);
}
void
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
override
;
private:
int
buffer_size_
;
std
::
vector
<
std
::
vector
<
LoDTensor
>>
buffer_
;
size_t
iteration_pos_
;
};
class
BatchReader
:
public
DecoratedReader
{
public:
BatchReader
(
ReaderBase
*
reader
,
int
batch_size
)
:
DecoratedReader
(
reader
),
batch_size_
(
batch_size
)
{
buffer_
.
reserve
(
batch_size_
);
}
void
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
override
;
private:
int
batch_size_
;
std
::
vector
<
std
::
vector
<
LoDTensor
>>
buffer_
;
};
// The ReaderHolder is used as readers' unified wrapper,
// making it easier to access different type readers in Variables.
class
ReaderHolder
{
class
ReaderHolder
{
public:
public:
void
Reset
(
ReaderBase
*
reader
)
{
reader_
.
reset
(
reader
);
}
void
Reset
(
ReaderBase
*
reader
)
{
reader_
.
reset
(
reader
);
}
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
c58ce0a6
...
@@ -235,6 +235,7 @@ TEST(TensorToVector, Tensor) {
...
@@ -235,6 +235,7 @@ TEST(TensorToVector, Tensor) {
TEST
(
TensorContainsNAN
,
CPU
)
{
TEST
(
TensorContainsNAN
,
CPU
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
using
namespace
paddle
::
platform
;
{
Tensor
src
;
Tensor
src
;
float
*
buf
=
src
.
mutable_data
<
float
>
({
3
},
CPUPlace
());
float
*
buf
=
src
.
mutable_data
<
float
>
({
3
},
CPUPlace
());
buf
[
0
]
=
0.0
;
buf
[
0
]
=
0.0
;
...
@@ -243,11 +244,24 @@ TEST(TensorContainsNAN, CPU) {
...
@@ -243,11 +244,24 @@ TEST(TensorContainsNAN, CPU) {
ASSERT_TRUE
(
TensorContainsNAN
(
src
));
ASSERT_TRUE
(
TensorContainsNAN
(
src
));
buf
[
1
]
=
0.0
;
buf
[
1
]
=
0.0
;
ASSERT_FALSE
(
TensorContainsNAN
(
src
));
ASSERT_FALSE
(
TensorContainsNAN
(
src
));
}
{
Tensor
src
;
float16
*
buf
=
src
.
mutable_data
<
float16
>
({
3
},
CPUPlace
());
buf
[
0
]
=
0.0
;
buf
[
1
].
x
=
0x7fff
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsNAN
(
src
));
buf
[
1
]
=
0.0
;
ASSERT_FALSE
(
TensorContainsNAN
(
src
));
}
}
}
TEST
(
TensorContainsInf
,
CPU
)
{
TEST
(
TensorContainsInf
,
CPU
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
using
namespace
paddle
::
platform
;
{
Tensor
src
;
Tensor
src
;
double
*
buf
=
src
.
mutable_data
<
double
>
({
3
},
CPUPlace
());
double
*
buf
=
src
.
mutable_data
<
double
>
({
3
},
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
0
]
=
1.0
;
...
@@ -256,6 +270,18 @@ TEST(TensorContainsInf, CPU) {
...
@@ -256,6 +270,18 @@ TEST(TensorContainsInf, CPU) {
ASSERT_TRUE
(
TensorContainsInf
(
src
));
ASSERT_TRUE
(
TensorContainsInf
(
src
));
buf
[
1
]
=
1.0
;
buf
[
1
]
=
1.0
;
ASSERT_FALSE
(
TensorContainsInf
(
src
));
ASSERT_FALSE
(
TensorContainsInf
(
src
));
}
{
Tensor
src
;
float16
*
buf
=
src
.
mutable_data
<
float16
>
({
3
},
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
].
x
=
0x7c00
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsInf
(
src
));
buf
[
1
]
=
1.0
;
ASSERT_FALSE
(
TensorContainsInf
(
src
));
}
}
}
TEST
(
Tensor
,
FromAndToStream
)
{
TEST
(
Tensor
,
FromAndToStream
)
{
...
...
paddle/fluid/framework/tensor_util_test.cu
浏览文件 @
c58ce0a6
...
@@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) {
...
@@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) {
buf
[
1
]
=
0.1
;
buf
[
1
]
=
0.1
;
buf
[
2
]
=
NAN
;
buf
[
2
]
=
NAN
;
}
}
static
__global__
void
FillInf
(
float
*
buf
)
{
static
__global__
void
FillInf
(
float
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
0
]
=
0.0
;
buf
[
1
]
=
INFINITY
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.5
;
buf
[
2
]
=
0.5
;
}
}
static
__global__
void
FillNAN
(
platform
::
float16
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
0.1
;
buf
[
2
].
x
=
0x7fff
;
}
static
__global__
void
FillInf
(
platform
::
float16
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
].
x
=
0x7c00
;
buf
[
2
]
=
0.5
;
}
TEST
(
TensorContainsNAN
,
GPU
)
{
TEST
(
TensorContainsNAN
,
GPU
)
{
Tensor
tensor
;
using
namespace
paddle
::
platform
;
platform
::
CUDAPlace
gpu
(
0
);
CUDAPlace
gpu
(
0
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
pool
=
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
}
}
TEST
(
TensorContainsInf
,
GPU
)
{
TEST
(
TensorContainsInf
,
GPU
)
{
Tensor
tensor
;
using
namespace
paddle
::
platform
;
platform
::
CUDAPlace
gpu
(
0
);
CUDAPlace
gpu
(
0
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
pool
=
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
}
}
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
c58ce0a6
file
(
GLOB GENERAL_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*_op.cc"
)
file
(
GLOB GENERAL_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*_op.cc"
)
string
(
REPLACE
"_mkldnn"
""
GENERAL_OPS
"
${
GENERAL_OPS
}
"
)
string
(
REPLACE
".cc"
""
GENERAL_OPS
"
${
GENERAL_OPS
}
"
)
string
(
REPLACE
".cc"
""
GENERAL_OPS
"
${
GENERAL_OPS
}
"
)
list
(
REMOVE_DUPLICATES GENERAL_OPS
)
set
(
DEPS_OPS
""
)
set
(
DEPS_OPS
""
)
set
(
pybind_file
${
PADDLE_SOURCE_DIR
}
/paddle/fluid/pybind/pybind.h
)
set
(
pybind_file
${
PADDLE_SOURCE_DIR
}
/paddle/fluid/pybind/pybind.h
)
file
(
WRITE
${
pybind_file
}
"// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!
\n\n
"
)
file
(
WRITE
${
pybind_file
}
"// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!
\n\n
"
)
...
@@ -13,6 +15,8 @@ function(op_library TARGET)
...
@@ -13,6 +15,8 @@ function(op_library TARGET)
set
(
cu_cc_srcs
)
set
(
cu_cc_srcs
)
set
(
cudnn_cu_cc_srcs
)
set
(
cudnn_cu_cc_srcs
)
set
(
CUDNN_FILE
)
set
(
CUDNN_FILE
)
set
(
mkldnn_cc_srcs
)
set
(
MKLDNN_FILE
)
set
(
op_common_deps operator op_registry math_function
)
set
(
op_common_deps operator op_registry math_function
)
set
(
options
""
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
...
@@ -36,12 +40,20 @@ function(op_library TARGET)
...
@@ -36,12 +40,20 @@ function(op_library TARGET)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
CUDNN_FILE
}
.cu.cc
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
CUDNN_FILE
}
.cu.cc
)
list
(
APPEND cudnn_cu_cc_srcs
${
CUDNN_FILE
}
.cu.cc
)
list
(
APPEND cudnn_cu_cc_srcs
${
CUDNN_FILE
}
.cu.cc
)
endif
()
endif
()
if
(
WITH_MKLDNN
)
string
(
REPLACE
"_op"
"_mkldnn_op"
MKLDNN_FILE
"
${
TARGET
}
"
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
MKLDNN_FILE
}
.cc
)
list
(
APPEND mkldnn_cc_srcs
${
MKLDNN_FILE
}
.cc
)
endif
()
endif
()
else
()
else
()
foreach
(
src
${
op_library_SRCS
}
)
foreach
(
src
${
op_library_SRCS
}
)
if
(
${
src
}
MATCHES
".*
\\
.cu$"
)
if
(
${
src
}
MATCHES
".*
\\
.cu$"
)
list
(
APPEND cu_srcs
${
src
}
)
list
(
APPEND cu_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*_cudnn_op.cu.cc$"
)
elseif
(
${
src
}
MATCHES
".*_cudnn_op.cu.cc$"
)
list
(
APPEND cudnn_cu_cc_srcs
${
src
}
)
list
(
APPEND cudnn_cu_cc_srcs
${
src
}
)
elseif
(
WITH_MKLDNN AND
${
src
}
MATCHES
".*_mkldnn_op.cc$"
)
list
(
APPEND mkldnn_cc_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*
\\
.cu.cc$"
)
elseif
(
${
src
}
MATCHES
".*
\\
.cu.cc$"
)
list
(
APPEND cu_cc_srcs
${
src
}
)
list
(
APPEND cu_cc_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*
\\
.cc$"
)
elseif
(
${
src
}
MATCHES
".*
\\
.cc$"
)
...
@@ -62,15 +74,15 @@ function(op_library TARGET)
...
@@ -62,15 +74,15 @@ function(op_library TARGET)
set
(
DEPS_OPS
${
TARGET
}
${
DEPS_OPS
}
PARENT_SCOPE
)
set
(
DEPS_OPS
${
TARGET
}
${
DEPS_OPS
}
PARENT_SCOPE
)
endif
()
endif
()
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
cu_cc_srcs
}
${
cudnn_cu_cc_srcs
}
${
cu_srcs
}
DEPS
${
op_library_DEPS
}
nv_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
cu_cc_srcs
}
${
cudnn_cu_cc_srcs
}
${
mkldnn_cc_srcs
}
${
cu_srcs
}
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
${
op_common_deps
}
)
else
()
else
()
cc_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
DEPS
${
op_library_DEPS
}
cc_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
mkldnn_cc_srcs
}
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
${
op_common_deps
}
)
endif
()
endif
()
# Define operators that don't need pybind here.
# Define operators that don't need pybind here.
foreach
(
manual_pybind_op
"net_op"
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
"create_reader_op"
)
foreach
(
manual_pybind_op
"net_op"
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
set
(
pybind_flag 1
)
endif
()
endif
()
...
@@ -101,7 +113,8 @@ function(op_library TARGET)
...
@@ -101,7 +113,8 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP
# pybind USE_CPU_ONLY_OP
list
(
LENGTH cu_srcs cu_srcs_len
)
list
(
LENGTH cu_srcs cu_srcs_len
)
list
(
LENGTH cu_cc_srcs cu_cc_srcs_len
)
list
(
LENGTH cu_cc_srcs cu_cc_srcs_len
)
if
(
${
pybind_flag
}
EQUAL 0 AND
${
cu_srcs_len
}
EQUAL 0 AND
${
cu_cc_srcs_len
}
EQUAL 0
)
list
(
LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len
)
if
(
${
pybind_flag
}
EQUAL 0 AND
${
mkldnn_cc_srcs_len
}
EQUAL 0 AND
${
cu_srcs_len
}
EQUAL 0 AND
${
cu_cc_srcs_len
}
EQUAL 0
)
file
(
APPEND
${
pybind_file
}
"USE_CPU_ONLY_OP(
${
TARGET
}
);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_CPU_ONLY_OP(
${
TARGET
}
);
\n
"
)
set
(
pybind_flag 1
)
set
(
pybind_flag 1
)
endif
()
endif
()
...
@@ -112,6 +125,11 @@ function(op_library TARGET)
...
@@ -112,6 +125,11 @@ function(op_library TARGET)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
TARGET
}
, CUDNN);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
TARGET
}
, CUDNN);
\n
"
)
endif
()
endif
()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if
(
WITH_MKLDNN AND
${
mkldnn_cc_srcs_len
}
GREATER 0
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
TARGET
}
, MKLDNN);
\n
"
)
endif
()
# pybind USE_OP
# pybind USE_OP
if
(
${
pybind_flag
}
EQUAL 0
)
if
(
${
pybind_flag
}
EQUAL 0
)
file
(
APPEND
${
pybind_file
}
"USE_OP(
${
TARGET
}
);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_OP(
${
TARGET
}
);
\n
"
)
...
@@ -128,8 +146,8 @@ else()
...
@@ -128,8 +146,8 @@ else()
set
(
DEPS_OPS
${
DEPS_OPS
}
nccl_op
)
set
(
DEPS_OPS
${
DEPS_OPS
}
nccl_op
)
endif
()
endif
()
add_subdirectory
(
detail
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_DISTRIBUTE
)
add_subdirectory
(
detail
)
set
(
DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf
)
set
(
DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
op_library
(
send_op DEPS
${
DISTRIBUTE_DEPS
}
)
op_library
(
send_op DEPS
${
DISTRIBUTE_DEPS
}
)
...
@@ -170,7 +188,6 @@ op_library(recurrent_op DEPS executor)
...
@@ -170,7 +188,6 @@ op_library(recurrent_op DEPS executor)
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
op_library
(
cos_sim_op DEPS cos_sim_functor
)
op_library
(
cos_sim_op DEPS cos_sim_functor
)
op_library
(
parallel_do_op DEPS executor
)
op_library
(
parallel_do_op DEPS executor
)
op_library
(
create_reader_op DEPS reader
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv
)
op_library
(
conv_op DEPS vol2col depthwise_conv
)
...
@@ -184,12 +201,18 @@ op_library(save_op DEPS lod_tensor)
...
@@ -184,12 +201,18 @@ op_library(save_op DEPS lod_tensor)
op_library
(
load_op DEPS lod_tensor
)
op_library
(
load_op DEPS lod_tensor
)
op_library
(
save_combine_op DEPS lod_tensor
)
op_library
(
save_combine_op DEPS lod_tensor
)
op_library
(
load_combine_op DEPS lod_tensor
)
op_library
(
load_combine_op DEPS lod_tensor
)
op_library
(
concat_op DEPS concat_functor
)
list
(
REMOVE_ITEM GENERAL_OPS
${
DEPS_OPS
}
)
list
(
REMOVE_ITEM GENERAL_OPS
${
DEPS_OPS
}
)
foreach
(
src
${
GENERAL_OPS
}
)
foreach
(
src
${
GENERAL_OPS
}
)
op_library
(
${
src
}
)
op_library
(
${
src
}
)
endforeach
()
endforeach
()
file
(
APPEND
${
pybind_file
}
"USE_OP(less_than);
\n
USE_OP(logical_and);
\n
USE_NO_KERNEL_OP(read_from_array);
\n
USE_NO_KERNEL_OP(create_random_data_generator);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_OP(less_than);
\n
USE_OP(logical_and);
\n
USE_NO_KERNEL_OP(read_from_array);
\n
"
)
add_subdirectory
(
reader
)
foreach
(
src
${
READER_LIBRARY
}
)
set
(
OP_LIBRARY
${
src
}
${
OP_LIBRARY
}
)
endforeach
()
set
(
GLOB_OP_LIB
${
OP_LIBRARY
}
CACHE INTERNAL
"Global OP library"
)
set
(
GLOB_OP_LIB
${
OP_LIBRARY
}
CACHE INTERNAL
"Global OP library"
)
...
...
paddle/fluid/operators/concat_op.cc
浏览文件 @
c58ce0a6
...
@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
...
@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_EX
(
concat
,
ops
::
ConcatOp
,
ops
::
ConcatOpMaker
,
concat_grad
,
REGISTER_OP_EX
(
concat
,
ops
::
ConcatOp
,
ops
::
ConcatOpMaker
,
concat_grad
,
ops
::
ConcatOpGrad
,
false
)
ops
::
ConcatOpGrad
,
false
)
REGISTER_OP_CPU_KERNEL
(
concat
,
REGISTER_OP_CPU_KERNEL
(
ops
::
ConcatKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
)
concat
,
ops
::
ConcatKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
)
REGISTER_OP_CPU_KERNEL
(
concat_grad
,
REGISTER_OP_CPU_KERNEL
(
ops
::
ConcatGradKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
)
concat_grad
,
ops
::
ConcatGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
)
paddle/fluid/operators/concat_op.h
浏览文件 @
c58ce0a6
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/strided_memcpy.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
...
@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"X"
);
auto
ins
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
framework
::
Tensor
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
int64_t
axis
=
static_cast
<
int64_t
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
int64_t
axis
=
static_cast
<
int64_t
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
auto
place
=
ctx
.
GetPlace
();
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
out
->
mutable_data
<
T
>
(
place
);
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if
(
axis
==
0
&&
ins
.
size
()
<
10
)
{
size_t
output_offset
=
0
;
size_t
output_offset
=
0
;
// If axis >=1, copy to out immediately need to call many times
// of cuda memcpy. Copy the input to cpu and do the stride copy,
// then copy to gpu output.
if
(
platform
::
is_gpu_place
(
place
)
&&
axis
>=
1
)
{
platform
::
CPUPlace
copy_place
;
auto
&
cpu_ctx
=
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
copy_place
);
framework
::
Tensor
cpu_out
;
cpu_out
.
Resize
(
out
->
dims
());
cpu_out
.
mutable_data
<
T
>
(
copy_place
);
auto
&
dev_ctx
=
ctx
.
device_context
();
std
::
vector
<
std
::
unique_ptr
<
framework
::
Tensor
>>
cpu_ins
;
for
(
auto
*
in
:
ins
)
{
std
::
unique_ptr
<
framework
::
Tensor
>
cpu_in
(
new
framework
::
Tensor
);
framework
::
TensorCopy
(
*
in
,
copy_place
,
dev_ctx
,
cpu_in
.
get
());
cpu_ins
.
emplace_back
(
std
::
move
(
cpu_in
));
}
// TODO(dzhwinter): overlap copy and compute stream
// https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
dev_ctx
.
Wait
();
for
(
auto
&
in
:
cpu_ins
)
{
auto
&
cpu_in
=
*
in
.
get
();
auto
in_stride
=
framework
::
stride_numel
(
cpu_in
.
dims
());
StridedNumelCopyWithAxis
<
T
>
(
cpu_ctx
,
axis
,
cpu_out
.
data
<
T
>
()
+
output_offset
,
out_stride
,
cpu_in
.
data
<
T
>
(),
in_stride
,
in_stride
[
axis
]);
output_offset
+=
in_stride
[
axis
];
}
framework
::
TensorCopy
(
cpu_out
,
place
,
dev_ctx
,
out
);
}
else
{
for
(
auto
*
in
:
ins
)
{
for
(
auto
*
in
:
ins
)
{
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
StridedNumelCopyWithAxis
<
T
>
(
ctx
.
device_context
(),
axis
,
StridedNumelCopyWithAxis
<
T
>
(
ctx
.
device_context
(),
axis
,
out
->
data
<
T
>
()
+
output_offset
,
out_stride
,
out
->
data
<
T
>
()
+
output_offset
,
out_stride
,
in
->
data
<
T
>
(),
in_stride
,
in_stride
[
axis
]);
in
->
data
<
T
>
(),
in_stride
,
in_stride
[
axis
]);
output_offset
+=
in_stride
[
axis
];
output_offset
+=
in_stride
[
axis
];
}
}
}
else
{
std
::
vector
<
framework
::
Tensor
>
inputs
(
ins
.
size
());
for
(
size_t
j
=
0
;
j
<
ins
.
size
();
++
j
)
{
inputs
[
j
]
=
*
ins
[
j
];
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
paddle
::
operators
::
math
::
ConcatFunctor
<
DeviceContext
,
T
>
concat_functor
;
concat_functor
(
dev_ctx
,
inputs
,
static_cast
<
int
>
(
axis
),
out
);
}
}
}
}
};
};
...
@@ -86,6 +63,9 @@ class ConcatGradKernel : public framework::OpKernel<T> {
...
@@ -86,6 +63,9 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int64_t
axis
=
static_cast
<
int64_t
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
int64_t
axis
=
static_cast
<
int64_t
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if
(
axis
==
0
&&
outs
.
size
()
<
10
)
{
size_t
input_offset
=
0
;
size_t
input_offset
=
0
;
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
...
@@ -97,6 +77,18 @@ class ConcatGradKernel : public framework::OpKernel<T> {
...
@@ -97,6 +77,18 @@ class ConcatGradKernel : public framework::OpKernel<T> {
in_stride
,
out_stride
[
axis
]);
in_stride
,
out_stride
[
axis
]);
input_offset
+=
out_stride
[
axis
];
input_offset
+=
out_stride
[
axis
];
}
}
}
else
{
std
::
vector
<
framework
::
Tensor
>
outputs
(
outs
.
size
());
for
(
size_t
j
=
0
;
j
<
outs
.
size
();
++
j
)
{
outs
[
j
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
outputs
[
j
]
=
*
outs
[
j
];
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
paddle
::
operators
::
math
::
ConcatGradFunctor
<
DeviceContext
,
T
>
concat_grad_functor
;
concat_grad_functor
(
dev_ctx
,
*
in
,
static_cast
<
int
>
(
axis
),
outputs
);
}
}
}
};
};
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
MKLDNNDeviceContext
;
using
paddle
::
platform
::
MKLDNNMemDesc
;
using
mkldnn
::
memory
;
// Note: paddle has also "memory" namespace
using
mkldnn
::
primitive
;
using
mkldnn
::
convolution_forward
;
using
mkldnn
::
convolution_backward_weights
;
using
mkldnn
::
convolution_backward_data
;
using
mkldnn
::
convolution_direct
;
using
mkldnn
::
prop_kind
;
using
mkldnn
::
padding_kind
;
using
mkldnn
::
stream
;
namespace
{
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
);
convolution_backward_weights
::
primitive_desc
ConvBwdWeightsPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
diff_weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
);
convolution_backward_data
::
primitive_desc
ConvBwdDataPrimitiveDesc
(
const
memory
::
desc
&
diff_src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
);
}
// anonymous namespace
template
<
typename
T
>
class
ConvOpMkldnnKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const
std
::
string
key
=
ctx
.
op
().
Output
(
"Output"
);
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE
(
groups
==
1
,
"group convolution is not implemented yet"
);
PADDLE_ENFORCE
(
dilations
.
size
()
==
2
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
,
"dilation in convolution is not implemented yet"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
// allocate memory for output
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
,
"Input must be with 4 dimensions, i.e. NCHW"
);
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
,
"Filter must be with 4 dimensions, i.e. OIHW"
);
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
weights_tz
=
paddle
::
framework
::
vectorize2int
(
filter
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
// TODO(pzelazko-intel): support more formats
// memory descriptors for convolution src/weight/dst
auto
conv_src_md
=
MKLDNNMemDesc
(
src_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
conv_weights_md
=
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
oihw
);
auto
conv_dst_md
=
MKLDNNMemDesc
(
dst_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
// create memory primitives
auto
conv_src_memory
=
memory
({
conv_src_md
,
mkldnn_engine
},
(
void
*
)
input_data
);
auto
conv_weights_memory
=
memory
({
conv_weights_md
,
mkldnn_engine
},
(
void
*
)
filter_data
);
auto
conv_dst_memory
=
memory
({
conv_dst_md
,
mkldnn_engine
},
output_data
);
std
::
unique_ptr
<
convolution_forward
::
primitive_desc
>
conv_pd
=
ConvFwdPrimitiveDesc
(
conv_src_md
,
conv_weights_md
,
conv_dst_md
,
strides
,
paddings
,
mkldnn_engine
);
// save p_conv_pd into dev_ctx to be referred in backward path
auto
p_conv_pd
=
conv_pd
.
get
();
std
::
shared_ptr
<
void
>
conv_pd_value
=
std
::
move
(
conv_pd
);
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd_value
);
// create convolution op primitive
auto
conv_prim
=
convolution_forward
(
*
p_conv_pd
,
conv_src_memory
,
conv_weights_memory
,
conv_dst_memory
);
// push op to stream and wait MKLDNN until it's executed
std
::
vector
<
primitive
>
pipeline
{
conv_prim
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
};
template
<
typename
T
>
class
ConvGradOpMkldnnKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
const
Tensor
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
const
Tensor
*
output
=
ctx
.
Input
<
Tensor
>
(
"Output"
);
const
Tensor
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
Tensor
*
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
if
(
!
input_grad
&&
!
filter_grad
)
return
;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const
std
::
string
key
=
ctx
.
op
().
Input
(
"Output"
);
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
->
data
<
T
>
();
T
*
input_grad_data
=
nullptr
;
T
*
filter_grad_data
=
nullptr
;
// allocate memory for gradient of input/filter
if
(
input_grad
)
{
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
if
(
filter_grad
)
{
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
weights_tz
=
paddle
::
framework
::
vectorize2int
(
filter
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
// TODO(pzelazko-intel): support more formats
auto
conv_src_md
=
MKLDNNMemDesc
(
src_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
conv_diff_src_md
=
MKLDNNMemDesc
(
src_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
conv_weights_md
=
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
oihw
);
auto
conv_diff_weights_md
=
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
oihw
);
auto
conv_diff_dst_md
=
MKLDNNMemDesc
(
dst_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
// create memory
auto
conv_diff_dst_memory
=
memory
({
conv_diff_weights_md
,
mkldnn_engine
},
(
void
*
)
output_grad_data
);
// Retrieve conv_pd from device context
std
::
shared_ptr
<
void
>
conv_pd
;
convolution_forward
::
primitive_desc
*
p_conv_pd
;
conv_pd
=
dev_ctx
.
GetBlob
(
key_conv_pd
);
PADDLE_ENFORCE
(
conv_pd
!=
nullptr
,
"Fail to find conv_pd in device context"
);
p_conv_pd
=
static_cast
<
convolution_forward
::
primitive_desc
*>
(
conv_pd
.
get
());
// create backward conv primitive for weights
if
(
filter_grad
)
{
// create primitive descriptor
convolution_backward_weights
::
primitive_desc
conv_bwd_weights_pd
=
ConvBwdWeightsPrimitiveDesc
(
conv_src_md
,
conv_diff_weights_md
,
conv_diff_dst_md
,
strides
,
paddings
,
*
p_conv_pd
,
mkldnn_engine
);
// create memory
auto
conv_diff_weights_memory
=
memory
(
{
conv_diff_weights_md
,
mkldnn_engine
},
(
void
*
)
filter_grad_data
);
auto
conv_src_memory
=
memory
({
conv_src_md
,
mkldnn_engine
},
(
void
*
)
input_data
);
// create backward conv primitive for weights
auto
conv_bwd_weights_prim
=
convolution_backward_weights
(
conv_bwd_weights_pd
,
conv_src_memory
,
conv_diff_dst_memory
,
conv_diff_weights_memory
);
// push primitive and execute it
std
::
vector
<
primitive
>
pipeline
{
conv_bwd_weights_prim
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
if
(
input_grad
)
{
// create primitive descriptor
convolution_backward_data
::
primitive_desc
conv_bwd_data_pd
=
ConvBwdDataPrimitiveDesc
(
conv_diff_src_md
,
conv_weights_md
,
conv_diff_dst_md
,
strides
,
paddings
,
*
p_conv_pd
,
mkldnn_engine
);
// create memory
auto
conv_diff_src_memory
=
memory
({
conv_diff_src_md
,
mkldnn_engine
},
(
void
*
)
input_grad_data
);
auto
conv_weights_memory
=
memory
({
conv_weights_md
,
mkldnn_engine
},
(
void
*
)
filter_data
);
// create backward conv primitive for data
auto
conv_bwd_data_prim
=
convolution_backward_data
(
conv_bwd_data_pd
,
conv_diff_dst_memory
,
conv_weights_memory
,
conv_diff_src_memory
);
// push primitive and execute it
std
::
vector
<
primitive
>
pipeline
{
conv_bwd_data_prim
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
}
// Compute()
};
namespace
{
std
::
unique_ptr
<
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
)
{
mkldnn
::
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
mkldnn
::
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
auto
conv_desc
=
mkldnn
::
convolution_forward
::
desc
(
mkldnn
::
prop_kind
::
forward
,
mkldnn
::
convolution_direct
,
src
,
weights
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
auto
p_conv_pd
=
new
convolution_forward
::
primitive_desc
(
conv_desc
,
engine
);
return
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
(
p_conv_pd
);
}
convolution_backward_weights
::
primitive_desc
ConvBwdWeightsPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
diff_weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
)
{
auto
conv_bwd_weights_desc
=
convolution_backward_weights
::
desc
(
convolution_direct
,
src
,
diff_weights
,
diff_dst
,
strides
,
paddings
,
paddings
,
padding_kind
::
zero
);
return
convolution_backward_weights
::
primitive_desc
(
conv_bwd_weights_desc
,
engine
,
conv_pd
);
}
convolution_backward_data
::
primitive_desc
ConvBwdDataPrimitiveDesc
(
const
memory
::
desc
&
diff_src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
)
{
auto
conv_bwd_data_desc
=
convolution_backward_data
::
desc
(
convolution_direct
,
diff_src
,
weights
,
diff_dst
,
strides
,
paddings
,
paddings
,
padding_kind
::
zero
);
return
convolution_backward_data
::
primitive_desc
(
conv_bwd_data_desc
,
engine
,
conv_pd
);
}
}
// anonymous namespace
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
conv2d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ConvOpMkldnnKernel
<
float
>
);
REGISTER_OP_KERNEL
(
conv2d_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ConvGradOpMkldnnKernel
<
float
>
);
paddle/fluid/operators/conv_op.cc
浏览文件 @
c58ce0a6
...
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/conv_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
ConvOp
::
GetExpectedKernelType
(
framework
::
OpKernelType
ConvOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
framework
::
ExecutionContext
&
ctx
)
const
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
library_
=
framework
::
LibraryType
::
kCUDNN
;
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
}
}
#endif
#endif
framework
::
LibraryType
library_
;
#ifdef PADDLE_WITH_MKLDNN
if
(
use_cudnn
)
{
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
library_
=
framework
::
LibraryType
::
kCUDNN
;
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
}
else
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
library_
=
framework
::
LibraryType
::
kPlain
;
}
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
...
@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...
@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn"
,
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"data_format"
,
"data_format"
,
"(string, default NCHW) Only used in "
"(string, default NCHW) Only used in "
...
@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...
@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn"
,
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
AddAttr
<
std
::
string
>
(
"data_format"
,
"data_format"
,
"(string, default NCHW) Only used in "
"(string, default NCHW) Only used in "
...
@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
...
@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
ConvOpGrad
::
GetExpectedKernelType
(
framework
::
OpKernelType
ConvOpGrad
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
framework
::
ExecutionContext
&
ctx
)
const
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
library_
=
framework
::
LibraryType
::
kCUDNN
;
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
}
}
#endif
#endif
#ifdef PADDLE_WITH_MKLDNN
framework
::
LibraryType
library_
;
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
if
(
use_cudnn
)
{
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
library_
=
framework
::
LibraryType
::
kMKLDNN
;
}
else
{
library_
=
framework
::
LibraryType
::
kPlain
;
}
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
...
...
paddle/fluid/operators/create_reader_op.cc
已删除
100644 → 0
浏览文件 @
26922a3b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace
paddle
{
namespace
operators
{
static
std
::
vector
<
framework
::
DDim
>
RestoreShapes
(
const
std
::
vector
<
int
>&
shape_concat
,
const
std
::
vector
<
int
>&
ranks
)
{
std
::
vector
<
framework
::
DDim
>
res
;
int
offset
=
0
;
for
(
int
len
:
ranks
)
{
auto
start_it
=
shape_concat
.
begin
()
+
offset
;
auto
end_it
=
start_it
+
len
;
res
.
push_back
(
framework
::
make_ddim
(
std
::
vector
<
int
>
(
start_it
,
end_it
)));
offset
+=
len
;
}
return
res
;
}
// general infershape for file readers
class
CreateFileReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output file reader should not be null."
);
const
auto
shape_concat
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
if
(
ctx
->
IsRuntime
())
{
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d)."
,
lod_levels
.
size
(),
shapes
.
size
());
framework
::
VarDesc
*
reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
reader
->
SetLoDLevels
(
lod_levels
);
}
}
};
// general infershape for decorated readers
class
CreateDecoratedReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"UnderlyingReader"
),
"Input(UnderlyingReader) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output decorated reader should not be null."
);
ctx
->
SetReaderDims
(
"Out"
,
ctx
->
GetReaderDims
(
"UnderlyingReader"
));
if
(
ctx
->
IsRuntime
())
{
framework
::
VarDesc
*
in_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"UnderlyingReader"
)[
0
]);
framework
::
VarDesc
*
out_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
out_reader
->
SetLoDLevels
(
in_reader
->
GetLoDLevels
());
}
}
};
// general var type inference for file readers
class
CreateFileReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
std
::
string
reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
}
};
// general var type inference for decorated readers
class
CreateDecoratedReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
std
::
string
in_reader_name
=
op_desc
.
Input
(
"UnderlyingReader"
)[
0
];
framework
::
VarDesc
*
in_reader
=
block
->
FindVarRecursive
(
in_reader_name
);
std
::
string
out_reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
out_reader
=
block
->
FindVarRecursive
(
out_reader_name
);
out_reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
out_reader
->
SetDataTypes
(
in_reader
->
GetDataTypes
());
}
};
template
<
typename
T
>
class
CreateRandomDataGeneratorOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
shape_concat
=
Attr
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
&
ranks
=
Attr
<
std
::
vector
<
int
>>
(
"ranks"
);
PADDLE_ENFORCE
(
!
shape_concat
.
empty
()
&&
!
ranks
.
empty
());
PADDLE_ENFORCE_EQ
(
std
::
accumulate
(
ranks
.
begin
(),
ranks
.
end
(),
0
),
int
(
shape_concat
.
size
()),
"The accumulate of all ranks should be equal to the "
"shape concat's length."
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
framework
::
RandomDataGenerator
<
T
>
(
shapes
,
Attr
<
float
>
(
"min"
),
Attr
<
float
>
(
"max"
)));
}
};
class
CreateRandomDataGeneratorOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
CreateRandomDataGeneratorOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddOutput
(
"Out"
,
"(ReaderHolder) The created random reader."
);
AddAttr
<
std
::
vector
<
int
>>
(
"shape_concat"
,
"The concat of all data's shapes."
);
AddAttr
<
std
::
vector
<
int
>>
(
"ranks"
,
"The ranks of each data."
"e.g."
"shape_concat = [2,3,4,5,6]"
"ranks = [3,2]"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively."
);
AddAttr
<
std
::
vector
<
int
>>
(
"lod_levels"
,
"The LoD levels of each data."
);
AddAttr
<
float
>
(
"min"
,
"The lower bound of reader's uniform distribution."
);
AddAttr
<
float
>
(
"max"
,
"The upper bound of reader's uniform distribution."
);
AddComment
(
R"DOC(
CreateRandomDataGenerator Operator
This Op creates a random reader.
The reader generates random data instead of really reading from files.
Generated data follow an uniform distribution between 'min' and 'max'.
)DOC"
);
}
};
class
CreateShuffleReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
framework
::
ShuffleReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"buffer_size"
)));
}
};
class
CreateShuffleReaderOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
CreateShuffleReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddInput
(
"UnderlyingReader"
,
"(ReaderHolder) The underlying reader for creating a shuffle reader."
);
AddOutput
(
"Out"
,
"(ReaderHolder) The created shuffle reader."
);
AddAttr
<
int
>
(
"buffer_size"
,
"The shuffle buffer size."
).
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateShuffleReader Operator
A shuffle reader takes another reader as its 'underlying reader'
and yields the underlying reader's outputs in a shuffled order.
)DOC"
);
}
};
class
CreateBatchReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
framework
::
BatchReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"batch_size"
)));
}
};
class
CreateBatchReaderOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
CreateBatchReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddInput
(
"UnderlyingReader"
,
"(ReaderHolder) The underlying reader for creating a batch reader."
);
AddOutput
(
"Out"
,
"(ReaderHolder) The created batch reader."
);
AddAttr
<
int
>
(
"batch_size"
,
"How many instances the batch reader yields each time."
)
.
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateBatchReader Operator
A batch reader takes another reader as its 'underlying reader',
gathers the underlying reader's outputs and then yields them in batches.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
create_random_data_generator
,
ops
::
CreateRandomDataGeneratorOp
<
float
>
,
ops
::
CreateFileReaderInferShape
,
ops
::
CreateRandomDataGeneratorOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
CreateFileReaderInferVarType
);
REGISTER_OPERATOR
(
create_shuffle_reader
,
ops
::
CreateShuffleReaderOp
,
ops
::
CreateDecoratedReaderInferShape
,
ops
::
CreateShuffleReaderOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
CreateDecoratedReaderInferVarType
);
REGISTER_OPERATOR
(
create_batch_reader
,
ops
::
CreateBatchReaderOp
,
ops
::
CreateDecoratedReaderInferShape
,
ops
::
CreateBatchReaderOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
CreateDecoratedReaderInferVarType
);
paddle/fluid/operators/detail/CMakeLists.txt
浏览文件 @
c58ce0a6
grpc_library
(
sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows
)
if
(
WITH_DISTRIBUTE
)
grpc_library
(
sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows
)
endif
()
paddle/fluid/operators/elementwise_mul_op.h
浏览文件 @
c58ce0a6
...
@@ -40,80 +40,14 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
...
@@ -40,80 +40,14 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
ElementwiseMulGradFunctor
{
struct
IdentityGrad_DX
{
template
<
typename
Device
,
typename
X
,
typename
Y
,
typename
Z
,
typename
dX
,
HOSTDEVICE
T
operator
()(
T
x
,
T
y
,
T
out
,
T
dout
)
const
{
return
dout
*
y
;
}
typename
dY
,
typename
dZ
>
void
operator
()(
Device
d
,
X
x
,
Y
y
,
Z
z
,
dX
dx
,
dY
dy
,
dZ
dz
)
{
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
dz_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dz
);
if
(
dx
)
{
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dx
);
dx_e
.
device
(
d
)
=
dz_e
*
y_e
;
}
if
(
dy
)
{
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dy
);
dy_e
.
device
(
d
)
=
x_e
*
dz_e
;
}
}
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
ElementwiseMulBroadCastGradFunctor
{
struct
IdentityGrad_DY
{
template
<
typename
Device
,
typename
X
,
typename
Y
,
typename
Z
,
typename
dX
,
HOSTDEVICE
T
operator
()(
T
x
,
T
y
,
T
out
,
T
dout
)
const
{
return
dout
*
x
;
}
typename
dY
,
typename
dZ
,
typename
Pre
,
typename
N
>
void
operator
()(
Device
d
,
X
x
,
Y
y
,
Z
z
,
dX
dx
,
dY
dy
,
dZ
dz
,
Pre
pre
,
N
n
)
{
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
dz_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dz
);
auto
y_e_bcast
=
y_e
.
reshape
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
n
))
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
pre
,
1
))
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
x_e
.
size
()));
if
(
dx
)
{
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dx
);
dx_e
.
device
(
d
)
=
dz_e
*
y_e_bcast
;
}
if
(
dy
)
{
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dy
);
dy_e
.
device
(
d
)
=
(
x_e
*
dz_e
)
.
reshape
(
Eigen
::
DSizes
<
int
,
2
>
(
pre
,
n
))
.
sum
(
Eigen
::
array
<
int
,
1
>
{{
0
}});
}
}
};
};
template
<
typename
T
>
struct
ElementwiseMulBroadCast2GradFunctor
{
template
<
typename
Device
,
typename
X
,
typename
Y
,
typename
Z
,
typename
dX
,
typename
dY
,
typename
dZ
,
typename
Pre
,
typename
N
,
typename
Post
>
void
operator
()(
Device
d
,
X
x
,
Y
y
,
Z
z
,
dX
dx
,
dY
dy
,
dZ
dz
,
Pre
pre
,
N
n
,
Post
post
)
{
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
dz_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dz
);
auto
y_e_bcast
=
y_e
.
reshape
(
Eigen
::
DSizes
<
int
,
3
>
(
1
,
n
,
1
))
.
broadcast
(
Eigen
::
DSizes
<
int
,
3
>
(
pre
,
1
,
post
))
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
x_e
.
size
()));
if
(
dx
)
{
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dx
);
dx_e
.
device
(
d
)
=
dz_e
*
y_e_bcast
;
}
if
(
dy
)
{
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dy
);
dy_e
.
device
(
d
)
=
(
x_e
*
dz_e
)
.
reshape
(
Eigen
::
DSizes
<
int
,
3
>
(
pre
,
n
,
post
))
.
sum
(
Eigen
::
array
<
int
,
2
>
{{
0
,
2
}});
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseMulGradKernel
:
public
framework
::
OpKernel
<
T
>
{
class
ElementwiseMulGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
@@ -127,12 +61,11 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
...
@@ -127,12 +61,11 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
Elem
entwiseGradCompute
<
DeviceContext
,
T
,
ElementwiseMulGradFunctor
<
T
>
,
Elem
wiseGradCompute
<
DeviceContext
,
T
,
IdentityGrad_DX
<
T
>
,
ElementwiseMulBroadCastGradFunctor
<
T
>
,
IdentityGrad_DY
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
ElementwiseMulBroadCast2GradFunctor
<
T
>>
(
dy
,
IdentityGrad_DX
<
T
>
(),
ctx
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
);
IdentityGrad_DY
<
T
>
()
);
}
}
};
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
c58ce0a6
...
@@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast {
...
@@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast {
dx_
[
i
]
=
dx_op_
(
x_
[
i
],
y_
[
i
],
out_
[
i
],
dout_
[
i
]);
dx_
[
i
]
=
dx_op_
(
x_
[
i
],
y_
[
i
],
out_
[
i
],
dout_
[
i
]);
}
}
if
(
dy_
!=
nullptr
)
{
if
(
dy_
!=
nullptr
)
{
dy_
[
i
]
=
d
x
_op_
(
x_
[
i
],
y_
[
i
],
out_
[
i
],
dout_
[
i
]);
dy_
[
i
]
=
d
y
_op_
(
x_
[
i
],
y_
[
i
],
out_
[
i
],
dout_
[
i
]);
}
}
}
}
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
c58ce0a6
...
@@ -20,6 +20,7 @@ if(WITH_GPU)
...
@@ -20,6 +20,7 @@ if(WITH_GPU)
nv_library
(
unpooling SRCS unpooling.cc unpooling.cu DEPS device_context
)
nv_library
(
unpooling SRCS unpooling.cc unpooling.cu DEPS device_context
)
nv_library
(
gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function
)
nv_library
(
gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function
)
nv_library
(
cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context
)
nv_library
(
cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context
)
nv_library
(
concat_functor SRCS concat.cc concat.cu DEPS device_context tensor
)
else
()
else
()
cc_library
(
math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto
)
cc_library
(
math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto
)
cc_library
(
selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function
)
cc_library
(
selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function
)
...
@@ -37,6 +38,7 @@ else()
...
@@ -37,6 +38,7 @@ else()
cc_library
(
unpooling SRCS unpooling.cc DEPS device_context
)
cc_library
(
unpooling SRCS unpooling.cc DEPS device_context
)
cc_library
(
gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function
)
cc_library
(
gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function
)
cc_library
(
cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context
)
cc_library
(
cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context
)
cc_library
(
concat_functor SRCS concat.cc DEPS device_context tensor
)
endif
()
endif
()
cc_test
(
math_function_test SRCS math_function_test.cc DEPS math_function tensor
)
cc_test
(
math_function_test SRCS math_function_test.cc DEPS math_function tensor
)
...
@@ -44,3 +46,4 @@ cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selec
...
@@ -44,3 +46,4 @@ cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selec
cc_test
(
im2col_test SRCS im2col_test.cc DEPS math_function tensor
)
cc_test
(
im2col_test SRCS im2col_test.cc DEPS math_function tensor
)
cc_test
(
vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor
)
cc_test
(
vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor
)
cc_test
(
sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding
)
cc_test
(
sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding
)
cc_test
(
concat_test SRCS concat_test.cc DEPS concat_functor tensor
)
paddle/fluid/operators/math/concat.cc
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
const
int
axis
,
framework
::
Tensor
*
output
)
{
// TODO(zcd): Add input data validity checking
int
num
=
input
.
size
();
int
rows
=
1
;
auto
dim_0
=
input
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
rows
*=
dim_0
[
i
];
}
int
out_rows
=
rows
,
out_cols
=
0
;
std
::
vector
<
int64_t
>
input_cols
(
input
.
size
());
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
input
[
i
].
numel
()
/
rows
;
out_cols
+=
t_cols
;
input_cols
[
i
]
=
t_cols
;
}
auto
&
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
// computation
for
(
int
k
=
0
;
k
<
out_rows
;
++
k
)
{
T
*
dst_ptr
=
output
->
data
<
T
>
()
+
k
*
out_cols
;
int
col_idx
=
0
;
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
int
col_len
=
input_cols
[
j
];
const
T
*
src_prt
=
input
[
j
].
data
<
T
>
()
+
k
*
col_len
;
memory
::
Copy
(
cpu_place
,
dst_ptr
+
col_idx
,
cpu_place
,
src_prt
,
sizeof
(
T
)
*
col_len
);
col_idx
+=
col_len
;
}
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>&
outputs
)
{
// TODO(zcd): Add input data validity checking
int
num
=
outputs
.
size
();
int
input_rows
=
1
;
auto
dim_0
=
outputs
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
input_rows
*=
dim_0
[
i
];
}
int
input_cols
=
0
;
std
::
vector
<
int64_t
>
output_cols
(
outputs
.
size
());
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
outputs
[
i
].
numel
()
/
input_rows
;
input_cols
+=
t_cols
;
output_cols
[
i
]
=
t_cols
;
}
auto
&
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
// computation
for
(
int
k
=
0
;
k
<
input_rows
;
++
k
)
{
const
T
*
src_ptr
=
input
.
data
<
T
>
()
+
k
*
input_cols
;
int
col_idx
=
0
;
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
int
col_len
=
output_cols
[
j
];
T
*
dst_ptr
=
outputs
[
j
].
data
<
T
>
()
+
k
*
col_len
;
memory
::
Copy
(
cpu_place
,
dst_ptr
,
cpu_place
,
src_ptr
+
col_idx
,
sizeof
(
T
)
*
col_len
);
col_idx
+=
col_len
;
}
}
}
};
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
int
>;
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
double
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
int
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/concat.cu
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
__device__
T
upper_bound
(
const
T
*
first
,
T
count
,
T
val
)
{
const
T
*
orig
=
first
;
const
T
*
it
=
nullptr
;
T
step
=
0
;
while
(
count
>
0
)
{
it
=
first
;
step
=
count
/
2
;
it
+=
step
;
if
(
!
(
val
<
*
it
))
{
first
=
++
it
;
count
-=
step
+
1
;
}
else
{
count
=
step
;
}
}
return
first
-
orig
;
}
template
<
typename
T
>
__global__
void
KernelConcat
(
T
**
inputs
,
const
int
*
input_cols
,
int
col_size
,
const
int
output_rows
,
const
int
output_cols
,
T
*
output
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
segment
=
upper_bound
<
int
>
(
input_cols
,
col_size
,
tid_x
)
-
1
;
int
curr_offset
=
input_cols
[
segment
];
int
curr_segment
=
segment
;
for
(;
tid_x
<
output_cols
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
curr_col_offset
;
while
((
curr_col_offset
=
input_cols
[
curr_segment
+
1
])
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
input_ptr
=
inputs
[
curr_segment
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
output_rows
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output
[
tid_y
*
output_cols
+
tid_x
]
=
input_ptr
[
tid_y
*
segment_width
+
local_col
];
}
}
template
<
typename
T
>
__global__
void
KernelConcat
(
T
**
inputs
,
const
int
input_col
,
const
int
output_rows
,
const
int
output_cols
,
T
*
output
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
double
inv_input_col
=
1.0
/
input_col
;
for
(;
tid_x
<
output_cols
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
*
inv_input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
T
*
input_ptr
=
inputs
[
split
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
output_rows
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
{
output
[
tid_y
*
output_cols
+
tid_x
]
=
input_ptr
[
tid_y
*
input_col
+
in_offset
];
}
}
}
template
<
typename
T
>
__global__
void
KernelConcatGrad
(
const
T
*
input
,
const
int
input_row
,
const
int
input_col
,
const
int
*
output_cols
,
int
col_size
,
T
**
outputs
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
segment
=
upper_bound
<
int
>
(
output_cols
,
col_size
,
tid_x
)
-
1
;
int
curr_offset
=
output_cols
[
segment
];
int
curr_segment
=
segment
;
for
(;
tid_x
<
input_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
curr_col_offset
;
while
((
curr_col_offset
=
output_cols
[
curr_segment
+
1
])
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
output_ptr
=
outputs
[
curr_segment
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
input_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input
[
tid_y
*
input_col
+
tid_x
];
}
}
template
<
typename
T
>
__global__
void
KernelConcatGrad
(
const
T
*
input
,
const
int
input_row
,
const
int
input_col
,
const
int
output_cols
,
T
**
outputs
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
double
inv_input_col
=
1.0
/
input_col
;
for
(;
tid_x
<
input_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
*
inv_input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
T
*
output_ptr
=
outputs
[
split
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
input_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
output_cols
+
in_offset
]
=
input
[
tid_y
*
input_col
+
tid_x
];
}
}
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
const
int
axis
,
framework
::
Tensor
*
output
)
{
// TODO(zcd): Add input data validity checking
int
num
=
input
.
size
();
int
rows
=
1
;
auto
dim_0
=
input
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
rows
*=
dim_0
[
i
];
}
int
cols
=
input
[
0
].
numel
()
/
rows
;
int
out_rows
=
rows
,
out_cols
=
0
;
framework
::
Vector
<
int16_t
>
inputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
framework
::
Vector
<
int
>
inputs_cols
(
num
+
1
);
inputs_cols
[
0
]
=
0
;
T
**
inputs_ptr
=
reinterpret_cast
<
T
**>
(
inputs_data
.
data
());
bool
sameShape
=
true
;
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
input
[
i
].
numel
()
/
rows
;
if
(
sameShape
)
{
if
(
t_cols
!=
cols
)
sameShape
=
false
;
}
out_cols
+=
t_cols
;
inputs_cols
[
i
+
1
]
=
out_cols
;
inputs_ptr
[
i
]
=
const_cast
<
T
*>
(
input
[
i
].
data
<
T
>
());
}
T
**
ins_gpu
=
reinterpret_cast
<
T
**>
(
inputs_data
.
CUDAMutableData
(
context
.
GetPlace
()));
const
int
*
ins_col_gpu
=
inputs_cols
.
CUDAData
(
context
.
GetPlace
());
// computation
// set the thread block and grid according to CurrentDeviceId
const
int
kThreadsPerBlock
=
1024
;
int
block_cols
=
kThreadsPerBlock
;
if
(
out_cols
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
out_cols
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
max_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
int
grid_cols
=
std
::
min
((
out_cols
+
block_cols
-
1
)
/
block_cols
,
max_blocks
);
int
grid_rows
=
std
::
min
(
max_blocks
/
grid_cols
,
std
::
max
(
out_rows
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
sameShape
)
{
KernelConcat
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
ins_gpu
,
cols
,
out_rows
,
out_cols
,
output
->
data
<
T
>
());
}
else
{
KernelConcat
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
ins_gpu
,
ins_col_gpu
,
static_cast
<
int
>
(
inputs_cols
.
size
()),
out_rows
,
out_cols
,
output
->
data
<
T
>
());
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>&
outputs
)
{
// TODO(zcd): Add input data validity checking
int
num
=
outputs
.
size
();
int
input_row
=
1
;
auto
dim_0
=
outputs
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
input_row
*=
dim_0
[
i
];
}
int
output_col_0
=
outputs
[
0
].
numel
()
/
input_row
;
int
input_col
=
0
;
bool
sameShape
=
true
;
framework
::
Vector
<
int16_t
>
outputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
framework
::
Vector
<
int
>
outputs_cols
(
num
+
1
);
outputs_cols
[
0
]
=
0
;
T
**
outputs_ptr
=
reinterpret_cast
<
T
**>
(
outputs_data
.
data
());
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_col
=
outputs
[
i
].
numel
()
/
input_row
;
if
(
sameShape
)
{
if
(
t_col
!=
output_col_0
)
sameShape
=
false
;
}
input_col
+=
t_col
;
outputs_cols
[
i
+
1
]
=
input_col
;
outputs_ptr
[
i
]
=
outputs
[
i
].
data
<
T
>
();
}
T
**
outs_gpu
=
reinterpret_cast
<
T
**>
(
outputs_data
.
CUDAMutableData
(
context
.
GetPlace
()));
const
int
*
outs_col_gpu
=
outputs_cols
.
CUDAData
(
context
.
GetPlace
());
// computation
const
int
kThreadsPerBlock
=
1024
;
int
block_cols
=
kThreadsPerBlock
;
if
(
input_col
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
input_col
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
max_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
int
grid_cols
=
std
::
min
((
input_col
+
block_cols
-
1
)
/
block_cols
,
max_blocks
);
int
grid_rows
=
std
::
min
(
max_blocks
/
grid_cols
,
std
::
max
(
input_row
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
sameShape
)
{
KernelConcatGrad
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
input
.
data
<
T
>
(),
input_row
,
input_col
,
output_col_0
,
outs_gpu
);
}
else
{
KernelConcatGrad
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
input
.
data
<
T
>
(),
input_row
,
input_col
,
outs_col_gpu
,
static_cast
<
int
>
(
outputs_cols
.
size
()),
outs_gpu
);
}
}
};
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/concat.h
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/*
* \brief Concatenate the input tensors along the dimension axis.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input[0] = [[1,2],[3,4]]
* Input[1] = [[5,6]]
* axis = 0
*
* Output = [[1,2],
* [3,4],
* [5,6]]
*/
template
<
typename
DeviceContext
,
typename
T
>
class
ConcatFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
const
int
axis
,
framework
::
Tensor
*
output
);
};
/*
* \brief Split the input tensors along the dimension axis into outputs.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input = [[1,2],
* [3,4],
* [5,6]]
* axis = 0
*
* Output[0] = [[1,2],[3,4]]
* Output[1] = [[5,6]]
*/
template
<
typename
DeviceContext
,
typename
T
>
class
ConcatGradFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>&
outputs
);
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/concat_test.cc
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
template
<
typename
DeviceContext
,
typename
Place
>
void
testConcat
()
{
Tensor
input_a_cpu
;
Tensor
input_b_cpu
;
Tensor
out_cpu
;
Tensor
input_a
;
Tensor
input_b
;
Tensor
out
;
DeviceContext
*
context
=
new
DeviceContext
(
Place
());
// DeviceContext context(Place());
/**
* cast1:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [3, 3, 4]
* output:
* out.shape: [5, 3, 4]
*/
auto
dim_a
=
make_ddim
({
2
,
3
,
4
});
auto
dim_b
=
make_ddim
({
3
,
3
,
4
});
auto
dim_out
=
make_ddim
({
5
,
3
,
4
});
input_a
.
mutable_data
<
int
>
(
dim_a
,
Place
());
input_b
.
mutable_data
<
int
>
(
dim_b
,
Place
());
out
.
mutable_data
<
int
>
(
dim_out
,
Place
());
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
mutable_data
<
int
>
(
dim_a
,
CPUPlace
());
input_b_cpu
.
mutable_data
<
int
>
(
dim_b
,
CPUPlace
());
out_cpu
.
mutable_data
<
int
>
(
dim_out
,
CPUPlace
());
}
int
*
a_ptr
;
int
*
b_ptr
;
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
3
*
3
*
4
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
std
::
vector
<
Tensor
>
input
;
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
paddle
::
operators
::
math
::
ConcatFunctor
<
DeviceContext
,
int
>
concat_functor
;
concat_functor
(
*
context
,
input
,
0
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
int
*
out_ptr
;
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
int
cols
=
2
*
3
*
4
;
int
idx_a
=
0
,
idx_b
=
0
;
for
(
int
j
=
0
;
j
<
5
*
3
*
4
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
//
/**
* cast2:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 4, 4]
* output:
* out.shape: [2, 7, 4]
*/
dim_a
=
make_ddim
({
2
,
3
,
4
});
dim_b
=
make_ddim
({
2
,
4
,
4
});
dim_out
=
make_ddim
({
2
,
7
,
4
});
input_a
.
Resize
(
dim_a
);
input_b
.
Resize
(
dim_b
);
out
.
Resize
(
dim_out
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
Resize
(
dim_a
);
input_b_cpu
.
Resize
(
dim_b
);
out_cpu
.
Resize
(
dim_out
);
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
2
*
4
*
4
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
input
.
clear
();
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
concat_functor
(
*
context
,
input
,
1
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
cols
=
3
*
4
;
idx_a
=
0
,
idx_b
=
0
;
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
for
(
int
j
=
0
;
j
<
28
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
28
+
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
28
+
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
}
/**
* cast3:
* inputs:
* t_a.shape: [2, 3, 5]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 3, 9]
*/
dim_a
=
make_ddim
({
2
,
3
,
4
});
dim_b
=
make_ddim
({
2
,
3
,
5
});
dim_out
=
make_ddim
({
2
,
3
,
9
});
input_a
.
Resize
(
dim_a
);
input_b
.
Resize
(
dim_b
);
out
.
Resize
(
dim_out
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
Resize
(
dim_a
);
input_b_cpu
.
Resize
(
dim_b
);
out_cpu
.
Resize
(
dim_out
);
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
2
*
3
*
5
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
input
.
clear
();
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
concat_functor
(
*
context
,
input
,
2
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
// check the data
cols
=
4
;
idx_a
=
0
,
idx_b
=
0
;
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
for
(
int
j
=
0
;
j
<
9
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
9
+
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
9
+
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
}
/**
* cast4:
* inputs:
* axis = 1
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 6, 4]
*/
dim_a
=
make_ddim
({
2
,
3
,
4
});
dim_b
=
make_ddim
({
2
,
3
,
4
});
dim_out
=
make_ddim
({
2
,
6
,
4
});
input_a
.
Resize
(
dim_a
);
input_b
.
Resize
(
dim_b
);
out
.
Resize
(
dim_out
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
Resize
(
dim_a
);
input_b_cpu
.
Resize
(
dim_b
);
out_cpu
.
Resize
(
dim_out
);
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
input
.
clear
();
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
concat_functor
(
*
context
,
input
,
1
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
// check the data
cols
=
12
;
idx_a
=
0
,
idx_b
=
0
;
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
for
(
int
j
=
0
;
j
<
24
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
24
+
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
24
+
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
}
}
TEST
(
math
,
concat
)
{
testConcat
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
#ifdef PADDLE_WITH_CUDA
testConcat
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
#endif
}
paddle/fluid/operators/math/math_function.cc
浏览文件 @
c58ce0a6
...
@@ -246,6 +246,8 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
...
@@ -246,6 +246,8 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
bool
>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
bool
>;
#define DEFINE_CPU_TRANS(RANK) \
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
...
...
paddle/fluid/operators/reader/CMakeLists.txt
0 → 100644
浏览文件 @
c58ce0a6
cc_library
(
reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader
)
op_library
(
create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry
)
op_library
(
create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry
)
op_library
(
create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry
)
set
(
READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op PARENT_SCOPE
)
paddle/fluid/operators/reader/create_batch_reader_op.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
class
BatchReader
:
public
framework
::
DecoratedReader
{
public:
BatchReader
(
ReaderBase
*
reader
,
int
batch_size
)
:
DecoratedReader
(
reader
),
batch_size_
(
batch_size
)
{
buffer_
.
reserve
(
batch_size_
);
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
;
private:
int
batch_size_
;
std
::
vector
<
std
::
vector
<
framework
::
LoDTensor
>>
buffer_
;
};
class
CreateBatchReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
BatchReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"batch_size"
)));
}
};
class
CreateBatchReaderOpMaker
:
public
DecoratedReaderMakerBase
{
public:
CreateBatchReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
DecoratedReaderMakerBase
(
op_proto
,
op_checker
)
{
AddAttr
<
int
>
(
"batch_size"
,
"How many instances the batch reader yields each time."
)
.
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateBatchReader Operator
A batch reader takes another reader as its 'underlying reader',
gathers the underlying reader's outputs and then yields them in batches.
)DOC"
);
}
};
void
BatchReader
::
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
{
buffer_
.
clear
();
buffer_
.
reserve
(
batch_size_
);
for
(
int
i
=
0
;
i
<
batch_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
framework
::
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// Concat instances
out
->
clear
();
if
(
buffer_
.
empty
())
{
// if buffer_ is empty, the 'out' will return as an empty vector.
return
;
}
int
out_num
=
buffer_
[
0
].
size
();
out
->
reserve
(
out_num
);
for
(
int
j
=
0
;
j
<
out_num
;
++
j
)
{
// Merge shape and check date type
std
::
type_index
batch_type
=
buffer_
[
0
][
j
].
type
();
framework
::
DDim
batch_shape
=
buffer_
[
0
][
j
].
dims
();
for
(
size_t
i
=
1
;
i
<
buffer_
.
size
();
++
i
)
{
std
::
type_index
ins_type
=
buffer_
[
i
][
j
].
type
();
framework
::
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
PADDLE_ENFORCE_EQ
(
batch_type
,
ins_type
);
PADDLE_ENFORCE_EQ
(
slice_ddim
(
batch_shape
,
1
,
batch_shape
.
size
()),
slice_ddim
(
ins_shape
,
1
,
ins_shape
.
size
()));
PADDLE_ENFORCE_GT
(
ins_shape
[
0
],
0
);
batch_shape
[
0
]
+=
ins_shape
[
0
];
}
framework
::
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
batch_shape
);
out_tensor
.
mutable_data
(
platform
::
CPUPlace
(),
batch_type
);
int64_t
dst_offset
=
0
;
// Merge lod and data
framework
::
LoD
batch_lod
;
for
(
size_t
i
=
0
;
i
<
buffer_
.
size
();
++
i
)
{
framework
::
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
framework
::
LoD
ins_lod
=
buffer_
[
i
][
j
].
lod
();
if
(
i
==
0
)
{
batch_lod
=
ins_lod
;
}
else
{
PADDLE_ENFORCE_EQ
(
batch_lod
.
size
(),
ins_lod
.
size
());
for
(
size_t
level_idx
=
0
;
level_idx
<
batch_lod
.
size
();
++
level_idx
)
{
auto
&
lod_level
=
batch_lod
[
level_idx
];
for
(
size_t
k
=
1
;
k
<
ins_lod
[
level_idx
].
size
();
++
k
)
{
lod_level
.
push_back
(
ins_lod
[
level_idx
][
k
]
+
lod_level
.
back
());
}
}
}
auto
dst
=
out_tensor
.
Slice
(
dst_offset
,
dst_offset
+
ins_shape
[
0
]);
TensorCopy
(
buffer_
[
i
][
j
],
platform
::
CPUPlace
(),
&
dst
);
dst_offset
+=
ins_shape
[
0
];
}
out_tensor
.
set_lod
(
batch_lod
);
out
->
push_back
(
out_tensor
);
}
}
}
// namespace reader
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
::
reader
;
REGISTER_DECORATED_READER_OPERATOR
(
create_batch_reader
,
ops
::
CreateBatchReaderOp
,
ops
::
CreateBatchReaderOpMaker
);
paddle/fluid/operators/reader/create_random_data_generator_op.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
template
<
typename
T
>
class
RandomDataGenerator
:
public
framework
::
FileReader
{
public:
RandomDataGenerator
(
const
std
::
vector
<
framework
::
DDim
>&
shapes
,
float
min
,
float
max
)
:
FileReader
(
shapes
),
min_
(
min
),
max_
(
max
)
{
PADDLE_ENFORCE_LE
(
min
,
max
,
"'min' shouldn't be greater than 'max'.(%f vs %f)"
,
min
,
max
);
unsigned
int
seed
=
std
::
random_device
()();
engine_
.
seed
(
seed
);
dist_
=
std
::
uniform_real_distribution
<
float
>
(
min_
,
max_
);
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
{
out
->
clear
();
out
->
reserve
(
shapes_
.
size
());
for
(
const
framework
::
DDim
&
shape
:
shapes_
)
{
PADDLE_ENFORCE_GE
(
shape
.
size
(),
2
,
"The rank of reader's output data should be 2 at least.(Now it's %d)"
,
shape
.
size
());
framework
::
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
shape
);
T
*
data
=
out_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
int64_t
numel
=
framework
::
product
(
shape
);
for
(
int64_t
i
=
0
;
i
<
numel
;
++
i
)
{
data
[
i
]
=
dist_
(
engine_
);
}
out
->
push_back
(
out_tensor
);
}
}
bool
HasNext
()
const
override
{
return
true
;
}
void
ReInit
()
override
{
return
;
}
private:
float
min_
;
float
max_
;
std
::
minstd_rand
engine_
;
std
::
uniform_real_distribution
<
float
>
dist_
;
};
template
<
typename
T
>
class
CreateRandomDataGeneratorOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
shape_concat
=
Attr
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
&
ranks
=
Attr
<
std
::
vector
<
int
>>
(
"ranks"
);
PADDLE_ENFORCE
(
!
shape_concat
.
empty
()
&&
!
ranks
.
empty
());
PADDLE_ENFORCE_EQ
(
std
::
accumulate
(
ranks
.
begin
(),
ranks
.
end
(),
0
),
int
(
shape_concat
.
size
()),
"The accumulate of all ranks should be equal to the "
"shape concat's length."
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
RandomDataGenerator
<
T
>
(
shapes
,
Attr
<
float
>
(
"min"
),
Attr
<
float
>
(
"max"
)));
}
};
class
CreateRandomDataGeneratorOpMaker
:
public
FileReaderMakerBase
{
public:
CreateRandomDataGeneratorOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
FileReaderMakerBase
(
op_proto
,
op_checker
)
{
AddAttr
<
float
>
(
"min"
,
"The lower bound of reader's uniform distribution."
);
AddAttr
<
float
>
(
"max"
,
"The upper bound of reader's uniform distribution."
);
AddComment
(
R"DOC(
CreateRandomDataGenerator Operator
This Op creates a random reader.
The reader generates random data instead of really reading from files.
Generated data follow an uniform distribution between 'min' and 'max'.
)DOC"
);
}
};
}
// namespace reader
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
::
reader
;
REGISTER_FILE_READER_OPERATOR
(
create_random_data_generator
,
ops
::
CreateRandomDataGeneratorOp
<
float
>
,
ops
::
CreateRandomDataGeneratorOpMaker
);
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
class
ShuffleReader
:
public
framework
::
DecoratedReader
{
public:
ShuffleReader
(
ReaderBase
*
reader
,
int
buffer_size
)
:
DecoratedReader
(
reader
),
buffer_size_
(
buffer_size
),
iteration_pos_
(
0
)
{
buffer_
.
reserve
(
buffer_size
);
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
;
private:
int
buffer_size_
;
std
::
vector
<
std
::
vector
<
framework
::
LoDTensor
>>
buffer_
;
size_t
iteration_pos_
;
};
void
ShuffleReader
::
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
{
if
(
iteration_pos_
>=
buffer_
.
size
())
{
// Reload buffer with new data
buffer_
.
clear
();
buffer_
.
reserve
(
buffer_size_
);
for
(
int
i
=
0
;
i
<
buffer_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
framework
::
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
// optimize.
std
::
random_shuffle
(
buffer_
.
begin
(),
buffer_
.
end
());
iteration_pos_
=
0
;
}
out
->
clear
();
if
(
!
buffer_
.
empty
())
{
std
::
swap
(
*
out
,
buffer_
[
iteration_pos_
++
]);
}
// if buffer_ is empty, the 'out' will return as an empty vector.
}
class
CreateShuffleReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
ShuffleReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"buffer_size"
)));
}
};
class
CreateShuffleReaderOpMaker
:
public
DecoratedReaderMakerBase
{
public:
CreateShuffleReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
DecoratedReaderMakerBase
(
op_proto
,
op_checker
)
{
AddAttr
<
int
>
(
"buffer_size"
,
"The shuffle buffer size."
).
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateShuffleReader Operator
A shuffle reader takes another reader as its 'underlying reader'
and yields the underlying reader's outputs in a shuffled order.
)DOC"
);
}
};
}
// namespace reader
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
::
reader
;
REGISTER_DECORATED_READER_OPERATOR
(
create_shuffle_reader
,
ops
::
CreateShuffleReaderOp
,
ops
::
CreateShuffleReaderOpMaker
);
paddle/fluid/operators/reader/reader_op_registry.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
std
::
vector
<
framework
::
DDim
>
RestoreShapes
(
const
std
::
vector
<
int
>&
shape_concat
,
const
std
::
vector
<
int
>&
ranks
)
{
std
::
vector
<
framework
::
DDim
>
res
;
int
offset
=
0
;
for
(
int
len
:
ranks
)
{
auto
start_it
=
shape_concat
.
begin
()
+
offset
;
auto
end_it
=
start_it
+
len
;
res
.
push_back
(
framework
::
make_ddim
(
std
::
vector
<
int
>
(
start_it
,
end_it
)));
offset
+=
len
;
}
return
res
;
}
FileReaderMakerBase
::
FileReaderMakerBase
(
framework
::
OpProtoAndCheckerMaker
::
OpProto
*
op_proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddOutput
(
"Out"
,
"(ReaderHolder) The created random reader."
);
AddAttr
<
std
::
vector
<
int
>>
(
"shape_concat"
,
"The concat of all data's shapes."
);
AddAttr
<
std
::
vector
<
int
>>
(
"ranks"
,
"The ranks of each data."
"e.g."
"shape_concat = [2,3,4,5,6]"
"ranks = [3,2]"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively."
);
AddAttr
<
std
::
vector
<
int
>>
(
"lod_levels"
,
"The LoD levels of each data."
);
}
void
FileReaderInferShape
::
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output file reader should not be null."
);
const
auto
shape_concat
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
if
(
ctx
->
IsRuntime
())
{
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d)."
,
lod_levels
.
size
(),
shapes
.
size
());
framework
::
VarDesc
*
reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
reader
->
SetLoDLevels
(
lod_levels
);
}
}
void
FileReaderInferVarType
::
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
{
std
::
string
reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
}
void
DecoratedReaderInferShape
::
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"UnderlyingReader"
),
"Input(UnderlyingReader) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output decorated reader should not be null."
);
ctx
->
SetReaderDims
(
"Out"
,
ctx
->
GetReaderDims
(
"UnderlyingReader"
));
if
(
ctx
->
IsRuntime
())
{
framework
::
VarDesc
*
in_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"UnderlyingReader"
)[
0
]);
framework
::
VarDesc
*
out_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
out_reader
->
SetLoDLevels
(
in_reader
->
GetLoDLevels
());
}
}
void
DecoratedReaderInferVarType
::
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
{
std
::
string
in_reader_name
=
op_desc
.
Input
(
"UnderlyingReader"
)[
0
];
framework
::
VarDesc
*
in_reader
=
block
->
FindVarRecursive
(
in_reader_name
);
std
::
string
out_reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
out_reader
=
block
->
FindVarRecursive
(
out_reader_name
);
out_reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
out_reader
->
SetDataTypes
(
in_reader
->
GetDataTypes
());
}
DecoratedReaderMakerBase
::
DecoratedReaderMakerBase
(
framework
::
OpProtoAndCheckerMaker
::
OpProto
*
op_proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddInput
(
"UnderlyingReader"
,
"(ReaderHolder) The underlying reader for creating a batch reader."
);
AddOutput
(
"Out"
,
"(ReaderHolder) The created batch reader."
);
}
}
// namespace reader
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reader/reader_op_registry.h
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
extern
std
::
vector
<
framework
::
DDim
>
RestoreShapes
(
const
std
::
vector
<
int
>&
shape_concat
,
const
std
::
vector
<
int
>&
ranks
);
class
FileReaderMakerBase
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
FileReaderMakerBase
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
);
};
class
FileReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
;
};
class
FileReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
;
};
// general infershape for decorated reader
class
DecoratedReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
;
};
// general var type inference for decorated reader
class
DecoratedReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
;
};
class
DecoratedReaderMakerBase
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
DecoratedReaderMakerBase
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
);
};
}
// namespace reader
}
// namespace operators
}
// namespace paddle
#define REGISTER_FILE_READER_OPERATOR(op_name, ...) \
REGISTER_OPERATOR(op_name, __VA_ARGS__, \
paddle::operators::reader::FileReaderInferShape, \
paddle::framework::EmptyGradOpMaker, \
paddle::operators::reader::FileReaderInferVarType)
#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...) \
REGISTER_OPERATOR(op_name, __VA_ARGS__, \
paddle::operators::reader::DecoratedReaderInferShape, \
paddle::framework::EmptyGradOpMaker, \
paddle::operators::reader::DecoratedReaderInferVarType)
paddle/fluid/operators/reshape_op.cc
浏览文件 @
c58ce0a6
...
@@ -84,6 +84,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -84,6 +84,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
std
::
vector
<
int
>>
(
"shape"
,
AddAttr
<
std
::
vector
<
int
>>
(
"shape"
,
"(vector<int>) "
"(vector<int>) "
"Target shape of reshape operator."
);
"Target shape of reshape operator."
);
AddAttr
<
bool
>
(
"inplace"
,
"Change the source tensor's shape without copy memory."
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
Reshape Operator.
Reshape Operator.
...
...
paddle/fluid/operators/reshape_op.h
浏览文件 @
c58ce0a6
...
@@ -26,10 +26,16 @@ class ReshapeKernel : public framework::OpKernel<T> {
...
@@ -26,10 +26,16 @@ class ReshapeKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
auto
out_dims
=
out
->
dims
();
auto
out_dims
=
out
->
dims
();
if
(
!
inplace
)
{
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
TensorCopy
(
*
in
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
out
);
framework
::
TensorCopy
(
*
in
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
out
);
out
->
Resize
(
out_dims
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
in
);
out
->
Resize
(
out_dims
);
}
}
}
};
};
...
@@ -40,10 +46,16 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
...
@@ -40,10 +46,16 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_x
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_x
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
auto
in_dims
=
d_x
->
dims
();
auto
in_dims
=
d_x
->
dims
();
if
(
!
inplace
)
{
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
d_x
);
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
d_x
);
d_x
->
Resize
(
in_dims
);
d_x
->
Resize
(
in_dims
);
}
else
{
d_x
->
ShareDataWith
(
*
d_out
);
d_x
->
Resize
(
in_dims
);
}
}
}
};
};
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
c58ce0a6
...
@@ -15,6 +15,8 @@ limitations under the License. */
...
@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#pragma once
#include <vector>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/macros.h"
...
@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
...
@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN
(
ScopedPoolingDescriptor
);
DISABLE_COPY_AND_ASSIGN
(
ScopedPoolingDescriptor
);
};
};
inline
bool
CanCUDNNBeUsed
(
const
framework
::
ExecutionContext
&
ctx
)
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
paddle
::
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
#ifdef PADDLE_WITH_CUDA
if
(
use_cudnn
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
}
#endif
return
use_cudnn
;
}
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/platform/device_context.cc
浏览文件 @
c58ce0a6
...
@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
...
@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
for
(
size_t
i
=
0
;
i
<
places
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
places
.
size
();
i
++
)
{
if
(
platform
::
is_cpu_place
(
places
[
i
]))
{
if
(
platform
::
is_cpu_place
(
places
[
i
]))
{
#ifdef PADDLE_WITH_MKLDNN
device_contexts_
.
emplace
(
places
[
i
],
new
platform
::
MKLDNNDeviceContext
(
boost
::
get
<
platform
::
CPUPlace
>
(
places
[
i
])));
#else
device_contexts_
.
emplace
(
places
[
i
],
device_contexts_
.
emplace
(
places
[
i
],
new
platform
::
CPUDeviceContext
(
new
platform
::
CPUDeviceContext
(
boost
::
get
<
platform
::
CPUPlace
>
(
places
[
i
])));
boost
::
get
<
platform
::
CPUPlace
>
(
places
[
i
])));
#endif
}
else
if
(
platform
::
is_gpu_place
(
places
[
i
]))
{
}
else
if
(
platform
::
is_gpu_place
(
places
[
i
]))
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
device_contexts_
.
emplace
(
places
[
i
],
device_contexts_
.
emplace
(
places
[
i
],
...
@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
...
@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
SetDeviceId
(
place_
.
device
);
SetDeviceId
(
place_
.
device
);
multi_process
=
GetCUDAMultiProcessors
(
place_
.
device
);
max_threads_per_mp
=
GetCUDAMaxThreadsPerMultiProcessor
(
place_
.
device
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
...
@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
...
@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE
(
cudaGetLastError
());
PADDLE_ENFORCE
(
cudaGetLastError
());
}
}
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
return
multi_process
*
max_threads_per_mp
;
}
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
return
eigen_device_
.
get
();
return
eigen_device_
.
get
();
}
}
...
@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
...
@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext
::
MKLDNNDeviceContext
(
CPUPlace
place
)
MKLDNNDeviceContext
::
MKLDNNDeviceContext
(
CPUPlace
place
)
:
CPUDeviceContext
(
place
),
ready_
(
false
)
{
:
CPUDeviceContext
(
place
),
engine_
(
mkldnn
::
engine
::
cpu
,
0
),
p_blobs_
()
{
stream_
.
reset
(
new
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
));
p_blobs_
.
reset
(
new
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>
());
engine_
.
reset
(
new
mkldnn
::
engine
(
mkldnn
::
engine
::
cpu
,
0
));
}
}
template
<
typename
T
>
void
MKLDNNDeviceContext
::
SetBlob
(
const
std
::
string
&
name
,
void
MKLDNNDeviceContext
::
AddElement
(
const
std
::
string
&
op_key
,
std
::
shared_ptr
<
void
>
data
)
const
{
const
T
&
value
)
{
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>*
p
;
if
(
GetElement
<
T
>
(
op_key
))
{
p
=
p_blobs_
.
get
();
return
;
}
GetElementPool
<
T
>
().
emplace
(
op_key
,
std
::
move
(
value
));
}
template
<
typename
T
>
auto
it
=
p
->
find
(
name
);
const
T
&
MKLDNNDeviceContext
::
GetElement
(
const
std
::
string
&
op_key
)
const
{
auto
it
=
GetElementPool
<
T
>
().
find
(
op_key
);
return
it
==
GetElementPool
<
T
>
().
end
()
?
nullptr
:
it
->
second
;
}
template
<
>
if
(
it
==
p
->
end
())
{
const
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNMemoryPtr
,
(
*
p
)[
name
]
=
data
;
// create new blob
std
::
hash
<
std
::
string
>>&
}
else
{
MKLDNNDeviceContext
::
GetElementPool
<
MKLDNNMemoryPtr
>
()
const
{
it
->
second
=
data
;
// set data to existing blob
return
memory_pool_
;
}
}
template
<
>
return
;
const
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitivePtr
,
std
::
hash
<
std
::
string
>>&
MKLDNNDeviceContext
::
GetElementPool
<
MKLDNNPrimitivePtr
>
()
const
{
return
primitive_pool_
;
}
}
template
<
>
std
::
shared_ptr
<
void
>
MKLDNNDeviceContext
::
GetBlob
(
const
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitiveDescPtr
,
const
std
::
string
&
name
)
const
{
std
::
hash
<
std
::
string
>>&
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>*
p
;
MKLDNNDeviceContext
::
GetElementPool
<
MKLDNNPrimitiveDescPtr
>
()
const
{
p
=
p_blobs_
.
get
();
return
primitive_desc_pool_
;
}
void
MKLDNNDeviceContext
::
Execute
(
bool
block
)
{
auto
it
=
p
->
find
(
name
);
if
(
pipeline_
.
empty
())
{
return
;
}
ResetStream
();
stream_
->
submit
(
pipeline_
).
wait
(
block
);
ready_
=
false
;
pipeline_
.
clear
();
}
void
MKLDNNDeviceContext
::
ResetStream
()
{
if
(
it
!=
p
->
end
())
{
if
(
ready_
)
{
return
it
->
second
;
return
;
}
}
// TODO(TJ): change me when mkldnn have specific method to reset this state
stream_
.
reset
(
new
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
));
return
nullptr
;
ready_
=
true
;
}
}
#endif
#endif
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
c58ce0a6
...
@@ -22,7 +22,7 @@ limitations under the License. */
...
@@ -22,7 +22,7 @@ limitations under the License. */
#endif
#endif
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
#include
"paddle/fluid/platform/mkldnn_helper.h"
#include
<mkldnn.hpp>
#endif
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */
/*! \brief Return place in the device context. */
Place
GetPlace
()
const
override
;
Place
GetPlace
()
const
override
;
/*! \brief Return the max physical thread count in the device context */
int
GetMaxPhysicalThreadCount
()
const
;
/*! \brief Return eigen device in the device context. */
/*! \brief Return eigen device in the device context. */
Eigen
::
GpuDevice
*
eigen_device
()
const
;
Eigen
::
GpuDevice
*
eigen_device
()
const
;
...
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
...
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t
stream_
;
cudaStream_t
stream_
;
cudnnHandle_t
cudnn_handle_
;
cudnnHandle_t
cudnn_handle_
;
cublasHandle_t
cublas_handle_
;
cublasHandle_t
cublas_handle_
;
int
multi_process
;
int
max_threads_per_mp
;
};
};
template
<
>
template
<
>
...
@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
...
@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
public:
public:
explicit
MKLDNNDeviceContext
(
CPUPlace
place
);
explicit
MKLDNNDeviceContext
(
CPUPlace
place
);
/* \brief Add new element: memory, primitive or primitive desc */
template
<
typename
T
>
void
AddElement
(
const
std
::
string
&
op_key
,
const
T
&
value
);
/* \brief Get existed element: memory, primitive or primitive desc */
template
<
typename
T
>
const
T
&
GetElement
(
const
std
::
string
&
op_key
)
const
;
/* \brief Get element pool: memory, primitive or primitive desc pool */
template
<
typename
T
>
const
std
::
unordered_map
<
const
std
::
string
,
const
T
,
std
::
hash
<
std
::
string
>>&
GetElementPool
()
const
;
/* \brief Get the active engine */
/* \brief Get the active engine */
const
MKLDNNEngine
&
engine
()
const
{
return
*
engine_
;
}
const
mkldnn
::
engine
&
GetEngine
()
const
{
return
engine_
;
}
/* \brief Submit primitive to pipeline */
void
Submit
(
const
MKLDNNPrimitivePtr
&
p
)
{
pipeline_
.
push_back
(
*
p
);
}
/
*! \brief Execute all submitted primitives in pipeline */
/
/ Set data to blob (i.e. name/data pair). Create blob if not existing
void
Execute
(
bool
block
=
true
)
;
void
SetBlob
(
const
std
::
string
&
name
,
std
::
shared_ptr
<
void
>
data
)
const
;
protected:
// Find a saved blob. Return nullptr if not found
/*! \brief Reset the stream to prepare next exectue */
std
::
shared_ptr
<
void
>
GetBlob
(
const
std
::
string
&
name
)
const
;
void
ResetStream
();
private:
private:
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNMemoryPtr
,
mkldnn
::
engine
engine_
;
std
::
hash
<
std
::
string
>>
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>>
memory_pool_
;
p_blobs_
;
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitivePtr
,
std
::
hash
<
std
::
string
>>
primitive_pool_
;
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitiveDescPtr
,
std
::
hash
<
std
::
string
>>
primitive_desc_pool_
;
std
::
vector
<
MKLDNNPrimitive
>
pipeline_
;
MKLDNNStreamPtr
stream_
;
MKLDNNEnginePtr
engine_
;
bool
ready_
;
};
};
#endif
#endif
...
...
paddle/fluid/platform/float16.h
浏览文件 @
c58ce0a6
...
@@ -20,10 +20,6 @@ limitations under the License. */
...
@@ -20,10 +20,6 @@ limitations under the License. */
#include <cuda.h>
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#endif // PADDLE_WITH_CUDA
#include "unsupported/Eigen/CXX11/Tensor"
#include "paddle/fluid/platform/hostdevice.h"
#ifdef __GNUC__
#ifdef __GNUC__
#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
#else
#else
...
@@ -64,6 +60,18 @@ limitations under the License. */
...
@@ -64,6 +60,18 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
// Forward declare float16 for eigen.h
struct
float16
;
}
// namespace platform
}
// namespace paddle
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
namespace
platform
{
// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
// and aligned at least on a 2-byte boundary, which leads to efficient
// and aligned at least on a 2-byte boundary, which leads to efficient
// memory access of float16 struct and also makes float16 compatible
// memory access of float16 struct and also makes float16 compatible
...
@@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
...
@@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
}
}
#endif
#endif
HOSTDEVICE
inline
bool
(
isnan
)(
const
float16
&
a
)
{
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hisnan
(
half
(
a
));
#else
return
(
a
.
x
&
0x7fff
)
>
0x7c00
;
#endif
}
HOSTDEVICE
inline
bool
(
isinf
)(
const
float16
&
a
)
{
return
(
a
.
x
&
0x7fff
)
==
0x7c00
;
}
HOSTDEVICE
inline
bool
(
isfinite
)(
const
float16
&
a
)
{
return
!
((
isnan
)(
a
))
&&
!
((
isinf
)(
a
));
}
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
...
@@ -750,3 +774,27 @@ struct is_pod<paddle::platform::float16> {
...
@@ -750,3 +774,27 @@ struct is_pod<paddle::platform::float16> {
};
};
}
// namespace std
}
// namespace std
namespace
Eigen
{
namespace
numext
{
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isnan
)(
const
paddle
::
platform
::
float16
&
a
)
{
return
(
paddle
::
platform
::
isnan
)(
a
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isinf
)(
const
paddle
::
platform
::
float16
&
a
)
{
return
(
paddle
::
platform
::
isinf
)(
a
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isfinite
)(
const
paddle
::
platform
::
float16
&
a
)
{
return
(
paddle
::
platform
::
isfinite
)(
a
);
}
}
// namespace numext
}
// namespace Eigen
paddle/fluid/platform/gpu_info.cc
浏览文件 @
c58ce0a6
...
@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
...
@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
return
count
;
return
count
;
}
}
int
GetCUDAMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
PADDLE_ENFORCE
(
cudaDeviceGetAttribute
(
&
count
,
cudaDevAttrMultiProcessorCount
,
id
),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMultiProcessors"
);
return
count
;
}
int
GetCUDAMaxThreadsPerMultiProcessor
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
PADDLE_ENFORCE
(
cudaDeviceGetAttribute
(
&
count
,
cudaDevAttrMaxThreadsPerMultiProcessor
,
id
),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMaxThreadsPerMultiProcessor"
);
return
count
;
}
int
GetCurrentDeviceId
()
{
int
GetCurrentDeviceId
()
{
int
device_id
;
int
device_id
;
PADDLE_ENFORCE
(
PADDLE_ENFORCE
(
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
c58ce0a6
...
@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
...
@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
//! Get the total number of GPU devices in system.
//! Get the total number of GPU devices in system.
int
GetCUDADeviceCount
();
int
GetCUDADeviceCount
();
//! Get the MultiProcessors of the ith GPU.
int
GetCUDAMultiProcessors
(
int
i
);
//! Get the MaxThreads of each MultiProcessor of the ith GPU.
int
GetCUDAMaxThreadsPerMultiProcessor
(
int
i
);
//! Get the current GPU device id in system.
//! Get the current GPU device id in system.
int
GetCurrentDeviceId
();
int
GetCurrentDeviceId
();
...
...
paddle/fluid/platform/mkldnn_helper.h
浏览文件 @
c58ce0a6
...
@@ -16,12 +16,15 @@ limitations under the License. */
...
@@ -16,12 +16,15 @@ limitations under the License. */
#include <mkldnn.hpp>
#include <mkldnn.hpp>
#include "paddle/fluid/framework/operator.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
using
MKLDNNStream
=
mkldnn
::
stream
;
using
MKLDNNStream
=
mkldnn
::
stream
;
using
MKLDNNEngine
=
mkldnn
::
engine
;
using
MKLDNNEngine
=
mkldnn
::
engine
;
using
MKLDNNMemory
=
mkldnn
::
memory
;
using
MKLDNNMemory
=
mkldnn
::
memory
;
using
MKLDNNMemoryDescriptor
=
mkldnn
::
memory
::
desc
;
using
MKLDNNPrimitive
=
mkldnn
::
primitive
;
using
MKLDNNPrimitive
=
mkldnn
::
primitive
;
using
MKLDNNPrimitiveDesc
=
mkldnn
::
handle
<
mkldnn_primitive_desc_t
>
;
using
MKLDNNPrimitiveDesc
=
mkldnn
::
handle
<
mkldnn_primitive_desc_t
>
;
...
@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
...
@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
typedef
std
::
unique_ptr
<
MKLDNNPrimitive
>
MKLDNNPrimitivePtr
;
typedef
std
::
unique_ptr
<
MKLDNNPrimitive
>
MKLDNNPrimitivePtr
;
typedef
std
::
unique_ptr
<
MKLDNNPrimitiveDesc
>
MKLDNNPrimitiveDescPtr
;
typedef
std
::
unique_ptr
<
MKLDNNPrimitiveDesc
>
MKLDNNPrimitiveDescPtr
;
inline
mkldnn
::
memory
::
desc
MKLDNNMemDesc
(
const
std
::
vector
<
int
>&
dims
,
mkldnn
::
memory
::
data_type
data_type
,
mkldnn
::
memory
::
format
format
)
{
mkldnn
::
memory
::
dims
tz
=
dims
;
return
mkldnn
::
memory
::
desc
({
tz
},
data_type
,
format
);
}
inline
bool
CanMKLDNNBeUsed
(
const
framework
::
ExecutionContext
&
ctx
)
{
bool
use_mkldnn
=
ctx
.
Attr
<
bool
>
(
"use_mkldnn"
);
return
use_mkldnn
&&
platform
::
is_cpu_place
(
ctx
.
GetPlace
());
}
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
python/paddle/fluid/__init__.py
浏览文件 @
c58ce0a6
...
@@ -28,6 +28,7 @@ import nets
...
@@ -28,6 +28,7 @@ import nets
import
optimizer
import
optimizer
import
backward
import
backward
import
regularizer
import
regularizer
import
average
from
param_attr
import
ParamAttr
,
WeightNormParamAttr
from
param_attr
import
ParamAttr
,
WeightNormParamAttr
from
data_feeder
import
DataFeeder
from
data_feeder
import
DataFeeder
from
core
import
LoDTensor
,
CPUPlace
,
CUDAPlace
from
core
import
LoDTensor
,
CPUPlace
,
CUDAPlace
...
...
python/paddle/fluid/average.py
0 → 100644
浏览文件 @
c58ce0a6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
"""
Class of all kinds of Average.
All Averages are accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
wrappers of Python functions.
"""
def
_is_number_
(
var
):
return
isinstance
(
var
,
int
)
or
isinstance
(
var
,
float
)
or
(
isinstance
(
var
,
np
.
ndarray
)
and
var
.
shape
==
(
1
,
))
def
_is_number_or_matrix_
(
var
):
return
_is_number_
(
var
)
or
isinstance
(
var
,
np
.
ndarray
)
class
WeightedAverage
(
object
):
def
__init__
(
self
):
self
.
reset
()
def
reset
(
self
):
self
.
numerator
=
None
self
.
denominator
=
None
def
add
(
self
,
value
,
weight
):
if
not
_is_number_or_matrix_
(
value
):
raise
ValueError
(
"The 'value' must be a number(int, float) or a numpy ndarray."
)
if
not
_is_number_
(
weight
):
raise
ValueError
(
"The 'weight' must be a number(int, float)."
)
if
self
.
numerator
is
None
or
self
.
denominator
is
None
:
self
.
numerator
=
value
*
weight
self
.
denominator
=
weight
else
:
self
.
numerator
+=
value
*
weight
self
.
denominator
+=
weight
def
eval
(
self
):
if
self
.
numerator
is
None
or
self
.
denominator
is
None
:
raise
ValueError
(
"There is no data to be averaged in WeightedAverage."
)
return
self
.
numerator
/
self
.
denominator
python/paddle/fluid/backward.py
浏览文件 @
c58ce0a6
...
@@ -486,7 +486,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
...
@@ -486,7 +486,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
params_and_grads
=
[]
params_and_grads
=
[]
for
param
in
parameters
:
for
param
in
parameters
:
if
param
not
in
grad_info_map
:
if
param
not
in
grad_info_map
:
raise
ValueError
(
"param %s is not in map"
%
param
)
continue
grad_info
=
grad_info_map
[
param
]
grad_info
=
grad_info_map
[
param
]
grad_block
=
grad_info
[
1
]
grad_block
=
grad_info
[
1
]
if
not
grad_block
.
has_var
(
grad_info
[
0
]):
if
not
grad_block
.
has_var
(
grad_info
[
0
]):
...
...
python/paddle/fluid/evaluator.py
浏览文件 @
c58ce0a6
...
@@ -108,44 +108,6 @@ class Evaluator(object):
...
@@ -108,44 +108,6 @@ class Evaluator(object):
return
state
return
state
class
Accuracy
(
Evaluator
):
"""
Average Accuracy for multiple mini-batches.
"""
def
__init__
(
self
,
input
,
label
,
k
=
1
,
**
kwargs
):
super
(
Accuracy
,
self
).
__init__
(
"accuracy"
,
**
kwargs
)
main_program
=
self
.
helper
.
main_program
if
main_program
.
current_block
().
idx
!=
0
:
raise
ValueError
(
"You can only invoke Evaluator in root block"
)
self
.
total
=
self
.
create_state
(
dtype
=
'int64'
,
shape
=
[
1
],
suffix
=
'total'
)
self
.
correct
=
self
.
create_state
(
dtype
=
'int64'
,
shape
=
[
1
],
suffix
=
'correct'
)
total
=
self
.
helper
.
create_tmp_variable
(
dtype
=
'int'
)
correct
=
self
.
helper
.
create_tmp_variable
(
dtype
=
'int'
)
acc
=
layers
.
accuracy
(
input
=
input
,
label
=
label
,
k
=
k
,
total
=
total
,
correct
=
correct
)
total
=
layers
.
cast
(
x
=
total
,
dtype
=
'int64'
)
correct
=
layers
.
cast
(
x
=
correct
,
dtype
=
'int64'
)
layers
.
sums
(
input
=
[
self
.
total
,
total
],
out
=
self
.
total
)
layers
.
sums
(
input
=
[
self
.
correct
,
correct
],
out
=
self
.
correct
)
self
.
metrics
.
append
(
acc
)
def
eval
(
self
,
executor
,
eval_program
=
None
):
if
eval_program
is
None
:
eval_program
=
Program
()
block
=
eval_program
.
current_block
()
with
program_guard
(
main_program
=
eval_program
):
total
=
_clone_var_
(
block
,
self
.
total
)
correct
=
_clone_var_
(
block
,
self
.
correct
)
total
=
layers
.
cast
(
total
,
dtype
=
'float32'
)
correct
=
layers
.
cast
(
correct
,
dtype
=
'float32'
)
out
=
layers
.
elementwise_div
(
x
=
correct
,
y
=
total
)
return
np
.
array
(
executor
.
run
(
eval_program
,
fetch_list
=
[
out
])[
0
])
class
ChunkEvaluator
(
Evaluator
):
class
ChunkEvaluator
(
Evaluator
):
"""
"""
Accumulate counter numbers output by chunk_eval from mini-batches and
Accumulate counter numbers output by chunk_eval from mini-batches and
...
...
python/paddle/fluid/layers/__init__.py
浏览文件 @
c58ce0a6
...
@@ -28,6 +28,8 @@ import math_op_patch
...
@@ -28,6 +28,8 @@ import math_op_patch
from
math_op_patch
import
*
from
math_op_patch
import
*
import
detection
import
detection
from
detection
import
*
from
detection
import
*
import
metric
from
metric
import
*
from
learning_rate_scheduler
import
*
from
learning_rate_scheduler
import
*
__all__
=
[]
__all__
=
[]
...
@@ -39,4 +41,5 @@ __all__ += control_flow.__all__
...
@@ -39,4 +41,5 @@ __all__ += control_flow.__all__
__all__
+=
ops
.
__all__
__all__
+=
ops
.
__all__
__all__
+=
device
.
__all__
__all__
+=
device
.
__all__
__all__
+=
detection
.
__all__
__all__
+=
detection
.
__all__
__all__
+=
metric
.
__all__
__all__
+=
learning_rate_scheduler
.
__all__
__all__
+=
learning_rate_scheduler
.
__all__
python/paddle/fluid/layers/metric.py
0 → 100644
浏览文件 @
c58ce0a6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
All layers just related to metric.
"""
from
..layer_helper
import
LayerHelper
from
..initializer
import
Normal
,
Constant
from
..framework
import
Variable
from
..param_attr
import
ParamAttr
__all__
=
[
'accuracy'
]
def
accuracy
(
input
,
label
,
k
=
1
,
correct
=
None
,
total
=
None
):
"""
This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices.
"""
helper
=
LayerHelper
(
"accuracy"
,
**
locals
())
topk_out
=
helper
.
create_tmp_variable
(
dtype
=
input
.
dtype
)
topk_indices
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"top_k"
,
inputs
=
{
"X"
:
[
input
]},
outputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
]},
attrs
=
{
"k"
:
k
})
acc_out
=
helper
.
create_tmp_variable
(
dtype
=
"float32"
)
if
correct
is
None
:
correct
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
if
total
is
None
:
total
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"accuracy"
,
inputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
],
"Label"
:
[
label
]
},
outputs
=
{
"Accuracy"
:
[
acc_out
],
"Correct"
:
[
correct
],
"Total"
:
[
total
],
})
return
acc_out
python/paddle/fluid/layers/nn.py
浏览文件 @
c58ce0a6
...
@@ -35,7 +35,6 @@ __all__ = [
...
@@ -35,7 +35,6 @@ __all__ = [
'cos_sim'
,
'cos_sim'
,
'cross_entropy'
,
'cross_entropy'
,
'square_error_cost'
,
'square_error_cost'
,
'accuracy'
,
'chunk_eval'
,
'chunk_eval'
,
'sequence_conv'
,
'sequence_conv'
,
'conv2d'
,
'conv2d'
,
...
@@ -1022,40 +1021,6 @@ def square_error_cost(input, label):
...
@@ -1022,40 +1021,6 @@ def square_error_cost(input, label):
return
square_out
return
square_out
def
accuracy
(
input
,
label
,
k
=
1
,
correct
=
None
,
total
=
None
):
"""
This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices.
"""
helper
=
LayerHelper
(
"accuracy"
,
**
locals
())
topk_out
=
helper
.
create_tmp_variable
(
dtype
=
input
.
dtype
)
topk_indices
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"top_k"
,
inputs
=
{
"X"
:
[
input
]},
outputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
]},
attrs
=
{
"k"
:
k
})
acc_out
=
helper
.
create_tmp_variable
(
dtype
=
"float32"
)
if
correct
is
None
:
correct
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
if
total
is
None
:
total
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"accuracy"
,
inputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
],
"Label"
:
[
label
]
},
outputs
=
{
"Accuracy"
:
[
acc_out
],
"Correct"
:
[
correct
],
"Total"
:
[
total
],
})
return
acc_out
def
chunk_eval
(
input
,
def
chunk_eval
(
input
,
label
,
label
,
chunk_scheme
,
chunk_scheme
,
...
@@ -1146,6 +1111,7 @@ def conv2d(input,
...
@@ -1146,6 +1111,7 @@ def conv2d(input,
param_attr
=
None
,
param_attr
=
None
,
bias_attr
=
None
,
bias_attr
=
None
,
use_cudnn
=
True
,
use_cudnn
=
True
,
use_mkldnn
=
False
,
act
=
None
):
act
=
None
):
"""
"""
**Convlution2D Layer**
**Convlution2D Layer**
...
@@ -1287,7 +1253,8 @@ def conv2d(input,
...
@@ -1287,7 +1253,8 @@ def conv2d(input,
'strides'
:
stride
,
'strides'
:
stride
,
'paddings'
:
padding
,
'paddings'
:
padding
,
'groups'
:
groups
,
'groups'
:
groups
,
'use_cudnn'
:
use_cudnn
'use_cudnn'
:
use_cudnn
,
'use_mkldnn'
:
use_mkldnn
})
})
pre_act
=
helper
.
append_bias_op
(
pre_bias
,
dim_start
=
1
,
dim_end
=
2
)
pre_act
=
helper
.
append_bias_op
(
pre_bias
,
dim_start
=
1
,
dim_end
=
2
)
...
@@ -3182,7 +3149,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
...
@@ -3182,7 +3149,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
data = fluid.layers.data(name='data', shape=[128], dtype='float32')
data = fluid.layers.data(name='data', shape=[128], dtype='float32')
label = fluid.layers.data(name='label', shape=[100], dtype='int64')
label = fluid.layers.data(name='label', shape=[100], dtype='int64')
fc = fluid.layers.fc(input=data, size=100)
fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.smooth_l1(
logits=fc, label
=label)
out = fluid.layers.smooth_l1(
x=fc, y
=label)
"""
"""
helper
=
LayerHelper
(
'smooth_l1_loss'
,
**
locals
())
helper
=
LayerHelper
(
'smooth_l1_loss'
,
**
locals
())
diff
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
diff
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
...
...
python/paddle/fluid/nets.py
浏览文件 @
c58ce0a6
...
@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
...
@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
act
,
act
,
param_attr
=
None
,
param_attr
=
None
,
pool_type
=
'max'
,
pool_type
=
'max'
,
use_cudnn
=
True
):
use_cudnn
=
True
,
use_mkldnn
=
False
):
conv_out
=
layers
.
conv2d
(
conv_out
=
layers
.
conv2d
(
input
=
input
,
input
=
input
,
num_filters
=
num_filters
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
filter_size
=
filter_size
,
param_attr
=
param_attr
,
param_attr
=
param_attr
,
act
=
act
,
act
=
act
,
use_cudnn
=
use_cudnn
)
use_cudnn
=
use_cudnn
,
use_mkldnn
=
use_mkldnn
)
pool_out
=
layers
.
pool2d
(
pool_out
=
layers
.
pool2d
(
input
=
conv_out
,
input
=
conv_out
,
...
@@ -58,7 +60,8 @@ def img_conv_group(input,
...
@@ -58,7 +60,8 @@ def img_conv_group(input,
conv_batchnorm_drop_rate
=
0.0
,
conv_batchnorm_drop_rate
=
0.0
,
pool_stride
=
1
,
pool_stride
=
1
,
pool_type
=
None
,
pool_type
=
None
,
use_cudnn
=
True
):
use_cudnn
=
True
,
use_mkldnn
=
False
):
"""
"""
Image Convolution Group, Used for vgg net.
Image Convolution Group, Used for vgg net.
"""
"""
...
@@ -90,7 +93,8 @@ def img_conv_group(input,
...
@@ -90,7 +93,8 @@ def img_conv_group(input,
padding
=
conv_padding
[
i
],
padding
=
conv_padding
[
i
],
param_attr
=
param_attr
[
i
],
param_attr
=
param_attr
[
i
],
act
=
local_conv_act
,
act
=
local_conv_act
,
use_cudnn
=
use_cudnn
)
use_cudnn
=
use_cudnn
,
use_mkldnn
=
use_mkldnn
)
if
conv_with_batchnorm
[
i
]:
if
conv_with_batchnorm
[
i
]:
tmp
=
layers
.
batch_norm
(
input
=
tmp
,
act
=
conv_act
)
tmp
=
layers
.
batch_norm
(
input
=
tmp
,
act
=
conv_act
)
...
...
python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
浏览文件 @
c58ce0a6
...
@@ -122,7 +122,8 @@ avg_cost = fluid.layers.mean(cost)
...
@@ -122,7 +122,8 @@ avg_cost = fluid.layers.mean(cost)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opts
=
optimizer
.
minimize
(
avg_cost
)
opts
=
optimizer
.
minimize
(
avg_cost
)
accuracy
=
fluid
.
evaluator
.
Accuracy
(
input
=
predict
,
label
=
label
)
batch_size
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size
)
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
...
@@ -144,13 +145,17 @@ feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
...
@@ -144,13 +145,17 @@ feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe
.
run
(
fluid
.
default_startup_program
())
exe
.
run
(
fluid
.
default_startup_program
())
i
=
0
i
=
0
accuracy
=
fluid
.
average
.
WeightedAverage
()
for
pass_id
in
range
(
PASS_NUM
):
for
pass_id
in
range
(
PASS_NUM
):
accuracy
.
reset
(
exe
)
accuracy
.
reset
()
for
data
in
train_reader
():
for
data
in
train_reader
():
loss
,
acc
=
exe
.
run
(
fluid
.
default_main_program
(),
loss
,
acc
,
weight
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
data
),
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
fetch_list
=
[
avg_cost
,
batch_acc
,
batch_size
])
pass_acc
=
accuracy
.
eval
(
exe
)
accuracy
.
add
(
value
=
acc
,
weight
=
weight
)
pass_acc
=
accuracy
.
eval
()
print
(
"loss:"
+
str
(
loss
)
+
" acc:"
+
str
(
acc
)
+
" pass_acc:"
+
str
(
print
(
"loss:"
+
str
(
loss
)
+
" acc:"
+
str
(
acc
)
+
" pass_acc:"
+
str
(
pass_acc
))
pass_acc
))
# this model is slow, so if we can train two mini batch, we think it works properly.
# this model is slow, so if we can train two mini batch, we think it works properly.
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
c58ce0a6
...
@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
...
@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
class
TestConv2dOp
(
OpTest
):
class
TestConv2dOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
use_cudnn
=
False
self
.
use_cudnn
=
False
self
.
use_mkldnn
=
False
self
.
init_op_type
()
self
.
init_op_type
()
self
.
init_group
()
self
.
init_group
()
self
.
init_dilation
()
self
.
init_dilation
()
...
@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
...
@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
'paddings'
:
self
.
pad
,
'paddings'
:
self
.
pad
,
'groups'
:
self
.
groups
,
'groups'
:
self
.
groups
,
'dilations'
:
self
.
dilations
,
'dilations'
:
self
.
dilations
,
'use_cudnn'
:
self
.
use_cudnn
'use_cudnn'
:
self
.
use_cudnn
,
'use_mkldnn'
:
self
.
use_mkldnn
}
}
self
.
outputs
=
{
'Output'
:
output
}
self
.
outputs
=
{
'Output'
:
output
}
...
@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
...
@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
# def init_op_type(self):
# def init_op_type(self):
# self.op_type = "conv_cudnn"
# self.op_type = "conv_cudnn"
#----------------Conv2dMKLDNN----------------
class
TestMKLDNN
(
TestConv2dOp
):
def
init_op_type
(
self
):
self
.
use_mkldnn
=
True
self
.
op_type
=
"conv2d"
class
TestMKLDNNWithPad
(
TestWithPad
):
def
init_op_type
(
self
):
self
.
use_mkldnn
=
True
self
.
op_type
=
"conv2d"
class
TestMKLDNNWithStride
(
TestWithStride
):
def
init_op_type
(
self
):
self
.
use_mkldnn
=
True
self
.
op_type
=
"conv2d"
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_profiler.py
浏览文件 @
c58ce0a6
...
@@ -37,7 +37,9 @@ class TestProfiler(unittest.TestCase):
...
@@ -37,7 +37,9 @@ class TestProfiler(unittest.TestCase):
label
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'int64'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
accuracy
=
fluid
.
evaluator
.
Accuracy
(
input
=
predict
,
label
=
label
)
batch_size
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size
)
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
)
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
)
opts
=
optimizer
.
minimize
(
avg_cost
,
startup_program
=
startup_program
)
opts
=
optimizer
.
minimize
(
avg_cost
,
startup_program
=
startup_program
)
...
@@ -46,7 +48,7 @@ class TestProfiler(unittest.TestCase):
...
@@ -46,7 +48,7 @@ class TestProfiler(unittest.TestCase):
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_program
)
exe
.
run
(
startup_program
)
accuracy
.
reset
(
exe
)
pass_acc_calculator
=
fluid
.
average
.
WeightedAverage
(
)
with
profiler
.
profiler
(
state
,
'total'
,
profile_path
)
as
prof
:
with
profiler
.
profiler
(
state
,
'total'
,
profile_path
)
as
prof
:
for
iter
in
range
(
10
):
for
iter
in
range
(
10
):
if
iter
==
2
:
if
iter
==
2
:
...
@@ -57,9 +59,11 @@ class TestProfiler(unittest.TestCase):
...
@@ -57,9 +59,11 @@ class TestProfiler(unittest.TestCase):
outs
=
exe
.
run
(
main_program
,
outs
=
exe
.
run
(
main_program
,
feed
=
{
'x'
:
x
,
feed
=
{
'x'
:
x
,
'y'
:
y
},
'y'
:
y
},
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
fetch_list
=
[
avg_cost
,
batch_acc
,
batch_size
]
)
acc
=
np
.
array
(
outs
[
1
])
acc
=
np
.
array
(
outs
[
1
])
pass_acc
=
accuracy
.
eval
(
exe
)
b_size
=
np
.
array
(
outs
[
2
])
pass_acc_calculator
.
add
(
value
=
acc
,
weight
=
b_size
)
pass_acc
=
pass_acc_calculator
.
eval
()
def
test_cpu_profiler
(
self
):
def
test_cpu_profiler
(
self
):
self
.
net_profiler
(
'CPU'
)
self
.
net_profiler
(
'CPU'
)
...
...
python/paddle/fluid/tests/unittests/test_reshape_op.py
浏览文件 @
c58ce0a6
...
@@ -45,5 +45,33 @@ class TestReshapeOpDimInfer(OpTest):
...
@@ -45,5 +45,33 @@ class TestReshapeOpDimInfer(OpTest):
self
.
check_grad
([
"X"
],
"Out"
)
self
.
check_grad
([
"X"
],
"Out"
)
class
TestReshapeOpInplace
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
20
)).
astype
(
"float32"
)}
self
.
attrs
=
{
'shape'
:
[
10
*
20
],
'inplace'
:
True
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
reshape
(
self
.
attrs
[
'shape'
])}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
"X"
],
"Out"
)
class
TestReshapeOpDimInferInplace
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reshape"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
10
,
20
)).
astype
(
"float32"
)}
self
.
attrs
=
{
'shape'
:
[
4
,
-
1
,
5
],
'inplace'
:
True
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
reshape
(
self
.
attrs
[
'shape'
])}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
"X"
],
"Out"
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录