Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
c58ce0a6
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c58ce0a6
编写于
3月 07, 2018
作者:
Q
qijun
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'baidu/develop' into update_getstarted_doc
上级
26922a3b
84aea8a8
变更
60
展开全部
隐藏空白更改
内联
并排
Showing
60 changed file
with
2969 addition
and
1215 deletion
+2969
-1215
benchmark/cluster/vgg16/vgg16_fluid.py
benchmark/cluster/vgg16/vgg16_fluid.py
+20
-15
cmake/external/snappy.cmake
cmake/external/snappy.cmake
+1
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+10
-5
paddle/fluid/framework/channel.h
paddle/fluid/framework/channel.h
+38
-11
paddle/fluid/framework/channel_impl.h
paddle/fluid/framework/channel_impl.h
+229
-0
paddle/fluid/framework/channel_test.cc
paddle/fluid/framework/channel_test.cc
+53
-62
paddle/fluid/framework/data_transform.cc
paddle/fluid/framework/data_transform.cc
+1
-0
paddle/fluid/framework/data_type.h
paddle/fluid/framework/data_type.h
+9
-1
paddle/fluid/framework/data_type_transform.cc
paddle/fluid/framework/data_type_transform.cc
+12
-2
paddle/fluid/framework/data_type_transform.cu
paddle/fluid/framework/data_type_transform.cu
+1
-0
paddle/fluid/framework/data_type_transform_test.cc
paddle/fluid/framework/data_type_transform_test.cc
+131
-18
paddle/fluid/framework/data_type_transform_test.cu
paddle/fluid/framework/data_type_transform_test.cu
+215
-0
paddle/fluid/framework/details/buffered_channel.h
paddle/fluid/framework/details/buffered_channel.h
+0
-142
paddle/fluid/framework/details/unbuffered_channel.h
paddle/fluid/framework/details/unbuffered_channel.h
+0
-174
paddle/fluid/framework/reader.cc
paddle/fluid/framework/reader.cc
+0
-87
paddle/fluid/framework/reader.h
paddle/fluid/framework/reader.h
+2
-77
paddle/fluid/framework/tensor_util_test.cc
paddle/fluid/framework/tensor_util_test.cc
+42
-16
paddle/fluid/framework/tensor_util_test.cu
paddle/fluid/framework/tensor_util_test.cu
+47
-14
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+31
-8
paddle/fluid/operators/concat_op.cc
paddle/fluid/operators/concat_op.cc
+5
-4
paddle/fluid/operators/concat_op.h
paddle/fluid/operators/concat_op.h
+38
-46
paddle/fluid/operators/conv_mkldnn_op.cc
paddle/fluid/operators/conv_mkldnn_op.cc
+313
-0
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+30
-21
paddle/fluid/operators/create_reader_op.cc
paddle/fluid/operators/create_reader_op.cc
+0
-246
paddle/fluid/operators/detail/CMakeLists.txt
paddle/fluid/operators/detail/CMakeLists.txt
+3
-1
paddle/fluid/operators/elementwise_mul_op.h
paddle/fluid/operators/elementwise_mul_op.h
+8
-75
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+1
-1
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+3
-0
paddle/fluid/operators/math/concat.cc
paddle/fluid/operators/math/concat.cc
+119
-0
paddle/fluid/operators/math/concat.cu
paddle/fluid/operators/math/concat.cu
+280
-0
paddle/fluid/operators/math/concat.h
paddle/fluid/operators/math/concat.h
+63
-0
paddle/fluid/operators/math/concat_test.cc
paddle/fluid/operators/math/concat_test.cc
+336
-0
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+7
-5
paddle/fluid/operators/reader/CMakeLists.txt
paddle/fluid/operators/reader/CMakeLists.txt
+5
-0
paddle/fluid/operators/reader/create_batch_reader_op.cc
paddle/fluid/operators/reader/create_batch_reader_op.cc
+137
-0
paddle/fluid/operators/reader/create_random_data_generator_op.cc
...fluid/operators/reader/create_random_data_generator_op.cc
+110
-0
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+97
-0
paddle/fluid/operators/reader/reader_op_registry.cc
paddle/fluid/operators/reader/reader_op_registry.cc
+116
-0
paddle/fluid/operators/reader/reader_op_registry.h
paddle/fluid/operators/reader/reader_op_registry.h
+75
-0
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+3
-0
paddle/fluid/operators/reshape_op.h
paddle/fluid/operators/reshape_op.h
+17
-5
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+14
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+34
-48
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+15
-36
paddle/fluid/platform/float16.h
paddle/fluid/platform/float16.h
+52
-4
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+20
-0
paddle/fluid/platform/gpu_info.h
paddle/fluid/platform/gpu_info.h
+6
-0
paddle/fluid/platform/mkldnn_helper.h
paddle/fluid/platform/mkldnn_helper.h
+15
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/average.py
python/paddle/fluid/average.py
+61
-0
python/paddle/fluid/backward.py
python/paddle/fluid/backward.py
+1
-1
python/paddle/fluid/evaluator.py
python/paddle/fluid/evaluator.py
+0
-38
python/paddle/fluid/layers/__init__.py
python/paddle/fluid/layers/__init__.py
+3
-0
python/paddle/fluid/layers/metric.py
python/paddle/fluid/layers/metric.py
+57
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+4
-37
python/paddle/fluid/nets.py
python/paddle/fluid/nets.py
+8
-4
python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
...ry_optimization/test_memopt_image_classification_train.py
+11
-6
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+23
-1
python/paddle/fluid/tests/unittests/test_profiler.py
python/paddle/fluid/tests/unittests/test_profiler.py
+8
-4
python/paddle/fluid/tests/unittests/test_reshape_op.py
python/paddle/fluid/tests/unittests/test_reshape_op.py
+28
-0
未找到文件。
benchmark/cluster/vgg16/vgg16_fluid.py
浏览文件 @
c58ce0a6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -138,13 +138,14 @@ def main():
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
accuracy
=
fluid
.
evaluator
.
Accuracy
(
input
=
predict
,
label
=
label
)
batch_size
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size
)
# inference program
inference_program
=
fluid
.
default_main_program
().
clone
()
with
fluid
.
program_guard
(
inference_program
):
test_target
=
accuracy
.
metrics
+
accuracy
.
states
inference_program
=
fluid
.
io
.
get_inference_program
(
test_target
)
inference_program
=
fluid
.
io
.
get_inference_program
(
batch_acc
)
# Optimization
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
args
.
learning_rate
)
...
...
@@ -157,27 +158,30 @@ def main():
# test
def
test
(
exe
):
accuracy
.
reset
(
exe
)
test_pass_acc
=
fluid
.
average
.
WeightedAverage
(
)
for
batch_id
,
data
in
enumerate
(
test_reader
()):
img_data
=
np
.
array
(
map
(
lambda
x
:
x
[
0
].
reshape
(
data_shape
),
data
)).
astype
(
"float32"
)
y_data
=
np
.
array
(
map
(
lambda
x
:
x
[
1
],
data
)).
astype
(
"int64"
)
y_data
=
y_data
.
reshape
([
-
1
,
1
])
exe
.
run
(
inference_program
,
feed
=
{
"pixel"
:
img_data
,
"label"
:
y_data
})
outs
=
exe
.
run
(
inference_program
,
feed
=
{
"pixel"
:
img_data
,
"label"
:
y_data
},
fetch_list
=
[
batch_acc
,
batch_size
])
test_pass_acc
.
add
(
value
=
np
.
array
(
outs
[
0
]),
weight
=
np
.
array
(
outs
[
1
]))
return
accuracy
.
eval
(
exe
)
return
test_pass_acc
.
eval
(
)
def
train_loop
(
exe
,
trainer_prog
):
iters
=
0
ts
=
time
.
time
()
train_pass_acc
=
fluid
.
average
.
WeightedAverage
()
for
pass_id
in
range
(
args
.
num_passes
):
# train
start_time
=
time
.
time
()
num_samples
=
0
accuracy
.
reset
(
exe
)
train_pass_acc
.
reset
(
)
with
profiler
.
profiler
(
"CPU"
,
'total'
)
as
prof
:
for
batch_id
,
data
in
enumerate
(
train_reader
()):
ts
=
time
.
time
()
...
...
@@ -187,13 +191,14 @@ def main():
y_data
=
np
.
array
(
map
(
lambda
x
:
x
[
1
],
data
)).
astype
(
"int64"
)
y_data
=
y_data
.
reshape
([
-
1
,
1
])
loss
,
acc
=
exe
.
run
(
loss
,
acc
,
b_size
=
exe
.
run
(
trainer_prog
,
feed
=
{
"pixel"
:
img_data
,
"label"
:
y_data
},
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
fetch_list
=
[
avg_cost
,
batch_acc
,
batch_size
]
)
iters
+=
1
num_samples
+=
len
(
data
)
train_pass_acc
.
add
(
value
=
acc
,
weight
=
b_size
)
print
(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
%
(
pass_id
,
iters
,
loss
,
acc
,
...
...
@@ -201,7 +206,7 @@ def main():
)
# The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed
=
time
.
time
()
-
start_time
pass_train_acc
=
accuracy
.
eval
(
exe
)
pass_train_acc
=
train_pass_acc
.
eval
(
)
pass_test_acc
=
test
(
exe
)
print
(
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f
\n
"
...
...
cmake/external/snappy.cmake
浏览文件 @
c58ce0a6
...
...
@@ -39,6 +39,7 @@ ExternalProject_Add(
-DCMAKE_INSTALL_LIBDIR=
${
SNAPPY_INSTALL_DIR
}
/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DSNAPPY_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=
${
THIRD_PARTY_BUILD_TYPE
}
${
EXTERNAL_OPTIONAL_ARGS
}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=
${
SNAPPY_INSTALL_DIR
}
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
c58ce0a6
...
...
@@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test
(
ddim_test SRCS ddim_test.cc DEPS ddim
)
nv_test
(
dim_test SRCS dim_test.cu DEPS ddim
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto
)
else
()
cc_library
(
tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto
)
endif
()
endif
()
cc_test
(
tensor_test SRCS tensor_test.cc DEPS tensor
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_test
(
tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor
)
else
()
cc_test
(
tensor_util_test SRCS tensor_util_test.cc DEPS tensor
)
...
...
@@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test
(
data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry init math_function
)
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
cc_test
(
data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform
)
if
(
WITH_GPU
)
nv_library
(
data_type_transform SRCS data_type_transform.cu DEPS tensor
)
nv_test
(
data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform
)
else
()
cc_library
(
data_type_transform SRCS data_type_transform.cc DEPS tensor
)
cc_test
(
data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform
)
endif
()
cc_library
(
data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function
)
cc_test
(
data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform
)
...
...
paddle/fluid/framework/channel.h
浏览文件 @
c58ce0a6
...
...
@@ -28,24 +28,19 @@ class Channel {
virtual
bool
Send
(
T
*
)
=
0
;
virtual
bool
Receive
(
T
*
)
=
0
;
virtual
size_t
Cap
()
=
0
;
virtual
void
Lock
()
=
0
;
virtual
void
Unlock
()
=
0
;
virtual
void
Close
()
=
0
;
virtual
~
Channel
()
{}
};
// Forward declaration of channel implementations.
namespace
details
{
template
<
typename
T
>
class
Buffered
;
template
<
typename
T
>
class
UnBuffered
;
}
// namespace details
class
ChannelImpl
;
template
<
typename
T
>
Channel
<
T
>*
MakeChannel
(
size_t
buffer_size
)
{
if
(
buffer_size
>
0
)
{
return
new
details
::
Buffered
<
T
>
(
buffer_size
);
}
return
new
details
::
UnBuffered
<
T
>
();
return
new
ChannelImpl
<
T
>
(
buffer_size
);
}
template
<
typename
T
>
...
...
@@ -89,6 +84,19 @@ class ChannelHolder {
if
(
IsInitialized
())
holder_
->
Close
();
}
size_t
Cap
()
{
if
(
IsInitialized
())
return
holder_
->
Cap
();
return
-
1
;
}
void
Lock
()
{
if
(
IsInitialized
())
holder_
->
Lock
();
}
void
Unlock
()
{
if
(
IsInitialized
())
holder_
->
Unlock
();
}
inline
bool
IsInitialized
()
const
{
return
holder_
!=
nullptr
;
}
inline
const
std
::
type_index
Type
()
{
...
...
@@ -106,6 +114,9 @@ class ChannelHolder {
virtual
const
std
::
type_index
Type
()
const
=
0
;
virtual
void
*
Ptr
()
const
=
0
;
virtual
void
Close
()
=
0
;
virtual
void
Lock
()
=
0
;
virtual
void
Unlock
()
=
0
;
virtual
size_t
Cap
()
=
0
;
};
template
<
typename
T
>
...
...
@@ -115,11 +126,28 @@ class ChannelHolder {
}
virtual
const
std
::
type_index
Type
()
const
{
return
type_
;
}
virtual
void
*
Ptr
()
const
{
return
static_cast
<
void
*>
(
channel_
.
get
());
}
virtual
void
Close
()
{
if
(
channel_
)
channel_
->
Close
();
}
virtual
size_t
Cap
()
{
if
(
channel_
)
return
channel_
->
Cap
();
else
return
-
1
;
}
virtual
void
Lock
()
{
if
(
channel_
)
channel_
->
Lock
();
}
virtual
void
Unlock
()
{
if
(
channel_
)
channel_
->
Unlock
();
}
std
::
unique_ptr
<
Channel
<
T
>>
channel_
;
const
std
::
type_index
type_
;
};
...
...
@@ -131,5 +159,4 @@ class ChannelHolder {
}
// namespace framework
}
// namespace paddle
#include "paddle/fluid/framework/details/buffered_channel.h"
#include "paddle/fluid/framework/details/unbuffered_channel.h"
#include "paddle/fluid/framework/channel_impl.h"
paddle/fluid/framework/channel_impl.h
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <atomic>
#include <condition_variable>
#include <deque>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
template
<
typename
T
>
class
ChannelImpl
:
public
paddle
::
framework
::
Channel
<
T
>
{
friend
Channel
<
T
>
*
paddle
::
framework
::
MakeChannel
<
T
>
(
size_t
);
friend
void
paddle
::
framework
::
CloseChannel
<
T
>
(
Channel
<
T
>
*
);
public:
virtual
bool
Send
(
T
*
);
virtual
bool
Receive
(
T
*
);
virtual
size_t
Cap
()
{
return
cap_
;
}
virtual
void
Lock
();
virtual
void
Unlock
();
virtual
void
Close
();
ChannelImpl
(
size_t
);
virtual
~
ChannelImpl
();
private:
struct
QueueMessage
{
T
*
data
;
std
::
condition_variable_any
cond
;
bool
chan_closed
=
false
;
bool
completed
=
false
;
QueueMessage
(
T
*
item
)
:
data
(
item
)
{}
void
Wait
(
std
::
unique_lock
<
std
::
recursive_mutex
>
&
lock
)
{
cond
.
wait
(
lock
,
[
this
]()
{
return
completed
;
});
}
void
Notify
()
{
completed
=
true
;
cond
.
notify_all
();
}
};
bool
send_return
(
bool
value
)
{
send_ctr
--
;
destructor_cond_
.
notify_all
();
return
value
;
}
bool
recv_return
(
bool
value
)
{
recv_ctr
--
;
destructor_cond_
.
notify_all
();
return
value
;
}
size_t
cap_
;
std
::
recursive_mutex
mu_
;
bool
closed_
;
std
::
deque
<
T
>
buf_
;
std
::
deque
<
std
::
shared_ptr
<
QueueMessage
>>
recvq
;
std
::
deque
<
std
::
shared_ptr
<
QueueMessage
>>
sendq
;
std
::
atomic
<
unsigned
>
send_ctr
{
0
};
std
::
atomic
<
unsigned
>
recv_ctr
{
0
};
std
::
condition_variable_any
destructor_cond_
;
};
template
<
typename
T
>
ChannelImpl
<
T
>::
ChannelImpl
(
size_t
capacity
)
:
cap_
(
capacity
),
closed_
(
false
),
send_ctr
(
0
),
recv_ctr
(
0
)
{
PADDLE_ENFORCE_GE
(
capacity
,
0
);
}
template
<
typename
T
>
bool
ChannelImpl
<
T
>::
Send
(
T
*
item
)
{
send_ctr
++
;
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
// If channel is closed, do nothing
if
(
closed_
)
{
lock
.
unlock
();
// TODO(abhinavarora) Should panic on closed channel
return
send_return
(
false
);
}
// If there is a receiver, directly pass the value we want
// to send to the receiver, bypassing the channel buffer if any
if
(
!
recvq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
recvq
.
front
();
recvq
.
pop_front
();
// Do the data transfer
*
(
m
->
data
)
=
std
::
move
(
*
item
);
// Wake up the blocked process and unlock
m
->
Notify
();
lock
.
unlock
();
return
send_return
(
true
);
}
// Unbuffered channel will always bypass this
// If buffered channel has space in buffer,
// write the element to the buffer.
if
(
buf_
.
size
()
<
cap_
)
{
// Copy to buffer
buf_
.
push_back
(
std
::
move
(
*
item
));
// Release lock and return true
lock
.
unlock
();
return
send_return
(
true
);
}
// Block on channel, because some receiver will complete
// the operation for us
auto
m
=
std
::
make_shared
<
QueueMessage
>
(
item
);
sendq
.
push_back
(
m
);
m
->
Wait
(
lock
);
// TODO(abhinavarora) Should panic on closed channel
return
send_return
(
!
m
->
chan_closed
);
}
template
<
typename
T
>
bool
ChannelImpl
<
T
>::
Receive
(
T
*
item
)
{
recv_ctr
++
;
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
// If channel is closed and buffer is empty or
// channel is unbuffered
if
(
closed_
&&
buf_
.
empty
())
{
lock
.
unlock
();
return
recv_return
(
false
);
}
// If there is a sender, directly receive the value we want
// from the sender, bypassing the channel buffer if any
if
(
!
sendq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
sendq
.
front
();
sendq
.
pop_front
();
// Do the data transfer
*
item
=
std
::
move
(
*
(
m
->
data
));
// Wake up the blocked process and unlock
m
->
Notify
();
lock
.
unlock
();
return
recv_return
(
true
);
}
// If this is a buffered channel and there are items in buffer
if
(
buf_
.
size
()
>
0
)
{
// Directly read from buffer
*
item
=
std
::
move
(
buf_
.
front
());
buf_
.
pop_front
();
// Release lock and return true
lock
.
unlock
();
return
recv_return
(
true
);
}
// No sender available, block on this channel
// Some receiver will complete the option for us
auto
m
=
std
::
make_shared
<
QueueMessage
>
(
item
);
recvq
.
push_back
(
m
);
m
->
Wait
(
lock
);
return
recv_return
(
!
m
->
chan_closed
);
}
template
<
typename
T
>
void
ChannelImpl
<
T
>::
Lock
()
{
mu_
.
lock
();
}
template
<
typename
T
>
void
ChannelImpl
<
T
>::
Unlock
()
{
mu_
.
unlock
();
}
template
<
typename
T
>
void
ChannelImpl
<
T
>::
Close
()
{
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
if
(
closed_
)
{
// TODO(abhinavarora): closing an already closed channel should panic
lock
.
unlock
();
return
;
}
closed_
=
true
;
// Empty the readers
while
(
!
recvq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
recvq
.
front
();
recvq
.
pop_front
();
m
->
chan_closed
=
true
;
m
->
Notify
();
}
// Empty the senders
while
(
!
sendq
.
empty
())
{
std
::
shared_ptr
<
QueueMessage
>
m
=
sendq
.
front
();
sendq
.
pop_front
();
m
->
chan_closed
=
true
;
m
->
Notify
();
}
}
template
<
typename
T
>
ChannelImpl
<
T
>::~
ChannelImpl
()
{
Close
();
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
std
::
unique_lock
<
std
::
recursive_mutex
>
lock
{
mu_
};
destructor_cond_
.
wait
(
lock
,
[
this
]()
{
return
send_ctr
==
0
&&
recv_ctr
==
0
;
});
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/channel_test.cc
浏览文件 @
c58ce0a6
...
...
@@ -23,8 +23,19 @@ using paddle::framework::Channel;
using
paddle
::
framework
::
ChannelHolder
;
using
paddle
::
framework
::
MakeChannel
;
using
paddle
::
framework
::
CloseChannel
;
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
TEST
(
Channel
,
ChannelCapacityTest
)
{
const
size_t
buffer_size
=
10
;
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
EXPECT_EQ
(
ch
->
Cap
(),
buffer_size
);
CloseChannel
(
ch
);
delete
ch
;
ch
=
MakeChannel
<
size_t
>
(
0
);
EXPECT_EQ
(
ch
->
Cap
(),
0U
);
CloseChannel
(
ch
);
delete
ch
;
}
void
RecevingOrderEqualToSendingOrder
(
Channel
<
int
>
*
ch
)
{
unsigned
sum_send
=
0
;
...
...
@@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
}
});
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
int
recv
;
int
recv
=
999
;
EXPECT_EQ
(
ch
->
Receive
(
&
recv
),
true
);
EXPECT_EQ
(
recv
,
i
);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
CloseChannel
(
ch
);
t
.
join
();
EXPECT_EQ
(
sum_send
,
10U
);
delete
ch
;
}
TEST
(
Channel
,
MakeAndClose
)
{
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto
ch
=
MakeChannel
<
int
>
(
10
);
EXPECT_NE
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
),
nullptr
);
EXPECT_EQ
(
dynamic_cast
<
UnBuffered
<
int
>
*>
(
ch
),
nullptr
);
CloseChannel
(
ch
);
delete
ch
;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto
ch
=
MakeChannel
<
int
>
(
0
);
EXPECT_EQ
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
),
nullptr
);
EXPECT_NE
(
dynamic_cast
<
UnBuffered
<
int
>
*>
(
ch
),
nullptr
);
CloseChannel
(
ch
);
delete
ch
;
}
}
TEST
(
Channel
,
SufficientBufferSizeDoesntBlock
)
{
const
size_t
buffer_size
=
10
;
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
...
...
@@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
TEST
(
Channel
,
ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize
)
{
const
size_t
buffer_size
=
10
;
auto
ch
=
MakeChannel
<
size_t
>
(
buffer_size
);
size_t
sum
=
0
;
std
::
thread
t
([
&
]()
{
// Try to write more than buffer size.
for
(
size_t
i
=
0
;
i
<
2
*
buffer_size
;
++
i
)
{
...
...
@@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
EXPECT_EQ
(
ch
->
Send
(
&
i
),
true
);
// should block after 10 iterations
else
EXPECT_EQ
(
ch
->
Send
(
&
i
),
false
);
sum
+=
i
;
}
});
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1 sec
EXPECT_EQ
(
sum
,
45U
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2 sec
CloseChannel
(
ch
);
t
.
join
();
delete
ch
;
...
...
@@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
},
&
thread_ended
[
i
]);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all the threads are blocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
// This should unblock all receivers
CloseChannel
(
ch
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
t
[
i
].
join
();
}
void
ChannelCloseUnblocksSendersTest
(
Channel
<
int
>
*
ch
)
{
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
void
ChannelCloseUnblocksSendersTest
(
Channel
<
int
>
*
ch
,
bool
isBuffered
)
{
size_t
num_threads
=
5
;
std
::
thread
t
[
num_threads
];
bool
thread_ended
[
num_threads
];
...
...
@@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
},
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
if
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
)
)
{
if
(
isBuffered
)
{
// If ch is Buffered, atleast 4 threads must be blocked.
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
// This should unblock all senders
CloseChannel
(
ch
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
EXPECT_EQ
(
thread_ended
[
i
],
true
);
}
if
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
)
)
{
if
(
isBuffered
)
{
// Verify that only 1 send was successful
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
// any senders waiting for channel to have write space
TEST
(
Channel
,
BufferedChannelCloseUnblocksSendersTest
)
{
auto
ch
=
MakeChannel
<
int
>
(
1
);
ChannelCloseUnblocksSendersTest
(
ch
);
ChannelCloseUnblocksSendersTest
(
ch
,
true
);
delete
ch
;
}
...
...
@@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
// unblocks any senders waiting for senders
TEST
(
Channel
,
UnbufferedChannelCloseUnblocksSendersTest
)
{
auto
ch
=
MakeChannel
<
int
>
(
0
);
ChannelCloseUnblocks
ReceiversTest
(
ch
);
ChannelCloseUnblocks
SendersTest
(
ch
,
false
);
delete
ch
;
}
...
...
@@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
ch
->
Receive
(
&
recv
);
EXPECT_EQ
(
recv
,
i
);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.5
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
EXPECT_EQ
(
sum_send
,
3U
);
CloseChannel
(
ch
);
...
...
@@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
ch
->
Send
(
&
i
);
sum_send
+=
i
;
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
// wait 0.5
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
EXPECT_EQ
(
sum_send
,
10U
);
EXPECT_EQ
(
sum_receive
,
10U
);
// send three more elements
...
...
@@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
// This tests that destroying a channel unblocks
// any senders waiting for channel to have write space
void
ChannelDestroyUnblockSenders
(
Channel
<
int
>
*
ch
)
{
void
ChannelDestroyUnblockSenders
(
Channel
<
int
>
*
ch
,
bool
isBuffered
)
{
size_t
num_threads
=
5
;
std
::
thread
t
[
num_threads
];
bool
thread_ended
[
num_threads
];
...
...
@@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
// wait 0.5 sec
bool
is_buffered_channel
=
false
;
if
(
dynamic_cast
<
Buffered
<
int
>
*>
(
ch
))
is_buffered_channel
=
true
;
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2 sec
if
(
is
_buffered_channel
)
{
if
(
is
Buffered
)
{
// If channel is buffered, verify that atleast 4 threads are blocked
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
EXPECT_EQ
(
thread_ended
[
i
],
true
);
}
// Count number of successful
d
sends
// Count number of successful sends
int
ct
=
0
;
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
if
(
send_success
[
i
])
ct
++
;
}
if
(
is
_buffered_channel
)
{
if
(
is
Buffered
)
{
// Only 1 send must be successful
EXPECT_EQ
(
ct
,
1
);
}
else
{
...
...
@@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
TEST
(
Channel
,
BufferedChannelDestroyUnblocksSendersTest
)
{
size_t
buffer_size
=
1
;
auto
ch
=
MakeChannel
<
int
>
(
buffer_size
);
ChannelDestroyUnblockSenders
(
ch
);
ChannelDestroyUnblockSenders
(
ch
,
true
);
}
// This tests that destroying an unbuffered channel also unblocks
...
...
@@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
TEST
(
Channel
,
UnbufferedChannelDestroyUnblocksSendersTest
)
{
auto
ch
=
MakeChannel
<
int
>
(
0
);
ChannelDestroyUnblockSenders
(
ch
);
ChannelDestroyUnblockSenders
(
ch
,
false
);
}
TEST
(
ChannelHolder
,
ChannelHolderCapacityTest
)
{
const
size_t
buffer_size
=
10
;
ChannelHolder
*
ch
=
new
ChannelHolder
();
ch
->
Reset
<
int
>
(
buffer_size
);
EXPECT_EQ
(
ch
->
Cap
(),
buffer_size
);
delete
ch
;
ch
=
new
ChannelHolder
();
ch
->
Reset
<
int
>
(
0
);
EXPECT_EQ
(
ch
->
Cap
(),
0U
);
delete
ch
;
}
void
ChannelHolderSendReceive
(
ChannelHolder
*
ch
)
{
...
...
@@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
},
&
thread_ended
[
i
]);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all the threads are blocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
// This should unblock all receivers
ch
->
close
();
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
100
));
// wait 0.1
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
}
void
ChannelHolderCloseUnblocksSendersTest
(
ChannelHolder
*
ch
,
bool
isBuffered
)
{
using
paddle
::
framework
::
details
::
Buffered
;
using
paddle
::
framework
::
details
::
UnBuffered
;
size_t
num_threads
=
5
;
std
::
thread
t
[
num_threads
];
bool
thread_ended
[
num_threads
];
...
...
@@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
},
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
if
(
isBuffered
)
{
// If ch is Buffered, atleast 4 threads must be blocked.
...
...
@@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
// This should unblock all senders
ch
->
close
();
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
// Verify that all threads got unblocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
@@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
&
thread_ended
[
i
],
&
send_success
[
i
]);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
500
));
// wait 0.5
sec
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
200
));
// wait 0.2
sec
if
(
isBuffered
)
{
// If channel is buffered, verify that atleast 4 threads are blocked
int
ct
=
0
;
...
...
@@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
},
&
thread_ended
[
i
]);
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
00
));
// wait
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
2
00
));
// wait
// Verify that all threads are blocked
for
(
size_t
i
=
0
;
i
<
num_threads
;
i
++
)
{
...
...
paddle/fluid/framework/data_transform.cc
浏览文件 @
c58ce0a6
...
...
@@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
PassTensorData
(
&
out
,
&
in
);
}
// do data type transform
if
(
expected_kernel_type
.
data_type_
!=
kernel_type_for_var
.
data_type_
)
{
TransDataType
(
kernel_type_for_var
,
expected_kernel_type
,
in
,
&
out
);
transformed
=
true
;
...
...
paddle/fluid/framework/data_type.h
浏览文件 @
c58ce0a6
...
...
@@ -16,13 +16,16 @@ limitations under the License. */
#include <typeindex>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
framework
{
inline
proto
::
VarType
::
Type
ToDataType
(
std
::
type_index
type
)
{
using
namespace
paddle
::
framework
::
proto
;
if
(
typeid
(
float
).
hash_code
()
==
type
.
hash_code
())
{
if
(
typeid
(
platform
::
float16
).
hash_code
()
==
type
.
hash_code
())
{
return
proto
::
VarType
::
FP16
;
}
else
if
(
typeid
(
float
).
hash_code
()
==
type
.
hash_code
())
{
return
proto
::
VarType
::
FP32
;
}
else
if
(
typeid
(
double
).
hash_code
()
==
type
.
hash_code
())
{
return
proto
::
VarType
::
FP64
;
...
...
@@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
inline
std
::
type_index
ToTypeIndex
(
proto
::
VarType
::
Type
type
)
{
using
namespace
paddle
::
framework
::
proto
;
switch
(
type
)
{
case
proto
::
VarType
::
FP16
:
return
typeid
(
platform
::
float16
);
case
proto
::
VarType
::
FP32
:
return
typeid
(
float
);
case
proto
::
VarType
::
FP64
:
...
...
@@ -59,6 +64,9 @@ template <typename Visitor>
inline
void
VisitDataType
(
proto
::
VarType
::
Type
type
,
Visitor
visitor
)
{
using
namespace
paddle
::
framework
::
proto
;
switch
(
type
)
{
case
proto
::
VarType
::
FP16
:
visitor
.
template
operator
()
<
platform
::
float16
>();
break
;
case
proto
::
VarType
::
FP32
:
visitor
.
template
operator
()
<
float
>();
break
;
...
...
paddle/fluid/framework/data_type_transform.cc
浏览文件 @
c58ce0a6
...
...
@@ -47,9 +47,15 @@ struct CastDataType {
auto
*
context
=
static_cast
<
const
platform
::
CPUDeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
#ifdef __NVCC__
}
else
if
(
platform
::
is_gpu_place
(
in_
.
place
()))
{
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
context
=
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
ctx_
);
trans
(
*
context
,
in_begin
,
in_end
,
out_begin
,
CastDataTypeFunctor
<
InType
,
OutType
>
());
#endif
}
else
{
// TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW
(
"Unsupport CPU <-> GPU!"
);
PADDLE_THROW
(
"Unsupported place!"
);
}
}
};
...
...
@@ -65,6 +71,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
auto
ctx
=
pool
.
Get
(
in
.
place
());
switch
(
src_type
)
{
case
proto
::
VarType
::
FP16
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
platform
::
float16
>
(
in
,
out
,
ctx
));
break
;
case
proto
::
VarType
::
FP32
:
framework
::
VisitDataType
(
dst_type
,
CastDataType
<
float
>
(
in
,
out
,
ctx
));
break
;
...
...
paddle/fluid/framework/data_type_transform.cu
0 → 120000
浏览文件 @
c58ce0a6
data_type_transform
.
cc
\ No newline at end of file
paddle/fluid/framework/data_type_transform_test.cc
浏览文件 @
c58ce0a6
...
...
@@ -22,32 +22,145 @@ TEST(DataTypeTransform, CPUTransform) {
auto
place
=
CPUPlace
();
Tensor
in
;
Tensor
out
;
float
*
ptr
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
place
);
int
data_number
=
2
*
3
;
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ptr
[
i
]
=
i
/
3
;
}
auto
kernel_fp16
=
OpKernelType
(
proto
::
VarType
::
FP16
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp32
=
OpKernelType
(
proto
::
VarType
::
FP32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp64
=
OpKernelType
(
proto
::
VarType
::
FP64
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int32
=
OpKernelType
(
proto
::
VarType
::
INT32
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int64
=
OpKernelType
(
proto
::
VarType
::
INT64
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_bool
=
OpKernelType
(
proto
::
VarType
::
BOOL
,
place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in
,
&
out
);
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
i
/
3
));
// data type transform from float32
{
Tensor
in
;
Tensor
out
;
float
*
ptr
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
place
);
int
data_number
=
2
*
3
;
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ptr
[
i
]
=
i
/
3
;
}
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in
,
&
out
);
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
i
/
3
));
}
TransDataType
(
kernel_fp32
,
kernel_int32
,
in
,
&
out
);
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
i
/
3
));
}
}
TransDataType
(
kernel_fp32
,
kernel_int32
,
in
,
&
out
);
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
i
/
3
));
// data type transform from/to float16
{
Tensor
in
;
Tensor
out
;
float16
*
ptr
=
in
.
mutable_data
<
float16
>
(
make_ddim
({
2
,
3
}),
place
);
int
data_number
=
2
*
3
;
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ptr
[
i
]
=
i
;
}
// transform from float16 to other data types
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in
,
&
out
);
float
*
out_data_float
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_float
[
i
],
static_cast
<
float
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in
,
&
out
);
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int32
,
in
,
&
out
);
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int64
,
in
,
&
out
);
int64_t
*
out_data_int64
=
out
.
data
<
int64_t
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int64
[
i
],
static_cast
<
int64_t
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_bool
,
in
,
&
out
);
bool
*
out_data_bool
=
out
.
data
<
bool
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_bool
[
i
],
static_cast
<
bool
>
(
ptr
[
i
]));
}
// transform float to float16
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_float
[
i
]
=
i
;
}
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_float
[
i
]).
x
);
}
// transform double to float16
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_double
[
i
]
=
i
;
}
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_double
[
i
]).
x
);
}
// transform int to float16
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int
[
i
]
=
i
;
}
TransDataType
(
kernel_int32
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int
[
i
]).
x
);
}
// transform int64 to float16
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int64
[
i
]
=
i
;
}
TransDataType
(
kernel_int64
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int64
[
i
]).
x
);
}
// transform bool to float16
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
make_ddim
({
2
,
3
}),
place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_bool
[
i
]
=
i
;
}
TransDataType
(
kernel_bool
,
kernel_fp16
,
in
,
&
out
);
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_bool
[
i
]).
x
);
}
}
}
paddle/fluid/framework/data_type_transform_test.cu
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "gtest/gtest.h"
TEST
(
DataTypeTransform
,
GPUTransform
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
auto
cpu_place
=
CPUPlace
();
auto
gpu_place
=
CUDAPlace
(
0
);
CUDADeviceContext
context
(
gpu_place
);
auto
kernel_fp16
=
OpKernelType
(
proto
::
VarType
::
FP16
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp32
=
OpKernelType
(
proto
::
VarType
::
FP32
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_fp64
=
OpKernelType
(
proto
::
VarType
::
FP64
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int32
=
OpKernelType
(
proto
::
VarType
::
INT32
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_int64
=
OpKernelType
(
proto
::
VarType
::
INT64
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
auto
kernel_bool
=
OpKernelType
(
proto
::
VarType
::
BOOL
,
gpu_place
,
DataLayout
::
kAnyLayout
,
LibraryType
::
kPlain
);
// data type transform from float32
{
Tensor
in
;
Tensor
in_gpu
;
Tensor
out_gpu
;
Tensor
out
;
float
*
in_ptr
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
float
arr
[
6
]
=
{
0
,
1
,
2
,
3
,
4
,
5
};
int
data_number
=
sizeof
(
arr
)
/
sizeof
(
arr
[
0
]);
memcpy
(
in_ptr
,
arr
,
sizeof
(
arr
));
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_fp32
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
arr
[
i
]));
}
TransDataType
(
kernel_fp32
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
arr
[
i
]));
}
}
// data type transform from/to float16
{
Tensor
in
;
Tensor
in_gpu
;
Tensor
out_gpu
;
Tensor
out
;
float16
*
ptr
=
in
.
mutable_data
<
float16
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
float16
arr
[
6
]
=
{
float16
(
0
),
float16
(
1
),
float16
(
2
),
float16
(
3
),
float16
(
4
),
float16
(
5
)};
int
data_number
=
sizeof
(
arr
)
/
sizeof
(
arr
[
0
]);
memcpy
(
ptr
,
arr
,
sizeof
(
arr
));
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
// transform from float16 to other data types
TransDataType
(
kernel_fp16
,
kernel_fp32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
float
*
out_data_float
=
out
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_float
[
i
],
static_cast
<
float
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_fp64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
double
*
out_data_double
=
out
.
data
<
double
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_double
[
i
],
static_cast
<
double
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int32
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int
*
out_data_int
=
out
.
data
<
int
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int
[
i
],
static_cast
<
int
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_int64
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
int64_t
*
out_data_int64
=
out
.
data
<
int64_t
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_int64
[
i
],
static_cast
<
int64_t
>
(
ptr
[
i
]));
}
TransDataType
(
kernel_fp16
,
kernel_bool
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
bool
*
out_data_bool
=
out
.
data
<
bool
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
out_data_bool
[
i
],
static_cast
<
bool
>
(
ptr
[
i
]));
}
// transform float to float16
float
*
in_data_float
=
in
.
mutable_data
<
float
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_float
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_fp32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_float
[
i
]).
x
);
}
// transform double to float16
double
*
in_data_double
=
in
.
mutable_data
<
double
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_double
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_fp64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_double
[
i
]).
x
);
}
// transform int to float16
int
*
in_data_int
=
in
.
mutable_data
<
int
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_int32
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int
[
i
]).
x
);
}
// transform int64 to float16
int64_t
*
in_data_int64
=
in
.
mutable_data
<
int64_t
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_int64
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_int64
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_int64
[
i
]).
x
);
}
// transform bool to float16
bool
*
in_data_bool
=
in
.
mutable_data
<
bool
>
(
make_ddim
({
2
,
3
}),
cpu_place
);
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
in_data_bool
[
i
]
=
i
;
}
TensorCopy
(
in
,
gpu_place
,
context
,
&
in_gpu
);
TransDataType
(
kernel_bool
,
kernel_fp16
,
in_gpu
,
&
out_gpu
);
TensorCopy
(
out_gpu
,
cpu_place
,
context
,
&
out
);
context
.
Wait
();
ptr
=
out
.
data
<
float16
>
();
for
(
int
i
=
0
;
i
<
data_number
;
++
i
)
{
ASSERT_EQ
(
ptr
[
i
].
x
,
static_cast
<
float16
>
(
in_data_bool
[
i
]).
x
);
}
}
}
paddle/fluid/framework/details/buffered_channel.h
已删除
100644 → 0
浏览文件 @
26922a3b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
// Four of the properties of Buffered Channel:
// - A send to a full channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from an empty channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template
<
typename
T
>
class
Buffered
:
public
paddle
::
framework
::
Channel
<
T
>
{
friend
Channel
<
T
>*
paddle
::
framework
::
MakeChannel
<
T
>
(
size_t
);
friend
void
paddle
::
framework
::
CloseChannel
<
T
>
(
Channel
<
T
>*
);
public:
virtual
bool
Send
(
T
*
);
virtual
bool
Receive
(
T
*
);
virtual
size_t
Cap
()
{
return
cap_
;
}
virtual
void
Close
();
virtual
~
Buffered
();
private:
size_t
cap_
;
std
::
mutex
mu_
;
std
::
condition_variable
empty_cond_var_
;
std
::
condition_variable
full_cond_var_
;
std
::
condition_variable
destructor_cond_var_
;
std
::
deque
<
T
>
channel_
;
std
::
atomic
<
bool
>
closed_
{
false
};
std
::
atomic
<
unsigned
>
send_ctr
{
0
};
std
::
atomic
<
unsigned
>
recv_ctr
{
0
};
Buffered
(
size_t
cap
)
:
cap_
(
cap
),
closed_
(
false
)
{
PADDLE_ENFORCE_GT
(
cap
,
0
);
}
void
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
);
};
template
<
typename
T
>
bool
Buffered
<
T
>::
Send
(
T
*
item
)
{
bool
ret
=
false
;
if
(
closed_
)
{
return
ret
;
}
send_ctr
++
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
full_cond_var_
.
wait
(
lock
,
[
this
]()
{
return
channel_
.
size
()
<
cap_
||
closed_
;
});
if
(
!
closed_
)
{
channel_
.
push_back
(
std
::
move
(
*
item
));
lock
.
unlock
();
empty_cond_var_
.
notify_one
();
ret
=
true
;
}
send_ctr
--
;
destructor_cond_var_
.
notify_one
();
return
ret
;
}
template
<
typename
T
>
bool
Buffered
<
T
>::
Receive
(
T
*
item
)
{
bool
ret
=
false
;
// Once the channel has been closed and all data has been consumed,
// just return false. Don't even try acquiring the mutex.
if
(
closed_
&&
channel_
.
empty
())
{
return
false
;
}
recv_ctr
++
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
empty_cond_var_
.
wait
(
lock
,
[
this
]()
{
return
!
channel_
.
empty
()
||
closed_
;
});
if
(
!
channel_
.
empty
())
{
*
item
=
std
::
move
(
channel_
.
front
());
channel_
.
pop_front
();
full_cond_var_
.
notify_one
();
ret
=
true
;
}
recv_ctr
--
;
destructor_cond_var_
.
notify_one
();
return
ret
;
}
template
<
typename
T
>
void
Buffered
<
T
>::
Close
()
{
if
(
closed_
)
{
return
;
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
closed_
=
true
;
NotifyAllParticipants
(
&
lock
);
}
template
<
typename
T
>
Buffered
<
T
>::~
Buffered
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_
);
closed_
=
true
;
channel_
.
clear
();
NotifyAllParticipants
(
&
lock
);
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
lock
.
lock
();
destructor_cond_var_
.
wait
(
lock
,
[
this
]()
{
return
send_ctr
==
0
&&
recv_ctr
==
0
;
});
}
template
<
typename
T
>
void
Buffered
<
T
>::
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
lock
)
{
lock
->
unlock
();
full_cond_var_
.
notify_all
();
empty_cond_var_
.
notify_all
();
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/unbuffered_channel.h
已删除
100644 → 0
浏览文件 @
26922a3b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <mutex>
#include "paddle/fluid/framework/channel.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
// Four of the properties of UnBuffered Channel:
// - A send to a channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from a channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template
<
typename
T
>
class
UnBuffered
:
public
paddle
::
framework
::
Channel
<
T
>
{
friend
Channel
<
T
>*
paddle
::
framework
::
MakeChannel
<
T
>
(
size_t
);
friend
void
paddle
::
framework
::
CloseChannel
<
T
>
(
Channel
<
T
>*
);
public:
virtual
bool
Send
(
T
*
);
virtual
bool
Receive
(
T
*
);
virtual
size_t
Cap
()
{
return
0
;
}
virtual
void
Close
();
virtual
~
UnBuffered
();
private:
std
::
mutex
mu_ch_
;
// Mutex for readers and writers who are waiting for other reader
// and writer to complete execution
std
::
recursive_mutex
mu_read_
,
mu_write_
;
// reader_found_ is set true when a reader is ready to accept data
// writer_found_ is set true when a writer is ready to send data
// A transaction occurs only when both are true
std
::
atomic
<
bool
>
reader_found_
{
false
},
writer_found_
{
false
};
std
::
condition_variable
cv_channel_
;
std
::
condition_variable_any
cv_reader_
,
cv_writer_
,
cv_destructor_
;
T
*
item
{
nullptr
};
std
::
atomic
<
bool
>
closed_
{
false
};
std
::
atomic
<
unsigned
>
send_ctr
{
0
};
std
::
atomic
<
unsigned
>
recv_ctr
{
0
};
UnBuffered
()
:
closed_
(
false
)
{}
void
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
);
};
// This function implements the concept of how data should
// be sent from a writer to a reader.
template
<
typename
T
>
bool
UnBuffered
<
T
>::
Send
(
T
*
data
)
{
bool
ret
=
false
;
if
(
closed_
)
{
return
ret
;
}
send_ctr
++
;
// Prevent other writers from entering
std
::
unique_lock
<
std
::
recursive_mutex
>
writer_lock
(
mu_write_
);
writer_found_
=
true
;
std
::
unique_lock
<
std
::
recursive_mutex
>
cv_lock
(
mu_write_
);
// If writer comes first, it should wait till a reader arrives
cv_writer_
.
wait
(
cv_lock
,
[
this
]()
{
return
reader_found_
==
true
||
closed_
;
});
cv_reader_
.
notify_one
();
if
(
!
closed_
)
{
std
::
unique_lock
<
std
::
mutex
>
channel_lock
(
mu_ch_
);
item
=
data
;
channel_lock
.
unlock
();
cv_channel_
.
notify_one
();
channel_lock
.
lock
();
cv_channel_
.
wait
(
channel_lock
,
[
this
]()
{
return
item
==
nullptr
||
closed_
;
});
ret
=
true
;
}
writer_found_
=
false
;
send_ctr
--
;
cv_destructor_
.
notify_one
();
return
ret
;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template
<
typename
T
>
bool
UnBuffered
<
T
>::
Receive
(
T
*
data
)
{
bool
ret
=
false
;
// If channel is closed, we don't even want any reader to enter.
// Unlike a buffered channel, an unbuffered channel does not allow
// readers to read after closing because there is no buffer to be consumed.
if
(
closed_
)
return
ret
;
recv_ctr
++
;
// Prevent other readers from entering
std
::
unique_lock
<
std
::
recursive_mutex
>
read_lock
{
mu_read_
};
reader_found_
=
true
;
std
::
unique_lock
<
std
::
recursive_mutex
>
cv_lock
{
mu_read_
};
// If reader comes first, it should wait till a writer arrives
cv_reader_
.
wait
(
cv_lock
,
[
this
]()
{
return
writer_found_
==
true
||
closed_
;
});
cv_writer_
.
notify_one
();
if
(
!
closed_
)
{
std
::
unique_lock
<
std
::
mutex
>
lock_ch
{
mu_ch_
};
// Reader should wait for the writer to first write its data
cv_channel_
.
wait
(
lock_ch
,
[
this
]()
{
return
item
!=
nullptr
||
closed_
;
});
if
(
!
closed_
)
{
*
data
=
std
::
move
(
*
item
);
item
=
nullptr
;
lock_ch
.
unlock
();
ret
=
true
;
}
cv_channel_
.
notify_one
();
}
reader_found_
=
false
;
recv_ctr
--
;
cv_destructor_
.
notify_one
();
return
ret
;
}
// This function implements the sequence of events
// that take place once the channel is closed.
template
<
typename
T
>
void
UnBuffered
<
T
>::
Close
()
{
if
(
closed_
)
{
return
;
}
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_ch_
);
item
=
nullptr
;
closed_
=
true
;
NotifyAllParticipants
(
&
lock
);
}
// This function implements the sequence of events
// that are executed once the object of an UnBuffered
// channel is destroyed.
template
<
typename
T
>
UnBuffered
<
T
>::~
UnBuffered
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mu_ch_
);
item
=
nullptr
;
closed_
=
true
;
NotifyAllParticipants
(
&
lock
);
lock
.
lock
();
cv_destructor_
.
wait
(
lock
,
[
this
]()
{
return
send_ctr
==
0
&&
recv_ctr
==
0
;
});
}
// This function notifies all the readers, writers and
// the channel condition variables.
template
<
typename
T
>
void
UnBuffered
<
T
>::
NotifyAllParticipants
(
std
::
unique_lock
<
std
::
mutex
>*
lock
)
{
lock
->
unlock
();
cv_writer_
.
notify_all
();
cv_channel_
.
notify_all
();
cv_reader_
.
notify_all
();
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/reader.cc
浏览文件 @
c58ce0a6
...
...
@@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
return
shapes_
[
idx
];
}
void
ShuffleReader
::
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
{
if
(
iteration_pos_
>=
buffer_
.
size
())
{
// Reload buffer with new data
buffer_
.
clear
();
buffer_
.
reserve
(
buffer_size_
);
for
(
int
i
=
0
;
i
<
buffer_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
// optimize.
std
::
random_shuffle
(
buffer_
.
begin
(),
buffer_
.
end
());
iteration_pos_
=
0
;
}
out
->
clear
();
if
(
!
buffer_
.
empty
())
{
std
::
swap
(
*
out
,
buffer_
[
iteration_pos_
++
]);
}
// if buffer_ is empty, the 'out' will return as an empty vector.
}
void
BatchReader
::
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
{
buffer_
.
clear
();
buffer_
.
reserve
(
batch_size_
);
for
(
int
i
=
0
;
i
<
batch_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// Concat instances
out
->
clear
();
if
(
buffer_
.
empty
())
{
// if buffer_ is empty, the 'out' will return as an empty vector.
return
;
}
int
out_num
=
buffer_
[
0
].
size
();
out
->
reserve
(
out_num
);
for
(
int
j
=
0
;
j
<
out_num
;
++
j
)
{
// Merge shape and check date type
std
::
type_index
batch_type
=
buffer_
[
0
][
j
].
type
();
DDim
batch_shape
=
buffer_
[
0
][
j
].
dims
();
for
(
size_t
i
=
1
;
i
<
buffer_
.
size
();
++
i
)
{
std
::
type_index
ins_type
=
buffer_
[
i
][
j
].
type
();
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
PADDLE_ENFORCE_EQ
(
batch_type
,
ins_type
);
PADDLE_ENFORCE_EQ
(
slice_ddim
(
batch_shape
,
1
,
batch_shape
.
size
()),
slice_ddim
(
ins_shape
,
1
,
ins_shape
.
size
()));
PADDLE_ENFORCE_GT
(
ins_shape
[
0
],
0
);
batch_shape
[
0
]
+=
ins_shape
[
0
];
}
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
batch_shape
);
out_tensor
.
mutable_data
(
platform
::
CPUPlace
(),
batch_type
);
int64_t
dst_offset
=
0
;
// Merge lod and data
LoD
batch_lod
;
for
(
size_t
i
=
0
;
i
<
buffer_
.
size
();
++
i
)
{
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
LoD
ins_lod
=
buffer_
[
i
][
j
].
lod
();
if
(
i
==
0
)
{
batch_lod
=
ins_lod
;
}
else
{
PADDLE_ENFORCE_EQ
(
batch_lod
.
size
(),
ins_lod
.
size
());
for
(
size_t
level_idx
=
0
;
level_idx
<
batch_lod
.
size
();
++
level_idx
)
{
auto
&
lod_level
=
batch_lod
[
level_idx
];
for
(
size_t
k
=
1
;
k
<
ins_lod
[
level_idx
].
size
();
++
k
)
{
lod_level
.
push_back
(
ins_lod
[
level_idx
][
k
]
+
lod_level
.
back
());
}
}
}
Tensor
dst
=
out_tensor
.
Slice
(
dst_offset
,
dst_offset
+
ins_shape
[
0
]);
TensorCopy
(
buffer_
[
i
][
j
],
platform
::
CPUPlace
(),
&
dst
);
dst_offset
+=
ins_shape
[
0
];
}
out_tensor
.
set_lod
(
batch_lod
);
out
->
push_back
(
out_tensor
);
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/reader.h
浏览文件 @
c58ce0a6
...
...
@@ -60,83 +60,8 @@ class DecoratedReader : public ReaderBase {
ReaderBase
*
reader_
;
};
// file readers
template
<
typename
T
>
class
RandomDataGenerator
:
public
FileReader
{
public:
RandomDataGenerator
(
const
std
::
vector
<
DDim
>&
shapes
,
float
min
,
float
max
)
:
FileReader
(
shapes
),
min_
(
min
),
max_
(
max
)
{
PADDLE_ENFORCE_LE
(
min
,
max
,
"'min' shouldn't be greater than 'max'.(%f vs %f)"
,
min
,
max
);
unsigned
int
seed
=
std
::
random_device
()();
engine_
.
seed
(
seed
);
dist_
=
std
::
uniform_real_distribution
<
float
>
(
min_
,
max_
);
}
void
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
override
{
out
->
clear
();
out
->
reserve
(
shapes_
.
size
());
for
(
const
DDim
&
shape
:
shapes_
)
{
PADDLE_ENFORCE_GE
(
shape
.
size
(),
2
,
"The rank of reader's output data should be 2 at least.(Now it's %d)"
,
shape
.
size
());
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
shape
);
T
*
data
=
out_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
int64_t
numel
=
product
(
shape
);
for
(
int64_t
i
=
0
;
i
<
numel
;
++
i
)
{
data
[
i
]
=
dist_
(
engine_
);
}
out
->
push_back
(
out_tensor
);
}
}
bool
HasNext
()
const
override
{
return
true
;
}
void
ReInit
()
override
{
return
;
}
private:
float
min_
;
float
max_
;
std
::
minstd_rand
engine_
;
std
::
uniform_real_distribution
<
float
>
dist_
;
};
// decorated readers
class
ShuffleReader
:
public
DecoratedReader
{
public:
ShuffleReader
(
ReaderBase
*
reader
,
int
buffer_size
)
:
DecoratedReader
(
reader
),
buffer_size_
(
buffer_size
),
iteration_pos_
(
0
)
{
buffer_
.
reserve
(
buffer_size
);
}
void
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
override
;
private:
int
buffer_size_
;
std
::
vector
<
std
::
vector
<
LoDTensor
>>
buffer_
;
size_t
iteration_pos_
;
};
class
BatchReader
:
public
DecoratedReader
{
public:
BatchReader
(
ReaderBase
*
reader
,
int
batch_size
)
:
DecoratedReader
(
reader
),
batch_size_
(
batch_size
)
{
buffer_
.
reserve
(
batch_size_
);
}
void
ReadNext
(
std
::
vector
<
LoDTensor
>*
out
)
override
;
private:
int
batch_size_
;
std
::
vector
<
std
::
vector
<
LoDTensor
>>
buffer_
;
};
// The ReaderHolder is used as readers' unified wrapper,
// making it easier to access different type readers in Variables.
// The ReaderHolder is used as reader' unified wrapper,
// making it easier to access different type reader in Variables.
class
ReaderHolder
{
public:
void
Reset
(
ReaderBase
*
reader
)
{
reader_
.
reset
(
reader
);
}
...
...
paddle/fluid/framework/tensor_util_test.cc
浏览文件 @
c58ce0a6
...
...
@@ -235,27 +235,53 @@ TEST(TensorToVector, Tensor) {
TEST
(
TensorContainsNAN
,
CPU
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
Tensor
src
;
float
*
buf
=
src
.
mutable_data
<
float
>
({
3
},
CPUPlace
());
buf
[
0
]
=
0.0
;
buf
[
1
]
=
NAN
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsNAN
(
src
));
buf
[
1
]
=
0.0
;
ASSERT_FALSE
(
TensorContainsNAN
(
src
));
{
Tensor
src
;
float
*
buf
=
src
.
mutable_data
<
float
>
({
3
},
CPUPlace
());
buf
[
0
]
=
0.0
;
buf
[
1
]
=
NAN
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsNAN
(
src
));
buf
[
1
]
=
0.0
;
ASSERT_FALSE
(
TensorContainsNAN
(
src
));
}
{
Tensor
src
;
float16
*
buf
=
src
.
mutable_data
<
float16
>
({
3
},
CPUPlace
());
buf
[
0
]
=
0.0
;
buf
[
1
].
x
=
0x7fff
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsNAN
(
src
));
buf
[
1
]
=
0.0
;
ASSERT_FALSE
(
TensorContainsNAN
(
src
));
}
}
TEST
(
TensorContainsInf
,
CPU
)
{
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
Tensor
src
;
double
*
buf
=
src
.
mutable_data
<
double
>
({
3
},
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsInf
(
src
));
buf
[
1
]
=
1.0
;
ASSERT_FALSE
(
TensorContainsInf
(
src
));
{
Tensor
src
;
double
*
buf
=
src
.
mutable_data
<
double
>
({
3
},
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsInf
(
src
));
buf
[
1
]
=
1.0
;
ASSERT_FALSE
(
TensorContainsInf
(
src
));
}
{
Tensor
src
;
float16
*
buf
=
src
.
mutable_data
<
float16
>
({
3
},
CPUPlace
());
buf
[
0
]
=
1.0
;
buf
[
1
].
x
=
0x7c00
;
buf
[
2
]
=
0.0
;
ASSERT_TRUE
(
TensorContainsInf
(
src
));
buf
[
1
]
=
1.0
;
ASSERT_FALSE
(
TensorContainsInf
(
src
));
}
}
TEST
(
Tensor
,
FromAndToStream
)
{
...
...
paddle/fluid/framework/tensor_util_test.cu
浏览文件 @
c58ce0a6
...
...
@@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) {
buf
[
1
]
=
0.1
;
buf
[
2
]
=
NAN
;
}
static
__global__
void
FillInf
(
float
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
INFINITY
;
buf
[
2
]
=
0.5
;
}
static
__global__
void
FillNAN
(
platform
::
float16
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
]
=
0.1
;
buf
[
2
].
x
=
0x7fff
;
}
static
__global__
void
FillInf
(
platform
::
float16
*
buf
)
{
buf
[
0
]
=
0.0
;
buf
[
1
].
x
=
0x7c00
;
buf
[
2
]
=
0.5
;
}
TEST
(
TensorContainsNAN
,
GPU
)
{
Tensor
tensor
;
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
using
namespace
paddle
::
platform
;
CUDAPlace
gpu
(
0
);
auto
&
pool
=
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
FillNAN
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsNAN
(
tensor
));
}
}
TEST
(
TensorContainsInf
,
GPU
)
{
Tensor
tensor
;
platform
::
CUDAPlace
gpu
(
0
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
using
namespace
paddle
::
platform
;
CUDAPlace
gpu
(
0
);
auto
&
pool
=
DeviceContextPool
::
Instance
();
auto
*
cuda_ctx
=
pool
.
GetByPlace
(
gpu
);
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
{
Tensor
tensor
;
float
*
buf
=
tensor
.
mutable_data
<
float
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
{
Tensor
tensor
;
float16
*
buf
=
tensor
.
mutable_data
<
float16
>
({
3
},
gpu
);
FillInf
<<<
1
,
1
,
0
,
cuda_ctx
->
stream
()
>>>
(
buf
);
cuda_ctx
->
Wait
();
ASSERT_TRUE
(
TensorContainsInf
(
tensor
));
}
}
}
// namespace framework
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
c58ce0a6
file
(
GLOB GENERAL_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"*_op.cc"
)
string
(
REPLACE
"_mkldnn"
""
GENERAL_OPS
"
${
GENERAL_OPS
}
"
)
string
(
REPLACE
".cc"
""
GENERAL_OPS
"
${
GENERAL_OPS
}
"
)
list
(
REMOVE_DUPLICATES GENERAL_OPS
)
set
(
DEPS_OPS
""
)
set
(
pybind_file
${
PADDLE_SOURCE_DIR
}
/paddle/fluid/pybind/pybind.h
)
file
(
WRITE
${
pybind_file
}
"// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!
\n\n
"
)
...
...
@@ -13,6 +15,8 @@ function(op_library TARGET)
set
(
cu_cc_srcs
)
set
(
cudnn_cu_cc_srcs
)
set
(
CUDNN_FILE
)
set
(
mkldnn_cc_srcs
)
set
(
MKLDNN_FILE
)
set
(
op_common_deps operator op_registry math_function
)
set
(
options
""
)
set
(
oneValueArgs
""
)
...
...
@@ -36,12 +40,20 @@ function(op_library TARGET)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
CUDNN_FILE
}
.cu.cc
)
list
(
APPEND cudnn_cu_cc_srcs
${
CUDNN_FILE
}
.cu.cc
)
endif
()
if
(
WITH_MKLDNN
)
string
(
REPLACE
"_op"
"_mkldnn_op"
MKLDNN_FILE
"
${
TARGET
}
"
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
MKLDNN_FILE
}
.cc
)
list
(
APPEND mkldnn_cc_srcs
${
MKLDNN_FILE
}
.cc
)
endif
()
endif
()
else
()
foreach
(
src
${
op_library_SRCS
}
)
if
(
${
src
}
MATCHES
".*
\\
.cu$"
)
list
(
APPEND cu_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*_cudnn_op.cu.cc$"
)
list
(
APPEND cudnn_cu_cc_srcs
${
src
}
)
elseif
(
WITH_MKLDNN AND
${
src
}
MATCHES
".*_mkldnn_op.cc$"
)
list
(
APPEND mkldnn_cc_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*
\\
.cu.cc$"
)
list
(
APPEND cu_cc_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*
\\
.cc$"
)
...
...
@@ -62,15 +74,15 @@ function(op_library TARGET)
set
(
DEPS_OPS
${
TARGET
}
${
DEPS_OPS
}
PARENT_SCOPE
)
endif
()
if
(
WITH_GPU
)
nv_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
cu_cc_srcs
}
${
cudnn_cu_cc_srcs
}
${
cu_srcs
}
DEPS
${
op_library_DEPS
}
nv_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
cu_cc_srcs
}
${
cudnn_cu_cc_srcs
}
${
mkldnn_cc_srcs
}
${
cu_srcs
}
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
else
()
cc_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
cc_library
(
${
TARGET
}
SRCS
${
cc_srcs
}
${
mkldnn_cc_srcs
}
DEPS
${
op_library_DEPS
}
${
op_common_deps
}
)
endif
()
# Define operators that don't need pybind here.
foreach
(
manual_pybind_op
"net_op"
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
"create_reader_op"
)
foreach
(
manual_pybind_op
"net_op"
"compare_op"
"logical_op"
"nccl_op"
"tensor_array_read_write_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
endif
()
...
...
@@ -101,7 +113,8 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP
list
(
LENGTH cu_srcs cu_srcs_len
)
list
(
LENGTH cu_cc_srcs cu_cc_srcs_len
)
if
(
${
pybind_flag
}
EQUAL 0 AND
${
cu_srcs_len
}
EQUAL 0 AND
${
cu_cc_srcs_len
}
EQUAL 0
)
list
(
LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len
)
if
(
${
pybind_flag
}
EQUAL 0 AND
${
mkldnn_cc_srcs_len
}
EQUAL 0 AND
${
cu_srcs_len
}
EQUAL 0 AND
${
cu_cc_srcs_len
}
EQUAL 0
)
file
(
APPEND
${
pybind_file
}
"USE_CPU_ONLY_OP(
${
TARGET
}
);
\n
"
)
set
(
pybind_flag 1
)
endif
()
...
...
@@ -112,6 +125,11 @@ function(op_library TARGET)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
TARGET
}
, CUDNN);
\n
"
)
endif
()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if
(
WITH_MKLDNN AND
${
mkldnn_cc_srcs_len
}
GREATER 0
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
TARGET
}
, MKLDNN);
\n
"
)
endif
()
# pybind USE_OP
if
(
${
pybind_flag
}
EQUAL 0
)
file
(
APPEND
${
pybind_file
}
"USE_OP(
${
TARGET
}
);
\n
"
)
...
...
@@ -128,8 +146,8 @@ else()
set
(
DEPS_OPS
${
DEPS_OPS
}
nccl_op
)
endif
()
add_subdirectory
(
detail
)
if
(
WITH_DISTRIBUTE
)
add_subdirectory
(
detail
)
set
(
DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
op_library
(
send_op DEPS
${
DISTRIBUTE_DEPS
}
)
...
...
@@ -170,7 +188,6 @@ op_library(recurrent_op DEPS executor)
op_library
(
warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale
)
op_library
(
cos_sim_op DEPS cos_sim_functor
)
op_library
(
parallel_do_op DEPS executor
)
op_library
(
create_reader_op DEPS reader
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv
)
...
...
@@ -184,12 +201,18 @@ op_library(save_op DEPS lod_tensor)
op_library
(
load_op DEPS lod_tensor
)
op_library
(
save_combine_op DEPS lod_tensor
)
op_library
(
load_combine_op DEPS lod_tensor
)
op_library
(
concat_op DEPS concat_functor
)
list
(
REMOVE_ITEM GENERAL_OPS
${
DEPS_OPS
}
)
foreach
(
src
${
GENERAL_OPS
}
)
op_library
(
${
src
}
)
endforeach
()
file
(
APPEND
${
pybind_file
}
"USE_OP(less_than);
\n
USE_OP(logical_and);
\n
USE_NO_KERNEL_OP(read_from_array);
\n
USE_NO_KERNEL_OP(create_random_data_generator);
\n
"
)
file
(
APPEND
${
pybind_file
}
"USE_OP(less_than);
\n
USE_OP(logical_and);
\n
USE_NO_KERNEL_OP(read_from_array);
\n
"
)
add_subdirectory
(
reader
)
foreach
(
src
${
READER_LIBRARY
}
)
set
(
OP_LIBRARY
${
src
}
${
OP_LIBRARY
}
)
endforeach
()
set
(
GLOB_OP_LIB
${
OP_LIBRARY
}
CACHE INTERNAL
"Global OP library"
)
...
...
paddle/fluid/operators/concat_op.cc
浏览文件 @
c58ce0a6
...
...
@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_EX
(
concat
,
ops
::
ConcatOp
,
ops
::
ConcatOpMaker
,
concat_grad
,
ops
::
ConcatOpGrad
,
false
)
REGISTER_OP_CPU_KERNEL
(
concat
,
ops
::
ConcatKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
)
REGISTER_OP_CPU_KERNEL
(
concat_grad
,
ops
::
ConcatGradKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
)
REGISTER_OP_CPU_KERNEL
(
concat
,
ops
::
ConcatKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
)
REGISTER_OP_CPU_KERNEL
(
concat_grad
,
ops
::
ConcatGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
)
paddle/fluid/operators/concat_op.h
浏览文件 @
c58ce0a6
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/strided_memcpy.h"
namespace
paddle
{
...
...
@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
framework
::
Tensor
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
int64_t
axis
=
static_cast
<
int64_t
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
size_t
output_offset
=
0
;
// If axis >=1, copy to out immediately need to call many times
// of cuda memcpy. Copy the input to cpu and do the stride copy,
// then copy to gpu output.
if
(
platform
::
is_gpu_place
(
place
)
&&
axis
>=
1
)
{
platform
::
CPUPlace
copy_place
;
auto
&
cpu_ctx
=
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
copy_place
);
framework
::
Tensor
cpu_out
;
cpu_out
.
Resize
(
out
->
dims
());
cpu_out
.
mutable_data
<
T
>
(
copy_place
);
auto
&
dev_ctx
=
ctx
.
device_context
();
std
::
vector
<
std
::
unique_ptr
<
framework
::
Tensor
>>
cpu_ins
;
for
(
auto
*
in
:
ins
)
{
std
::
unique_ptr
<
framework
::
Tensor
>
cpu_in
(
new
framework
::
Tensor
);
framework
::
TensorCopy
(
*
in
,
copy_place
,
dev_ctx
,
cpu_in
.
get
());
cpu_ins
.
emplace_back
(
std
::
move
(
cpu_in
));
}
// TODO(dzhwinter): overlap copy and compute stream
// https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
dev_ctx
.
Wait
();
for
(
auto
&
in
:
cpu_ins
)
{
auto
&
cpu_in
=
*
in
.
get
();
auto
in_stride
=
framework
::
stride_numel
(
cpu_in
.
dims
());
StridedNumelCopyWithAxis
<
T
>
(
cpu_ctx
,
axis
,
cpu_out
.
data
<
T
>
()
+
output_offset
,
out_stride
,
cpu_in
.
data
<
T
>
(),
in_stride
,
in_stride
[
axis
]);
output_offset
+=
in_stride
[
axis
];
}
framework
::
TensorCopy
(
cpu_out
,
place
,
dev_ctx
,
out
);
}
else
{
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if
(
axis
==
0
&&
ins
.
size
()
<
10
)
{
size_t
output_offset
=
0
;
for
(
auto
*
in
:
ins
)
{
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
StridedNumelCopyWithAxis
<
T
>
(
ctx
.
device_context
(),
axis
,
out
->
data
<
T
>
()
+
output_offset
,
out_stride
,
in
->
data
<
T
>
(),
in_stride
,
in_stride
[
axis
]);
output_offset
+=
in_stride
[
axis
];
}
}
else
{
std
::
vector
<
framework
::
Tensor
>
inputs
(
ins
.
size
());
for
(
size_t
j
=
0
;
j
<
ins
.
size
();
++
j
)
{
inputs
[
j
]
=
*
ins
[
j
];
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
paddle
::
operators
::
math
::
ConcatFunctor
<
DeviceContext
,
T
>
concat_functor
;
concat_functor
(
dev_ctx
,
inputs
,
static_cast
<
int
>
(
axis
),
out
);
}
}
};
...
...
@@ -86,16 +63,31 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
outs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
int64_t
axis
=
static_cast
<
int64_t
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
size_t
input_offset
=
0
;
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
for
(
auto
&
out
:
outs
)
{
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
StridedNumelCopyWithAxis
<
T
>
(
ctx
.
device_context
(),
axis
,
out
->
data
<
T
>
(),
out_stride
,
in
->
data
<
T
>
()
+
input_offset
,
in_stride
,
out_stride
[
axis
]);
input_offset
+=
out_stride
[
axis
];
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if
(
axis
==
0
&&
outs
.
size
()
<
10
)
{
size_t
input_offset
=
0
;
auto
in_stride
=
framework
::
stride_numel
(
in
->
dims
());
for
(
auto
&
out
:
outs
)
{
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
out_stride
=
framework
::
stride_numel
(
out
->
dims
());
StridedNumelCopyWithAxis
<
T
>
(
ctx
.
device_context
(),
axis
,
out
->
data
<
T
>
(),
out_stride
,
in
->
data
<
T
>
()
+
input_offset
,
in_stride
,
out_stride
[
axis
]);
input_offset
+=
out_stride
[
axis
];
}
}
else
{
std
::
vector
<
framework
::
Tensor
>
outputs
(
outs
.
size
());
for
(
size_t
j
=
0
;
j
<
outs
.
size
();
++
j
)
{
outs
[
j
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
outputs
[
j
]
=
*
outs
[
j
];
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
paddle
::
operators
::
math
::
ConcatGradFunctor
<
DeviceContext
,
T
>
concat_grad_functor
;
concat_grad_functor
(
dev_ctx
,
*
in
,
static_cast
<
int
>
(
axis
),
outputs
);
}
}
};
...
...
paddle/fluid/operators/conv_mkldnn_op.cc
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
MKLDNNDeviceContext
;
using
paddle
::
platform
::
MKLDNNMemDesc
;
using
mkldnn
::
memory
;
// Note: paddle has also "memory" namespace
using
mkldnn
::
primitive
;
using
mkldnn
::
convolution_forward
;
using
mkldnn
::
convolution_backward_weights
;
using
mkldnn
::
convolution_backward_data
;
using
mkldnn
::
convolution_direct
;
using
mkldnn
::
prop_kind
;
using
mkldnn
::
padding_kind
;
using
mkldnn
::
stream
;
namespace
{
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
);
convolution_backward_weights
::
primitive_desc
ConvBwdWeightsPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
diff_weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
);
convolution_backward_data
::
primitive_desc
ConvBwdDataPrimitiveDesc
(
const
memory
::
desc
&
diff_src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
);
}
// anonymous namespace
template
<
typename
T
>
class
ConvOpMkldnnKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const
std
::
string
key
=
ctx
.
op
().
Output
(
"Output"
);
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE
(
groups
==
1
,
"group convolution is not implemented yet"
);
PADDLE_ENFORCE
(
dilations
.
size
()
==
2
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
,
"dilation in convolution is not implemented yet"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
// allocate memory for output
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
4
,
"Input must be with 4 dimensions, i.e. NCHW"
);
PADDLE_ENFORCE
(
filter
->
dims
().
size
()
==
4
,
"Filter must be with 4 dimensions, i.e. OIHW"
);
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
weights_tz
=
paddle
::
framework
::
vectorize2int
(
filter
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
// TODO(pzelazko-intel): support more formats
// memory descriptors for convolution src/weight/dst
auto
conv_src_md
=
MKLDNNMemDesc
(
src_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
conv_weights_md
=
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
oihw
);
auto
conv_dst_md
=
MKLDNNMemDesc
(
dst_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
// create memory primitives
auto
conv_src_memory
=
memory
({
conv_src_md
,
mkldnn_engine
},
(
void
*
)
input_data
);
auto
conv_weights_memory
=
memory
({
conv_weights_md
,
mkldnn_engine
},
(
void
*
)
filter_data
);
auto
conv_dst_memory
=
memory
({
conv_dst_md
,
mkldnn_engine
},
output_data
);
std
::
unique_ptr
<
convolution_forward
::
primitive_desc
>
conv_pd
=
ConvFwdPrimitiveDesc
(
conv_src_md
,
conv_weights_md
,
conv_dst_md
,
strides
,
paddings
,
mkldnn_engine
);
// save p_conv_pd into dev_ctx to be referred in backward path
auto
p_conv_pd
=
conv_pd
.
get
();
std
::
shared_ptr
<
void
>
conv_pd_value
=
std
::
move
(
conv_pd
);
dev_ctx
.
SetBlob
(
key_conv_pd
,
conv_pd_value
);
// create convolution op primitive
auto
conv_prim
=
convolution_forward
(
*
p_conv_pd
,
conv_src_memory
,
conv_weights_memory
,
conv_dst_memory
);
// push op to stream and wait MKLDNN until it's executed
std
::
vector
<
primitive
>
pipeline
{
conv_prim
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
};
template
<
typename
T
>
class
ConvGradOpMkldnnKernel
:
public
paddle
::
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
paddle
::
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
const
Tensor
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
const
Tensor
*
output
=
ctx
.
Input
<
Tensor
>
(
"Output"
);
const
Tensor
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
Tensor
*
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
if
(
!
input_grad
&&
!
filter_grad
)
return
;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const
std
::
string
key
=
ctx
.
op
().
Input
(
"Output"
);
const
std
::
string
key_conv_pd
=
key
+
"@conv_pd"
;
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
->
data
<
T
>
();
T
*
input_grad_data
=
nullptr
;
T
*
filter_grad_data
=
nullptr
;
// allocate memory for gradient of input/filter
if
(
input_grad
)
{
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
if
(
filter_grad
)
{
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
std
::
vector
<
int
>
src_tz
=
paddle
::
framework
::
vectorize2int
(
input
->
dims
());
std
::
vector
<
int
>
weights_tz
=
paddle
::
framework
::
vectorize2int
(
filter
->
dims
());
std
::
vector
<
int
>
dst_tz
=
paddle
::
framework
::
vectorize2int
(
output
->
dims
());
// TODO(pzelazko-intel): support more formats
auto
conv_src_md
=
MKLDNNMemDesc
(
src_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
conv_diff_src_md
=
MKLDNNMemDesc
(
src_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
auto
conv_weights_md
=
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
oihw
);
auto
conv_diff_weights_md
=
MKLDNNMemDesc
(
weights_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
oihw
);
auto
conv_diff_dst_md
=
MKLDNNMemDesc
(
dst_tz
,
memory
::
data_type
::
f32
,
memory
::
format
::
nchw
);
// create memory
auto
conv_diff_dst_memory
=
memory
({
conv_diff_weights_md
,
mkldnn_engine
},
(
void
*
)
output_grad_data
);
// Retrieve conv_pd from device context
std
::
shared_ptr
<
void
>
conv_pd
;
convolution_forward
::
primitive_desc
*
p_conv_pd
;
conv_pd
=
dev_ctx
.
GetBlob
(
key_conv_pd
);
PADDLE_ENFORCE
(
conv_pd
!=
nullptr
,
"Fail to find conv_pd in device context"
);
p_conv_pd
=
static_cast
<
convolution_forward
::
primitive_desc
*>
(
conv_pd
.
get
());
// create backward conv primitive for weights
if
(
filter_grad
)
{
// create primitive descriptor
convolution_backward_weights
::
primitive_desc
conv_bwd_weights_pd
=
ConvBwdWeightsPrimitiveDesc
(
conv_src_md
,
conv_diff_weights_md
,
conv_diff_dst_md
,
strides
,
paddings
,
*
p_conv_pd
,
mkldnn_engine
);
// create memory
auto
conv_diff_weights_memory
=
memory
(
{
conv_diff_weights_md
,
mkldnn_engine
},
(
void
*
)
filter_grad_data
);
auto
conv_src_memory
=
memory
({
conv_src_md
,
mkldnn_engine
},
(
void
*
)
input_data
);
// create backward conv primitive for weights
auto
conv_bwd_weights_prim
=
convolution_backward_weights
(
conv_bwd_weights_pd
,
conv_src_memory
,
conv_diff_dst_memory
,
conv_diff_weights_memory
);
// push primitive and execute it
std
::
vector
<
primitive
>
pipeline
{
conv_bwd_weights_prim
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
if
(
input_grad
)
{
// create primitive descriptor
convolution_backward_data
::
primitive_desc
conv_bwd_data_pd
=
ConvBwdDataPrimitiveDesc
(
conv_diff_src_md
,
conv_weights_md
,
conv_diff_dst_md
,
strides
,
paddings
,
*
p_conv_pd
,
mkldnn_engine
);
// create memory
auto
conv_diff_src_memory
=
memory
({
conv_diff_src_md
,
mkldnn_engine
},
(
void
*
)
input_grad_data
);
auto
conv_weights_memory
=
memory
({
conv_weights_md
,
mkldnn_engine
},
(
void
*
)
filter_data
);
// create backward conv primitive for data
auto
conv_bwd_data_prim
=
convolution_backward_data
(
conv_bwd_data_pd
,
conv_diff_dst_memory
,
conv_weights_memory
,
conv_diff_src_memory
);
// push primitive and execute it
std
::
vector
<
primitive
>
pipeline
{
conv_bwd_data_prim
};
stream
(
stream
::
kind
::
eager
).
submit
(
pipeline
).
wait
();
}
}
// Compute()
};
namespace
{
std
::
unique_ptr
<
convolution_forward
::
primitive_desc
>
ConvFwdPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
mkldnn
::
engine
&
engine
)
{
mkldnn
::
memory
::
dims
stride_dims
=
{
strides
[
0
],
strides
[
1
]};
mkldnn
::
memory
::
dims
padding_dims
=
{
paddings
[
0
],
paddings
[
1
]};
auto
conv_desc
=
mkldnn
::
convolution_forward
::
desc
(
mkldnn
::
prop_kind
::
forward
,
mkldnn
::
convolution_direct
,
src
,
weights
,
dst
,
stride_dims
,
padding_dims
,
padding_dims
,
mkldnn
::
padding_kind
::
zero
);
auto
p_conv_pd
=
new
convolution_forward
::
primitive_desc
(
conv_desc
,
engine
);
return
std
::
unique_ptr
<
mkldnn
::
convolution_forward
::
primitive_desc
>
(
p_conv_pd
);
}
convolution_backward_weights
::
primitive_desc
ConvBwdWeightsPrimitiveDesc
(
const
memory
::
desc
&
src
,
const
memory
::
desc
&
diff_weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
)
{
auto
conv_bwd_weights_desc
=
convolution_backward_weights
::
desc
(
convolution_direct
,
src
,
diff_weights
,
diff_dst
,
strides
,
paddings
,
paddings
,
padding_kind
::
zero
);
return
convolution_backward_weights
::
primitive_desc
(
conv_bwd_weights_desc
,
engine
,
conv_pd
);
}
convolution_backward_data
::
primitive_desc
ConvBwdDataPrimitiveDesc
(
const
memory
::
desc
&
diff_src
,
const
memory
::
desc
&
weights
,
const
memory
::
desc
&
diff_dst
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
convolution_forward
::
primitive_desc
&
conv_pd
,
const
mkldnn
::
engine
&
engine
)
{
auto
conv_bwd_data_desc
=
convolution_backward_data
::
desc
(
convolution_direct
,
diff_src
,
weights
,
diff_dst
,
strides
,
paddings
,
paddings
,
padding_kind
::
zero
);
return
convolution_backward_data
::
primitive_desc
(
conv_bwd_data_desc
,
engine
,
conv_pd
);
}
}
// anonymous namespace
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
conv2d
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ConvOpMkldnnKernel
<
float
>
);
REGISTER_OP_KERNEL
(
conv2d_grad
,
MKLDNN
,
::
paddle
::
platform
::
CPUPlace
,
ops
::
ConvGradOpMkldnnKernel
<
float
>
);
paddle/fluid/operators/conv_op.cc
浏览文件 @
c58ce0a6
...
...
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
ConvOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
#endif
framework
::
LibraryType
library_
;
if
(
use_cudnn
)
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
else
{
library_
=
framework
::
LibraryType
::
kPlain
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
...
...
@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"data_format"
,
"(string, default NCHW) Only used in "
...
...
@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"data_format"
,
"(string, default NCHW) Only used in "
...
...
@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
ConvOpGrad
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
#endif
framework
::
LibraryType
library_
;
if
(
use_cudnn
)
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
else
{
library_
=
framework
::
LibraryType
::
kPlain
;
#ifdef PADDLE_WITH_MKLDNN
if
(
library_
==
framework
::
LibraryType
::
kPlain
&&
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kMKLDNN
;
}
#endif
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
...
...
paddle/fluid/operators/create_reader_op.cc
已删除
100644 → 0
浏览文件 @
26922a3b
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace
paddle
{
namespace
operators
{
static
std
::
vector
<
framework
::
DDim
>
RestoreShapes
(
const
std
::
vector
<
int
>&
shape_concat
,
const
std
::
vector
<
int
>&
ranks
)
{
std
::
vector
<
framework
::
DDim
>
res
;
int
offset
=
0
;
for
(
int
len
:
ranks
)
{
auto
start_it
=
shape_concat
.
begin
()
+
offset
;
auto
end_it
=
start_it
+
len
;
res
.
push_back
(
framework
::
make_ddim
(
std
::
vector
<
int
>
(
start_it
,
end_it
)));
offset
+=
len
;
}
return
res
;
}
// general infershape for file readers
class
CreateFileReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output file reader should not be null."
);
const
auto
shape_concat
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
if
(
ctx
->
IsRuntime
())
{
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d)."
,
lod_levels
.
size
(),
shapes
.
size
());
framework
::
VarDesc
*
reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
reader
->
SetLoDLevels
(
lod_levels
);
}
}
};
// general infershape for decorated readers
class
CreateDecoratedReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"UnderlyingReader"
),
"Input(UnderlyingReader) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output decorated reader should not be null."
);
ctx
->
SetReaderDims
(
"Out"
,
ctx
->
GetReaderDims
(
"UnderlyingReader"
));
if
(
ctx
->
IsRuntime
())
{
framework
::
VarDesc
*
in_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"UnderlyingReader"
)[
0
]);
framework
::
VarDesc
*
out_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
out_reader
->
SetLoDLevels
(
in_reader
->
GetLoDLevels
());
}
}
};
// general var type inference for file readers
class
CreateFileReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
std
::
string
reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
}
};
// general var type inference for decorated readers
class
CreateDecoratedReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
std
::
string
in_reader_name
=
op_desc
.
Input
(
"UnderlyingReader"
)[
0
];
framework
::
VarDesc
*
in_reader
=
block
->
FindVarRecursive
(
in_reader_name
);
std
::
string
out_reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
out_reader
=
block
->
FindVarRecursive
(
out_reader_name
);
out_reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
out_reader
->
SetDataTypes
(
in_reader
->
GetDataTypes
());
}
};
template
<
typename
T
>
class
CreateRandomDataGeneratorOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
shape_concat
=
Attr
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
&
ranks
=
Attr
<
std
::
vector
<
int
>>
(
"ranks"
);
PADDLE_ENFORCE
(
!
shape_concat
.
empty
()
&&
!
ranks
.
empty
());
PADDLE_ENFORCE_EQ
(
std
::
accumulate
(
ranks
.
begin
(),
ranks
.
end
(),
0
),
int
(
shape_concat
.
size
()),
"The accumulate of all ranks should be equal to the "
"shape concat's length."
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
framework
::
RandomDataGenerator
<
T
>
(
shapes
,
Attr
<
float
>
(
"min"
),
Attr
<
float
>
(
"max"
)));
}
};
class
CreateRandomDataGeneratorOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
CreateRandomDataGeneratorOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddOutput
(
"Out"
,
"(ReaderHolder) The created random reader."
);
AddAttr
<
std
::
vector
<
int
>>
(
"shape_concat"
,
"The concat of all data's shapes."
);
AddAttr
<
std
::
vector
<
int
>>
(
"ranks"
,
"The ranks of each data."
"e.g."
"shape_concat = [2,3,4,5,6]"
"ranks = [3,2]"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively."
);
AddAttr
<
std
::
vector
<
int
>>
(
"lod_levels"
,
"The LoD levels of each data."
);
AddAttr
<
float
>
(
"min"
,
"The lower bound of reader's uniform distribution."
);
AddAttr
<
float
>
(
"max"
,
"The upper bound of reader's uniform distribution."
);
AddComment
(
R"DOC(
CreateRandomDataGenerator Operator
This Op creates a random reader.
The reader generates random data instead of really reading from files.
Generated data follow an uniform distribution between 'min' and 'max'.
)DOC"
);
}
};
class
CreateShuffleReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
framework
::
ShuffleReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"buffer_size"
)));
}
};
class
CreateShuffleReaderOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
CreateShuffleReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddInput
(
"UnderlyingReader"
,
"(ReaderHolder) The underlying reader for creating a shuffle reader."
);
AddOutput
(
"Out"
,
"(ReaderHolder) The created shuffle reader."
);
AddAttr
<
int
>
(
"buffer_size"
,
"The shuffle buffer size."
).
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateShuffleReader Operator
A shuffle reader takes another reader as its 'underlying reader'
and yields the underlying reader's outputs in a shuffled order.
)DOC"
);
}
};
class
CreateBatchReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
framework
::
BatchReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"batch_size"
)));
}
};
class
CreateBatchReaderOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
CreateBatchReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddInput
(
"UnderlyingReader"
,
"(ReaderHolder) The underlying reader for creating a batch reader."
);
AddOutput
(
"Out"
,
"(ReaderHolder) The created batch reader."
);
AddAttr
<
int
>
(
"batch_size"
,
"How many instances the batch reader yields each time."
)
.
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateBatchReader Operator
A batch reader takes another reader as its 'underlying reader',
gathers the underlying reader's outputs and then yields them in batches.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
create_random_data_generator
,
ops
::
CreateRandomDataGeneratorOp
<
float
>
,
ops
::
CreateFileReaderInferShape
,
ops
::
CreateRandomDataGeneratorOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
CreateFileReaderInferVarType
);
REGISTER_OPERATOR
(
create_shuffle_reader
,
ops
::
CreateShuffleReaderOp
,
ops
::
CreateDecoratedReaderInferShape
,
ops
::
CreateShuffleReaderOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
CreateDecoratedReaderInferVarType
);
REGISTER_OPERATOR
(
create_batch_reader
,
ops
::
CreateBatchReaderOp
,
ops
::
CreateDecoratedReaderInferShape
,
ops
::
CreateBatchReaderOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
,
ops
::
CreateDecoratedReaderInferVarType
);
paddle/fluid/operators/detail/CMakeLists.txt
浏览文件 @
c58ce0a6
grpc_library
(
sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows
)
if
(
WITH_DISTRIBUTE
)
grpc_library
(
sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows
)
endif
()
paddle/fluid/operators/elementwise_mul_op.h
浏览文件 @
c58ce0a6
...
...
@@ -40,80 +40,14 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
};
template
<
typename
T
>
struct
ElementwiseMulGradFunctor
{
template
<
typename
Device
,
typename
X
,
typename
Y
,
typename
Z
,
typename
dX
,
typename
dY
,
typename
dZ
>
void
operator
()(
Device
d
,
X
x
,
Y
y
,
Z
z
,
dX
dx
,
dY
dy
,
dZ
dz
)
{
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
dz_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dz
);
if
(
dx
)
{
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dx
);
dx_e
.
device
(
d
)
=
dz_e
*
y_e
;
}
if
(
dy
)
{
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dy
);
dy_e
.
device
(
d
)
=
x_e
*
dz_e
;
}
}
struct
IdentityGrad_DX
{
HOSTDEVICE
T
operator
()(
T
x
,
T
y
,
T
out
,
T
dout
)
const
{
return
dout
*
y
;
}
};
template
<
typename
T
>
struct
ElementwiseMulBroadCastGradFunctor
{
template
<
typename
Device
,
typename
X
,
typename
Y
,
typename
Z
,
typename
dX
,
typename
dY
,
typename
dZ
,
typename
Pre
,
typename
N
>
void
operator
()(
Device
d
,
X
x
,
Y
y
,
Z
z
,
dX
dx
,
dY
dy
,
dZ
dz
,
Pre
pre
,
N
n
)
{
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
dz_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dz
);
auto
y_e_bcast
=
y_e
.
reshape
(
Eigen
::
DSizes
<
int
,
2
>
(
1
,
n
))
.
broadcast
(
Eigen
::
DSizes
<
int
,
2
>
(
pre
,
1
))
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
x_e
.
size
()));
if
(
dx
)
{
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dx
);
dx_e
.
device
(
d
)
=
dz_e
*
y_e_bcast
;
}
if
(
dy
)
{
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dy
);
dy_e
.
device
(
d
)
=
(
x_e
*
dz_e
)
.
reshape
(
Eigen
::
DSizes
<
int
,
2
>
(
pre
,
n
))
.
sum
(
Eigen
::
array
<
int
,
1
>
{{
0
}});
}
}
struct
IdentityGrad_DY
{
HOSTDEVICE
T
operator
()(
T
x
,
T
y
,
T
out
,
T
dout
)
const
{
return
dout
*
x
;
}
};
template
<
typename
T
>
struct
ElementwiseMulBroadCast2GradFunctor
{
template
<
typename
Device
,
typename
X
,
typename
Y
,
typename
Z
,
typename
dX
,
typename
dY
,
typename
dZ
,
typename
Pre
,
typename
N
,
typename
Post
>
void
operator
()(
Device
d
,
X
x
,
Y
y
,
Z
z
,
dX
dx
,
dY
dy
,
dZ
dz
,
Pre
pre
,
N
n
,
Post
post
)
{
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
dz_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dz
);
auto
y_e_bcast
=
y_e
.
reshape
(
Eigen
::
DSizes
<
int
,
3
>
(
1
,
n
,
1
))
.
broadcast
(
Eigen
::
DSizes
<
int
,
3
>
(
pre
,
1
,
post
))
.
reshape
(
Eigen
::
DSizes
<
int
,
1
>
(
x_e
.
size
()));
if
(
dx
)
{
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dx
);
dx_e
.
device
(
d
)
=
dz_e
*
y_e_bcast
;
}
if
(
dy
)
{
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dy
);
dy_e
.
device
(
d
)
=
(
x_e
*
dz_e
)
.
reshape
(
Eigen
::
DSizes
<
int
,
3
>
(
pre
,
n
,
post
))
.
sum
(
Eigen
::
array
<
int
,
2
>
{{
0
,
2
}});
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseMulGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -127,12 +61,11 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
Elem
entwiseGradCompute
<
DeviceContext
,
T
,
ElementwiseMulGradFunctor
<
T
>
,
ElementwiseMulBroadCastGradFunctor
<
T
>
,
ElementwiseMulBroadCast2GradFunctor
<
T
>>
(
ctx
,
x
,
y
,
out
,
dout
,
axis
,
dx
,
dy
);
Elem
wiseGradCompute
<
DeviceContext
,
T
,
IdentityGrad_DX
<
T
>
,
IdentityGrad_DY
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad_DX
<
T
>
(),
IdentityGrad_DY
<
T
>
()
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
c58ce0a6
...
...
@@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast {
dx_
[
i
]
=
dx_op_
(
x_
[
i
],
y_
[
i
],
out_
[
i
],
dout_
[
i
]);
}
if
(
dy_
!=
nullptr
)
{
dy_
[
i
]
=
d
x
_op_
(
x_
[
i
],
y_
[
i
],
out_
[
i
],
dout_
[
i
]);
dy_
[
i
]
=
d
y
_op_
(
x_
[
i
],
y_
[
i
],
out_
[
i
],
dout_
[
i
]);
}
}
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
c58ce0a6
...
...
@@ -20,6 +20,7 @@ if(WITH_GPU)
nv_library
(
unpooling SRCS unpooling.cc unpooling.cu DEPS device_context
)
nv_library
(
gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function
)
nv_library
(
cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context
)
nv_library
(
concat_functor SRCS concat.cc concat.cu DEPS device_context tensor
)
else
()
cc_library
(
math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto
)
cc_library
(
selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function
)
...
...
@@ -37,6 +38,7 @@ else()
cc_library
(
unpooling SRCS unpooling.cc DEPS device_context
)
cc_library
(
gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function
)
cc_library
(
cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context
)
cc_library
(
concat_functor SRCS concat.cc DEPS device_context tensor
)
endif
()
cc_test
(
math_function_test SRCS math_function_test.cc DEPS math_function tensor
)
...
...
@@ -44,3 +46,4 @@ cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selec
cc_test
(
im2col_test SRCS im2col_test.cc DEPS math_function tensor
)
cc_test
(
vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor
)
cc_test
(
sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding
)
cc_test
(
concat_test SRCS concat_test.cc DEPS concat_functor tensor
)
paddle/fluid/operators/math/concat.cc
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
const
int
axis
,
framework
::
Tensor
*
output
)
{
// TODO(zcd): Add input data validity checking
int
num
=
input
.
size
();
int
rows
=
1
;
auto
dim_0
=
input
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
rows
*=
dim_0
[
i
];
}
int
out_rows
=
rows
,
out_cols
=
0
;
std
::
vector
<
int64_t
>
input_cols
(
input
.
size
());
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
input
[
i
].
numel
()
/
rows
;
out_cols
+=
t_cols
;
input_cols
[
i
]
=
t_cols
;
}
auto
&
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
// computation
for
(
int
k
=
0
;
k
<
out_rows
;
++
k
)
{
T
*
dst_ptr
=
output
->
data
<
T
>
()
+
k
*
out_cols
;
int
col_idx
=
0
;
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
int
col_len
=
input_cols
[
j
];
const
T
*
src_prt
=
input
[
j
].
data
<
T
>
()
+
k
*
col_len
;
memory
::
Copy
(
cpu_place
,
dst_ptr
+
col_idx
,
cpu_place
,
src_prt
,
sizeof
(
T
)
*
col_len
);
col_idx
+=
col_len
;
}
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>&
outputs
)
{
// TODO(zcd): Add input data validity checking
int
num
=
outputs
.
size
();
int
input_rows
=
1
;
auto
dim_0
=
outputs
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
input_rows
*=
dim_0
[
i
];
}
int
input_cols
=
0
;
std
::
vector
<
int64_t
>
output_cols
(
outputs
.
size
());
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
outputs
[
i
].
numel
()
/
input_rows
;
input_cols
+=
t_cols
;
output_cols
[
i
]
=
t_cols
;
}
auto
&
cpu_place
=
boost
::
get
<
platform
::
CPUPlace
>
(
context
.
GetPlace
());
// computation
for
(
int
k
=
0
;
k
<
input_rows
;
++
k
)
{
const
T
*
src_ptr
=
input
.
data
<
T
>
()
+
k
*
input_cols
;
int
col_idx
=
0
;
for
(
int
j
=
0
;
j
<
num
;
++
j
)
{
int
col_len
=
output_cols
[
j
];
T
*
dst_ptr
=
outputs
[
j
].
data
<
T
>
()
+
k
*
col_len
;
memory
::
Copy
(
cpu_place
,
dst_ptr
,
cpu_place
,
src_ptr
+
col_idx
,
sizeof
(
T
)
*
col_len
);
col_idx
+=
col_len
;
}
}
}
};
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
int
>;
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
ConcatFunctor
<
platform
::
CPUDeviceContext
,
double
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
int
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
float
>;
template
class
ConcatGradFunctor
<
platform
::
CPUDeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/concat.cu
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_helper.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
__device__
T
upper_bound
(
const
T
*
first
,
T
count
,
T
val
)
{
const
T
*
orig
=
first
;
const
T
*
it
=
nullptr
;
T
step
=
0
;
while
(
count
>
0
)
{
it
=
first
;
step
=
count
/
2
;
it
+=
step
;
if
(
!
(
val
<
*
it
))
{
first
=
++
it
;
count
-=
step
+
1
;
}
else
{
count
=
step
;
}
}
return
first
-
orig
;
}
template
<
typename
T
>
__global__
void
KernelConcat
(
T
**
inputs
,
const
int
*
input_cols
,
int
col_size
,
const
int
output_rows
,
const
int
output_cols
,
T
*
output
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
segment
=
upper_bound
<
int
>
(
input_cols
,
col_size
,
tid_x
)
-
1
;
int
curr_offset
=
input_cols
[
segment
];
int
curr_segment
=
segment
;
for
(;
tid_x
<
output_cols
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
curr_col_offset
;
while
((
curr_col_offset
=
input_cols
[
curr_segment
+
1
])
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
input_ptr
=
inputs
[
curr_segment
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
output_rows
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output
[
tid_y
*
output_cols
+
tid_x
]
=
input_ptr
[
tid_y
*
segment_width
+
local_col
];
}
}
template
<
typename
T
>
__global__
void
KernelConcat
(
T
**
inputs
,
const
int
input_col
,
const
int
output_rows
,
const
int
output_cols
,
T
*
output
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
double
inv_input_col
=
1.0
/
input_col
;
for
(;
tid_x
<
output_cols
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
*
inv_input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
T
*
input_ptr
=
inputs
[
split
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
output_rows
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
{
output
[
tid_y
*
output_cols
+
tid_x
]
=
input_ptr
[
tid_y
*
input_col
+
in_offset
];
}
}
}
template
<
typename
T
>
__global__
void
KernelConcatGrad
(
const
T
*
input
,
const
int
input_row
,
const
int
input_col
,
const
int
*
output_cols
,
int
col_size
,
T
**
outputs
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
segment
=
upper_bound
<
int
>
(
output_cols
,
col_size
,
tid_x
)
-
1
;
int
curr_offset
=
output_cols
[
segment
];
int
curr_segment
=
segment
;
for
(;
tid_x
<
input_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
curr_col_offset
;
while
((
curr_col_offset
=
output_cols
[
curr_segment
+
1
])
<=
tid_x
)
{
curr_offset
=
curr_col_offset
;
++
curr_segment
;
}
int
local_col
=
tid_x
-
curr_offset
;
int
segment_width
=
curr_col_offset
-
curr_offset
;
T
*
output_ptr
=
outputs
[
curr_segment
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
input_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
segment_width
+
local_col
]
=
input
[
tid_y
*
input_col
+
tid_x
];
}
}
template
<
typename
T
>
__global__
void
KernelConcatGrad
(
const
T
*
input
,
const
int
input_row
,
const
int
input_col
,
const
int
output_cols
,
T
**
outputs
)
{
int
tid_x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
double
inv_input_col
=
1.0
/
input_col
;
for
(;
tid_x
<
input_col
;
tid_x
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
split
=
tid_x
*
inv_input_col
;
int
in_offset
=
tid_x
-
split
*
input_col
;
T
*
output_ptr
=
outputs
[
split
];
int
tid_y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
for
(;
tid_y
<
input_row
;
tid_y
+=
blockDim
.
y
*
gridDim
.
y
)
output_ptr
[
tid_y
*
output_cols
+
in_offset
]
=
input
[
tid_y
*
input_col
+
tid_x
];
}
}
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
const
int
axis
,
framework
::
Tensor
*
output
)
{
// TODO(zcd): Add input data validity checking
int
num
=
input
.
size
();
int
rows
=
1
;
auto
dim_0
=
input
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
rows
*=
dim_0
[
i
];
}
int
cols
=
input
[
0
].
numel
()
/
rows
;
int
out_rows
=
rows
,
out_cols
=
0
;
framework
::
Vector
<
int16_t
>
inputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
framework
::
Vector
<
int
>
inputs_cols
(
num
+
1
);
inputs_cols
[
0
]
=
0
;
T
**
inputs_ptr
=
reinterpret_cast
<
T
**>
(
inputs_data
.
data
());
bool
sameShape
=
true
;
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
input
[
i
].
numel
()
/
rows
;
if
(
sameShape
)
{
if
(
t_cols
!=
cols
)
sameShape
=
false
;
}
out_cols
+=
t_cols
;
inputs_cols
[
i
+
1
]
=
out_cols
;
inputs_ptr
[
i
]
=
const_cast
<
T
*>
(
input
[
i
].
data
<
T
>
());
}
T
**
ins_gpu
=
reinterpret_cast
<
T
**>
(
inputs_data
.
CUDAMutableData
(
context
.
GetPlace
()));
const
int
*
ins_col_gpu
=
inputs_cols
.
CUDAData
(
context
.
GetPlace
());
// computation
// set the thread block and grid according to CurrentDeviceId
const
int
kThreadsPerBlock
=
1024
;
int
block_cols
=
kThreadsPerBlock
;
if
(
out_cols
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
out_cols
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
max_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
int
grid_cols
=
std
::
min
((
out_cols
+
block_cols
-
1
)
/
block_cols
,
max_blocks
);
int
grid_rows
=
std
::
min
(
max_blocks
/
grid_cols
,
std
::
max
(
out_rows
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
sameShape
)
{
KernelConcat
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
ins_gpu
,
cols
,
out_rows
,
out_cols
,
output
->
data
<
T
>
());
}
else
{
KernelConcat
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
ins_gpu
,
ins_col_gpu
,
static_cast
<
int
>
(
inputs_cols
.
size
()),
out_rows
,
out_cols
,
output
->
data
<
T
>
());
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template
<
typename
T
>
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>&
outputs
)
{
// TODO(zcd): Add input data validity checking
int
num
=
outputs
.
size
();
int
input_row
=
1
;
auto
dim_0
=
outputs
[
0
].
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
input_row
*=
dim_0
[
i
];
}
int
output_col_0
=
outputs
[
0
].
numel
()
/
input_row
;
int
input_col
=
0
;
bool
sameShape
=
true
;
framework
::
Vector
<
int16_t
>
outputs_data
(
num
*
sizeof
(
T
*
)
/
2
);
framework
::
Vector
<
int
>
outputs_cols
(
num
+
1
);
outputs_cols
[
0
]
=
0
;
T
**
outputs_ptr
=
reinterpret_cast
<
T
**>
(
outputs_data
.
data
());
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
int
t_col
=
outputs
[
i
].
numel
()
/
input_row
;
if
(
sameShape
)
{
if
(
t_col
!=
output_col_0
)
sameShape
=
false
;
}
input_col
+=
t_col
;
outputs_cols
[
i
+
1
]
=
input_col
;
outputs_ptr
[
i
]
=
outputs
[
i
].
data
<
T
>
();
}
T
**
outs_gpu
=
reinterpret_cast
<
T
**>
(
outputs_data
.
CUDAMutableData
(
context
.
GetPlace
()));
const
int
*
outs_col_gpu
=
outputs_cols
.
CUDAData
(
context
.
GetPlace
());
// computation
const
int
kThreadsPerBlock
=
1024
;
int
block_cols
=
kThreadsPerBlock
;
if
(
input_col
<
kThreadsPerBlock
)
{
// block_cols is aligned by 32.
block_cols
=
((
input_col
+
31
)
>>
5
)
<<
5
;
}
int
block_rows
=
kThreadsPerBlock
/
block_cols
;
dim3
block_size
=
dim3
(
block_cols
,
block_rows
,
1
);
int
max_threads
=
context
.
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_threads
/
kThreadsPerBlock
,
1
);
int
grid_cols
=
std
::
min
((
input_col
+
block_cols
-
1
)
/
block_cols
,
max_blocks
);
int
grid_rows
=
std
::
min
(
max_blocks
/
grid_cols
,
std
::
max
(
input_row
/
block_rows
,
1
));
dim3
grid_size
=
dim3
(
grid_cols
,
grid_rows
,
1
);
if
(
sameShape
)
{
KernelConcatGrad
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
input
.
data
<
T
>
(),
input_row
,
input_col
,
output_col_0
,
outs_gpu
);
}
else
{
KernelConcatGrad
<<<
grid_size
,
block_size
,
0
,
context
.
stream
()
>>>
(
input
.
data
<
T
>
(),
input_row
,
input_col
,
outs_col_gpu
,
static_cast
<
int
>
(
outputs_cols
.
size
()),
outs_gpu
);
}
}
};
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ConcatFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
int
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
ConcatGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/concat.h
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
/*
* \brief Concatenate the input tensors along the dimension axis.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input[0] = [[1,2],[3,4]]
* Input[1] = [[5,6]]
* axis = 0
*
* Output = [[1,2],
* [3,4],
* [5,6]]
*/
template
<
typename
DeviceContext
,
typename
T
>
class
ConcatFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
const
int
axis
,
framework
::
Tensor
*
output
);
};
/*
* \brief Split the input tensors along the dimension axis into outputs.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input = [[1,2],
* [3,4],
* [5,6]]
* axis = 0
*
* Output[0] = [[1,2],[3,4]]
* Output[1] = [[5,6]]
*/
template
<
typename
DeviceContext
,
typename
T
>
class
ConcatGradFunctor
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
>&
outputs
);
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/concat_test.cc
0 → 100644
浏览文件 @
c58ce0a6
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
using
namespace
paddle
::
framework
;
using
namespace
paddle
::
platform
;
template
<
typename
DeviceContext
,
typename
Place
>
void
testConcat
()
{
Tensor
input_a_cpu
;
Tensor
input_b_cpu
;
Tensor
out_cpu
;
Tensor
input_a
;
Tensor
input_b
;
Tensor
out
;
DeviceContext
*
context
=
new
DeviceContext
(
Place
());
// DeviceContext context(Place());
/**
* cast1:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [3, 3, 4]
* output:
* out.shape: [5, 3, 4]
*/
auto
dim_a
=
make_ddim
({
2
,
3
,
4
});
auto
dim_b
=
make_ddim
({
3
,
3
,
4
});
auto
dim_out
=
make_ddim
({
5
,
3
,
4
});
input_a
.
mutable_data
<
int
>
(
dim_a
,
Place
());
input_b
.
mutable_data
<
int
>
(
dim_b
,
Place
());
out
.
mutable_data
<
int
>
(
dim_out
,
Place
());
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
mutable_data
<
int
>
(
dim_a
,
CPUPlace
());
input_b_cpu
.
mutable_data
<
int
>
(
dim_b
,
CPUPlace
());
out_cpu
.
mutable_data
<
int
>
(
dim_out
,
CPUPlace
());
}
int
*
a_ptr
;
int
*
b_ptr
;
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
3
*
3
*
4
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
std
::
vector
<
Tensor
>
input
;
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
paddle
::
operators
::
math
::
ConcatFunctor
<
DeviceContext
,
int
>
concat_functor
;
concat_functor
(
*
context
,
input
,
0
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
int
*
out_ptr
;
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
int
cols
=
2
*
3
*
4
;
int
idx_a
=
0
,
idx_b
=
0
;
for
(
int
j
=
0
;
j
<
5
*
3
*
4
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
//
/**
* cast2:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 4, 4]
* output:
* out.shape: [2, 7, 4]
*/
dim_a
=
make_ddim
({
2
,
3
,
4
});
dim_b
=
make_ddim
({
2
,
4
,
4
});
dim_out
=
make_ddim
({
2
,
7
,
4
});
input_a
.
Resize
(
dim_a
);
input_b
.
Resize
(
dim_b
);
out
.
Resize
(
dim_out
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
Resize
(
dim_a
);
input_b_cpu
.
Resize
(
dim_b
);
out_cpu
.
Resize
(
dim_out
);
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
2
*
4
*
4
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
input
.
clear
();
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
concat_functor
(
*
context
,
input
,
1
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
cols
=
3
*
4
;
idx_a
=
0
,
idx_b
=
0
;
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
for
(
int
j
=
0
;
j
<
28
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
28
+
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
28
+
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
}
/**
* cast3:
* inputs:
* t_a.shape: [2, 3, 5]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 3, 9]
*/
dim_a
=
make_ddim
({
2
,
3
,
4
});
dim_b
=
make_ddim
({
2
,
3
,
5
});
dim_out
=
make_ddim
({
2
,
3
,
9
});
input_a
.
Resize
(
dim_a
);
input_b
.
Resize
(
dim_b
);
out
.
Resize
(
dim_out
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
Resize
(
dim_a
);
input_b_cpu
.
Resize
(
dim_b
);
out_cpu
.
Resize
(
dim_out
);
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
2
*
3
*
5
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
input
.
clear
();
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
concat_functor
(
*
context
,
input
,
2
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
// check the data
cols
=
4
;
idx_a
=
0
,
idx_b
=
0
;
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
for
(
int
j
=
0
;
j
<
9
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
9
+
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
9
+
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
}
/**
* cast4:
* inputs:
* axis = 1
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 6, 4]
*/
dim_a
=
make_ddim
({
2
,
3
,
4
});
dim_b
=
make_ddim
({
2
,
3
,
4
});
dim_out
=
make_ddim
({
2
,
6
,
4
});
input_a
.
Resize
(
dim_a
);
input_b
.
Resize
(
dim_b
);
out
.
Resize
(
dim_out
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
input_a_cpu
.
Resize
(
dim_a
);
input_b_cpu
.
Resize
(
dim_b
);
out_cpu
.
Resize
(
dim_out
);
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
a_ptr
=
input_a_cpu
.
data
<
int
>
();
b_ptr
=
input_b_cpu
.
data
<
int
>
();
}
else
{
a_ptr
=
input_a
.
data
<
int
>
();
b_ptr
=
input_b
.
data
<
int
>
();
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
a_ptr
[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
2
*
3
*
4
;
++
i
)
{
b_ptr
[
i
]
=
i
;
}
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
input_a_cpu
,
Place
(),
*
context
,
&
input_a
);
TensorCopy
(
input_b_cpu
,
Place
(),
*
context
,
&
input_b
);
}
input
.
clear
();
input
.
push_back
(
input_a
);
input
.
push_back
(
input_b
);
concat_functor
(
*
context
,
input
,
1
,
&
out
);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ
(
input_a
.
dims
(),
dim_a
);
PADDLE_ENFORCE_EQ
(
input_b
.
dims
(),
dim_b
);
if
(
paddle
::
platform
::
is_gpu_place
(
Place
()))
{
TensorCopy
(
out
,
CPUPlace
(),
*
context
,
&
out_cpu
);
out_ptr
=
out_cpu
.
data
<
int
>
();
}
else
{
out_ptr
=
out
.
data
<
int
>
();
}
// check the data
cols
=
12
;
idx_a
=
0
,
idx_b
=
0
;
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
for
(
int
j
=
0
;
j
<
24
;
++
j
)
{
if
(
j
>=
cols
)
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
24
+
j
],
b_ptr
[
idx_b
]);
++
idx_b
;
}
else
{
PADDLE_ENFORCE_EQ
(
out_ptr
[
i
*
24
+
j
],
a_ptr
[
idx_a
]);
++
idx_a
;
}
}
}
}
TEST
(
math
,
concat
)
{
testConcat
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
#ifdef PADDLE_WITH_CUDA
testConcat
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
#endif
}
paddle/fluid/operators/math/math_function.cc
浏览文件 @
c58ce0a6
...
...
@@ -245,11 +245,13 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
int64_t
>;
template
struct
SetConstant
<
platform
::
CPUDeviceContext
,
bool
>;
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
DEFINE_CPU_TRANS
(
1
);
...
...
paddle/fluid/operators/reader/CMakeLists.txt
0 → 100644
浏览文件 @
c58ce0a6
cc_library
(
reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader
)
op_library
(
create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry
)
op_library
(
create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry
)
op_library
(
create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry
)
set
(
READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op PARENT_SCOPE
)
paddle/fluid/operators/reader/create_batch_reader_op.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
class
BatchReader
:
public
framework
::
DecoratedReader
{
public:
BatchReader
(
ReaderBase
*
reader
,
int
batch_size
)
:
DecoratedReader
(
reader
),
batch_size_
(
batch_size
)
{
buffer_
.
reserve
(
batch_size_
);
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
;
private:
int
batch_size_
;
std
::
vector
<
std
::
vector
<
framework
::
LoDTensor
>>
buffer_
;
};
class
CreateBatchReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
BatchReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"batch_size"
)));
}
};
class
CreateBatchReaderOpMaker
:
public
DecoratedReaderMakerBase
{
public:
CreateBatchReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
DecoratedReaderMakerBase
(
op_proto
,
op_checker
)
{
AddAttr
<
int
>
(
"batch_size"
,
"How many instances the batch reader yields each time."
)
.
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateBatchReader Operator
A batch reader takes another reader as its 'underlying reader',
gathers the underlying reader's outputs and then yields them in batches.
)DOC"
);
}
};
void
BatchReader
::
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
{
buffer_
.
clear
();
buffer_
.
reserve
(
batch_size_
);
for
(
int
i
=
0
;
i
<
batch_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
framework
::
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// Concat instances
out
->
clear
();
if
(
buffer_
.
empty
())
{
// if buffer_ is empty, the 'out' will return as an empty vector.
return
;
}
int
out_num
=
buffer_
[
0
].
size
();
out
->
reserve
(
out_num
);
for
(
int
j
=
0
;
j
<
out_num
;
++
j
)
{
// Merge shape and check date type
std
::
type_index
batch_type
=
buffer_
[
0
][
j
].
type
();
framework
::
DDim
batch_shape
=
buffer_
[
0
][
j
].
dims
();
for
(
size_t
i
=
1
;
i
<
buffer_
.
size
();
++
i
)
{
std
::
type_index
ins_type
=
buffer_
[
i
][
j
].
type
();
framework
::
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
PADDLE_ENFORCE_EQ
(
batch_type
,
ins_type
);
PADDLE_ENFORCE_EQ
(
slice_ddim
(
batch_shape
,
1
,
batch_shape
.
size
()),
slice_ddim
(
ins_shape
,
1
,
ins_shape
.
size
()));
PADDLE_ENFORCE_GT
(
ins_shape
[
0
],
0
);
batch_shape
[
0
]
+=
ins_shape
[
0
];
}
framework
::
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
batch_shape
);
out_tensor
.
mutable_data
(
platform
::
CPUPlace
(),
batch_type
);
int64_t
dst_offset
=
0
;
// Merge lod and data
framework
::
LoD
batch_lod
;
for
(
size_t
i
=
0
;
i
<
buffer_
.
size
();
++
i
)
{
framework
::
DDim
ins_shape
=
buffer_
[
i
][
j
].
dims
();
framework
::
LoD
ins_lod
=
buffer_
[
i
][
j
].
lod
();
if
(
i
==
0
)
{
batch_lod
=
ins_lod
;
}
else
{
PADDLE_ENFORCE_EQ
(
batch_lod
.
size
(),
ins_lod
.
size
());
for
(
size_t
level_idx
=
0
;
level_idx
<
batch_lod
.
size
();
++
level_idx
)
{
auto
&
lod_level
=
batch_lod
[
level_idx
];
for
(
size_t
k
=
1
;
k
<
ins_lod
[
level_idx
].
size
();
++
k
)
{
lod_level
.
push_back
(
ins_lod
[
level_idx
][
k
]
+
lod_level
.
back
());
}
}
}
auto
dst
=
out_tensor
.
Slice
(
dst_offset
,
dst_offset
+
ins_shape
[
0
]);
TensorCopy
(
buffer_
[
i
][
j
],
platform
::
CPUPlace
(),
&
dst
);
dst_offset
+=
ins_shape
[
0
];
}
out_tensor
.
set_lod
(
batch_lod
);
out
->
push_back
(
out_tensor
);
}
}
}
// namespace reader
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
::
reader
;
REGISTER_DECORATED_READER_OPERATOR
(
create_batch_reader
,
ops
::
CreateBatchReaderOp
,
ops
::
CreateBatchReaderOpMaker
);
paddle/fluid/operators/reader/create_random_data_generator_op.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
template
<
typename
T
>
class
RandomDataGenerator
:
public
framework
::
FileReader
{
public:
RandomDataGenerator
(
const
std
::
vector
<
framework
::
DDim
>&
shapes
,
float
min
,
float
max
)
:
FileReader
(
shapes
),
min_
(
min
),
max_
(
max
)
{
PADDLE_ENFORCE_LE
(
min
,
max
,
"'min' shouldn't be greater than 'max'.(%f vs %f)"
,
min
,
max
);
unsigned
int
seed
=
std
::
random_device
()();
engine_
.
seed
(
seed
);
dist_
=
std
::
uniform_real_distribution
<
float
>
(
min_
,
max_
);
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
{
out
->
clear
();
out
->
reserve
(
shapes_
.
size
());
for
(
const
framework
::
DDim
&
shape
:
shapes_
)
{
PADDLE_ENFORCE_GE
(
shape
.
size
(),
2
,
"The rank of reader's output data should be 2 at least.(Now it's %d)"
,
shape
.
size
());
framework
::
LoDTensor
out_tensor
;
out_tensor
.
Resize
(
shape
);
T
*
data
=
out_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
int64_t
numel
=
framework
::
product
(
shape
);
for
(
int64_t
i
=
0
;
i
<
numel
;
++
i
)
{
data
[
i
]
=
dist_
(
engine_
);
}
out
->
push_back
(
out_tensor
);
}
}
bool
HasNext
()
const
override
{
return
true
;
}
void
ReInit
()
override
{
return
;
}
private:
float
min_
;
float
max_
;
std
::
minstd_rand
engine_
;
std
::
uniform_real_distribution
<
float
>
dist_
;
};
template
<
typename
T
>
class
CreateRandomDataGeneratorOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
shape_concat
=
Attr
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
&
ranks
=
Attr
<
std
::
vector
<
int
>>
(
"ranks"
);
PADDLE_ENFORCE
(
!
shape_concat
.
empty
()
&&
!
ranks
.
empty
());
PADDLE_ENFORCE_EQ
(
std
::
accumulate
(
ranks
.
begin
(),
ranks
.
end
(),
0
),
int
(
shape_concat
.
size
()),
"The accumulate of all ranks should be equal to the "
"shape concat's length."
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
RandomDataGenerator
<
T
>
(
shapes
,
Attr
<
float
>
(
"min"
),
Attr
<
float
>
(
"max"
)));
}
};
class
CreateRandomDataGeneratorOpMaker
:
public
FileReaderMakerBase
{
public:
CreateRandomDataGeneratorOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
FileReaderMakerBase
(
op_proto
,
op_checker
)
{
AddAttr
<
float
>
(
"min"
,
"The lower bound of reader's uniform distribution."
);
AddAttr
<
float
>
(
"max"
,
"The upper bound of reader's uniform distribution."
);
AddComment
(
R"DOC(
CreateRandomDataGenerator Operator
This Op creates a random reader.
The reader generates random data instead of really reading from files.
Generated data follow an uniform distribution between 'min' and 'max'.
)DOC"
);
}
};
}
// namespace reader
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
::
reader
;
REGISTER_FILE_READER_OPERATOR
(
create_random_data_generator
,
ops
::
CreateRandomDataGeneratorOp
<
float
>
,
ops
::
CreateRandomDataGeneratorOpMaker
);
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
class
ShuffleReader
:
public
framework
::
DecoratedReader
{
public:
ShuffleReader
(
ReaderBase
*
reader
,
int
buffer_size
)
:
DecoratedReader
(
reader
),
buffer_size_
(
buffer_size
),
iteration_pos_
(
0
)
{
buffer_
.
reserve
(
buffer_size
);
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
;
private:
int
buffer_size_
;
std
::
vector
<
std
::
vector
<
framework
::
LoDTensor
>>
buffer_
;
size_t
iteration_pos_
;
};
void
ShuffleReader
::
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
{
if
(
iteration_pos_
>=
buffer_
.
size
())
{
// Reload buffer with new data
buffer_
.
clear
();
buffer_
.
reserve
(
buffer_size_
);
for
(
int
i
=
0
;
i
<
buffer_size_
;
++
i
)
{
if
(
reader_
->
HasNext
())
{
buffer_
.
push_back
(
std
::
vector
<
framework
::
LoDTensor
>
());
reader_
->
ReadNext
(
&
buffer_
.
back
());
}
else
{
break
;
}
}
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
// optimize.
std
::
random_shuffle
(
buffer_
.
begin
(),
buffer_
.
end
());
iteration_pos_
=
0
;
}
out
->
clear
();
if
(
!
buffer_
.
empty
())
{
std
::
swap
(
*
out
,
buffer_
[
iteration_pos_
++
]);
}
// if buffer_ is empty, the 'out' will return as an empty vector.
}
class
CreateShuffleReaderOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
auto
&
underlying_reader
=
scope
.
FindVar
(
Input
(
"UnderlyingReader"
))
->
Get
<
framework
::
ReaderHolder
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
template
GetMutable
<
framework
::
ReaderHolder
>();
out
->
Reset
(
new
ShuffleReader
(
underlying_reader
.
Get
(),
Attr
<
int
>
(
"buffer_size"
)));
}
};
class
CreateShuffleReaderOpMaker
:
public
DecoratedReaderMakerBase
{
public:
CreateShuffleReaderOpMaker
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
)
:
DecoratedReaderMakerBase
(
op_proto
,
op_checker
)
{
AddAttr
<
int
>
(
"buffer_size"
,
"The shuffle buffer size."
).
GreaterThan
(
0
);
AddComment
(
R"DOC(
CreateShuffleReader Operator
A shuffle reader takes another reader as its 'underlying reader'
and yields the underlying reader's outputs in a shuffled order.
)DOC"
);
}
};
}
// namespace reader
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
::
reader
;
REGISTER_DECORATED_READER_OPERATOR
(
create_shuffle_reader
,
ops
::
CreateShuffleReaderOp
,
ops
::
CreateShuffleReaderOpMaker
);
paddle/fluid/operators/reader/reader_op_registry.cc
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "reader_op_registry.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
std
::
vector
<
framework
::
DDim
>
RestoreShapes
(
const
std
::
vector
<
int
>&
shape_concat
,
const
std
::
vector
<
int
>&
ranks
)
{
std
::
vector
<
framework
::
DDim
>
res
;
int
offset
=
0
;
for
(
int
len
:
ranks
)
{
auto
start_it
=
shape_concat
.
begin
()
+
offset
;
auto
end_it
=
start_it
+
len
;
res
.
push_back
(
framework
::
make_ddim
(
std
::
vector
<
int
>
(
start_it
,
end_it
)));
offset
+=
len
;
}
return
res
;
}
FileReaderMakerBase
::
FileReaderMakerBase
(
framework
::
OpProtoAndCheckerMaker
::
OpProto
*
op_proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddOutput
(
"Out"
,
"(ReaderHolder) The created random reader."
);
AddAttr
<
std
::
vector
<
int
>>
(
"shape_concat"
,
"The concat of all data's shapes."
);
AddAttr
<
std
::
vector
<
int
>>
(
"ranks"
,
"The ranks of each data."
"e.g."
"shape_concat = [2,3,4,5,6]"
"ranks = [3,2]"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively."
);
AddAttr
<
std
::
vector
<
int
>>
(
"lod_levels"
,
"The LoD levels of each data."
);
}
void
FileReaderInferShape
::
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output file reader should not be null."
);
const
auto
shape_concat
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
if
(
ctx
->
IsRuntime
())
{
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d)."
,
lod_levels
.
size
(),
shapes
.
size
());
framework
::
VarDesc
*
reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
reader
->
SetLoDLevels
(
lod_levels
);
}
}
void
FileReaderInferVarType
::
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
{
std
::
string
reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
}
void
DecoratedReaderInferShape
::
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"UnderlyingReader"
),
"Input(UnderlyingReader) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output decorated reader should not be null."
);
ctx
->
SetReaderDims
(
"Out"
,
ctx
->
GetReaderDims
(
"UnderlyingReader"
));
if
(
ctx
->
IsRuntime
())
{
framework
::
VarDesc
*
in_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"UnderlyingReader"
)[
0
]);
framework
::
VarDesc
*
out_reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
out_reader
->
SetLoDLevels
(
in_reader
->
GetLoDLevels
());
}
}
void
DecoratedReaderInferVarType
::
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
{
std
::
string
in_reader_name
=
op_desc
.
Input
(
"UnderlyingReader"
)[
0
];
framework
::
VarDesc
*
in_reader
=
block
->
FindVarRecursive
(
in_reader_name
);
std
::
string
out_reader_name
=
op_desc
.
Output
(
"Out"
)[
0
];
framework
::
VarDesc
*
out_reader
=
block
->
FindVarRecursive
(
out_reader_name
);
out_reader
->
SetType
(
framework
::
proto
::
VarType
::
READER
);
out_reader
->
SetDataTypes
(
in_reader
->
GetDataTypes
());
}
DecoratedReaderMakerBase
::
DecoratedReaderMakerBase
(
framework
::
OpProtoAndCheckerMaker
::
OpProto
*
op_proto
,
framework
::
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
op_proto
,
op_checker
)
{
AddInput
(
"UnderlyingReader"
,
"(ReaderHolder) The underlying reader for creating a batch reader."
);
AddOutput
(
"Out"
,
"(ReaderHolder) The created batch reader."
);
}
}
// namespace reader
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reader/reader_op_registry.h
0 → 100644
浏览文件 @
c58ce0a6
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace
paddle
{
namespace
operators
{
namespace
reader
{
extern
std
::
vector
<
framework
::
DDim
>
RestoreShapes
(
const
std
::
vector
<
int
>&
shape_concat
,
const
std
::
vector
<
int
>&
ranks
);
class
FileReaderMakerBase
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
FileReaderMakerBase
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
);
};
class
FileReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
;
};
class
FileReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
;
};
// general infershape for decorated reader
class
DecoratedReaderInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
;
};
// general var type inference for decorated reader
class
DecoratedReaderInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
;
};
class
DecoratedReaderMakerBase
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
DecoratedReaderMakerBase
(
OpProto
*
op_proto
,
OpAttrChecker
*
op_checker
);
};
}
// namespace reader
}
// namespace operators
}
// namespace paddle
#define REGISTER_FILE_READER_OPERATOR(op_name, ...) \
REGISTER_OPERATOR(op_name, __VA_ARGS__, \
paddle::operators::reader::FileReaderInferShape, \
paddle::framework::EmptyGradOpMaker, \
paddle::operators::reader::FileReaderInferVarType)
#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...) \
REGISTER_OPERATOR(op_name, __VA_ARGS__, \
paddle::operators::reader::DecoratedReaderInferShape, \
paddle::framework::EmptyGradOpMaker, \
paddle::operators::reader::DecoratedReaderInferVarType)
paddle/fluid/operators/reshape_op.cc
浏览文件 @
c58ce0a6
...
...
@@ -84,6 +84,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
std
::
vector
<
int
>>
(
"shape"
,
"(vector<int>) "
"Target shape of reshape operator."
);
AddAttr
<
bool
>
(
"inplace"
,
"Change the source tensor's shape without copy memory."
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
Reshape Operator.
...
...
paddle/fluid/operators/reshape_op.h
浏览文件 @
c58ce0a6
...
...
@@ -26,10 +26,16 @@ class ReshapeKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
auto
out_dims
=
out
->
dims
();
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
TensorCopy
(
*
in
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
out
);
out
->
Resize
(
out_dims
);
if
(
!
inplace
)
{
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
TensorCopy
(
*
in
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
out
);
out
->
Resize
(
out_dims
);
}
else
{
out
->
ShareDataWith
(
*
in
);
out
->
Resize
(
out_dims
);
}
}
};
...
...
@@ -40,10 +46,16 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_x
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bool
inplace
=
ctx
.
Attr
<
bool
>
(
"inplace"
);
auto
in_dims
=
d_x
->
dims
();
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
d_x
);
d_x
->
Resize
(
in_dims
);
if
(
!
inplace
)
{
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
d_x
);
d_x
->
Resize
(
in_dims
);
}
else
{
d_x
->
ShareDataWith
(
*
d_out
);
d_x
->
Resize
(
in_dims
);
}
}
};
}
// namespace operators
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
c58ce0a6
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
...
...
@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN
(
ScopedPoolingDescriptor
);
};
inline
bool
CanCUDNNBeUsed
(
const
framework
::
ExecutionContext
&
ctx
)
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
paddle
::
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
#ifdef PADDLE_WITH_CUDA
if
(
use_cudnn
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
use_cudnn
&=
dev_ctx
.
cudnn_handle
()
!=
nullptr
;
}
#endif
return
use_cudnn
;
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device_context.cc
浏览文件 @
c58ce0a6
...
...
@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
for
(
size_t
i
=
0
;
i
<
places
.
size
();
i
++
)
{
if
(
platform
::
is_cpu_place
(
places
[
i
]))
{
#ifdef PADDLE_WITH_MKLDNN
device_contexts_
.
emplace
(
places
[
i
],
new
platform
::
MKLDNNDeviceContext
(
boost
::
get
<
platform
::
CPUPlace
>
(
places
[
i
])));
#else
device_contexts_
.
emplace
(
places
[
i
],
new
platform
::
CPUDeviceContext
(
boost
::
get
<
platform
::
CPUPlace
>
(
places
[
i
])));
#endif
}
else
if
(
platform
::
is_gpu_place
(
places
[
i
]))
{
#ifdef PADDLE_WITH_CUDA
device_contexts_
.
emplace
(
places
[
i
],
...
...
@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext
::
CUDADeviceContext
(
CUDAPlace
place
)
:
place_
(
place
)
{
SetDeviceId
(
place_
.
device
);
multi_process
=
GetCUDAMultiProcessors
(
place_
.
device
);
max_threads_per_mp
=
GetCUDAMaxThreadsPerMultiProcessor
(
place_
.
device
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
eigen_stream_
.
reset
(
new
EigenCudaStreamDevice
());
eigen_stream_
->
Reinitialize
(
&
stream_
,
place
);
...
...
@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE
(
cudaGetLastError
());
}
int
CUDADeviceContext
::
GetMaxPhysicalThreadCount
()
const
{
return
multi_process
*
max_threads_per_mp
;
}
Eigen
::
GpuDevice
*
CUDADeviceContext
::
eigen_device
()
const
{
return
eigen_device_
.
get
();
}
...
...
@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext
::
MKLDNNDeviceContext
(
CPUPlace
place
)
:
CPUDeviceContext
(
place
),
ready_
(
false
)
{
stream_
.
reset
(
new
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
));
engine_
.
reset
(
new
mkldnn
::
engine
(
mkldnn
::
engine
::
cpu
,
0
));
:
CPUDeviceContext
(
place
),
engine_
(
mkldnn
::
engine
::
cpu
,
0
),
p_blobs_
()
{
p_blobs_
.
reset
(
new
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>
());
}
template
<
typename
T
>
void
MKLDNNDeviceContext
::
AddElement
(
const
std
::
string
&
op_key
,
const
T
&
value
)
{
if
(
GetElement
<
T
>
(
op_key
))
{
return
;
}
GetElementPool
<
T
>
().
emplace
(
op_key
,
std
::
move
(
value
));
}
void
MKLDNNDeviceContext
::
SetBlob
(
const
std
::
string
&
name
,
std
::
shared_ptr
<
void
>
data
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>*
p
;
p
=
p_blobs_
.
get
();
template
<
typename
T
>
const
T
&
MKLDNNDeviceContext
::
GetElement
(
const
std
::
string
&
op_key
)
const
{
auto
it
=
GetElementPool
<
T
>
().
find
(
op_key
);
return
it
==
GetElementPool
<
T
>
().
end
()
?
nullptr
:
it
->
second
;
}
auto
it
=
p
->
find
(
name
);
template
<
>
const
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNMemoryPtr
,
std
::
hash
<
std
::
string
>>&
MKLDNNDeviceContext
::
GetElementPool
<
MKLDNNMemoryPtr
>
()
const
{
return
memory_pool_
;
}
if
(
it
==
p
->
end
())
{
(
*
p
)[
name
]
=
data
;
// create new blob
}
else
{
it
->
second
=
data
;
// set data to existing blob
}
template
<
>
const
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitivePtr
,
std
::
hash
<
std
::
string
>>&
MKLDNNDeviceContext
::
GetElementPool
<
MKLDNNPrimitivePtr
>
()
const
{
return
primitive_pool_
;
return
;
}
template
<
>
const
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitiveDescPtr
,
std
::
hash
<
std
::
string
>>&
MKLDNNDeviceContext
::
GetElementPool
<
MKLDNNPrimitiveDescPtr
>
()
const
{
return
primitive_desc_pool_
;
}
std
::
shared_ptr
<
void
>
MKLDNNDeviceContext
::
GetBlob
(
const
std
::
string
&
name
)
const
{
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>*
p
;
p
=
p_blobs_
.
get
();
void
MKLDNNDeviceContext
::
Execute
(
bool
block
)
{
if
(
pipeline_
.
empty
())
{
return
;
}
ResetStream
();
stream_
->
submit
(
pipeline_
).
wait
(
block
);
ready_
=
false
;
pipeline_
.
clear
();
}
auto
it
=
p
->
find
(
name
);
void
MKLDNNDeviceContext
::
ResetStream
()
{
if
(
ready_
)
{
return
;
if
(
it
!=
p
->
end
())
{
return
it
->
second
;
}
// TODO(TJ): change me when mkldnn have specific method to reset this state
stream_
.
reset
(
new
mkldnn
::
stream
(
mkldnn
::
stream
::
kind
::
eager
));
ready_
=
true
;
return
nullptr
;
}
#endif
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
c58ce0a6
...
...
@@ -22,7 +22,7 @@ limitations under the License. */
#endif
#ifdef PADDLE_WITH_MKLDNN
#include
"paddle/fluid/platform/mkldnn_helper.h"
#include
<mkldnn.hpp>
#endif
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */
Place
GetPlace
()
const
override
;
/*! \brief Return the max physical thread count in the device context */
int
GetMaxPhysicalThreadCount
()
const
;
/*! \brief Return eigen device in the device context. */
Eigen
::
GpuDevice
*
eigen_device
()
const
;
...
...
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t
stream_
;
cudnnHandle_t
cudnn_handle_
;
cublasHandle_t
cublas_handle_
;
int
multi_process
;
int
max_threads_per_mp
;
};
template
<
>
...
...
@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
public:
explicit
MKLDNNDeviceContext
(
CPUPlace
place
);
/* \brief Add new element: memory, primitive or primitive desc */
template
<
typename
T
>
void
AddElement
(
const
std
::
string
&
op_key
,
const
T
&
value
);
/* \brief Get existed element: memory, primitive or primitive desc */
template
<
typename
T
>
const
T
&
GetElement
(
const
std
::
string
&
op_key
)
const
;
/* \brief Get element pool: memory, primitive or primitive desc pool */
template
<
typename
T
>
const
std
::
unordered_map
<
const
std
::
string
,
const
T
,
std
::
hash
<
std
::
string
>>&
GetElementPool
()
const
;
/* \brief Get the active engine */
const
MKLDNNEngine
&
engine
()
const
{
return
*
engine_
;
}
/* \brief Submit primitive to pipeline */
void
Submit
(
const
MKLDNNPrimitivePtr
&
p
)
{
pipeline_
.
push_back
(
*
p
);
}
const
mkldnn
::
engine
&
GetEngine
()
const
{
return
engine_
;
}
/
*! \brief Execute all submitted primitives in pipeline */
void
Execute
(
bool
block
=
true
)
;
/
/ Set data to blob (i.e. name/data pair). Create blob if not existing
void
SetBlob
(
const
std
::
string
&
name
,
std
::
shared_ptr
<
void
>
data
)
const
;
protected:
/*! \brief Reset the stream to prepare next exectue */
void
ResetStream
();
// Find a saved blob. Return nullptr if not found
std
::
shared_ptr
<
void
>
GetBlob
(
const
std
::
string
&
name
)
const
;
private:
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNMemoryPtr
,
std
::
hash
<
std
::
string
>>
memory_pool_
;
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitivePtr
,
std
::
hash
<
std
::
string
>>
primitive_pool_
;
std
::
unordered_map
<
const
std
::
string
,
const
MKLDNNPrimitiveDescPtr
,
std
::
hash
<
std
::
string
>>
primitive_desc_pool_
;
std
::
vector
<
MKLDNNPrimitive
>
pipeline_
;
MKLDNNStreamPtr
stream_
;
MKLDNNEnginePtr
engine_
;
bool
ready_
;
mkldnn
::
engine
engine_
;
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
void
>>>
p_blobs_
;
};
#endif
...
...
paddle/fluid/platform/float16.h
浏览文件 @
c58ce0a6
...
...
@@ -20,10 +20,6 @@ limitations under the License. */
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#include "unsupported/Eigen/CXX11/Tensor"
#include "paddle/fluid/platform/hostdevice.h"
#ifdef __GNUC__
#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
#else
...
...
@@ -64,6 +60,18 @@ limitations under the License. */
namespace
paddle
{
namespace
platform
{
// Forward declare float16 for eigen.h
struct
float16
;
}
// namespace platform
}
// namespace paddle
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
namespace
platform
{
// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
// and aligned at least on a 2-byte boundary, which leads to efficient
// memory access of float16 struct and also makes float16 compatible
...
...
@@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
}
#endif
HOSTDEVICE
inline
bool
(
isnan
)(
const
float16
&
a
)
{
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return
__hisnan
(
half
(
a
));
#else
return
(
a
.
x
&
0x7fff
)
>
0x7c00
;
#endif
}
HOSTDEVICE
inline
bool
(
isinf
)(
const
float16
&
a
)
{
return
(
a
.
x
&
0x7fff
)
==
0x7c00
;
}
HOSTDEVICE
inline
bool
(
isfinite
)(
const
float16
&
a
)
{
return
!
((
isnan
)(
a
))
&&
!
((
isinf
)(
a
));
}
}
// namespace platform
}
// namespace paddle
...
...
@@ -750,3 +774,27 @@ struct is_pod<paddle::platform::float16> {
};
}
// namespace std
namespace
Eigen
{
namespace
numext
{
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isnan
)(
const
paddle
::
platform
::
float16
&
a
)
{
return
(
paddle
::
platform
::
isnan
)(
a
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isinf
)(
const
paddle
::
platform
::
float16
&
a
)
{
return
(
paddle
::
platform
::
isinf
)(
a
);
}
template
<
>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE
bool
(
isfinite
)(
const
paddle
::
platform
::
float16
&
a
)
{
return
(
paddle
::
platform
::
isfinite
)(
a
);
}
}
// namespace numext
}
// namespace Eigen
paddle/fluid/platform/gpu_info.cc
浏览文件 @
c58ce0a6
...
...
@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
return
count
;
}
int
GetCUDAMultiProcessors
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
PADDLE_ENFORCE
(
cudaDeviceGetAttribute
(
&
count
,
cudaDevAttrMultiProcessorCount
,
id
),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMultiProcessors"
);
return
count
;
}
int
GetCUDAMaxThreadsPerMultiProcessor
(
int
id
)
{
PADDLE_ENFORCE_LT
(
id
,
GetCUDADeviceCount
(),
"id must less than GPU count"
);
int
count
;
PADDLE_ENFORCE
(
cudaDeviceGetAttribute
(
&
count
,
cudaDevAttrMaxThreadsPerMultiProcessor
,
id
),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMaxThreadsPerMultiProcessor"
);
return
count
;
}
int
GetCurrentDeviceId
()
{
int
device_id
;
PADDLE_ENFORCE
(
...
...
paddle/fluid/platform/gpu_info.h
浏览文件 @
c58ce0a6
...
...
@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
//! Get the total number of GPU devices in system.
int
GetCUDADeviceCount
();
//! Get the MultiProcessors of the ith GPU.
int
GetCUDAMultiProcessors
(
int
i
);
//! Get the MaxThreads of each MultiProcessor of the ith GPU.
int
GetCUDAMaxThreadsPerMultiProcessor
(
int
i
);
//! Get the current GPU device id in system.
int
GetCurrentDeviceId
();
...
...
paddle/fluid/platform/mkldnn_helper.h
浏览文件 @
c58ce0a6
...
...
@@ -16,12 +16,15 @@ limitations under the License. */
#include <mkldnn.hpp>
#include "paddle/fluid/framework/operator.h"
namespace
paddle
{
namespace
platform
{
using
MKLDNNStream
=
mkldnn
::
stream
;
using
MKLDNNEngine
=
mkldnn
::
engine
;
using
MKLDNNMemory
=
mkldnn
::
memory
;
using
MKLDNNMemoryDescriptor
=
mkldnn
::
memory
::
desc
;
using
MKLDNNPrimitive
=
mkldnn
::
primitive
;
using
MKLDNNPrimitiveDesc
=
mkldnn
::
handle
<
mkldnn_primitive_desc_t
>
;
...
...
@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
typedef
std
::
unique_ptr
<
MKLDNNPrimitive
>
MKLDNNPrimitivePtr
;
typedef
std
::
unique_ptr
<
MKLDNNPrimitiveDesc
>
MKLDNNPrimitiveDescPtr
;
inline
mkldnn
::
memory
::
desc
MKLDNNMemDesc
(
const
std
::
vector
<
int
>&
dims
,
mkldnn
::
memory
::
data_type
data_type
,
mkldnn
::
memory
::
format
format
)
{
mkldnn
::
memory
::
dims
tz
=
dims
;
return
mkldnn
::
memory
::
desc
({
tz
},
data_type
,
format
);
}
inline
bool
CanMKLDNNBeUsed
(
const
framework
::
ExecutionContext
&
ctx
)
{
bool
use_mkldnn
=
ctx
.
Attr
<
bool
>
(
"use_mkldnn"
);
return
use_mkldnn
&&
platform
::
is_cpu_place
(
ctx
.
GetPlace
());
}
}
// namespace platform
}
// namespace paddle
python/paddle/fluid/__init__.py
浏览文件 @
c58ce0a6
...
...
@@ -28,6 +28,7 @@ import nets
import
optimizer
import
backward
import
regularizer
import
average
from
param_attr
import
ParamAttr
,
WeightNormParamAttr
from
data_feeder
import
DataFeeder
from
core
import
LoDTensor
,
CPUPlace
,
CUDAPlace
...
...
python/paddle/fluid/average.py
0 → 100644
浏览文件 @
c58ce0a6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
"""
Class of all kinds of Average.
All Averages are accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
wrappers of Python functions.
"""
def
_is_number_
(
var
):
return
isinstance
(
var
,
int
)
or
isinstance
(
var
,
float
)
or
(
isinstance
(
var
,
np
.
ndarray
)
and
var
.
shape
==
(
1
,
))
def
_is_number_or_matrix_
(
var
):
return
_is_number_
(
var
)
or
isinstance
(
var
,
np
.
ndarray
)
class
WeightedAverage
(
object
):
def
__init__
(
self
):
self
.
reset
()
def
reset
(
self
):
self
.
numerator
=
None
self
.
denominator
=
None
def
add
(
self
,
value
,
weight
):
if
not
_is_number_or_matrix_
(
value
):
raise
ValueError
(
"The 'value' must be a number(int, float) or a numpy ndarray."
)
if
not
_is_number_
(
weight
):
raise
ValueError
(
"The 'weight' must be a number(int, float)."
)
if
self
.
numerator
is
None
or
self
.
denominator
is
None
:
self
.
numerator
=
value
*
weight
self
.
denominator
=
weight
else
:
self
.
numerator
+=
value
*
weight
self
.
denominator
+=
weight
def
eval
(
self
):
if
self
.
numerator
is
None
or
self
.
denominator
is
None
:
raise
ValueError
(
"There is no data to be averaged in WeightedAverage."
)
return
self
.
numerator
/
self
.
denominator
python/paddle/fluid/backward.py
浏览文件 @
c58ce0a6
...
...
@@ -486,7 +486,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
params_and_grads
=
[]
for
param
in
parameters
:
if
param
not
in
grad_info_map
:
raise
ValueError
(
"param %s is not in map"
%
param
)
continue
grad_info
=
grad_info_map
[
param
]
grad_block
=
grad_info
[
1
]
if
not
grad_block
.
has_var
(
grad_info
[
0
]):
...
...
python/paddle/fluid/evaluator.py
浏览文件 @
c58ce0a6
...
...
@@ -108,44 +108,6 @@ class Evaluator(object):
return
state
class
Accuracy
(
Evaluator
):
"""
Average Accuracy for multiple mini-batches.
"""
def
__init__
(
self
,
input
,
label
,
k
=
1
,
**
kwargs
):
super
(
Accuracy
,
self
).
__init__
(
"accuracy"
,
**
kwargs
)
main_program
=
self
.
helper
.
main_program
if
main_program
.
current_block
().
idx
!=
0
:
raise
ValueError
(
"You can only invoke Evaluator in root block"
)
self
.
total
=
self
.
create_state
(
dtype
=
'int64'
,
shape
=
[
1
],
suffix
=
'total'
)
self
.
correct
=
self
.
create_state
(
dtype
=
'int64'
,
shape
=
[
1
],
suffix
=
'correct'
)
total
=
self
.
helper
.
create_tmp_variable
(
dtype
=
'int'
)
correct
=
self
.
helper
.
create_tmp_variable
(
dtype
=
'int'
)
acc
=
layers
.
accuracy
(
input
=
input
,
label
=
label
,
k
=
k
,
total
=
total
,
correct
=
correct
)
total
=
layers
.
cast
(
x
=
total
,
dtype
=
'int64'
)
correct
=
layers
.
cast
(
x
=
correct
,
dtype
=
'int64'
)
layers
.
sums
(
input
=
[
self
.
total
,
total
],
out
=
self
.
total
)
layers
.
sums
(
input
=
[
self
.
correct
,
correct
],
out
=
self
.
correct
)
self
.
metrics
.
append
(
acc
)
def
eval
(
self
,
executor
,
eval_program
=
None
):
if
eval_program
is
None
:
eval_program
=
Program
()
block
=
eval_program
.
current_block
()
with
program_guard
(
main_program
=
eval_program
):
total
=
_clone_var_
(
block
,
self
.
total
)
correct
=
_clone_var_
(
block
,
self
.
correct
)
total
=
layers
.
cast
(
total
,
dtype
=
'float32'
)
correct
=
layers
.
cast
(
correct
,
dtype
=
'float32'
)
out
=
layers
.
elementwise_div
(
x
=
correct
,
y
=
total
)
return
np
.
array
(
executor
.
run
(
eval_program
,
fetch_list
=
[
out
])[
0
])
class
ChunkEvaluator
(
Evaluator
):
"""
Accumulate counter numbers output by chunk_eval from mini-batches and
...
...
python/paddle/fluid/layers/__init__.py
浏览文件 @
c58ce0a6
...
...
@@ -28,6 +28,8 @@ import math_op_patch
from
math_op_patch
import
*
import
detection
from
detection
import
*
import
metric
from
metric
import
*
from
learning_rate_scheduler
import
*
__all__
=
[]
...
...
@@ -39,4 +41,5 @@ __all__ += control_flow.__all__
__all__
+=
ops
.
__all__
__all__
+=
device
.
__all__
__all__
+=
detection
.
__all__
__all__
+=
metric
.
__all__
__all__
+=
learning_rate_scheduler
.
__all__
python/paddle/fluid/layers/metric.py
0 → 100644
浏览文件 @
c58ce0a6
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
All layers just related to metric.
"""
from
..layer_helper
import
LayerHelper
from
..initializer
import
Normal
,
Constant
from
..framework
import
Variable
from
..param_attr
import
ParamAttr
__all__
=
[
'accuracy'
]
def
accuracy
(
input
,
label
,
k
=
1
,
correct
=
None
,
total
=
None
):
"""
This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices.
"""
helper
=
LayerHelper
(
"accuracy"
,
**
locals
())
topk_out
=
helper
.
create_tmp_variable
(
dtype
=
input
.
dtype
)
topk_indices
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"top_k"
,
inputs
=
{
"X"
:
[
input
]},
outputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
]},
attrs
=
{
"k"
:
k
})
acc_out
=
helper
.
create_tmp_variable
(
dtype
=
"float32"
)
if
correct
is
None
:
correct
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
if
total
is
None
:
total
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"accuracy"
,
inputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
],
"Label"
:
[
label
]
},
outputs
=
{
"Accuracy"
:
[
acc_out
],
"Correct"
:
[
correct
],
"Total"
:
[
total
],
})
return
acc_out
python/paddle/fluid/layers/nn.py
浏览文件 @
c58ce0a6
...
...
@@ -35,7 +35,6 @@ __all__ = [
'cos_sim'
,
'cross_entropy'
,
'square_error_cost'
,
'accuracy'
,
'chunk_eval'
,
'sequence_conv'
,
'conv2d'
,
...
...
@@ -1022,40 +1021,6 @@ def square_error_cost(input, label):
return
square_out
def
accuracy
(
input
,
label
,
k
=
1
,
correct
=
None
,
total
=
None
):
"""
This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices.
"""
helper
=
LayerHelper
(
"accuracy"
,
**
locals
())
topk_out
=
helper
.
create_tmp_variable
(
dtype
=
input
.
dtype
)
topk_indices
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"top_k"
,
inputs
=
{
"X"
:
[
input
]},
outputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
]},
attrs
=
{
"k"
:
k
})
acc_out
=
helper
.
create_tmp_variable
(
dtype
=
"float32"
)
if
correct
is
None
:
correct
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
if
total
is
None
:
total
=
helper
.
create_tmp_variable
(
dtype
=
"int64"
)
helper
.
append_op
(
type
=
"accuracy"
,
inputs
=
{
"Out"
:
[
topk_out
],
"Indices"
:
[
topk_indices
],
"Label"
:
[
label
]
},
outputs
=
{
"Accuracy"
:
[
acc_out
],
"Correct"
:
[
correct
],
"Total"
:
[
total
],
})
return
acc_out
def
chunk_eval
(
input
,
label
,
chunk_scheme
,
...
...
@@ -1146,6 +1111,7 @@ def conv2d(input,
param_attr
=
None
,
bias_attr
=
None
,
use_cudnn
=
True
,
use_mkldnn
=
False
,
act
=
None
):
"""
**Convlution2D Layer**
...
...
@@ -1287,7 +1253,8 @@ def conv2d(input,
'strides'
:
stride
,
'paddings'
:
padding
,
'groups'
:
groups
,
'use_cudnn'
:
use_cudnn
'use_cudnn'
:
use_cudnn
,
'use_mkldnn'
:
use_mkldnn
})
pre_act
=
helper
.
append_bias_op
(
pre_bias
,
dim_start
=
1
,
dim_end
=
2
)
...
...
@@ -3182,7 +3149,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
data = fluid.layers.data(name='data', shape=[128], dtype='float32')
label = fluid.layers.data(name='label', shape=[100], dtype='int64')
fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.smooth_l1(
logits=fc, label
=label)
out = fluid.layers.smooth_l1(
x=fc, y
=label)
"""
helper
=
LayerHelper
(
'smooth_l1_loss'
,
**
locals
())
diff
=
helper
.
create_tmp_variable
(
dtype
=
x
.
dtype
)
...
...
python/paddle/fluid/nets.py
浏览文件 @
c58ce0a6
...
...
@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
act
,
param_attr
=
None
,
pool_type
=
'max'
,
use_cudnn
=
True
):
use_cudnn
=
True
,
use_mkldnn
=
False
):
conv_out
=
layers
.
conv2d
(
input
=
input
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
param_attr
=
param_attr
,
act
=
act
,
use_cudnn
=
use_cudnn
)
use_cudnn
=
use_cudnn
,
use_mkldnn
=
use_mkldnn
)
pool_out
=
layers
.
pool2d
(
input
=
conv_out
,
...
...
@@ -58,7 +60,8 @@ def img_conv_group(input,
conv_batchnorm_drop_rate
=
0.0
,
pool_stride
=
1
,
pool_type
=
None
,
use_cudnn
=
True
):
use_cudnn
=
True
,
use_mkldnn
=
False
):
"""
Image Convolution Group, Used for vgg net.
"""
...
...
@@ -90,7 +93,8 @@ def img_conv_group(input,
padding
=
conv_padding
[
i
],
param_attr
=
param_attr
[
i
],
act
=
local_conv_act
,
use_cudnn
=
use_cudnn
)
use_cudnn
=
use_cudnn
,
use_mkldnn
=
use_mkldnn
)
if
conv_with_batchnorm
[
i
]:
tmp
=
layers
.
batch_norm
(
input
=
tmp
,
act
=
conv_act
)
...
...
python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
浏览文件 @
c58ce0a6
...
...
@@ -122,7 +122,8 @@ avg_cost = fluid.layers.mean(cost)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
0.001
)
opts
=
optimizer
.
minimize
(
avg_cost
)
accuracy
=
fluid
.
evaluator
.
Accuracy
(
input
=
predict
,
label
=
label
)
batch_size
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size
)
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
...
...
@@ -144,13 +145,17 @@ feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe
.
run
(
fluid
.
default_startup_program
())
i
=
0
accuracy
=
fluid
.
average
.
WeightedAverage
()
for
pass_id
in
range
(
PASS_NUM
):
accuracy
.
reset
(
exe
)
accuracy
.
reset
()
for
data
in
train_reader
():
loss
,
acc
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
pass_acc
=
accuracy
.
eval
(
exe
)
loss
,
acc
,
weight
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
avg_cost
,
batch_acc
,
batch_size
])
accuracy
.
add
(
value
=
acc
,
weight
=
weight
)
pass_acc
=
accuracy
.
eval
()
print
(
"loss:"
+
str
(
loss
)
+
" acc:"
+
str
(
acc
)
+
" pass_acc:"
+
str
(
pass_acc
))
# this model is slow, so if we can train two mini batch, we think it works properly.
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
c58ce0a6
...
...
@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
class
TestConv2dOp
(
OpTest
):
def
setUp
(
self
):
self
.
use_cudnn
=
False
self
.
use_mkldnn
=
False
self
.
init_op_type
()
self
.
init_group
()
self
.
init_dilation
()
...
...
@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
'paddings'
:
self
.
pad
,
'groups'
:
self
.
groups
,
'dilations'
:
self
.
dilations
,
'use_cudnn'
:
self
.
use_cudnn
'use_cudnn'
:
self
.
use_cudnn
,
'use_mkldnn'
:
self
.
use_mkldnn
}
self
.
outputs
=
{
'Output'
:
output
}
...
...
@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
# def init_op_type(self):
# self.op_type = "conv_cudnn"
#----------------Conv2dMKLDNN----------------
class
TestMKLDNN
(
TestConv2dOp
):
def
init_op_type
(
self
):
self
.
use_mkldnn
=
True
self
.
op_type
=
"conv2d"
class
TestMKLDNNWithPad
(
TestWithPad
):
def
init_op_type
(
self
):
self
.
use_mkldnn
=
True
self
.
op_type
=
"conv2d"
class
TestMKLDNNWithStride
(
TestWithStride
):
def
init_op_type
(
self
):
self
.
use_mkldnn
=
True
self
.
op_type
=
"conv2d"
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_profiler.py
浏览文件 @
c58ce0a6
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/test_reshape_op.py
浏览文件 @
c58ce0a6
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录