Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
4e07f259
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
4e07f259
编写于
7月 23, 2018
作者:
Q
Qiao Longfei
提交者:
sneaxiy
7月 23, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #12295 from jacquesqiao/speedup-reduce-sum-grad-op
Speedup reduce sum grad op
上级
eec412b2
273f7375
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
239 addition
and
64 deletion
+239
-64
paddle/fluid/operators/reduce_sum_op.cc
paddle/fluid/operators/reduce_sum_op.cc
+10
-9
paddle/fluid/operators/reduce_sum_op.h
paddle/fluid/operators/reduce_sum_op.h
+59
-1
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+102
-32
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+2
-2
python/paddle/fluid/tests/unittests/test_reduce_op.py
python/paddle/fluid/tests/unittests/test_reduce_op.py
+66
-20
未找到文件。
paddle/fluid/operators/reduce_sum_op.cc
浏览文件 @
4e07f259
...
...
@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL(
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
SumFunctor
>
);
REGISTER_OP_CPU_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
SumGradFunctor
>
);
REGISTER_OP_CPU_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceSumGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceSumGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceSumGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
,
ops
::
SumGradFunctor
>
,
ops
::
ReduceSumGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
,
ops
::
SumGradFunctor
>
);
paddle/fluid/operators/reduce_sum_op.h
浏览文件 @
4e07f259
...
...
@@ -14,11 +14,69 @@
#pragma once
#include <vector>
#include "paddle/fluid/operators/reduce_op.h"
namespace
paddle
{
namespace
operators
{
// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
template
<
typename
DeviceContext
,
typename
T
,
typename
Functor
>
class
ReduceSumGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
if
(
context
.
GetPlace
().
type
()
==
typeid
(
platform
::
CPUPlace
)
&&
dims
.
size
()
==
1
)
{
auto
*
input0
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
input2
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
output
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
auto
*
input2_d
=
input2
->
data
<
T
>
();
auto
*
output_d
=
output
->
data
<
T
>
();
// handle reduce_all
if
(
input2
->
dims
().
size
()
==
1
&&
input2
->
dims
()[
0
]
==
1
)
{
for
(
int64_t
i
=
0
;
i
<
framework
::
product
(
input0
->
dims
());
++
i
)
{
output_d
[
i
]
=
input2_d
[
0
];
}
return
;
}
// handle reduce by one dimension
int
reduce_dim_index
=
dims
[
0
];
if
(
reduce_dim_index
<
0
)
{
reduce_dim_index
+=
input0
->
dims
().
size
();
}
auto
&
input_dim
=
input0
->
dims
();
int64_t
before_dim
=
1
;
for
(
int
i
=
0
;
i
<
reduce_dim_index
;
++
i
)
{
before_dim
*=
input_dim
[
i
];
}
int64_t
reduce_dim
=
input_dim
[
reduce_dim_index
];
int64_t
after_dim
=
1
;
for
(
int
i
=
reduce_dim_index
+
1
;
i
<
input_dim
.
size
();
++
i
)
{
after_dim
*=
input_dim
[
i
];
}
for
(
int64_t
i
=
0
;
i
<
before_dim
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
reduce_dim
;
++
j
)
{
for
(
int64_t
k
=
0
;
k
<
after_dim
;
++
k
)
{
output_d
[
i
*
reduce_dim
*
after_dim
+
j
*
after_dim
+
k
]
=
input2_d
[
i
*
after_dim
+
k
];
}
}
}
return
;
}
// default use Eigen broadcast
ReduceGradKernel
<
DeviceContext
,
T
,
Functor
>
kernel
;
kernel
.
Compute
(
context
);
}
};
struct
SumFunctor
{
template
<
typename
DeviceContext
,
typename
X
,
typename
Y
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
const
Dim
&
dim
)
{
...
...
@@ -31,7 +89,7 @@ struct SumGradFunctor {
typename
DY
,
typename
Dim
>
void
operator
()(
const
DeviceContext
&
place
,
X
*
x
,
Y
*
y
,
DX
*
dx
,
DY
*
dy
,
const
Dim
&
dim
,
int
size
)
{
dx
->
device
(
place
)
=
dy
->
broadcast
(
dim
);
dx
->
device
(
place
)
=
dy
->
eval
().
broadcast
(
dim
);
}
};
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
4e07f259
...
...
@@ -456,52 +456,122 @@ def py_reader(capacity,
name
=
None
,
use_double_buffer
=
True
):
"""
Create a
reader and blocking queue
for data feeding in Python
Create a
python reader
for data feeding in Python
This layer returns a Reader Variable and a BlockingQueue.
The BlockingQueue provides `push()` method to push a `LoDTensorArray`
object into the queue in Python side. In C++ side, the Reader
Variable would invoke `pop()` method of the queue to retrieve the
feeding data. The process of feeding data in Python side and fetching
data in C++ side can run in parallel. The BlockingQueue should be closed
using `close()` method when unused.
This layer returns a Reader Variable.
The Reader provides :code:`decorate_paddle_reader` and
:code:`decorate_tensor_provider` to set a Python generator as the data
source in Python side. When :code:`Executor::Run()` is invoked in C++
side, the data from the generator would be read automatically. Unlike
:code:`DataFeeder.feed()`, the data reading process and
:code:`Executor::Run()` process can run in parallel using
:code:`py_reader`. The :code:`start()` method of the Reader should be
called when each pass begins, while the :code:`reset()` method should be
called when the pass ends and :code:`fluid.core.EOFException` raises.
Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
Args:
use_double_buffer(bool): Whether use double buffer or not.
capacity(int): The maximum capacity of the BlockingQueue.
capacity(int): The buffer capacity maintained by :code:`py_reader`.
shapes(list|tuple): List of tuples which declaring data shapes.
dtypes(list|tuple): List of strs which declaring data type.
lod_levels(list|tuple): List of ints which declaring data lod_level.
name(basestring): The prefix Python queue name and Reader name. None will
be generated automatically.
use_double_buffer(bool): Whether use double buffer or not.
Returns:
tuple(Variable, BlockingQueue):
A Reader Variable from which we can get feeding data.
A BlockingQueue object for data feeding.
Variable: A Reader from which we can get feeding data.
Examples:
.. code-block:: python
1. The basic usage of :code:`py_reader` is as follows:
reader, queue = fluid.layers.py_reader(
capacity=10,
shapes=[[-1,3,224,224], [-1,1]],
dtypes=['float32', 'int64'])
# Via the reader, we can use 'read_file' layer to get data:
image, label = fluid.layers.read_file(reader)
# Via the blocking queue, we can feed data using threads
def feed_data(queue, feed_images, feed_labels):
for feed_image, feed_label in zip(feed_images, feed_labels):
data = core.LoDTensorArray()
data.append(feed_image)
data.append(feed_label)
queue.push(data)
thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels))
thread.start()
>>> import paddle.v2
>>> import paddle.fluid as fluid
>>> import paddle.dataset.mnist as mnist
>>>
>>> reader = fluid.layers.py_reader(capacity=64,
>>> shapes=[(-1,3,224,224), (-1,1)],
>>> dtypes=['float32', 'int64'])
>>> reader.decorate_paddle_reader(
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train())
>>>
>>> img, label = fluid.layers.read_file(reader)
>>> loss = network(img, label) # some network definition
>>>
>>> fluid.Executor(fluid.CUDAPlace(0)).run(fluid.default_startup_program())
>>>
>>> exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
>>> for epoch_id in range(10):
>>> reader.start()
>>> try:
>>> while True:
>>> exe.run(fetch_list=[loss.name])
>>> except fluid.core.EOFException:
>>> reader.reset()
2. When training and testing are both performed, two different
:code:`py_reader` should be created with different names, e.g.:
>>> import paddle.v2
>>> import paddle.fluid as fluid
>>> import paddle.dataset.mnist as mnist
>>>
>>> def network(reader):
>>> img, label = fluid.layers.read_file(reader)
>>> # Here, we omitted the network definition
>>> return loss
>>>
>>> train_reader = fluid.layers.py_reader(capacity=64,
>>> shapes=[(-1,3,224,224), (-1,1)],
>>> dtypes=['float32', 'int64'],
>>> name='train_reader')
>>> train_reader.decorate_paddle_reader(
>>> paddle.v2.reader.shuffle(paddle.batch(mnist.train())
>>>
>>> test_reader = fluid.layers.py_reader(capacity=32,
>>> shapes=[(-1,3,224,224), (-1,1)],
>>> dtypes=['float32', 'int64'],
>>> name='test_reader')
>>> test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
>>>
>>> # Create train_main_prog and train_startup_prog
>>> train_main_prog = fluid.Program()
>>> train_startup_prog = fluid.Program()
>>> with fluid.program_guard(train_main_prog, train_startup_prog):
>>> # Use fluid.unique_name.guard() to share parameters with test program
>>> with fluid.unique_name.guard():
>>> train_loss = network(train_reader) # some network definition
>>> adam = fluid.optimizer.Adam(learning_rate=0.01)
>>> adam.minimize(loss)
>>>
>>> # Create test_main_prog and test_startup_prog
>>> test_main_prog = fluid.Program()
>>> test_startup_prog = fluid.Program()
>>> with fluid.program_guard(test_main_prog, test_startup_prog):
>>> # Use fluid.unique_name.guard() to share parameters with train program
>>> with fluid.unique_name.guard():
>>> test_loss = network(test_reader)
>>>
>>> fluid.Executor(fluid.CUDAPlace(0)).run(train_startup_prog)
>>> fluid.Executor(fluid.CUDAPlace(0)).run(test_startup_prog)
>>>
>>> train_exe = fluid.ParallelExecutor(use_cuda=True,
>>> loss_name=train_loss.name, main_program=train_main_prog)
>>> test_exe = fluid.ParallelExecutor(use_cuda=True,
>>> loss_name=test_loss.name, main_program=test_main_prog)
>>> for epoch_id in range(10):
>>> try:
>>> while True:
>>> train_exe.run(fetch_list=[train_loss.name])
>>> except fluid.core.EOFException:
>>> train_reader.reset()
>>>
>>> try:
>>> while True:
>>> test_exe.run(fetch_list=[test_loss.name])
>>> except fluid.core.EOFException:
>>> test_reader.reset()
"""
dtypes
=
[
convert_np_dtype_to_dtype_
(
dt
)
for
dt
in
dtypes
]
shape_concat
=
[]
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
4e07f259
...
...
@@ -2961,7 +2961,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
# x is a Tensor variable with following elements:
# [[0.2, 0.3, 0.5, 0.9]
# [0.1, 0.2, 0.6, 0.7]]
# Each example is followed by the corresp
e
nding output tensor.
# Each example is followed by the corresp
o
nding output tensor.
fluid.layers.reduce_sum(x) # [3.5]
fluid.layers.reduce_sum(x, dim=0) # [0.3, 0.5, 1.1, 1.6]
fluid.layers.reduce_sum(x, dim=-1) # [1.9, 1.6]
...
...
@@ -2970,7 +2970,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
# x is a Tensor variable with shape [2, 2, 2] and elements as below:
# [[[1, 2], [3, 4]],
# [[5, 6], [7, 8]]]
# Each example is followed by the corresp
e
nding output tensor.
# Each example is followed by the corresp
o
nding output tensor.
fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
...
...
python/paddle/fluid/tests/unittests/test_reduce_op.py
浏览文件 @
4e07f259
...
...
@@ -89,15 +89,11 @@ class TestProdOp(OpTest):
self
.
check_grad
([
'X'
],
'Out'
)
class
Test
KeepDim
Reduce
(
OpTest
):
class
Test
1D
Reduce
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
10
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'dim'
:
[
-
2
],
'keep_dim'
:
True
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]),
keepdims
=
True
)
}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
20
).
astype
(
"float64"
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
0
)}
def
test_check_output
(
self
):
self
.
check_output
()
...
...
@@ -106,32 +102,82 @@ class TestKeepDimReduce(OpTest):
self
.
check_grad
([
'X'
],
'Out'
)
class
Test
1DReduce
(
OpTest
):
class
Test
2DReduce0
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
20
).
astype
(
"float64"
)}
self
.
attrs
=
{
'dim'
:
[
0
]}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
20
,
10
)).
astype
(
"float64"
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
0
)}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
class
Test2DReduce1
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
attrs
=
{
'dim'
:
[
1
]}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
20
,
10
)).
astype
(
"float64"
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
}
class
TestReduceAll
(
OpTest
):
class
Test3DReduce0
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
attrs
=
{
'dim'
:
[
1
]}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
7
)).
astype
(
"float64"
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
}
class
Test3DReduce1
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
attrs
=
{
'dim'
:
[
2
]}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
7
)).
astype
(
"float64"
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
}
class
Test3DReduce2
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
attrs
=
{
'dim'
:
[
-
2
]}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
7
)).
astype
(
"float64"
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
}
class
Test3DReduce3
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
attrs
=
{
'dim'
:
[
1
,
2
]}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
7
)).
astype
(
"float64"
)}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]))
}
class
TestKeepDimReduce
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
10
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'dim'
:
[
1
],
'keep_dim'
:
True
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
(
axis
=
tuple
(
self
.
attrs
[
'dim'
]),
keepdims
=
self
.
attrs
[
'keep_dim'
])
}
class
TestReduceAll
(
Test1DReduce
):
def
setUp
(
self
):
self
.
op_type
=
"reduce_sum"
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
5
,
6
,
2
,
10
)).
astype
(
"float64"
)}
self
.
attrs
=
{
'reduce_all'
:
True
}
self
.
outputs
=
{
'Out'
:
self
.
inputs
[
'X'
].
sum
()}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
## reduction in multi dims
class
TestReduceMeanOpMultiAxises
(
OpTest
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录