Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f044b23f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f044b23f
编写于
1月 24, 2018
作者:
Y
Yang Yu
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of github.com:baidu/Paddle into feature/add_demo_for_parallel.do
上级
d815ec23
0c5cbc4e
变更
8
展开全部
隐藏空白更改
内联
并排
Showing
8 changed file
with
450 addition
and
126 deletion
+450
-126
doc/api/v2/fluid/nets.rst
doc/api/v2/fluid/nets.rst
+3
-3
paddle/gserver/tests/test_LayerGrad.cpp
paddle/gserver/tests/test_LayerGrad.cpp
+26
-18
paddle/operators/reshape_op.cc
paddle/operators/reshape_op.cc
+2
-6
python/paddle/v2/fluid/io.py
python/paddle/v2/fluid/io.py
+9
-2
python/paddle/v2/fluid/layers/nn.py
python/paddle/v2/fluid/layers/nn.py
+162
-73
python/paddle/v2/fluid/nets.py
python/paddle/v2/fluid/nets.py
+150
-24
python/paddle/v2/fluid/tests/test_iou_similarity_op.py
python/paddle/v2/fluid/tests/test_iou_similarity_op.py
+0
-0
python/paddle/v2/fluid/tests/test_multihead_attention.py
python/paddle/v2/fluid/tests/test_multihead_attention.py
+98
-0
未找到文件。
doc/api/v2/fluid/nets.rst
浏览文件 @
f044b23f
...
...
@@ -26,8 +26,8 @@ glu
:noindex:
dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
scaled_
dot_product_attention
---------------------
-------
.. autofunction:: paddle.v2.fluid.nets.
scaled_
dot_product_attention
:noindex:
paddle/gserver/tests/test_LayerGrad.cpp
浏览文件 @
f044b23f
...
...
@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
"seqlastins"
,
"non-seq"
,
-
1
);
// hasSubseq seqlastins to non-seq
testDegradeLayer
(
true
,
"seqlastins"
,
"seq"
,
-
1
);
// hasSubseq seqlastins to seq
testDegradeLayer
(
true
,
"seqlastins"
,
"seq"
,
-
1
);
// hasSubseq seqlastins to seq
}
TEST
(
Layer
,
AverageLayer
)
{
...
...
@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) {
"average"
,
"non-seq"
,
5
);
// seq average to a shorten seq, stride window = 5
testDegradeLayer
(
true
,
"average"
,
"non-seq"
,
-
1
);
// hasSubseq average to non-seq
testDegradeLayer
(
true
,
"average"
,
"non-seq"
,
-
1
);
// hasSubseq average to non-seq
testDegradeLayer
(
true
,
"average"
,
"seq"
,
-
1
);
// hasSubseq average to seq
}
...
...
@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) {
testPoolLayer
(
"cudnn-avg-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-max-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-avg-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-avg-incl-pad-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-avg-incl-pad-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer
(
"max-pool-with-mask"
,
/* trans= */
false
,
/* useGpu= */
true
);
#endif
}
...
...
@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) {
}
TEST
(
Layer
,
ScaleShiftLayer
)
{
const
size_t
batchSize
=
16
;
const
size_t
size
=
32
;
TestConfig
config
;
config
.
layerConfig
.
set_type
(
"scale_shift"
);
config
.
layerConfig
.
set_size
(
size
);
config
.
biasSize
=
1
;
config
.
inputDefs
.
push_back
(
{
INPUT_DATA
,
"input"
,
/* dim= */
size
,
/* paraSize= */
1
});
config
.
layerConfig
.
add_inputs
();
for
(
auto
useGpu
:
{
false
,
true
})
{
testLayerGrad
(
config
,
"scale_shift"
,
batchSize
,
false
,
useGpu
,
false
);
}
// FIXME: Disable ScaleShiftLayer because it is not stable.
// https://github.com/PaddlePaddle/Paddle/issues/7781
return
;
// const size_t batchSize = 16;
// const size_t size = 32;
// TestConfig config;
// config.layerConfig.set_type("scale_shift");
// config.layerConfig.set_size(size);
// config.biasSize = 1;
// config.inputDefs.push_back(
// {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
// config.layerConfig.add_inputs();
// for (auto useGpu : {false, true}) {
// testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
// }
}
TEST
(
Layer
,
ScaleSubRegionLayer
)
{
...
...
paddle/operators/reshape_op.cc
浏览文件 @
f044b23f
...
...
@@ -90,14 +90,10 @@ Reshape Operator.
Reshape Input(X) into the shape specified by Attr(shape).
An example:
Given a 2-D tensor X with 2 rows and 2 columns
[[1, 2], [3, 4]]
Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
and target shape = [1, 4], the reshape operator will transform
the tensor X into a 2-D tensor:
[[1, 2, 3, 4]]
the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
One dimension in the target shape can be set -1, representing that its
size is unknown. In this case, the real dimension will be infered from
...
...
python/paddle/v2/fluid/io.py
浏览文件 @
f044b23f
...
...
@@ -15,6 +15,7 @@
import
os
import
cPickle
as
pickle
from
paddle.v2.fluid.evaluator
import
Evaluator
from
paddle.v2.fluid.framework
import
Program
,
Parameter
,
default_main_program
,
Variable
from
.
import
core
...
...
@@ -187,8 +188,14 @@ def get_inference_program(target_vars, main_program=None):
main_program
=
default_main_program
()
if
not
isinstance
(
target_vars
,
list
):
target_vars
=
[
target_vars
]
pruned_program
=
main_program
.
prune
(
targets
=
target_vars
)
vars
=
[]
for
var
in
target_vars
:
if
isinstance
(
var
,
Evaluator
):
vars
.
append
(
var
.
states
)
vars
.
append
(
var
.
metrics
)
else
:
vars
.
append
(
var
)
pruned_program
=
main_program
.
prune
(
targets
=
vars
)
inference_program
=
pruned_program
.
inference_optimize
()
return
inference_program
...
...
python/paddle/v2/fluid/layers/nn.py
浏览文件 @
f044b23f
此差异已折叠。
点击以展开。
python/paddle/v2/fluid/nets.py
浏览文件 @
f044b23f
...
...
@@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
layers
__all__
=
[
"simple_img_conv_pool"
,
"sequence_conv_pool"
,
"glu"
,
"dot_product_attention"
,
"
scaled_
dot_product_attention"
,
]
...
...
@@ -160,7 +159,11 @@ def glu(input, dim=-1):
return
out
def
dot_product_attention
(
querys
,
keys
,
values
):
def
scaled_dot_product_attention
(
queries
,
keys
,
values
,
num_heads
=
1
,
dropout_rate
=
0.
):
"""
The dot-product attention.
...
...
@@ -174,39 +177,162 @@ def dot_product_attention(querys, keys, values):
.. math::
Attention(Q, K, V)=
softmax(QK^\mathrm{T})V
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_.
Note that batch data containing sequences with different lengths is not
supported by this because of the (batch) matrix multipication.
Args:
query (Variable): The input variable which is a Tensor or LoDTensor.
key (Variable): The input variable which is a Tensor or LoDTensor.
value (Variable): The input variable which is a Tensor or LoDTensor.
queries (Variable): The input variable which should be a 3-D Tensor.
keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product
attention. Default value is 1.
dropout_rate (float): The dropout rate to drop the attention weight.
Default value is 0.
Returns:
tuple: The Tensor variables representing the output and attention scores.
Variable: A 3-D Tensor computed by multi-head scaled dot product
attention.
Raises:
ValueError: If input queries, keys, values are not 3-D Tensors.
NOTE:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
1. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
Examples:
.. code-block:: python
# Suppose q, k, v are
tensor variable
s with the following shape:
# Suppose q, k, v are
Tensor
s with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
out.shape # [3, 5, 10]
attn_scores.shape # [3, 5, 6
]
contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
contexts.shape # [3, 5, 10
]
"""
assert
keys
.
shape
[
-
2
]
==
values
.
shape
[
-
2
],
'The shapes of keys and values mismatch.'
assert
querys
.
shape
[
-
1
]
==
keys
.
shape
[
-
1
],
'The shapes of querys and keys mismatch.'
product
=
layers
.
matmul
(
x
=
querys
,
y
=
keys
,
transpose_y
=
True
)
attn_scores
=
layers
.
reshape
(
if
not
(
len
(
queries
.
shape
)
==
len
(
keys
.
shape
)
==
len
(
values
.
shape
)
==
3
):
raise
ValueError
(
"Inputs quries, keys and values should all be 3-D tensors."
)
if
queries
.
shape
[
-
1
]
!=
keys
.
shape
[
-
1
]:
raise
ValueError
(
"The hidden size of queries and keys should be the same."
)
if
keys
.
shape
[
-
2
]
!=
values
.
shape
[
-
2
]:
raise
ValueError
(
"The max sequence length in query batch and in key batch "
"should be the same."
)
if
keys
.
shape
[
-
1
]
%
num_heads
!=
0
:
raise
ValueError
(
"The hidden size of keys (%d) must be divisible "
"by the number of attention heads (%d)."
%
(
keys
.
shape
[
-
1
],
num_heads
))
if
values
.
shape
[
-
1
]
%
num_heads
!=
0
:
raise
ValueError
(
"The hidden size of values (%d) must be divisible "
"by the number of attention heads (%d)."
%
(
values
.
shape
[
-
1
],
num_heads
))
def
__compute_qkv
(
queries
,
keys
,
values
,
num_heads
):
"""
Add linear projection to queries, keys, and values.
Args:
queries(Tensor): a 3-D input Tensor.
keys(Tensor): a 3-D input Tensor.
values(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads. Linearly project the inputs
ONLY when num_heads > 1.
Returns:
Tensor: linearly projected output Tensors: queries', keys' and
values'. They have the same shapes with queries, keys and
values.
"""
if
num_heads
==
1
:
return
queries
,
keys
,
values
q
=
layers
.
fc
(
input
=
queries
,
size
=
queries
.
shape
[
-
1
],
num_flatten_dims
=
2
)
k
=
layers
.
fc
(
input
=
keys
,
size
=
keys
.
shape
[
-
1
],
num_flatten_dims
=
2
)
v
=
layers
.
fc
(
input
=
values
,
size
=
values
.
shape
[
-
1
],
num_flatten_dims
=
2
)
return
q
,
k
,
v
def
__split_heads
(
x
,
num_heads
):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions.
Args:
x(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads.
Returns:
Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
of the last dimension of x.
"""
if
num_heads
==
1
:
return
x
hidden_size
=
x
.
shape
[
-
1
]
# reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
# into a 4-D output:
# [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
reshaped
=
layers
.
reshape
(
x
=
x
,
shape
=
list
(
x
.
shape
[:
-
1
])
+
[
num_heads
,
hidden_size
//
num_heads
])
# permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
return
layers
.
transpose
(
x
=
reshaped
,
perm
=
[
0
,
2
,
1
,
3
])
def
__combine_heads
(
x
):
"""
Reshape the last two dimensions of inpunt tensor x so that it becomes
one dimension.
Args:
x(Tensor): a 4-D input Tensor with shape
[bs, num_heads, max_sequence_length, hidden_dim].
Returns:
Tensor: a Tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim].
"""
if
len
(
x
.
shape
)
==
3
:
return
x
if
len
(
x
.
shape
)
!=
4
:
raise
ValueError
(
"Input(x) should be a 4-D Tensor."
)
trans_x
=
layers
.
transpose
(
x
,
perm
=
[
0
,
2
,
1
,
3
])
return
layers
.
reshape
(
x
=
trans_x
,
shape
=
map
(
int
,
[
trans_x
.
shape
[
0
],
trans_x
.
shape
[
1
],
trans_x
.
shape
[
2
]
*
trans_x
.
shape
[
3
]
]))
q
,
k
,
v
=
__compute_qkv
(
queries
,
keys
,
values
,
num_heads
)
q
=
__split_heads
(
q
,
num_heads
)
k
=
__split_heads
(
k
,
num_heads
)
v
=
__split_heads
(
v
,
num_heads
)
key_dim_per_head
=
keys
.
shape
[
-
1
]
//
num_heads
scaled_q
=
layers
.
scale
(
x
=
q
,
scale
=
key_dim_per_head
**-
0.5
)
product
=
layers
.
matmul
(
x
=
k
,
y
=
scaled_q
,
transpose_y
=
True
)
weights
=
layers
.
reshape
(
x
=
layers
.
reshape
(
x
=
product
,
shape
=
[
-
1
,
product
.
shape
[
-
1
]],
act
=
'softmax'
),
x
=
product
,
shape
=
[
-
1
,
product
.
shape
[
-
1
]],
act
=
"softmax"
),
shape
=
product
.
shape
)
out
=
layers
.
matmul
(
attn_scores
,
values
)
return
out
,
attn_scores
if
dropout_rate
:
weights
=
layers
.
dropout
(
x
,
dropout_prob
=
dropout_rate
,
is_test
=
False
)
ctx_multiheads
=
layers
.
matmul
(
weights
,
v
)
return
__combine_heads
(
ctx_multiheads
)
python/paddle/v2/fluid/tests/test_iou_similarity_op.py
100755 → 100644
浏览文件 @
f044b23f
文件模式从 100755 更改为 100644
python/paddle/v2/fluid/tests/test_multihead_attention.py
0 → 100644
浏览文件 @
f044b23f
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle.v2.fluid
as
fluid
import
paddle.v2.fluid.core
as
core
import
numpy
as
np
class
TestMultiheadAttention
(
unittest
.
TestCase
):
def
gen_random_input
(
self
):
"""Generate random input data.
"""
# batch_size, max_sequence_length, hidden dimension
self
.
input_shape
=
(
3
,
13
,
16
)
self
.
queries
=
np
.
random
.
random
(
size
=
self
.
input_shape
).
astype
(
"float32"
)
self
.
keys
=
np
.
random
.
random
(
size
=
self
.
input_shape
).
astype
(
"float32"
)
def
set_program
(
self
):
"""Build the test program.
"""
queries
=
fluid
.
layers
.
data
(
name
=
"queries"
,
shape
=
self
.
input_shape
,
dtype
=
"float32"
,
append_batch_size
=
False
)
queries
.
stop_gradient
=
False
keys
=
fluid
.
layers
.
data
(
name
=
"keys"
,
shape
=
self
.
input_shape
,
dtype
=
"float32"
,
append_batch_size
=
False
)
keys
.
stop_gradient
=
False
contexts
=
fluid
.
nets
.
scaled_dot_product_attention
(
queries
=
queries
,
keys
=
keys
,
values
=
keys
,
num_heads
=
8
,
dropout_rate
=
0.
)
out
=
fluid
.
layers
.
reduce_sum
(
contexts
,
dim
=
None
)
fluid
.
backward
.
append_backward
(
loss
=
out
)
self
.
fetch_list
=
[
contexts
]
def
run_program
(
self
):
"""Run the test program.
"""
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile_gpu
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
set_inputs
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
output
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
self
.
inputs
,
fetch_list
=
self
.
fetch_list
,
return_numpy
=
True
)
self
.
op_output
=
output
def
set_inputs
(
self
,
place
):
"""Set the randomly generated data to the test program.
"""
self
.
inputs
=
{}
queries
=
fluid
.
Tensor
()
queries
.
set
(
self
.
queries
,
place
)
keys
=
fluid
.
Tensor
()
keys
.
set
(
self
.
keys
,
place
)
self
.
inputs
[
"keys"
]
=
keys
self
.
inputs
[
"queries"
]
=
queries
def
test_multihead_attention
(
self
):
self
.
gen_random_input
()
self
.
set_program
()
self
.
run_program
()
#fixme(caoying) add more meaningfull unittest.
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录