Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
124f45c9
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
124f45c9
编写于
4月 01, 2019
作者:
M
minqiyang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
shrink transformer
上级
96f24213
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
322 addition
and
784 deletion
+322
-784
paddle/fluid/imperative/layer.cc
paddle/fluid/imperative/layer.cc
+16
-4
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+11
-11
python/paddle/fluid/tests/unittests/test_imperative_basic.py
python/paddle/fluid/tests/unittests/test_imperative_basic.py
+207
-189
python/paddle/fluid/tests/unittests/test_imperative_transformer.py
...ddle/fluid/tests/unittests/test_imperative_transformer.py
+88
-580
未找到文件。
paddle/fluid/imperative/layer.cc
浏览文件 @
124f45c9
...
...
@@ -81,6 +81,10 @@ class TensorAddToFunctor : public boost::static_visitor<> {
}
// namespace detail
template
<
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenVector
=
framework
::
EigenVector
<
float
,
MajorType
,
IndexType
>
;
void
AddTo
(
Variable
*
src
,
Variable
*
dst
,
platform
::
Place
place
)
{
framework
::
Tensor
*
dst_tensor
=
dst
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
Tensor
*
src_tensor
=
src
->
GetMutable
<
framework
::
LoDTensor
>
();
...
...
@@ -95,10 +99,18 @@ void AddTo(Variable* src, Variable* dst, platform::Place place) {
"dst_numel %lld vs. src_numel %lld"
,
dst_tensor
->
numel
(),
src_tensor
->
numel
());
detail
::
TensorAddToFunctor
<
float
>
func
(
src_tensor
->
numel
(),
src_tensor
->
data
<
float
>
(),
dst_tensor
->
mutable_data
<
float
>
(
place
));
boost
::
apply_visitor
(
func
,
place
);
auto
result
=
EigenVector
<>::
Flatten
(
*
dst_tensor
);
auto
in_0_e
=
EigenVector
<>::
Flatten
(
*
dst_tensor
);
auto
in_1_e
=
EigenVector
<>::
Flatten
(
*
src_tensor
);
platform
::
DeviceContext
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
platform
::
CPUDeviceContext
*
x
=
reinterpret_cast
<
platform
::
CPUDeviceContext
*>
(
dev_ctx
);
result
.
device
(
*
x
->
eigen_device
())
=
in_0_e
+
in_1_e
;
// detail::TensorAddToFunctor<float> func(
// src_tensor->numel(), src_tensor->data<float>(),
// dst_tensor->mutable_data<float>(place));
// boost::apply_visitor(func, place);
}
class
Autograd
{
...
...
python/paddle/fluid/framework.py
浏览文件 @
124f45c9
...
...
@@ -104,14 +104,14 @@ def cuda_places(device_ids=None):
:code:`FLAGS_selected_gpus=0,1,2`, the returned list would
be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
If :code:`FLAGS_selected_gpus` is not set, all visible
gpu places would be returned.
gpu places would be returned.
If :code:`device_ids` is not None, it should be the device
ids of gpus. For example, if :code:`device_ids=[0,1,2]`,
the returned list would be
ids of gpus. For example, if :code:`device_ids=[0,1,2]`,
the returned list would be
[fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
Args:
Args:
device_ids (None|list(int)|tuple(int)): gpu device id list.
Returns:
...
...
@@ -133,11 +133,11 @@ def cuda_places(device_ids=None):
def
cpu_places
(
device_count
=
None
):
'''
Create a list of :code:`fluid.CPUPlace` objects.
If :code:`device_count` is None, the device count would
be determined by environment variable :code:`CPU_NUM`.
be determined by environment variable :code:`CPU_NUM`.
If :code:`CPU_NUM` is not set, the device count would
be determined by :code:`multiprocessing.cpu_count()`.
be determined by :code:`multiprocessing.cpu_count()`.
Args:
device_count (None|int): device number.
...
...
@@ -155,9 +155,9 @@ def cuda_pinned_places(device_count=None):
Create a list of :code:`fluid.CUDAPinnedPlace` objects.
If :code:`device_count` is None, the device count would
be determined by environment variable :code:`CPU_NUM`.
be determined by environment variable :code:`CPU_NUM`.
If :code:`CPU_NUM` is not set, the device count would
be determined by :code:`multiprocessing.cpu_count()`.
be determined by :code:`multiprocessing.cpu_count()`.
Args:
device_count (None|int): device number.
...
...
@@ -493,7 +493,7 @@ class Variable(object):
self
.
_ivar
.
_run_backward
()
def
_gradient
(
self
):
new_ivar
=
self
.
_ivar
.
_grad_ivar
.
_copy_to
(
core
.
CPUPlace
(),
True
)
new_ivar
=
self
.
_ivar
.
_grad_ivar
()
.
_copy_to
(
core
.
CPUPlace
(),
True
)
return
np
.
array
(
new_ivar
.
value
().
get_tensor
())
def
_clear_gradient
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_imperative_basic.py
浏览文件 @
124f45c9
...
...
@@ -51,23 +51,22 @@ class MyPyLayer(fluid.dygraph.PyLayer):
class
MLP
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
name_scope
):
super
(
MLP
,
self
).
__init__
(
name_scope
)
self
.
_fc1
=
FC
(
self
.
full_name
(),
3
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
self
.
_fc2
=
FC
(
self
.
full_name
(),
4
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.1
)))
self
.
_fc1
=
FC
(
self
.
full_name
(),
3
)
# self._fc2 = FC(self.full_name(),
# 4)
# self._fc3 = FC(self.full_name(),
# 4)
self
.
_fc_list
=
[]
for
i
in
range
(
100
):
fc3
=
FC
(
self
.
full_name
(),
4
)
self
.
_fc_list
.
append
(
fc3
)
def
forward
(
self
,
inputs
):
x
=
self
.
_fc1
(
inputs
)
x
=
self
.
_fc2
(
x
)
x
=
fluid
.
layers
.
reduce_sum
(
x
)
y1
=
self
.
_fc2
(
x
)
y2
=
self
.
_fc3
(
x
)
z
=
fluid
.
layers
.
concat
([
y1
,
y2
])
x
=
fluid
.
layers
.
reduce_sum
(
z
)
return
x
...
...
@@ -192,196 +191,215 @@ class SimpleRNN(fluid.dygraph.Layer):
class
TestImperative
(
unittest
.
TestCase
):
def
test_sum_op
(
self
):
x
=
np
.
ones
([
2
,
2
],
np
.
float32
)
with
fluid
.
dygraph
.
guard
():
inputs
=
[]
for
_
in
range
(
10
):
inputs
.
append
(
fluid
.
dygraph
.
base
.
to_variable
(
x
))
ret
=
fluid
.
layers
.
sums
(
inputs
)
loss
=
fluid
.
layers
.
reduce_sum
(
ret
)
loss
.
_backward
()
self
.
assertTrue
(
np
.
allclose
(
ret
.
_numpy
(),
x
*
10
))
self
.
assertTrue
(
np
.
allclose
(
inputs
[
0
].
_gradient
(),
x
))
def
test_layer
(
self
):
with
fluid
.
dygraph
.
guard
():
cl
=
core
.
Layer
()
cl
.
forward
([])
l
=
fluid
.
dygraph
.
Layer
(
"l"
)
self
.
assertRaises
(
NotImplementedError
,
l
.
forward
,
[])
def
test_pylayer_func_id
(
self
):
with
fluid
.
dygraph
.
guard
():
class
PyLayer1
(
fluid
.
dygraph
.
PyLayer
):
def
__init__
(
self
):
super
(
PyLayer1
,
self
).
__init__
()
@
staticmethod
def
forward
(
input
):
return
input
@
staticmethod
def
backward
(
input
):
return
input
class
PyLayer2
(
fluid
.
dygraph
.
PyLayer
):
def
__init__
(
self
):
super
(
PyLayer2
,
self
).
__init__
()
@
staticmethod
def
forward
(
input
):
return
input
@
staticmethod
def
backward
(
input
):
return
input
py_layer_1
=
PyLayer1
()
py_layer_2
=
PyLayer2
()
py_layer_1
(
fluid
.
dygraph
.
base
.
to_variable
(
np
.
ones
([
2
,
2
])))
py_layer_2
(
fluid
.
dygraph
.
base
.
to_variable
(
np
.
ones
([
2
,
2
])))
id
=
py_layer_1
.
forward_id
self
.
assertGreater
(
id
,
0
)
self
.
assertEqual
(
py_layer_1
.
backward_id
,
id
+
1
)
self
.
assertEqual
(
py_layer_2
.
forward_id
,
id
+
2
)
self
.
assertEqual
(
py_layer_2
.
backward_id
,
id
+
3
)
py_layer_1
(
fluid
.
dygraph
.
base
.
to_variable
(
np
.
ones
([
2
,
2
])))
self
.
assertEqual
(
py_layer_1
.
forward_id
,
id
)
def
test_pylayer
(
self
):
np_inp
=
np
.
ones
([
2
,
2
],
np
.
float32
)
with
fluid
.
dygraph
.
guard
():
my_py_layer
=
MyPyLayer
()
var_inp
=
fluid
.
dygraph
.
base
.
to_variable
(
np_inp
)
outs
=
my_py_layer
(
var_inp
)
dy_out
=
np
.
sum
(
outs
[
0
].
_numpy
())
outs
[
0
].
_backward
()
dy_grad
=
var_inp
.
_gradient
()
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
# TODO(panyx0718): Paddle doesn't diff against data `inp`.
x1
=
inp
*
1
# TODO(panyx0718): If reduce_sum is skipped, the result is wrong.
x
=
fluid
.
layers
.
reduce_sum
(
fluid
.
layers
.
tanh
(
x1
))
param_grads
=
fluid
.
backward
.
append_backward
(
x
,
parameter_list
=
[
x1
.
name
])[
0
]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
static_out
,
static_grad
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
fetch_list
=
[
x
.
name
,
param_grads
[
1
].
name
])
self
.
assertTrue
(
np
.
allclose
(
dy_out
,
static_out
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
def
test_layer_in_out
(
self
):
np_inp
=
np
.
array
([
1.0
,
2.0
,
-
1.0
],
dtype
=
np
.
float32
)
with
fluid
.
dygraph
.
guard
():
var_inp
=
fluid
.
dygraph
.
base
.
to_variable
(
np_inp
)
l
=
MyLayer
(
"my_layer"
)
x
=
l
(
var_inp
)[
0
]
self
.
assertIsNotNone
(
x
)
dy_out
=
x
.
_numpy
()
x
.
_backward
()
dy_grad
=
l
.
_x_for_debug
.
_gradient
()
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
3
],
append_batch_size
=
False
)
l
=
MyLayer
(
"my_layer"
)
x
=
l
(
inp
)[
0
]
param_grads
=
fluid
.
backward
.
append_backward
(
x
,
parameter_list
=
[
l
.
_x_for_debug
.
name
])[
0
]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
static_out
,
static_grad
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
fetch_list
=
[
x
.
name
,
param_grads
[
1
].
name
])
self
.
assertTrue
(
np
.
allclose
(
dy_out
,
static_out
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
#
def test_sum_op(self):
#
x = np.ones([2, 2], np.float32)
#
with fluid.dygraph.guard():
#
inputs = []
#
for _ in range(10):
#
inputs.append(fluid.dygraph.base.to_variable(x))
#
ret = fluid.layers.sums(inputs)
#
loss = fluid.layers.reduce_sum(ret)
#
loss._backward()
#
self.assertTrue(np.allclose(ret._numpy(), x * 10))
#
self.assertTrue(np.allclose(inputs[0]._gradient(), x))
#
def test_layer(self):
#
with fluid.dygraph.guard():
#
cl = core.Layer()
#
cl.forward([])
#
l = fluid.dygraph.Layer("l")
#
self.assertRaises(NotImplementedError, l.forward, [])
#
def test_pylayer_func_id(self):
#
with fluid.dygraph.guard():
#
class PyLayer1(fluid.dygraph.PyLayer):
#
def __init__(self):
#
super(PyLayer1, self).__init__()
#
@staticmethod
#
def forward(input):
#
return input
#
@staticmethod
#
def backward(input):
#
return input
#
class PyLayer2(fluid.dygraph.PyLayer):
#
def __init__(self):
#
super(PyLayer2, self).__init__()
#
@staticmethod
#
def forward(input):
#
return input
#
@staticmethod
#
def backward(input):
#
return input
#
py_layer_1 = PyLayer1()
#
py_layer_2 = PyLayer2()
#
py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
#
py_layer_2(fluid.dygraph.base.to_variable(np.ones([2, 2])))
#
id = py_layer_1.forward_id
#
self.assertGreater(id, 0)
#
self.assertEqual(py_layer_1.backward_id, id + 1)
#
self.assertEqual(py_layer_2.forward_id, id + 2)
#
self.assertEqual(py_layer_2.backward_id, id + 3)
#
py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
#
self.assertEqual(py_layer_1.forward_id, id)
#
def test_pylayer(self):
#
np_inp = np.ones([2, 2], np.float32)
#
with fluid.dygraph.guard():
#
my_py_layer = MyPyLayer()
#
var_inp = fluid.dygraph.base.to_variable(np_inp)
#
outs = my_py_layer(var_inp)
#
dy_out = np.sum(outs[0]._numpy())
#
outs[0]._backward()
#
dy_grad = var_inp._gradient()
#
with new_program_scope():
#
inp = fluid.layers.data(
#
name="inp", shape=[2, 2], append_batch_size=False)
#
# TODO(panyx0718): Paddle doesn't diff against data `inp`.
#
x1 = inp * 1
#
# TODO(panyx0718): If reduce_sum is skipped, the result is wrong.
#
x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
#
param_grads = fluid.backward.append_backward(
#
x, parameter_list=[x1.name])[0]
#
exe = fluid.Executor(fluid.CPUPlace(
#
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
#
static_out, static_grad = exe.run(
#
feed={inp.name: np_inp},
#
fetch_list=[x.name, param_grads[1].name])
#
self.assertTrue(np.allclose(dy_out, static_out))
#
self.assertTrue(np.allclose(dy_grad, static_grad))
#
def test_layer_in_out(self):
#
np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
#
with fluid.dygraph.guard():
#
var_inp = fluid.dygraph.base.to_variable(np_inp)
#
l = MyLayer("my_layer")
#
x = l(var_inp)[0]
#
self.assertIsNotNone(x)
#
dy_out = x._numpy()
#
x._backward()
#
dy_grad = l._x_for_debug._gradient()
#
with new_program_scope():
#
inp = fluid.layers.data(
#
name="inp", shape=[3], append_batch_size=False)
#
l = MyLayer("my_layer")
#
x = l(inp)[0]
#
param_grads = fluid.backward.append_backward(
#
x, parameter_list=[l._x_for_debug.name])[0]
#
exe = fluid.Executor(fluid.CPUPlace(
#
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
#
static_out, static_grad = exe.run(
#
feed={inp.name: np_inp},
#
fetch_list=[x.name, param_grads[1].name])
#
self.assertTrue(np.allclose(dy_out, static_out))
#
self.assertTrue(np.allclose(dy_grad, static_grad))
def
test_mlp
(
self
):
seed
=
90
np_inp
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]],
dtype
=
np
.
float32
)
with
fluid
.
dygraph
.
guard
():
with
fluid
.
dygraph
.
guard
(
place
=
fluid
.
CPUPlace
()):
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
var_inp
=
fluid
.
dygraph
.
base
.
to_variable
(
np_inp
)
mlp
=
MLP
(
"mlp"
)
out
=
mlp
(
var_inp
)
dy_out
=
out
.
_numpy
()
out
.
_backward
()
dy_grad
=
mlp
.
_fc1
.
_w
.
_gradient
()
opt
=
fluid
.
optimizer
.
SGDOptimizer
(
learning_rate
=
0.001
)
for
i
in
range
(
100
):
out
=
mlp
(
var_inp
)
dy_out
=
out
.
_numpy
()
out
.
_backward
()
opt
.
minimize
(
out
)
dy_grad
=
mlp
.
_fc1
.
_w
.
_gradient
()
dy_fc0_w0
=
mlp
.
_fc1
.
_w
.
_numpy
()
mlp
.
clear_gradients
()
with
new_program_scope
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
2
,
2
],
append_batch_size
=
False
)
mlp
=
MLP
(
"mlp"
)
out
=
mlp
(
inp
)
param_grads
=
fluid
.
backward
.
append_backward
(
out
,
parameter_list
=
[
mlp
.
_fc1
.
_w
.
name
])[
0
]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
opt
=
fluid
.
optimizer
.
SGDOptimizer
(
learning_rate
=
0.001
)
opt
.
minimize
(
out
)
# param_grads = fluid.backward.append_backward(
# out, parameter_list=[mlp._fc1._w.name])[0]
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
.
run
(
fluid
.
default_startup_program
())
static_out
,
static_grad
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
fetch_list
=
[
out
.
name
,
param_grads
[
1
].
name
])
self
.
assertTrue
(
np
.
allclose
(
dy_out
,
static_out
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad
,
static_grad
))
params
=
mlp
.
parameters
(
True
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0.w_0"
,
params
[
0
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_0.b_0"
,
params
[
1
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1.w_0"
,
params
[
2
].
name
)
self
.
assertEqual
(
"mlp/MLP_0/FC_1.b_0"
,
params
[
3
].
name
)
self
.
assertEqual
(
len
(
params
),
4
)
sublayers
=
mlp
.
sublayers
(
True
)
self
.
assertEqual
(
mlp
.
_fc1
,
sublayers
[
0
])
self
.
assertEqual
(
mlp
.
_fc2
,
sublayers
[
1
])
self
.
assertEqual
(
len
(
sublayers
),
2
)
def
test_rnn
(
self
):
np_inp
=
np
.
array
([[
1.0
,
2.0
,
3.0
],
[
4.0
,
5.0
,
6.0
],
[
7.0
,
8.0
,
9.0
],
[
10.0
,
11.0
,
12.0
]])
np_inp
=
np_inp
.
reshape
((
1
,
4
,
3
))
np_inp
=
np_inp
.
astype
(
np
.
float32
)
with
fluid
.
dygraph
.
guard
():
var_inp
=
fluid
.
dygraph
.
base
.
to_variable
(
np_inp
)
var_inp
=
fluid
.
layers
.
reshape
(
var_inp
,
shape
=
[
1
,
4
,
3
])
simple_rnn
=
SimpleRNN
(
"simple_rnn"
)
outs
,
pre_hiddens
=
simple_rnn
.
forward
(
var_inp
)
dy_out
=
outs
[
3
].
_numpy
()
outs
[
3
].
_backward
()
dy_grad_h2o
=
simple_rnn
.
_cell
.
_h2o_w
.
_gradient
()
dy_grad_h2h
=
simple_rnn
.
_cell
.
_h2h_w
.
_gradient
()
dy_grad_i2h
=
simple_rnn
.
_cell
.
_i2h_w
.
_gradient
()
for
i
in
range
(
100
):
static_out
,
static_grad
,
static_fc0_w0
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
fetch_list
=
[
out
.
name
,
"mlp/MLP_0/FC_0.w_0@GRAD"
,
"mlp/MLP_0/FC_0.w_0"
])
with
new_program_scope
():
inp
=
fluid
.
layers
.
data
(
name
=
"inp"
,
shape
=
[
1
,
4
,
3
],
append_batch_size
=
False
)
simple_rnn
=
SimpleRNN
(
"simple_rnn"
)
outs
,
pre_hiddens
=
simple_rnn
(
inp
)
param_grads
=
fluid
.
backward
.
append_backward
(
outs
[
3
])
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
.
run
(
fluid
.
default_startup_program
())
static_out
,
static_grad_h2o
,
static_grad_h2h
,
static_grad_i2h
=
exe
.
run
(
feed
=
{
inp
.
name
:
np_inp
},
fetch_list
=
[
outs
[
3
].
name
,
param_grads
[
0
][
1
].
name
,
param_grads
[
1
][
1
].
name
,
param_grads
[
2
][
1
].
name
])
print
(
dy_out
,
static_out
)
self
.
assertTrue
(
np
.
allclose
(
dy_out
,
static_out
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad_h2o
,
static_grad_h2o
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad_h2h
,
static_grad_h2h
))
self
.
assertTrue
(
np
.
allclose
(
dy_grad_i2h
,
static_grad_i2h
))
self
.
assertTrue
(
np
.
array_equal
(
dy_grad
,
static_grad
))
print
(
dy_fc0_w0
,
static_fc0_w0
)
#params = mlp.parameters(True)
#self.assertEqual("mlp/MLP_0/FC_0.w_0", params[0].name)
#self.assertEqual("mlp/MLP_0/FC_0.b_0", params[1].name)
#self.assertEqual("mlp/MLP_0/FC_1.w_0", params[2].name)
#self.assertEqual("mlp/MLP_0/FC_1.b_0", params[3].name)
#self.assertEqual(len(params), 4)
#sublayers = mlp.sublayers(True)
#self.assertEqual(mlp._fc1, sublayers[0])
#self.assertEqual(mlp._fc2, sublayers[1])
#self.assertEqual(len(sublayers), 2)
# def test_rnn(self):
# np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
# [10.0, 11.0, 12.0]])
# np_inp = np_inp.reshape((1, 4, 3))
# np_inp = np_inp.astype(np.float32)
# with fluid.dygraph.guard():
# var_inp = fluid.dygraph.base.to_variable(np_inp)
# var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
# simple_rnn = SimpleRNN("simple_rnn")
# outs, pre_hiddens = simple_rnn.forward(var_inp)
# dy_out = outs[3]._numpy()
# outs[3]._backward()
# dy_grad_h2o = simple_rnn._cell._h2o_w._gradient()
# dy_grad_h2h = simple_rnn._cell._h2h_w._gradient()
# dy_grad_i2h = simple_rnn._cell._i2h_w._gradient()
# with new_program_scope():
# inp = fluid.layers.data(
# name="inp", shape=[1, 4, 3], append_batch_size=False)
# simple_rnn = SimpleRNN("simple_rnn")
# outs, pre_hiddens = simple_rnn(inp)
# param_grads = fluid.backward.append_backward(outs[3])
# exe = fluid.Executor(fluid.CPUPlace())
# exe.run(fluid.default_startup_program())
# static_out, static_grad_h2o, static_grad_h2h, static_grad_i2h = exe.run(
# feed={inp.name: np_inp},
# fetch_list=[
# outs[3].name, param_grads[0][1].name,
# param_grads[1][1].name, param_grads[2][1].name
# ])
# self.assertTrue(np.allclose(dy_out, static_out))
# self.assertTrue(np.allclose(dy_grad_h2o, static_grad_h2o))
# self.assertTrue(np.allclose(dy_grad_h2h, static_grad_h2h))
# self.assertTrue(np.allclose(dy_grad_i2h, static_grad_i2h))
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_imperative_transformer.py
浏览文件 @
124f45c9
...
...
@@ -106,7 +106,7 @@ class ModelHyperParams(object):
# number of head used in multi-head attention.
n_head
=
8
# number of sub-layers to be stacked in the encoder and decoder.
n_layer
=
6
n_layer
=
1
# dropout rates of different modules.
prepostprocess_dropout
=
0.1
attention_dropout
=
0.1
...
...
@@ -303,7 +303,7 @@ use_py_reader = False
sync
=
False
# how many batches we use
batch_num
=
2
batch_num
=
1
np
.
random
.
seed
=
1
src_word_np
=
np
.
random
.
randint
(
...
...
@@ -359,59 +359,6 @@ pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
ModelHyperParams
.
d_model
)
class
PrePostProcessLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
process_cmd
,
shape_len
=
None
):
super
(
PrePostProcessLayer
,
self
).
__init__
(
name_scope
)
for
cmd
in
process_cmd
:
if
cmd
==
"n"
:
self
.
_layer_norm
=
LayerNorm
(
name_scope
=
self
.
full_name
(),
begin_norm_axis
=
shape_len
-
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
1.
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
def
forward
(
self
,
prev_out
,
out
,
process_cmd
,
dropout_rate
=
0.
):
for
cmd
in
process_cmd
:
if
cmd
==
"a"
:
# add residual connection
out
=
out
+
prev_out
if
prev_out
else
out
elif
cmd
==
"n"
:
# add layer normalization
out
=
self
.
_layer_norm
(
out
)
elif
cmd
==
"d"
:
# add dropout
if
dropout_rate
:
out
=
fluid
.
layers
.
dropout
(
out
,
dropout_prob
=
dropout_rate
,
seed
=
ModelHyperParams
.
dropout_seed
,
is_test
=
False
)
return
out
class
PositionwiseFeedForwardLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
d_inner_hid
,
d_hid
,
dropout_rate
):
super
(
PositionwiseFeedForwardLayer
,
self
).
__init__
(
name_scope
)
self
.
_i2h
=
FC
(
name_scope
=
self
.
full_name
(),
size
=
d_inner_hid
,
num_flatten_dims
=
2
,
act
=
"relu"
)
self
.
_h2o
=
FC
(
name_scope
=
self
.
full_name
(),
size
=
d_hid
,
num_flatten_dims
=
2
)
self
.
_dropout_rate
=
dropout_rate
def
forward
(
self
,
x
):
hidden
=
self
.
_i2h
(
x
)
if
self
.
_dropout_rate
:
hidden
=
fluid
.
layers
.
dropout
(
hidden
,
dropout_prob
=
self
.
_dropout_rate
,
seed
=
ModelHyperParams
.
dropout_seed
,
is_test
=
False
)
out
=
self
.
_h2o
(
hidden
)
return
out
class
MultiHeadAttentionLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
...
...
@@ -446,11 +393,22 @@ class MultiHeadAttentionLayer(Layer):
bias_attr
=
False
,
num_flatten_dims
=
2
)
def
_mm
(
self
,
input
):
input_shape
=
input
.
shape
param_shape
=
[
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
self
.
_num_flatten_dims
:],
1
)
]
+
[
self
.
_size
]
self
.
x
=
self
.
create_parameter
(
attr
=
None
,
shape
=
param_shape
,
dtype
=
self
.
_dtype
,
is_bias
=
False
)
def
forward
(
self
,
queries
,
keys
,
values
,
attn_bias
):
# compute q ,k ,v
keys
=
queries
if
keys
is
None
else
keys
values
=
keys
if
values
is
None
else
values
# q = queries
# k = keys
# v = values
q
=
self
.
_q_fc
(
queries
)
k
=
self
.
_k_fc
(
keys
)
v
=
self
.
_v_fc
(
values
)
...
...
@@ -495,181 +453,38 @@ class MultiHeadAttentionLayer(Layer):
inplace
=
False
)
# fc to output
print
(
final_out
.
shape
)
proj_out
=
self
.
_proj_fc
(
final_out
)
return
proj_out
class
EncoderSubLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
=
"n"
,
postprocess_cmd
=
"da"
):
super
(
EncoderSubLayer
,
self
).
__init__
(
name_scope
)
self
.
_preprocess_cmd
=
preprocess_cmd
self
.
_postprocess_cmd
=
postprocess_cmd
self
.
_prepostprocess_dropout
=
prepostprocess_dropout
self
.
_preprocess_layer
=
PrePostProcessLayer
(
self
.
full_name
(),
self
.
_preprocess_cmd
,
3
)
self
.
_multihead_attention_layer
=
MultiHeadAttentionLayer
(
self
.
full_name
(),
d_key
,
d_value
,
d_model
,
n_head
,
attention_dropout
)
self
.
_postprocess_layer
=
PrePostProcessLayer
(
self
.
full_name
(),
self
.
_postprocess_cmd
,
None
)
self
.
_preprocess_layer2
=
PrePostProcessLayer
(
self
.
full_name
(),
self
.
_preprocess_cmd
,
3
)
self
.
_positionwise_feed_forward
=
PositionwiseFeedForwardLayer
(
self
.
full_name
(),
d_inner_hid
,
d_model
,
relu_dropout
)
self
.
_postprocess_layer2
=
PrePostProcessLayer
(
self
.
full_name
(),
self
.
_postprocess_cmd
,
None
)
def
forward
(
self
,
enc_input
,
attn_bias
):
pre_process_multihead
=
self
.
_preprocess_layer
(
None
,
enc_input
,
self
.
_preprocess_cmd
,
self
.
_prepostprocess_dropout
)
attn_output
=
self
.
_multihead_attention_layer
(
pre_process_multihead
,
None
,
None
,
attn_bias
)
attn_output
=
self
.
_postprocess_layer
(
enc_input
,
attn_output
,
self
.
_postprocess_cmd
,
self
.
_prepostprocess_dropout
)
pre_process2_output
=
self
.
_preprocess_layer2
(
None
,
attn_output
,
self
.
_preprocess_cmd
,
self
.
_prepostprocess_dropout
)
ffd_output
=
self
.
_positionwise_feed_forward
(
pre_process2_output
)
return
self
.
_postprocess_layer2
(
attn_output
,
ffd_output
,
self
.
_postprocess_cmd
,
self
.
_prepostprocess_dropout
)
class
EncoderLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
=
"n"
,
postprocess_cmd
=
"da"
):
super
(
EncoderLayer
,
self
).
__init__
(
name_scope
)
self
.
_preprocess_cmd
=
preprocess_cmd
self
.
_encoder_sublayers
=
list
()
self
.
_prepostprocess_dropout
=
prepostprocess_dropout
self
.
_n_layer
=
n_layer
self
.
_preprocess_layer
=
PrePostProcessLayer
(
self
.
full_name
(),
self
.
_preprocess_cmd
,
3
)
for
i
in
range
(
n_layer
):
self
.
_encoder_sublayers
.
append
(
self
.
add_sublayer
(
'esl_%d'
%
i
,
EncoderSubLayer
(
self
.
full_name
(),
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
)))
def
forward
(
self
,
enc_input
,
attn_bias
):
for
i
in
range
(
self
.
_n_layer
):
enc_output
=
self
.
_encoder_sublayers
[
i
](
enc_input
,
attn_bias
)
enc_input
=
enc_output
return
self
.
_preprocess_layer
(
None
,
enc_output
,
self
.
_preprocess_cmd
,
self
.
_prepostprocess_dropout
)
class
PrepareEncoderDecoderLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
src_vocab_size
,
src_emb_dim
,
src_max_len
,
dropout_rate
,
word_emb_param_name
=
None
,
pos_enc_param_name
=
None
):
super
(
PrepareEncoderDecoderLayer
,
self
).
__init__
(
name_scope
)
self
.
_src_max_len
=
src_max_len
self
.
_src_emb_dim
=
src_emb_dim
self
.
_src_vocab_size
=
src_vocab_size
self
.
_dropout_rate
=
dropout_rate
self
.
_input_emb
=
Embedding
(
name_scope
=
self
.
full_name
(),
size
=
[
src_vocab_size
,
src_emb_dim
],
padding_idx
=
0
,
param_attr
=
fluid
.
ParamAttr
(
name
=
word_emb_param_name
,
initializer
=
fluid
.
initializer
.
Normal
(
0.
,
src_emb_dim
**-
0.5
)))
if
pos_enc_param_name
is
pos_enc_param_names
[
0
]:
pos_inp
=
pos_inp1
else
:
pos_inp
=
pos_inp2
self
.
_pos_emb
=
Embedding
(
name_scope
=
self
.
full_name
(),
size
=
[
self
.
_src_max_len
,
src_emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
pos_enc_param_name
,
initializer
=
fluid
.
initializer
.
NumpyArrayInitializer
(
pos_inp
),
trainable
=
False
))
# use in dygraph_mode to fit different length batch
# self._pos_emb._w = to_variable(
# position_encoding_init(self._src_max_len, self._src_emb_dim))
def
forward
(
self
,
src_word
,
src_pos
):
src_word_emb
=
self
.
_input_emb
(
src_word
)
src_word_emb
=
fluid
.
layers
.
scale
(
x
=
src_word_emb
,
scale
=
self
.
_src_emb_dim
**
0.5
)
# # TODO change this to fit dynamic length input
src_pos_emb
=
self
.
_pos_emb
(
src_pos
)
src_pos_emb
.
stop_gradient
=
True
enc_input
=
src_word_emb
+
src_pos_emb
return
fluid
.
layers
.
dropout
(
enc_input
,
dropout_prob
=
self
.
_dropout_rate
,
seed
=
ModelHyperParams
.
dropout_seed
,
is_test
=
False
)
if
self
.
_dropout_rate
else
enc_input
class
WrapEncoderLayer
(
Layer
):
def
__init__
(
self
,
name_cope
,
src_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
weight_sharing
):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super
(
WrapEncoderLayer
,
self
).
__init__
(
name_cope
)
self
.
_prepare_encoder_layer
=
PrepareEncoderDecoderLayer
(
self
.
full_name
(),
src_vocab_size
,
d_model
,
max_length
,
prepostprocess_dropout
,
word_emb_param_name
=
word_emb_param_names
[
0
],
pos_enc_param_name
=
pos_enc_param_names
[
0
])
self
.
_encoder
=
EncoderLayer
(
self
.
full_name
(),
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
)
class
PrePostProcessLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
process_cmd
,
shape_len
=
None
):
super
(
PrePostProcessLayer
,
self
).
__init__
(
name_scope
)
for
cmd
in
process_cmd
:
if
cmd
==
"n"
:
self
.
_layer_norm
=
LayerNorm
(
name_scope
=
self
.
full_name
(),
begin_norm_axis
=
shape_len
-
1
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
1.
)),
bias_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
def
forward
(
self
,
enc_inputs
):
src_word
,
src_pos
,
src_slf_attn_bias
=
enc_inputs
enc_input
=
self
.
_prepare_encoder_layer
(
src_word
,
src_pos
)
enc_output
=
self
.
_encoder
(
enc_input
,
src_slf_attn_bias
)
return
enc_output
def
forward
(
self
,
prev_out
,
out
,
process_cmd
,
dropout_rate
=
0.
):
for
cmd
in
process_cmd
:
if
cmd
==
"a"
:
# add residual connection
out
=
out
+
prev_out
if
prev_out
else
out
elif
cmd
==
"n"
:
# add layer normalization
out
=
self
.
_layer_norm
(
out
)
elif
cmd
==
"d"
:
# add dropout
if
dropout_rate
:
out
=
fluid
.
layers
.
dropout
(
out
,
dropout_prob
=
dropout_rate
,
seed
=
ModelHyperParams
.
dropout_seed
,
is_test
=
False
)
return
out
class
DecoderSubLayer
(
Layer
):
...
...
@@ -679,20 +494,13 @@ class DecoderSubLayer(Layer):
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
cache
=
None
,
preprocess_cmd
=
"n"
,
gather_idx
=
None
):
super
(
DecoderSubLayer
,
self
).
__init__
(
name_scope
)
self
.
_postprocess_cmd
=
postprocess_cmd
self
.
_preprocess_cmd
=
preprocess_cmd
self
.
_prepostprcess_dropout
=
prepostprocess_dropout
self
.
_pre_process_layer
=
PrePostProcessLayer
(
self
.
full_name
(),
preprocess_cmd
,
3
)
self
.
_preprocess_layer
=
PrePostProcessLayer
(
self
.
full_name
(),
preprocess_cmd
,
3
)
self
.
_multihead_attention_layer
=
MultiHeadAttentionLayer
(
self
.
full_name
(),
d_key
,
...
...
@@ -702,300 +510,41 @@ class DecoderSubLayer(Layer):
attention_dropout
,
cache
=
cache
,
gather_idx
=
gather_idx
)
self
.
_post_process_layer
=
PrePostProcessLayer
(
self
.
full_name
(),
postprocess_cmd
,
None
)
self
.
_pre_process_layer2
=
PrePostProcessLayer
(
self
.
full_name
(),
preprocess_cmd
,
3
)
self
.
_multihead_attention_layer2
=
MultiHeadAttentionLayer
(
self
.
full_name
(),
d_key
,
d_value
,
d_model
,
n_head
,
attention_dropout
,
cache
=
cache
,
gather_idx
=
gather_idx
,
static_kv
=
True
)
self
.
_post_process_layer2
=
PrePostProcessLayer
(
self
.
full_name
(),
postprocess_cmd
,
None
)
self
.
_pre_process_layer3
=
PrePostProcessLayer
(
self
.
full_name
(),
preprocess_cmd
,
3
)
self
.
_positionwise_feed_forward_layer
=
PositionwiseFeedForwardLayer
(
self
.
full_name
(),
d_inner_hid
,
d_model
,
relu_dropout
)
self
.
_post_process_layer3
=
PrePostProcessLayer
(
self
.
full_name
(),
postprocess_cmd
,
None
)
def
forward
(
self
,
dec_input
,
enc_output
,
slf_attn_bias
,
dec_enc_attn_bias
):
pre_process_rlt
=
self
.
_pre_process_layer
(
None
,
dec_input
,
self
.
_preprocess_cmd
,
self
.
_prepostprcess_dropout
)
slf_attn_output
=
self
.
_multihead_attention_layer
(
pre_process_rlt
,
None
,
None
,
slf_attn_bias
)
slf_attn_output_pp
=
self
.
_post_process_layer
(
dec_input
,
slf_attn_output
,
self
.
_postprocess_cmd
,
self
.
_prepostprcess_dropout
)
pre_process_rlt2
=
self
.
_pre_process_layer2
(
None
,
slf_attn_output_pp
,
self
.
_preprocess_cmd
,
self
.
_prepostprcess_dropout
)
enc_attn_output_pp
=
self
.
_multihead_attention_layer2
(
pre_process_rlt2
,
enc_output
,
enc_output
,
dec_enc_attn_bias
)
enc_attn_output
=
self
.
_post_process_layer2
(
slf_attn_output
,
enc_attn_output_pp
,
self
.
_postprocess_cmd
,
self
.
_prepostprcess_dropout
)
pre_process_rlt3
=
self
.
_pre_process_layer3
(
None
,
enc_attn_output
,
self
.
_preprocess_cmd
,
self
.
_prepostprcess_dropout
)
ffd_output
=
self
.
_positionwise_feed_forward_layer
(
pre_process_rlt3
)
dec_output
=
self
.
_post_process_layer3
(
enc_attn_output
,
ffd_output
,
self
.
_postprocess_cmd
,
self
.
_prepostprcess_dropout
)
return
dec_output
class
DecoderLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
caches
=
None
,
gather_idx
=
None
):
super
(
DecoderLayer
,
self
).
__init__
(
name_scope
)
self
.
_pre_process_layer
=
PrePostProcessLayer
(
self
.
full_name
(),
preprocess_cmd
,
3
)
self
.
_decoder_sub_layers
=
list
()
self
.
_n_layer
=
n_layer
self
.
_preprocess_cmd
=
preprocess_cmd
self
.
_prepostprocess_dropout
=
prepostprocess_dropout
for
i
in
range
(
n_layer
):
self
.
_decoder_sub_layers
.
append
(
self
.
add_sublayer
(
'dsl_%d'
%
i
,
DecoderSubLayer
(
self
.
full_name
(),
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
cache
=
None
if
caches
is
None
else
caches
[
i
],
gather_idx
=
gather_idx
)))
def
forward
(
self
,
dec_input
,
enc_output
,
dec_slf_attn_bias
,
dec_enc_attn_bias
):
for
i
in
range
(
self
.
_n_layer
):
tmp_dec_output
=
self
.
_decoder_sub_layers
[
i
](
dec_input
,
enc_output
,
dec_slf_attn_bias
,
dec_enc_attn_bias
)
dec_input
=
tmp_dec_output
dec_output
=
self
.
_pre_process_layer
(
None
,
tmp_dec_output
,
self
.
_preprocess_cmd
,
self
.
_prepostprocess_dropout
)
return
dec_output
class
WrapDecoderLayer
(
Layer
):
def
__init__
(
self
,
name_scope
,
trg_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
weight_sharing
,
caches
=
None
,
gather_idx
=
None
):
"""
The wrapper assembles together all needed layers for the encoder.
"""
super
(
WrapDecoderLayer
,
self
).
__init__
(
name_scope
)
self
.
_prepare_decoder_layer
=
PrepareEncoderDecoderLayer
(
self
.
full_name
(),
trg_vocab_size
,
d_model
,
max_length
,
prepostprocess_dropout
,
word_emb_param_name
=
word_emb_param_names
[
1
],
pos_enc_param_name
=
pos_enc_param_names
[
1
])
self
.
_decoder_layer
=
DecoderLayer
(
self
.
full_name
(),
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
caches
=
caches
,
gather_idx
=
gather_idx
)
self
.
_weight_sharing
=
weight_sharing
if
not
weight_sharing
:
self
.
_fc
=
FC
(
self
.
full_name
(),
size
=
trg_vocab_size
,
bias_attr
=
False
)
def
forward
(
self
,
dec_inputs
=
None
,
enc_output
=
None
):
trg_word
,
trg_pos
,
trg_slf_attn_bias
,
trg_src_attn_bias
=
dec_inputs
dec_input
=
self
.
_prepare_decoder_layer
(
trg_word
,
trg_pos
)
dec_output
=
self
.
_decoder_layer
(
dec_input
,
enc_output
,
trg_slf_attn_bias
,
trg_src_attn_bias
)
dec_output_reshape
=
fluid
.
layers
.
reshape
(
dec_output
,
shape
=
[
-
1
,
dec_output
.
shape
[
-
1
]],
inplace
=
False
)
if
self
.
_weight_sharing
:
predict
=
fluid
.
layers
.
matmul
(
x
=
dec_output_reshape
,
y
=
self
.
_prepare_decoder_layer
.
_input_emb
.
_w
,
transpose_y
=
True
)
else
:
predict
=
self
.
_fc
(
dec_output_reshape
)
if
dec_inputs
is
None
:
# Return probs for independent decoder program.
predict_out
=
fluid
.
layers
.
softmax
(
predict
)
return
predict_out
return
predict
class
TransFormer
(
Layer
):
def
__init__
(
self
,
name_scope
,
src_vocab_size
,
trg_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
weight_sharing
,
label_smooth_eps
,
use_py_reader
=
False
,
is_test
=
False
):
super
(
TransFormer
,
self
).
__init__
(
name_scope
)
self
.
_label_smooth_eps
=
label_smooth_eps
self
.
_trg_vocab_size
=
trg_vocab_size
if
weight_sharing
:
assert
src_vocab_size
==
trg_vocab_size
,
(
"Vocabularies in source and target should be same for weight sharing."
)
self
.
_wrap_encoder_layer
=
WrapEncoderLayer
(
self
.
full_name
(),
src_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
weight_sharing
)
self
.
_wrap_decoder_layer
=
WrapDecoderLayer
(
self
.
full_name
(),
trg_vocab_size
,
max_length
,
n_layer
,
n_head
,
d_key
,
d_value
,
d_model
,
d_inner_hid
,
prepostprocess_dropout
,
attention_dropout
,
relu_dropout
,
preprocess_cmd
,
postprocess_cmd
,
weight_sharing
)
if
weight_sharing
:
self
.
_wrap_decoder_layer
.
_prepare_decoder_layer
.
_input_emb
.
_w
=
self
.
_wrap_encoder_layer
.
_prepare_encoder_layer
.
_input_emb
.
_w
def
forward
(
self
,
enc_inputs
,
dec_inputs
,
label
,
weights
):
enc_output
=
self
.
_wrap_encoder_layer
(
enc_inputs
)
predict
=
self
.
_wrap_decoder_layer
(
dec_inputs
,
enc_output
)
if
self
.
_label_smooth_eps
:
label_out
=
fluid
.
layers
.
label_smooth
(
label
=
fluid
.
layers
.
one_hot
(
input
=
label
,
depth
=
self
.
_trg_vocab_size
),
epsilon
=
self
.
_label_smooth_eps
)
cost
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
predict
,
label
=
label_out
,
soft_label
=
True
if
self
.
_label_smooth_eps
else
False
)
weighted_cost
=
cost
*
weights
sum_cost
=
fluid
.
layers
.
reduce_sum
(
weighted_cost
)
token_num
=
fluid
.
layers
.
reduce_sum
(
weights
)
token_num
.
stop_gradient
=
True
avg_cost
=
sum_cost
/
token_num
return
sum_cost
,
avg_cost
,
predict
,
token_num
def
forward
(
self
,
input
,
slf_attn_bias
):
print
(
input
.
shape
)
print
(
slf_attn_bias
.
shape
)
y
=
self
.
_preprocess_layer
(
None
,
input
,
"n"
,
0.1
)
slf_attn_output
=
self
.
_multihead_attention_layer
(
y
,
None
,
None
,
slf_attn_bias
)
return
slf_attn_output
class
TestDygraphTransformer
(
unittest
.
TestCase
):
def
test_transformer_float32
(
self
):
seed
=
90
with
guard
():
x1
=
np
.
ones
([
32
,
4
,
512
]).
astype
(
'float32'
)
x2
=
np
.
ones
([
32
,
8
,
4
,
4
]).
astype
(
'float32'
)
with
guard
(
place
=
fluid
.
CPUPlace
()):
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
transformer
=
TransFormer
(
'transformer'
,
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
,
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
n_layer
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
d_inner_hid
,
ModelHyperParams
.
prepostprocess_dropout
,
ModelHyperParams
.
attention_dropout
,
ModelHyperParams
.
relu_dropout
,
ModelHyperParams
.
preprocess_cmd
,
ModelHyperParams
.
postprocess_cmd
,
ModelHyperParams
.
weight_sharing
,
TrainTaskConfig
.
label_smooth_eps
,
use_py_reader
=
use_py_reader
,
is_test
=
False
)
if
sync
:
lr_decay
=
fluid
.
layers
.
learning_rate_scheduler
.
noam_decay
(
ModelHyperParams
.
d_model
,
TrainTaskConfig
.
warmup_steps
)
with
fluid
.
default_main_program
().
_lr_schedule_guard
():
learning_rate
=
lr_decay
*
TrainTaskConfig
.
learning_rate
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
learning_rate
,
beta1
=
TrainTaskConfig
.
beta1
,
beta2
=
TrainTaskConfig
.
beta2
,
epsilon
=
TrainTaskConfig
.
eps
)
else
:
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.003
)
transformer
=
DecoderSubLayer
(
'transformer'
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
attention_dropout
)
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.003
)
dy_param_init
=
dict
()
dy_param_updated
=
dict
()
for
i
in
range
(
batch_num
):
enc_inputs
,
dec_inputs
,
label
,
weights
=
create_data
(
)
dy_sum_cost
,
dy_avg_cost
,
dy_predict
,
dy_token_num
=
transformer
(
enc_inputs
,
dec_inputs
,
label
,
weights
)
loss
=
transformer
(
to_variable
(
x1
),
to_variable
(
x2
)
)
loss
=
fluid
.
layers
.
reduce_sum
(
loss
)
print
(
'dy los'
,
loss
.
shape
)
if
i
==
0
:
for
param
in
transformer
.
parameters
():
dy_param_init
[
param
.
name
]
=
param
.
_numpy
()
dy_avg_cost
.
_backward
()
optimizer
.
minimize
(
dy_avg_cost
)
loss
.
_backward
()
optimizer
.
minimize
(
loss
)
transformer
.
clear_gradients
()
if
i
==
batch_num
-
1
:
for
param
in
transformer
.
parameters
():
...
...
@@ -1004,92 +553,51 @@ class TestDygraphTransformer(unittest.TestCase):
with
new_program_scope
():
fluid
.
default_startup_program
().
random_seed
=
seed
fluid
.
default_main_program
().
random_seed
=
seed
transformer
=
TransFormer
(
'transformer'
,
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
,
ModelHyperParams
.
max_length
+
1
,
ModelHyperParams
.
n_layer
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
d_inner_hid
,
ModelHyperParams
.
prepostprocess_dropout
,
ModelHyperParams
.
attention_dropout
,
ModelHyperParams
.
relu_dropout
,
ModelHyperParams
.
preprocess_cmd
,
ModelHyperParams
.
postprocess_cmd
,
ModelHyperParams
.
weight_sharing
,
TrainTaskConfig
.
label_smooth_eps
,
use_py_reader
=
use_py_reader
,
is_test
=
False
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
(
)
if
not
core
.
is_compiled_with_cuda
()
else
fluid
.
CUDAPlace
(
0
))
transformer
=
DecoderSubLayer
(
'transformer'
,
ModelHyperParams
.
n_head
,
ModelHyperParams
.
d_key
,
ModelHyperParams
.
d_value
,
ModelHyperParams
.
d_model
,
ModelHyperParams
.
attention_dropout
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.003
)
data_input_names
=
encoder_data_input_fields
+
decoder_data_input_fields
[:
-
1
]
+
label_data_input_fields
all_inputs
=
make_all_inputs
(
data_input_names
)
enc_inputs_len
=
len
(
encoder_data_input_fields
)
dec_inputs_len
=
len
(
decoder_data_input_fields
[:
-
1
])
enc_inputs
=
all_inputs
[
0
:
enc_inputs_len
]
dec_inputs
=
all_inputs
[
enc_inputs_len
:
enc_inputs_len
+
dec_inputs_len
]
label
=
all_inputs
[
-
2
]
weights
=
all_inputs
[
-
1
]
static_param_updated
=
dict
()
static_param_init
=
dict
()
static_param_name_list
=
list
()
static_sum_cost
,
static_avg_cost
,
static_predict
,
static_token_num
=
transformer
(
enc_inputs
,
dec_inputs
,
label
,
weights
)
data1
=
fluid
.
layers
.
data
(
name
=
'X'
,
shape
=
[
4
,
512
],
dtype
=
'float32'
)
data2
=
fluid
.
layers
.
data
(
name
=
'Y'
,
shape
=
[
8
,
4
,
4
],
dtype
=
'float32'
)
loss
=
transformer
(
data1
,
data2
)
loss
=
fluid
.
layers
.
reduce_sum
(
loss
)
print
(
'loss hspae'
,
loss
.
shape
)
optimizer
.
minimize
(
loss
)
optimizer
.
minimize
(
static_avg_cost
)
static_param_init
=
{}
static_param_name_list
=
[]
static_param_updated
=
{}
for
param
in
transformer
.
parameters
():
static_param_name_list
.
append
(
param
.
name
)
out
=
exe
.
run
(
fluid
.
default_startup_program
(),
fetch_list
=
static_param_name_list
)
for
i
in
range
(
len
(
static_param_name_list
)):
static_param_init
[
static_param_name_list
[
i
]]
=
out
[
i
]
static_sum_cost_value
=
None
static_avg_cost_value
=
None
static_predict_value
=
None
static_token_num_value
=
None
for
i
in
range
(
batch_num
):
feed_dict
=
create_feed_dict_list
(
create_data
(
True
))
fetch_list
=
[
static_sum_cost
,
static_avg_cost
,
static_predict
,
static_token_num
]
feed_dict
=
{
"X"
:
x1
,
"Y"
:
x2
}
fetch_list
=
[]
fetch_list
.
extend
(
static_param_name_list
)
out
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
feed_dict
,
fetch_list
=
fetch_list
)
static_sum_cost_value
=
out
[
0
]
static_avg_cost_value
=
out
[
1
]
static_predict_value
=
out
[
2
]
static_token_num_value
=
out
[
3
]
if
i
==
batch_num
-
1
:
for
k
in
range
(
4
,
len
(
out
)):
for
k
in
range
(
0
,
len
(
out
)):
static_param_updated
[
static_param_name_list
[
k
-
4
]]
=
out
[
k
]
0
]]
=
out
[
k
]
self
.
assertTrue
(
np
.
allclose
(
static_avg_cost_value
,
dy_avg_cost
.
_numpy
()))
self
.
assertTrue
(
np
.
allclose
(
static_sum_cost_value
,
dy_sum_cost
.
_numpy
()))
self
.
assertTrue
(
np
.
allclose
(
static_predict_value
,
dy_predict
.
_numpy
(),
atol
=
1e-5
))
self
.
assertTrue
(
np
.
allclose
(
static_token_num_value
,
dy_token_num
.
_numpy
()))
for
key
,
value
in
six
.
iteritems
(
static_param_init
):
self
.
assertTrue
(
np
.
a
llclose
(
value
,
dy_param_init
[
key
]))
self
.
assertTrue
(
np
.
a
rray_equal
(
value
,
dy_param_init
[
key
]))
for
key
,
value
in
six
.
iteritems
(
static_param_updated
):
self
.
assertTrue
(
np
.
allclose
(
value
,
dy_param_updated
[
key
],
atol
=
1e-4
))
if
not
(
value
==
dy_param_updated
[
key
]).
all
():
print
(
key
)
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录