Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
6cc4bd53
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
6cc4bd53
编写于
12月 26, 2017
作者:
T
typhoonzero
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
wip
上级
1398854f
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
232 addition
and
13 deletion
+232
-13
paddle/operators/adam_op.h
paddle/operators/adam_op.h
+107
-13
python/paddle/v2/fluid/tests/test_adam_op.py
python/paddle/v2/fluid/tests/test_adam_op.py
+125
-0
未找到文件。
paddle/operators/adam_op.h
浏览文件 @
6cc4bd53
...
@@ -79,6 +79,71 @@ struct AdamFunctor {
...
@@ -79,6 +79,71 @@ struct AdamFunctor {
}
}
};
};
template
<
typename
T
>
struct
SparseAdamFunctor
{
T
beta1_
;
T
beta2_
;
T
epsilon_
;
const
T
*
beta1_pow_
;
const
T
*
beta2_pow_
;
const
T
*
moment1_
;
T
*
moment1_out_
;
const
T
*
moment2_
;
T
*
moment2_out_
;
const
T
*
lr_
;
const
T
*
grad_
;
const
T
*
param_
;
T
*
param_out_
;
const
int64_t
*
rows_
;
int64_t
row_numel_
;
int64_t
height_
;
SparseAdamFunctor
(
T
beta1
,
T
beta2
,
T
epsilon
,
const
T
*
beta1_pow
,
const
T
*
beta2_pow
,
const
T
*
mom1
,
T
*
mom1_out
,
const
T
*
mom2
,
T
*
mom2_out
,
const
T
*
lr
,
const
T
*
grad
,
const
T
*
param
,
T
*
param_out
,
const
int64_t
*
rows
,
int64_t
row_numel
,
int64_t
height
)
:
beta1_
(
beta1
),
beta2_
(
beta2
),
epsilon_
(
epsilon
),
beta1_pow_
(
beta1_pow
),
beta2_pow_
(
beta2_pow
),
moment1_
(
mom1
),
moment1_out_
(
mom1_out
),
moment2_
(
mom2
),
moment2_out_
(
mom2_out
),
lr_
(
lr
),
grad_
(
grad
),
param_
(
param
),
param_out_
(
param_out
),
rows_
(
rows
),
row_numel_
(
row_numel
),
height_
(
height
)
{}
inline
HOSTDEVICE
void
operator
()(
size_t
i
)
const
{
for
(
int64_t
j
=
0
;
j
<
row_numel_
;
++
j
)
{
T
g
=
grad_
[
i
*
row_numel_
+
j
];
T
mom1
=
moment1_
[
rows_
[
i
]
*
row_numel_
+
j
];
T
mom2
=
moment2_
[
rows_
[
i
]
*
row_numel_
+
j
];
T
lr
=
*
lr_
;
T
beta1_pow
=
*
beta1_pow_
;
T
beta2_pow
=
*
beta2_pow_
;
T
p
=
param_
[
rows_
[
i
]
*
row_numel_
+
j
];
lr
*=
sqrt
(
1
-
beta2_pow
)
/
(
1
-
beta1_pow
);
mom1
=
beta1_
*
mom1
+
(
1
-
beta1_
)
*
g
;
mom2
=
beta2_
*
mom2
+
(
1
-
beta2_
)
*
g
*
g
;
p
-=
lr
*
(
mom1
/
(
sqrt
(
mom2
)
+
epsilon_
));
// FIXME(typhoonzero): row id may be duplicate
moment1_out_
[
rows_
[
i
]
*
row_numel_
+
j
]
=
mom1
;
moment2_out_
[
rows_
[
i
]
*
row_numel_
+
j
]
=
mom2
;
param_out_
[
rows_
[
i
]
*
row_numel_
+
j
]
=
p
;
}
// for col id
}
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
AdamOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
AdamOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
@@ -90,7 +155,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
...
@@ -90,7 +155,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
&
param
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Param"
),
"Must set Param"
);
auto
&
param
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Param"
),
"Must set Param"
);
auto
&
grad
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Grad"
),
"Must set Grad"
);
// auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
auto
&
mom1
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Moment1"
),
"Must set Moment1"
);
auto
&
mom1
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Moment1"
),
"Must set Moment1"
);
auto
&
mom2
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Moment2"
),
"Must set Moment2"
);
auto
&
mom2
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Moment2"
),
"Must set Moment2"
);
auto
&
lr
=
auto
&
lr
=
...
@@ -108,18 +174,46 @@ class AdamOpKernel : public framework::OpKernel<T> {
...
@@ -108,18 +174,46 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto
&
mom2_out
=
auto
&
mom2_out
=
Ref
(
ctx
.
Output
<
LoDTensor
>
(
"Moment2Out"
),
"Must set Moment1Out"
);
Ref
(
ctx
.
Output
<
LoDTensor
>
(
"Moment2Out"
),
"Must set Moment1Out"
);
AdamFunctor
<
T
>
functor
(
beta1
,
beta2
,
epsilon
,
beta1_pow
.
template
data
<
T
>(),
if
(
grad_var
->
IsType
<
framework
::
LoDTensor
>
())
{
beta2_pow
.
template
data
<
T
>(),
auto
&
grad
=
Ref
(
ctx
.
Input
<
LoDTensor
>
(
"Grad"
),
"Must set Grad"
);
mom1
.
template
data
<
T
>(),
AdamFunctor
<
T
>
functor
(
mom1_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
beta1
,
beta2
,
epsilon
,
beta1_pow
.
template
data
<
T
>(),
mom2
.
template
data
<
T
>(),
beta2_pow
.
template
data
<
T
>(),
mom1
.
template
data
<
T
>(),
mom2_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
mom1_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
lr
.
template
data
<
T
>(),
grad
.
template
data
<
T
>(),
mom2
.
template
data
<
T
>(),
param
.
template
data
<
T
>(),
mom2_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
param_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()));
lr
.
template
data
<
T
>(),
grad
.
template
data
<
T
>(),
platform
::
ForRange
<
DeviceContext
>
for_range
(
param
.
template
data
<
T
>(),
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
param
.
numel
());
param_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()));
for_range
(
functor
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
param
.
numel
());
for_range
(
functor
);
}
else
if
(
grad_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
grad
=
Ref
(
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Grad"
),
"Must set Grad"
);
auto
&
grad_tensor
=
grad
.
value
();
const
T
*
grad_data
=
grad_tensor
.
template
data
<
T
>();
auto
*
rows
=
grad
.
rows
().
data
();
auto
height
=
grad
.
height
();
auto
row_numel
=
grad_tensor
.
numel
()
/
height
;
SparseAdamFunctor
<
T
>
functor
(
beta1
,
beta2
,
epsilon
,
beta1_pow
.
template
data
<
T
>(),
beta2_pow
.
template
data
<
T
>(),
mom1
.
template
data
<
T
>(),
mom1_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
mom2
.
template
data
<
T
>(),
mom2_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
lr
.
template
data
<
T
>(),
grad_data
,
param
.
template
data
<
T
>(),
param_out
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
rows
,
row_numel
,
height
);
platform
::
ForRange
<
DeviceContext
>
for_range
(
static_cast
<
const
DeviceContext
&>
(
ctx
.
device_context
()),
grad
.
rows
().
size
());
for_range
(
functor
);
}
else
{
PADDLE_THROW
(
"Variable type not supported by adam_op"
);
}
}
}
};
};
...
...
python/paddle/v2/fluid/tests/test_adam_op.py
浏览文件 @
6cc4bd53
...
@@ -176,5 +176,130 @@ def adam_step(inputs, attributes):
...
@@ -176,5 +176,130 @@ def adam_step(inputs, attributes):
return
param_out
,
moment1_out
,
moment2_out
return
param_out
,
moment1_out
,
moment2_out
def
adam_step_sparse
(
inputs
,
attributes
,
height
,
rows
,
row_numel
,
np_grad
):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
:param attributes: dict of attributes
:return tuple: tuple of output param, moment1, moment2,
beta1 power accumulator and beta2 power accumulator
'''
param
=
inputs
[
'Param'
]
# grad = inputs['Grad']
moment1
=
inputs
[
'Moment1'
]
moment2
=
inputs
[
'Moment2'
]
lr
=
inputs
[
'LearningRate'
]
beta1_pow
=
inputs
[
'Beta1Pow'
]
beta2_pow
=
inputs
[
'Beta2Pow'
]
beta1
=
attributes
[
'beta1'
]
beta2
=
attributes
[
'beta2'
]
epsilon
=
attributes
[
'epsilon'
]
moment1_out
=
np
.
array
([
height
,
row_numel
])
moment2_out
=
np
.
array
([
height
,
row_numel
])
param_out
=
np
.
array
([
height
,
row_numel
])
for
idx
,
row_id
in
enumerate
(
rows
):
moment1_out
[
row_id
]
=
beta1
*
moment1
[
row_id
]
+
(
1
-
beta1
)
*
np_grad
[
idx
]
moment2_out
[
row_id
]
=
beta2
*
moment2
[
row_id
]
+
(
1
-
beta2
)
*
np
.
square
(
np_grad
[
idx
])
lr_t
=
lr
*
np
.
sqrt
(
1
-
beta2_pow
)
/
(
1
-
beta1_pow
)
param_out
[
row_id
]
=
param
[
row_id
]
-
lr_t
*
(
moment1_out
/
(
np
.
sqrt
(
moment2_out
)
+
epsilon
))
return
param_out
,
moment1_out
,
moment2_out
class
TestSparseAdamOp
(
unittest
.
TestCase
):
def
setup
(
self
,
scope
,
place
):
beta1
=
0.78
beta2
=
0.836
epsilon
=
1e-4
height
=
10
rows
=
[
0
,
4
,
7
]
row_numel
=
12
self
.
dense_inputs
=
{
"Param"
:
np
.
full
((
height
,
row_numel
),
5.0
).
astype
(
"float32"
),
"Moment1"
:
np
.
full
((
height
,
row_numel
),
5.0
).
astype
(
"float32"
),
"Moment2"
:
np
.
full
((
height
,
row_numel
),
5.0
).
astype
(
"float32"
),
'Beta1Pow'
:
np
.
array
([
0.9
**
10
]).
astype
(
"float32"
),
'Beta2Pow'
:
np
.
array
([
0.999
**
10
]).
astype
(
"float32"
),
"LearningRate"
:
np
.
full
((
1
),
2.0
).
astype
(
"float32"
)
}
self
.
attrs
=
{
'epsilon'
:
epsilon
,
'beta1'
:
beta1
,
'beta2'
:
beta2
}
grad_selected_rows
=
scope
.
var
(
'Grad'
).
get_selected_rows
()
grad_selected_rows
.
set_height
(
height
)
grad_selected_rows
.
set_rows
(
rows
)
np_array
=
np
.
ones
((
len
(
rows
),
row_numel
)).
astype
(
"float32"
)
np_array
[
0
,
0
]
=
2.0
np_array
[
2
,
8
]
=
4.0
grad_tensor
=
grad_selected_rows
.
get_tensor
()
grad_tensor
.
set
(
np_array
,
place
)
self
.
sparse_inputs
=
[
"Grad"
]
param_out
,
mom1
,
mom2
=
adam_step_sparse
(
self
.
dense_inputs
,
self
.
attrs
,
height
,
rows
,
row_numel
,
np_array
)
self
.
outputs
=
{
"Param"
:
param_out
,
"Moment1Out"
:
mom1
,
"Moment2Out"
:
mom2
}
def
check_with_place
(
self
,
place
):
scope
=
core
.
Scope
()
self
.
setup
(
scope
,
place
)
op_args
=
dict
()
for
key
,
np_array
in
self
.
dense_inputs
.
iteritems
():
var
=
scope
.
var
(
key
).
get_tensor
()
var
.
set
(
np_array
,
place
)
op_args
[
key
]
=
key
for
s
in
self
.
sparse_inputs
:
op_args
[
s
]
=
s
for
k
in
self
.
attrs
:
op_args
[
k
]
=
self
.
attrs
[
k
]
# create and run sgd operator
sgd_op
=
Operator
(
"adam"
,
**
op_args
)
sgd_op
.
run
(
scope
,
place
)
for
key
,
np_array
in
self
.
outputs
.
iteritems
():
out_var
=
scope
.
var
(
key
).
get_tensor
()
actual
=
np
.
array
(
out_var
)
actual
.
reshape
([
actual
.
size
()])
np_array
.
reshape
([
np_array
.
size
()])
i
=
0
while
i
<
actual
.
size
():
self
.
assertAlmostEqual
(
actual
[
i
],
np_array
[
i
])
i
+=
1
# # rows[0] = 0, 5.0 - 2.0 * 2.0
# self.assertAlmostEqual(1.0, result_array[rows[0], 0])
# # rows[0] = 0, 5.0 - 2.0 * 1.0
# self.assertAlmostEqual(3.0, result_array[rows[0], 2])
# # 5.0 - 2.0 * 0.0
# self.assertAlmostEqual(5.0, result_array[1, 0])
# # rows[1] = 4, 5.0 - 2.0 * 1.0
# self.assertAlmostEqual(3.0, result_array[rows[1], 10])
# # 5.0 - 2.0 * 0.0
# self.assertAlmostEqual(5.0, result_array[5, 8])
# # rows[2] = 7, 5.0 - 2.0 * 1.0
# self.assertAlmostEqual(3.0, result_array[rows[2], 1])
# # rows[2] = 7, 5.0 - 2.0 * 4.0
# self.assertAlmostEqual(-3.0, result_array[rows[2], 8])
def
test_sparse_sgd
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile_gpu
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录