Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a8a2b7f4
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a8a2b7f4
编写于
3月 06, 2023
作者:
N
niuliling123
提交者:
GitHub
3月 06, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add multiprecision for adadelta op (#50131)
上级
a1006b2b
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
607 addition
and
45 deletion
+607
-45
paddle/fluid/operators/optimizers/adadelta_op.cc
paddle/fluid/operators/optimizers/adadelta_op.cc
+9
-0
paddle/fluid/pybind/eager_generator.h
paddle/fluid/pybind/eager_generator.h
+12
-1
paddle/phi/api/yaml/legacy_ops.yaml
paddle/phi/api/yaml/legacy_ops.yaml
+5
-3
paddle/phi/infermeta/multiary.cc
paddle/phi/infermeta/multiary.cc
+4
-1
paddle/phi/infermeta/multiary.h
paddle/phi/infermeta/multiary.h
+4
-1
paddle/phi/kernels/adadelta_kernel.h
paddle/phi/kernels/adadelta_kernel.h
+4
-1
paddle/phi/kernels/gpu/adadelta_kernel.cu
paddle/phi/kernels/gpu/adadelta_kernel.cu
+7
-2
paddle/phi/kernels/impl/adadelta_kernel_impl.h
paddle/phi/kernels/impl/adadelta_kernel_impl.h
+31
-12
paddle/phi/kernels/xpu/adadelta_kernel.cc
paddle/phi/kernels/xpu/adadelta_kernel.cc
+4
-1
paddle/phi/ops/compat/adadelta_sig.cc
paddle/phi/ops/compat/adadelta_sig.cc
+36
-0
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+106
-11
python/paddle/fluid/tests/unittests/test_adadelta_op.py
python/paddle/fluid/tests/unittests/test_adadelta_op.py
+276
-0
python/paddle/optimizer/adadelta.py
python/paddle/optimizer/adadelta.py
+109
-12
未找到文件。
paddle/fluid/operators/optimizers/adadelta_op.cc
浏览文件 @
a8a2b7f4
...
...
@@ -39,12 +39,17 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"AvgSquaredGrad"
,
"(Tensor) Input average of squared gradient"
);
AddInput
(
"AvgSquaredUpdate"
,
"(Tensor) Input average of squared parameter updates"
);
AddInput
(
"MasterParam"
,
"FP32 master weight for AMP."
).
AsDispensable
();
AddOutput
(
"ParamOut"
,
"(Tensor) Output parameter"
);
AddOutput
(
"AvgSquaredGradOut"
,
"(Tensor) Output average of squared gradient"
);
AddOutput
(
"AvgSquaredUpdateOut"
,
"(Tensor) Output average of squared parameter updates"
);
AddOutput
(
"MasterParamOut"
,
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam)."
)
.
AsDispensable
();
AddAttr
<
float
>
(
"rho"
,
"(float, default 0.95) Exponential decay rate "
...
...
@@ -54,6 +59,10 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-6) Constant for "
"numerical stability"
)
.
SetDefault
(
1.0e-6
f
);
AddAttr
<
bool
>
(
"multi_precision"
,
"(bool, default false) "
"Whether to use multi-precision during weight updating."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Adadelta Optimizer.
...
...
paddle/fluid/pybind/eager_generator.h
浏览文件 @
a8a2b7f4
...
...
@@ -206,6 +206,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{
"Q"
,
"K"
,
"V"
,
"Offset"
,
"Columns"
,
"KeyPaddingMask"
,
"AttnMask"
}},
{
"sgd"
,
{
"Param"
,
"LearningRate"
,
"Grad"
,
"MasterParam"
}},
{
"adagrad"
,
{
"Param"
,
"Grad"
,
"Moment"
,
"LearningRate"
,
"MasterParam"
}},
{
"adadelta"
,
{
"Param"
,
"Grad"
,
"AvgSquaredGrad"
,
"AvgSquaredUpdate"
,
"MasterParam"
}},
{
"graph_khop_sampler"
,
{
"Row"
,
"Eids"
,
"Col_Ptr"
,
"X"
}},
{
"nce"
,
{
"Input"
,
...
...
@@ -311,6 +313,11 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
"SavedMean"
,
"SavedVariance"
,
"ReserveSpace"
}},
{
"adadelta"
,
{
"ParamOut"
,
"AvgSquaredGradOut"
,
"AvgSquaredUpdateOut"
,
"MasterParamOut"
}},
{
"unique"
,
{
"Out"
,
"Index"
,
"Indices"
,
"Counts"
}},
{
"unique_consecutive"
,
{
"Out"
,
"Index"
,
"Counts"
}},
{
"generate_proposals"
,
{
"RpnRois"
,
"RpnRoiProbs"
,
"RpnRoisNum"
}},
...
...
@@ -400,7 +407,11 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
"MeanGradOut"
,
"MasterParamOut"
}},
{
"ftrl"
,
{
"ParamOut"
,
"SquaredAccumOut"
,
"LinearAccumOut"
}},
{
"adadelta"
,
{
"ParamOut"
,
"AvgSquaredGradOut"
,
"AvgSquaredUpdateOut"
}},
{
"adadelta"
,
{
"ParamOut"
,
"AvgSquaredGradOut"
,
"AvgSquaredUpdateOut"
,
"MasterParamOut"
}},
{
"adagrad"
,
{
"ParamOut"
,
"MomentOut"
,
"MasterParamOut"
}},
{
"adamax"
,
{
"ParamOut"
,
"MomentOut"
,
"InfNormOut"
}},
{
"dpsgd"
,
{
"ParamOut"
}},
...
...
paddle/phi/api/yaml/legacy_ops.yaml
浏览文件 @
a8a2b7f4
...
...
@@ -20,13 +20,15 @@
data_type
:
x
-
op
:
adadelta_
args
:
(Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update,
float rho, float epsil
on)
output
:
Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out)
args
:
(Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update,
Tensor master_param, float rho, float epsilon, bool multi_precisi
on)
output
:
Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out)
, Tensor(master_param_out)
infer_meta
:
func
:
AdadeltaInferMeta
kernel
:
func
:
adadelta
inplace
:
(param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out)
data_type
:
param
optional
:
master_param
inplace
:
(param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out), (master_param -> master_param_out)
-
op
:
adagrad_
args
:
(Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon, bool multi_precision)
...
...
paddle/phi/infermeta/multiary.cc
浏览文件 @
a8a2b7f4
...
...
@@ -38,11 +38,14 @@ void AdadeltaInferMeta(const MetaTensor& param,
const
MetaTensor
&
grad
,
const
MetaTensor
&
avg_squared_grad
,
const
MetaTensor
&
avg_squared_update
,
const
MetaTensor
&
master_param
,
float
rho
,
float
epsilon
,
bool
multi_precision
,
MetaTensor
*
param_out
,
MetaTensor
*
avg_squared_grad_out
,
MetaTensor
*
avg_squared_update_out
)
{
MetaTensor
*
avg_squared_update_out
,
MetaTensor
*
master_param_out
)
{
auto
param_dims
=
param
.
dims
();
PADDLE_ENFORCE_EQ
(
param_dims
,
...
...
paddle/phi/infermeta/multiary.h
浏览文件 @
a8a2b7f4
...
...
@@ -43,11 +43,14 @@ void AdadeltaInferMeta(const MetaTensor& param,
const
MetaTensor
&
grad
,
const
MetaTensor
&
avg_squared_grad
,
const
MetaTensor
&
avg_squared_update
,
const
MetaTensor
&
master_param
,
float
rho
,
float
epsilon
,
bool
multi_precision
,
MetaTensor
*
param_out
,
MetaTensor
*
avg_squared_grad_out
,
MetaTensor
*
avg_squared_update_out
);
MetaTensor
*
avg_squared_update_out
,
MetaTensor
*
master_param_outs
);
void
AdagradInferMeta
(
const
MetaTensor
&
param
,
const
MetaTensor
&
grad
,
...
...
paddle/phi/kernels/adadelta_kernel.h
浏览文件 @
a8a2b7f4
...
...
@@ -24,10 +24,13 @@ void AdadeltaKernel(const Context& dev_ctx,
const
DenseTensor
&
grad
,
const
DenseTensor
&
avg_squared_grad
,
const
DenseTensor
&
avg_squared_update
,
const
paddle
::
optional
<
DenseTensor
>&
master_param
,
float
rho
,
float
epsilon
,
bool
multi_precision
,
DenseTensor
*
param_out
,
DenseTensor
*
avg_squared_grad_out
,
DenseTensor
*
avg_squared_update_out
);
DenseTensor
*
avg_squared_update_out
,
DenseTensor
*
master_param_outs
);
}
// namespace phi
paddle/phi/kernels/gpu/adadelta_kernel.cu
浏览文件 @
a8a2b7f4
...
...
@@ -18,5 +18,10 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
PD_REGISTER_KERNEL
(
adadelta
,
GPU
,
ALL_LAYOUT
,
phi
::
AdadeltaKernel
,
float
,
double
)
{}
PD_REGISTER_KERNEL
(
adadelta
,
GPU
,
ALL_LAYOUT
,
phi
::
AdadeltaKernel
,
float
,
double
,
phi
::
dtype
::
float16
)
{}
paddle/phi/kernels/impl/adadelta_kernel_impl.h
浏览文件 @
a8a2b7f4
...
...
@@ -14,6 +14,7 @@
#pragma once
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/adadelta_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
...
...
@@ -26,40 +27,58 @@ void AdadeltaKernel(const Context& dev_ctx,
const
DenseTensor
&
grad
,
const
DenseTensor
&
avg_squared_grad
,
const
DenseTensor
&
avg_squared_update
,
const
paddle
::
optional
<
DenseTensor
>&
master_param
,
float
rho
,
float
epsilon
,
bool
multi_precision
,
DenseTensor
*
param_out
,
DenseTensor
*
avg_squared_grad_out
,
DenseTensor
*
avg_squared_update_out
)
{
DenseTensor
*
avg_squared_update_out
,
DenseTensor
*
master_param_outs
)
{
using
MPDType
=
typename
phi
::
dtype
::
template
MPTypeTrait
<
T
>
::
Type
;
dev_ctx
.
template
Alloc
<
T
>(
param_out
);
dev_ctx
.
template
Alloc
<
T
>(
avg_squared_grad_out
);
dev_ctx
.
template
Alloc
<
T
>(
avg_squared_update_out
);
dev_ctx
.
template
Alloc
<
MPDType
>(
avg_squared_grad_out
);
dev_ctx
.
template
Alloc
<
MPDType
>(
avg_squared_update_out
);
T
rho_
=
static_cast
<
T
>
(
rho
);
T
epsilon_
=
static_cast
<
T
>
(
epsilon
);
MPDType
rho_
=
static_cast
<
MPDType
>
(
rho
);
MPDType
epsilon_
=
static_cast
<
MPDType
>
(
epsilon
);
auto
eigen_param
=
EigenVector
<
T
>::
Flatten
(
param
);
auto
eigen_grad
=
EigenVector
<
T
>::
Flatten
(
grad
);
// Squared gradient accumulator
auto
eigen_avg_squared_grad
=
EigenVector
<
T
>::
Flatten
(
avg_squared_grad
);
auto
eigen_avg_squared_grad
=
EigenVector
<
MPDType
>::
Flatten
(
avg_squared_grad
);
// Squared updates accumulator
auto
eigen_avg_squared_update
=
EigenVector
<
T
>::
Flatten
(
avg_squared_update
);
auto
eigen_avg_squared_update
=
EigenVector
<
MPDType
>::
Flatten
(
avg_squared_update
);
auto
eigen_param_out
=
EigenVector
<
T
>::
Flatten
(
*
param_out
);
auto
eigen_avg_squared_grad_out
=
EigenVector
<
T
>::
Flatten
(
*
avg_squared_grad_out
);
EigenVector
<
MPDType
>::
Flatten
(
*
avg_squared_grad_out
);
auto
eigen_avg_squared_update_out
=
EigenVector
<
T
>::
Flatten
(
*
avg_squared_update_out
);
EigenVector
<
MPDType
>::
Flatten
(
*
avg_squared_update_out
);
auto
&
place
=
*
dev_ctx
.
eigen_device
();
auto
eigen_grad_cast
=
eigen_grad
.
template
cast
<
MPDType
>();
eigen_avg_squared_grad_out
.
device
(
place
)
=
rho_
*
eigen_avg_squared_grad
+
(
1
-
rho_
)
*
eigen_grad
.
square
();
rho_
*
eigen_avg_squared_grad
+
(
1
-
rho_
)
*
eigen_grad
_cast
.
square
();
auto
update
=
-
((
eigen_avg_squared_update
+
epsilon_
)
/
(
eigen_avg_squared_grad_out
+
epsilon_
))
.
sqrt
()
*
eigen_grad
;
eigen_grad
_cast
;
eigen_avg_squared_update_out
.
device
(
place
)
=
rho_
*
eigen_avg_squared_update
+
(
1
-
rho_
)
*
update
.
square
();
eigen_param_out
.
device
(
place
)
=
eigen_param
+
update
;
if
(
multi_precision
)
{
auto
eigen_master_param_out
=
EigenVector
<
MPDType
>::
Flatten
(
*
master_param_outs
);
auto
eigen_master_param
=
EigenVector
<
MPDType
>::
Flatten
(
*
master_param
);
eigen_master_param_out
.
device
(
place
)
=
eigen_master_param
+
update
;
eigen_param_out
.
device
(
place
)
=
(
eigen_param
.
template
cast
<
MPDType
>()
+
update
).
template
cast
<
T
>();
}
else
{
eigen_param_out
.
device
(
place
)
=
eigen_param
+
update
.
template
cast
<
T
>();
}
}
}
// namespace phi
paddle/phi/kernels/xpu/adadelta_kernel.cc
浏览文件 @
a8a2b7f4
...
...
@@ -25,11 +25,14 @@ void AdadeltaKernel(const Context& dev_ctx,
const
DenseTensor
&
grad
,
const
DenseTensor
&
avg_squared_grad
,
const
DenseTensor
&
avg_squared_update
,
const
paddle
::
optional
<
DenseTensor
>&
master_param
,
float
rho
,
float
epsilon
,
bool
multi_precision
,
DenseTensor
*
param_out
,
DenseTensor
*
avg_squared_grad_out
,
DenseTensor
*
avg_squared_update_out
)
{
DenseTensor
*
avg_squared_update_out
,
DenseTensor
*
master_param_outs
)
{
dev_ctx
.
template
Alloc
<
T
>(
param_out
);
dev_ctx
.
template
Alloc
<
T
>(
avg_squared_grad_out
);
dev_ctx
.
template
Alloc
<
T
>(
avg_squared_update_out
);
...
...
paddle/phi/ops/compat/adadelta_sig.cc
0 → 100644
浏览文件 @
a8a2b7f4
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace
phi
{
KernelSignature
AdadeltaOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
if
(
ctx
.
IsDenseTensorInput
(
"Grad"
))
{
return
KernelSignature
(
"adadelta"
,
{
"Param"
,
"Grad"
,
"AvgSquaredGrad"
,
"AvgSquaredUpdate"
,
"MasterParam"
},
{
"rho"
,
"epsilon"
,
"multi_precision"
},
{
"ParamOut"
,
"AvgSquaredGradOut"
,
"AvgSquaredUpdateOut"
,
"MasterParamOut"
});
}
return
KernelSignature
(
"unregistered"
,
{},
{},
{});
}
}
// namespace phi
PD_REGISTER_ARG_MAPPING_FN
(
adadelta
,
phi
::
AdadeltaOpArgumentMapping
);
python/paddle/fluid/optimizer.py
浏览文件 @
a8a2b7f4
...
...
@@ -3181,14 +3181,87 @@ class AdadeltaOptimizer(Optimizer):
name
=
name
,
)
self
.
type
=
"adadelta"
self
.
_multi_precision
=
False
self
.
_master_weights
=
{}
self
.
_epsilon
=
epsilon
self
.
_rho
=
rho
def
_create_master_weight
(
self
,
param
):
if
param
.
name
in
self
.
_master_weights
:
var
=
self
.
_master_weights
[
param
.
name
]
else
:
assert
isinstance
(
self
.
helper
,
LayerHelper
)
var_name
=
param
.
name
+
'_fp32_master'
var_name
=
unique_name
.
generate
(
var_name
)
var
=
paddle
.
static
.
create_global_var
(
name
=
var_name
,
shape
=
param
.
shape
,
value
=
0
,
dtype
=
'float32'
,
persistable
=
True
,
)
block
=
self
.
helper
.
startup_program
.
global_block
()
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
[
param
]},
outputs
=
{
"Out"
:
[
var
]},
attrs
=
{
"in_dtype"
:
param
.
dtype
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
,
},
)
self
.
_master_weights
[
param
.
name
]
=
var
return
var
def
_get_accumulator
(
self
,
name
,
param
):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if
self
.
_name
is
not
None
:
name
=
self
.
_name
+
"_"
+
name
find_master
=
(
self
.
_multi_precision
and
param
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
target_param
=
(
self
.
_master_weights
[
param
.
name
]
if
find_master
else
param
)
target_name
=
target_param
.
name
if
(
name
not
in
self
.
_accumulators
or
target_name
not
in
self
.
_accumulators
[
name
]
):
raise
Exception
(
"Accumulator {} does not exist for parameter {}"
.
format
(
name
,
target_name
)
)
return
self
.
_accumulators
[
name
][
target_name
]
def
_create_accumulators
(
self
,
block
,
parameters
):
if
not
isinstance
(
block
,
framework
.
Block
):
raise
TypeError
(
"block is not instance of framework.Block."
)
for
p
in
parameters
:
if
self
.
_multi_precision
and
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
master_p
=
self
.
_create_master_weight
(
p
)
self
.
_add_accumulator
(
self
.
_avg_squared_grad_acc_str
,
master_p
)
self
.
_add_accumulator
(
self
.
_avg_squared_update_acc_str
,
master_p
)
continue
if
(
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
not
self
.
_multi_precision
):
warnings
.
warn
(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self
.
_add_accumulator
(
self
.
_avg_squared_grad_acc_str
,
p
)
self
.
_add_accumulator
(
self
.
_avg_squared_update_acc_str
,
p
)
...
...
@@ -3202,6 +3275,15 @@ class AdadeltaOptimizer(Optimizer):
avg_squared_update_acc
=
self
.
_get_accumulator
(
self
.
_avg_squared_update_acc_str
,
param_and_grad
[
0
]
)
find_master
=
(
self
.
_multi_precision
and
param_and_grad
[
0
].
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
master_weight
=
(
self
.
_master_weights
[
param_and_grad
[
0
].
name
]
if
find_master
else
None
)
if
in_dygraph_mode
():
_C_ops
.
adadelta_
(
...
...
@@ -3209,25 +3291,38 @@ class AdadeltaOptimizer(Optimizer):
param_and_grad
[
1
],
avg_squared_grad_acc
,
avg_squared_update_acc
,
master_weight
,
self
.
_rho
,
self
.
_epsilon
,
find_master
,
)
else
:
# Create the adadelta optimizer op
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"AvgSquaredGrad"
:
avg_squared_grad_acc
,
"AvgSquaredUpdate"
:
avg_squared_update_acc
,
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"AvgSquaredGradOut"
:
avg_squared_grad_acc
,
"AvgSquaredUpdateOut"
:
avg_squared_update_acc
,
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
adadelta_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"AvgSquaredGrad"
:
avg_squared_grad_acc
,
"AvgSquaredUpdate"
:
avg_squared_update_acc
,
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"AvgSquaredGradOut"
:
avg_squared_grad_acc
,
"AvgSquaredUpdateOut"
:
avg_squared_update_acc
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
{
"epsilon"
:
self
.
_epsilon
,
"rho"
:
self
.
_rho
,
"multi_precision"
:
find_master
,
},
attrs
=
{
"epsilon"
:
self
.
_epsilon
,
"rho"
:
self
.
_rho
},
stop_gradient
=
True
,
)
...
...
python/paddle/fluid/tests/unittests/test_adadelta_op.py
浏览文件 @
a8a2b7f4
...
...
@@ -203,5 +203,281 @@ class TestAdadeltaV2Group(TestAdadeltaV2):
adam
.
clear_gradients
()
class
TestAdadeltaOpMultiPrecison
(
unittest
.
TestCase
):
def
_test_adadelta_op_dygraph_place_amp
(
self
,
place
,
use_amp
=
False
):
import
paddle
paddle
.
disable_static
()
paddle
.
seed
(
10
)
paddle
.
set_device
(
place
)
input
=
paddle
.
randn
((
5
,
5
))
model
=
paddle
.
nn
.
Linear
(
5
,
5
)
optimizer
=
paddle
.
optimizer
.
Adadelta
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
(),
weight_decay
=
0.1
,
)
optimizer
.
_multi_precision
=
use_amp
for
idx
in
range
(
2
):
if
place
==
'gpu'
and
use_amp
:
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
if
place
==
'gpu'
and
use_amp
:
with
paddle
.
amp
.
auto_cast
(
level
=
'O2'
):
output
=
model
(
input
)
loss
=
paddle
.
mean
(
output
)
scaled
=
scaler
.
scale
(
loss
)
scaled
.
backward
()
scaler
.
step
(
optimizer
)
optimizer
.
clear_grad
()
else
:
output
=
model
(
input
)
loss
=
paddle
.
mean
(
output
)
loss
.
backward
()
optimizer
.
step
()
optimizer
.
clear_grad
()
paddle
.
enable_static
()
def
_get_places
(
self
):
import
paddle
places
=
[
'cpu'
]
if
paddle
.
is_compiled_with_cuda
():
places
.
append
(
'gpu'
)
return
places
def
test_main
(
self
):
for
place
in
self
.
_get_places
():
use_amp_list
=
[
True
,
False
]
for
use_amp
in
use_amp_list
:
self
.
_test_adadelta_op_dygraph_place_amp
(
place
,
use_amp
)
class
TestAdadeltaMultiPrecision2_0
(
unittest
.
TestCase
):
def
dygraph_adadelta_mp
(
self
,
mp
,
use_amp
):
paddle
.
disable_static
()
paddle
.
seed
(
100
)
paddle
.
set_device
(
'gpu'
)
input
=
paddle
.
randn
((
2
,
2
))
model
=
paddle
.
nn
.
Linear
(
2
,
2
)
optimizer
=
paddle
.
optimizer
.
Adadelta
(
0.5
,
parameters
=
model
.
parameters
()
)
optimizer
.
_multi_precision
=
mp
if
use_amp
:
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
for
idx
in
range
(
5
):
if
use_amp
:
with
paddle
.
amp
.
auto_cast
(
level
=
'O2'
):
output
=
model
(
input
)
loss
=
paddle
.
mean
(
output
)
scaled
=
scaler
.
scale
(
loss
)
scaled
.
backward
()
scaler
.
minimize
(
optimizer
,
scaled
)
optimizer
.
clear_grad
()
else
:
output
=
model
(
input
)
loss
=
paddle
.
mean
(
output
)
loss
.
backward
()
optimizer
.
step
()
optimizer
.
clear_grad
()
return
output
,
model
.
parameters
()
def
static_adadelta_mp
(
self
,
mp
,
use_amp
):
paddle
.
enable_static
()
paddle
.
seed
(
100
)
np
.
random
.
seed
(
100
)
exe
=
paddle
.
static
.
Executor
(
'gpu'
)
train_program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
optimizer
=
paddle
.
optimizer
.
Adadelta
(
0.1
)
optimizer
.
_multi_precision
=
mp
if
use_amp
:
optimizer
=
paddle
.
static
.
amp
.
decorate
(
optimizer
,
init_loss_scaling
=
128.0
,
use_dynamic_loss_scaling
=
True
,
use_pure_fp16
=
True
,
use_fp16_guard
=
False
,
)
with
paddle
.
static
.
program_guard
(
train_program
,
startup_program
):
if
use_amp
:
data
=
paddle
.
static
.
data
(
shape
=
[
2
,
2
],
name
=
'X'
,
dtype
=
'float16'
)
else
:
data
=
paddle
.
static
.
data
(
shape
=
[
2
,
2
],
name
=
'X'
,
dtype
=
'float32'
)
hidden
=
paddle
.
static
.
nn
.
fc
(
x
=
data
,
size
=
10
)
loss
=
paddle
.
mean
(
hidden
)
optimizer
.
minimize
(
loss
)
exe
.
run
(
startup_program
)
if
use_amp
:
optimizer
.
amp_init
(
place
=
'gpu'
,
scope
=
paddle
.
static
.
global_scope
())
x
=
np
.
random
.
random
(
size
=
(
2
,
2
)).
astype
(
'float16'
)
else
:
x
=
np
.
random
.
random
(
size
=
(
2
,
2
)).
astype
(
'float32'
)
out
=
[]
for
idx
in
range
(
5
):
(
loss_data
,)
=
exe
.
run
(
train_program
,
feed
=
{
"X"
:
x
},
fetch_list
=
[
loss
.
name
]
)
out
.
append
(
loss_data
)
return
out
def
test_main
(
self
):
if
not
paddle
.
is_compiled_with_cuda
():
return
"Test dygraph mode"
output1_dy
,
params1_dy
=
self
.
dygraph_adadelta_mp
(
use_amp
=
True
,
mp
=
True
)
output2_dy
,
params2_dy
=
self
.
dygraph_adadelta_mp
(
use_amp
=
False
,
mp
=
False
)
np
.
testing
.
assert_allclose
(
output1_dy
.
astype
(
'float32'
).
numpy
(),
output2_dy
.
astype
(
'float32'
).
numpy
(),
rtol
=
1e-05
,
atol
=
0.1
,
)
for
idx
in
range
(
len
(
params1_dy
)):
np
.
testing
.
assert_allclose
(
params1_dy
[
idx
].
astype
(
'float32'
).
numpy
(),
params2_dy
[
idx
].
astype
(
'float32'
).
numpy
(),
rtol
=
1e-05
,
atol
=
0.1
,
)
"Test static mode"
output1_st
=
self
.
static_adadelta_mp
(
use_amp
=
True
,
mp
=
True
)
output2_st
=
self
.
static_adadelta_mp
(
use_amp
=
False
,
mp
=
False
)
for
idx
in
range
(
len
(
output1_st
)):
np
.
testing
.
assert_allclose
(
output1_st
[
idx
].
astype
(
'float32'
),
output2_st
[
idx
].
astype
(
'float32'
),
rtol
=
1e-05
,
atol
=
0.1
,
)
class
TestAdadeltaMultiPrecision1_0
(
unittest
.
TestCase
):
def
dygraph_adadelta_mp
(
self
,
use_amp
,
mp
):
paddle
.
disable_static
()
paddle
.
seed
(
10
)
paddle
.
set_device
(
'gpu'
)
input
=
paddle
.
randn
((
2
,
2
))
model
=
paddle
.
nn
.
Linear
(
2
,
2
)
optimizer
=
paddle
.
fluid
.
optimizer
.
Adadelta
(
learning_rate
=
0.001
,
parameter_list
=
model
.
parameters
(),
)
optimizer
.
_multi_precision
=
mp
if
use_amp
:
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
for
idx
in
range
(
5
):
if
use_amp
:
with
paddle
.
amp
.
auto_cast
(
level
=
'O2'
):
output
=
model
(
input
)
loss
=
paddle
.
mean
(
output
)
scaled
=
scaler
.
scale
(
loss
)
scaled
.
backward
()
scaler
.
minimize
(
optimizer
,
scaled
)
optimizer
.
clear_gradients
()
else
:
output
=
model
(
input
)
loss
=
paddle
.
mean
(
output
)
optimizer
.
minimize
(
loss
)
optimizer
.
clear_gradients
()
return
output
,
model
.
parameters
()
def
static_adadelta_mp
(
self
,
use_amp
,
mp
):
paddle
.
enable_static
()
paddle
.
seed
(
100
)
np
.
random
.
seed
(
100
)
exe
=
paddle
.
static
.
Executor
(
'gpu'
)
train_program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
optimizer
=
paddle
.
fluid
.
optimizer
.
Adadelta
(
learning_rate
=
0.001
)
optimizer
.
_multi_precision
=
mp
if
use_amp
:
optimizer
=
paddle
.
static
.
amp
.
decorate
(
optimizer
,
init_loss_scaling
=
128.0
,
use_dynamic_loss_scaling
=
True
,
use_pure_fp16
=
True
,
use_fp16_guard
=
False
,
)
with
paddle
.
static
.
program_guard
(
train_program
,
startup_program
):
if
use_amp
:
data
=
paddle
.
static
.
data
(
shape
=
[
2
,
2
],
name
=
'X'
,
dtype
=
'float16'
)
else
:
data
=
paddle
.
static
.
data
(
shape
=
[
2
,
2
],
name
=
'X'
,
dtype
=
'float32'
)
hidden
=
paddle
.
static
.
nn
.
fc
(
x
=
data
,
size
=
10
)
loss
=
paddle
.
mean
(
hidden
)
optimizer
.
minimize
(
loss
)
exe
.
run
(
startup_program
)
if
use_amp
:
optimizer
.
amp_init
(
place
=
'gpu'
,
scope
=
paddle
.
static
.
global_scope
())
x
=
np
.
random
.
random
(
size
=
(
2
,
2
)).
astype
(
'float16'
)
else
:
x
=
np
.
random
.
random
(
size
=
(
2
,
2
)).
astype
(
'float32'
)
out
=
[]
for
idx
in
range
(
5
):
(
loss_data
,)
=
exe
.
run
(
train_program
,
feed
=
{
"X"
:
x
},
fetch_list
=
[
loss
.
name
]
)
out
.
append
(
loss_data
)
return
out
def
test_main
(
self
):
if
not
paddle
.
is_compiled_with_cuda
():
return
"Test dygraph mode"
output1_dy
,
params1_dy
=
self
.
dygraph_adadelta_mp
(
use_amp
=
True
,
mp
=
True
)
output2_dy
,
params2_dy
=
self
.
dygraph_adadelta_mp
(
use_amp
=
False
,
mp
=
False
)
np
.
testing
.
assert_allclose
(
output1_dy
.
astype
(
'float32'
).
numpy
(),
output2_dy
.
astype
(
'float32'
).
numpy
(),
rtol
=
1e-05
,
atol
=
0.1
,
)
for
idx
in
range
(
len
(
params1_dy
)):
np
.
testing
.
assert_allclose
(
params1_dy
[
idx
].
astype
(
'float32'
).
numpy
(),
params2_dy
[
idx
].
astype
(
'float32'
).
numpy
(),
rtol
=
1e-05
,
atol
=
0.1
,
)
"Test static mode"
output1_st
=
self
.
static_adadelta_mp
(
use_amp
=
True
,
mp
=
True
)
output2_st
=
self
.
static_adadelta_mp
(
use_amp
=
False
,
mp
=
False
)
for
idx
in
range
(
len
(
output1_st
)):
np
.
testing
.
assert_allclose
(
output1_st
[
idx
].
astype
(
'float32'
),
output2_st
[
idx
].
astype
(
'float32'
),
rtol
=
1e-05
,
atol
=
0.1
,
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/optimizer/adadelta.py
浏览文件 @
a8a2b7f4
...
...
@@ -12,10 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
warnings
import
paddle
from
paddle
import
_C_ops
from
..fluid
import
framework
from
..fluid
import
core
,
framework
,
unique_name
from
..fluid.dygraph
import
no_grad
from
..fluid.layer_helper
import
LayerHelper
from
..framework
import
in_dygraph_mode
from
.optimizer
import
Optimizer
...
...
@@ -130,6 +134,8 @@ class Adadelta(Optimizer):
grad_clip
=
grad_clip
,
name
=
name
,
)
self
.
_multi_precision
=
False
self
.
_master_weights
=
{}
self
.
type
=
"adadelta"
self
.
_epsilon
=
epsilon
self
.
_rho
=
rho
...
...
@@ -138,6 +144,62 @@ class Adadelta(Optimizer):
'rho'
:
rho
,
}
def
_create_master_weight
(
self
,
param
):
if
param
.
name
in
self
.
_master_weights
:
var
=
self
.
_master_weights
[
param
.
name
]
else
:
assert
isinstance
(
self
.
helper
,
LayerHelper
)
var_name
=
param
.
name
+
"_fp32_master"
var_name
=
unique_name
.
generate
(
var_name
)
var
=
paddle
.
static
.
create_global_var
(
name
=
var_name
,
shape
=
param
.
shape
,
value
=
0
,
dtype
=
'float32'
,
persistable
=
True
,
)
block
=
self
.
helper
.
startup_program
.
global_block
()
block
.
append_op
(
type
=
"cast"
,
inputs
=
{
"X"
:
[
param
]},
outputs
=
{
"Out"
:
[
var
]},
attrs
=
{
"in_dtype"
:
param
.
dtype
,
"out_dtype"
:
core
.
VarDesc
.
VarType
.
FP32
,
},
)
self
.
_master_weights
[
param
.
name
]
=
var
return
var
def
_get_accumulator
(
self
,
name
,
param
):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if
self
.
_name
is
not
None
:
name
=
self
.
_name
+
"_"
+
name
find_master
=
(
self
.
_multi_precision
and
param
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
target_param
=
(
self
.
_master_weights
[
param
.
name
]
if
find_master
else
param
)
target_name
=
target_param
.
name
if
(
name
not
in
self
.
_accumulators
or
target_name
not
in
self
.
_accumulators
[
name
]
):
raise
Exception
(
"Accumulator {} does not exist for parameter {}"
.
format
(
name
,
target_name
)
)
return
self
.
_accumulators
[
name
][
target_name
]
def
_create_accumulators
(
self
,
block
,
parameters
):
if
not
isinstance
(
block
,
framework
.
Block
):
raise
TypeError
(
"block is not instance of framework.Block."
)
...
...
@@ -145,6 +207,21 @@ class Adadelta(Optimizer):
parameters
=
parameters
.
get
(
'params'
)
for
p
in
parameters
:
if
self
.
_multi_precision
and
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
master_p
=
self
.
_create_master_weight
(
p
)
self
.
_add_accumulator
(
self
.
_avg_squared_grad_acc_str
,
master_p
)
self
.
_add_accumulator
(
self
.
_avg_squared_update_acc_str
,
master_p
)
continue
if
(
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
not
self
.
_multi_precision
):
warnings
.
warn
(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self
.
_add_accumulator
(
self
.
_avg_squared_grad_acc_str
,
p
)
self
.
_add_accumulator
(
self
.
_avg_squared_update_acc_str
,
p
)
...
...
@@ -158,6 +235,15 @@ class Adadelta(Optimizer):
avg_squared_update_acc
=
self
.
_get_accumulator
(
self
.
_avg_squared_update_acc_str
,
param_and_grad
[
0
]
)
find_master
=
(
self
.
_multi_precision
and
param_and_grad
[
0
].
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
master_weight
=
(
self
.
_master_weights
[
param_and_grad
[
0
].
name
]
if
find_master
else
None
)
if
in_dygraph_mode
():
with
no_grad
():
...
...
@@ -166,8 +252,10 @@ class Adadelta(Optimizer):
param_and_grad
[
1
],
avg_squared_grad_acc
,
avg_squared_update_acc
,
master_weight
,
self
.
_rho
,
self
.
_epsilon
,
find_master
,
)
return
None
else
:
...
...
@@ -175,20 +263,29 @@ class Adadelta(Optimizer):
raise
TypeError
(
"block is not instance of framework.Block."
)
# Create the adadelta optimizer op
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"AvgSquaredGrad"
:
avg_squared_grad_acc
,
"AvgSquaredUpdate"
:
avg_squared_update_acc
,
}
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"AvgSquaredGradOut"
:
avg_squared_grad_acc
,
"AvgSquaredUpdateOut"
:
avg_squared_update_acc
,
}
if
find_master
:
inputs
[
"MasterParam"
]
=
master_weight
outputs
[
"MasterParamOut"
]
=
master_weight
adadelta_op
=
block
.
append_op
(
type
=
self
.
type
,
inputs
=
{
"Param"
:
param_and_grad
[
0
],
"Grad"
:
param_and_grad
[
1
],
"AvgSquaredGrad"
:
avg_squared_grad_acc
,
"AvgSquaredUpdate"
:
avg_squared_update_acc
,
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
],
"AvgSquaredGradOut"
:
avg_squared_grad_acc
,
"AvgSquaredUpdateOut"
:
avg_squared_update_acc
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
{
"epsilon"
:
self
.
_epsilon
,
"rho"
:
self
.
_rho
,
"multi_precision"
:
find_master
,
},
attrs
=
{
"epsilon"
:
self
.
_epsilon
,
"rho"
:
self
.
_rho
},
stop_gradient
=
True
,
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录