未验证 提交 adaeee4d 编写于 作者: Z zhangbo9674 提交者: GitHub

[AMP] Support pure fp16 training mode for dygraph (#35521)

* add pure fp16 major function in auto_cast & tracer

* support master weight in dygraph for pure fp16

* check mix dtype of fp16&fp32 for check_finite_and_unscale op

* change pure fp16 funtion name

* refine some bug in auto_cast

* refine auto_cast interface logic

* add param _casted_by_pure_fp16 for class Layer

* support state_dict hook for save model by user appointed dtype in pure_fp16_decorator

* refine pure_fp16_decorator as decorator

* add unittest

* add comment

* add comment

* support recompute

* add comment for auto_cast and decorator

* support to_static_state_dict for paddle.jit.save

* unlimite models num and optimizers num

* add lookup_table in black_list

* fix momentum and layer state_dict

* fix bug in layer state_dict

* fix bug in layer state_dict_helper

* refine unittest

* refine test_momentun_op

* refine interface and some code

* refine amp_decorator interface

* refine pure fp16 interface

* refine master weight interface
上级 68ae6345
...@@ -117,7 +117,7 @@ static inline std::shared_ptr<imperative::VarBase> CastToType( ...@@ -117,7 +117,7 @@ static inline std::shared_ptr<imperative::VarBase> CastToType(
imperative::NameVarBaseMap outs = {{"Out", {out}}}; imperative::NameVarBaseMap outs = {{"Out", {out}}};
{ {
AutoCastGuard guard(tracer, false); AutoCastGuard guard(tracer, 0);
tracer->TraceOp("cast", ins, outs, std::move(attrs)); tracer->TraceOp("cast", ins, outs, std::move(attrs));
} }
...@@ -225,5 +225,30 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, ...@@ -225,5 +225,30 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
return new_ins; return new_ins;
} }
NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
const NameVarBaseMap& ins) {
NameVarBaseMap new_ins(ins);
auto dst_type = framework::proto::VarType::FP16;
if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
dst_type = framework::proto::VarType::FP32;
}
for (auto& pair : new_ins) {
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first != "X") {
continue;
}
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< framework::DataTypeToString(dst_type);
for (auto& var : pair.second) {
var = (dst_type == framework::proto::VarType::FP32 ? CastToFP32(var)
: CastToFP16(var));
}
}
return new_ins;
}
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
...@@ -63,15 +63,16 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops); ...@@ -63,15 +63,16 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
// NOTE(zhiqiu): AutoCastGuard is used for RAII. // NOTE(zhiqiu): AutoCastGuard is used for RAII.
class AutoCastGuard { class AutoCastGuard {
public: public:
AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode) AutoCastGuard(std::shared_ptr<Tracer> tracer, int guard_level)
: tracer_(tracer) { : tracer_(tracer) {
pre_mode_ = tracer_->IsAutoCastEnabled(); pre_amp_level_ = tracer_->AMPLevel();
if (pre_mode_ != guard_mode) {
tracer_->SetEnableAutoCast(guard_mode); if (pre_amp_level_ != guard_level) {
tracer_->SetAMPLevel(guard_level);
} }
} }
~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); } ~AutoCastGuard() { tracer_->SetAMPLevel(pre_amp_level_); }
// forbid copy and operator= // forbid copy and operator=
AutoCastGuard(const AutoCastGuard& guard) = delete; AutoCastGuard(const AutoCastGuard& guard) = delete;
...@@ -79,11 +80,14 @@ class AutoCastGuard { ...@@ -79,11 +80,14 @@ class AutoCastGuard {
private: private:
std::shared_ptr<Tracer> tracer_; std::shared_ptr<Tracer> tracer_;
bool pre_mode_; int pre_amp_level_;
}; };
NameVarBaseMap AutoCastInputs(const std::string& op_type, NameVarBaseMap AutoCastInputs(const std::string& op_type,
const NameVarBaseMap& ins); const NameVarBaseMap& ins);
NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
const NameVarBaseMap& ins);
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
...@@ -176,9 +176,12 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -176,9 +176,12 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
: attr_checker->GetDefaultAttrMap(); : attr_checker->GetDefaultAttrMap();
NameVarBaseMap new_ins = ins; NameVarBaseMap new_ins = ins;
if (enable_autocast_) { if (amp_level_ == 1) {
VLOG(5) << "Auto mixed precision run operator: " << type; VLOG(5) << "Auto mixed precision run operator: " << type;
new_ins = AutoCastInputs(type, ins); new_ins = AutoCastInputs(type, ins);
} else if (amp_level_ == 2) {
VLOG(5) << "Pure fp16 run operator: " << type;
new_ins = CastPureFp16Inputs(type, ins);
} }
try { try {
......
...@@ -105,9 +105,9 @@ class Tracer { ...@@ -105,9 +105,9 @@ class Tracer {
void SetHasGrad(bool has_grad) { has_grad_ = has_grad; } void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }
void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; } void SetAMPLevel(int level) { amp_level_ = level; }
bool IsAutoCastEnabled() const { return enable_autocast_; } int AMPLevel() const { return amp_level_; }
paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
const platform::Place& place); const platform::Place& place);
...@@ -118,9 +118,9 @@ class Tracer { ...@@ -118,9 +118,9 @@ class Tracer {
bool enable_program_desc_tracing_{false}; bool enable_program_desc_tracing_{false};
std::unique_ptr<UniqueNameGenerator> generator_; std::unique_ptr<UniqueNameGenerator> generator_;
platform::Place expected_place_; platform::Place expected_place_;
bool enable_autocast_{false};
GarbageCollectorMap gcs_; GarbageCollectorMap gcs_;
static thread_local bool has_grad_; static thread_local bool has_grad_;
int amp_level_{0};
}; };
// To access static variable current_tracer // To access static variable current_tracer
......
...@@ -1947,8 +1947,8 @@ void BindImperative(py::module *m_ptr) { ...@@ -1947,8 +1947,8 @@ void BindImperative(py::module *m_ptr) {
.def_property("_enable_program_desc_tracing", .def_property("_enable_program_desc_tracing",
&imperative::Tracer::IsProgramDescTracingEnabled, &imperative::Tracer::IsProgramDescTracingEnabled,
&imperative::Tracer::SetEnableProgramDescTracing) &imperative::Tracer::SetEnableProgramDescTracing)
.def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled, .def_property("_amp_level", &imperative::Tracer::AMPLevel,
&imperative::Tracer::SetEnableAutoCast) &imperative::Tracer::SetAMPLevel)
.def_property("_has_grad", &imperative::Tracer::HasGrad, .def_property("_has_grad", &imperative::Tracer::HasGrad,
&imperative::Tracer::SetHasGrad) &imperative::Tracer::SetHasGrad)
.def_property( .def_property(
......
...@@ -63,11 +63,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -63,11 +63,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"moving_average_abs_max_scale", {"X", "InAccum", "InState"}}, {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
{"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}}, {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
{"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}}, {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
{"momentum", {"Param", "Grad", "Velocity", "LearningRate"}}, {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}}, {"run_program", {"X", "Params"}},
{"matrix_rank", {"X", "TolTensor"}}}; {"matrix_rank", {"X", "TolTensor"}},
{"adam",
{"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
"Beta2Pow", "MasterParam"}},
};
// NOTE(zhiqiu): Like op_ins_map. // NOTE(zhiqiu): Like op_ins_map.
// Commonly, the outputs in auto-generated OP function are determined by the // Commonly, the outputs in auto-generated OP function are determined by the
...@@ -97,12 +101,15 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -97,12 +101,15 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"Out", "OutScale", "OutAccum", "OutState"}}, {"Out", "OutScale", "OutAccum", "OutState"}},
{"multiclass_nms3", {"Out", "NmsRoisNum"}}, {"multiclass_nms3", {"Out", "NmsRoisNum"}},
{"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
{"momentum", {"ParamOut", "VelocityOut"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}}, {"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"rnn", {"DropoutState", "Reserve", "Out", "State"}}, {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
{"lamb", {"lamb",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
{"run_program", {"DOut"}}, {"run_program", {"DOut"}},
{"adam",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
"MasterParamOut"}},
}; };
// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
...@@ -119,13 +126,14 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -119,13 +126,14 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
std::map<std::string, std::set<std::string>> op_passing_outs_map = { std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"sgd", {"ParamOut"}}, {"sgd", {"ParamOut"}},
{"adam", {"adam",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
"MasterParamOut"}},
{"adamw", {"adamw",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
{"average_accumulates", {"average_accumulates",
{"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
"out_old_num_accumulates", "out_num_updates"}}, "out_old_num_accumulates", "out_num_updates"}},
{"momentum", {"ParamOut", "VelocityOut"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}}, {"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"batch_norm", {"MeanOut", "VarianceOut"}}, {"batch_norm", {"MeanOut", "VarianceOut"}},
{"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
......
...@@ -14,5 +14,6 @@ ...@@ -14,5 +14,6 @@
from .auto_cast import auto_cast # noqa: F401 from .auto_cast import auto_cast # noqa: F401
from .grad_scaler import GradScaler # noqa: F401 from .grad_scaler import GradScaler # noqa: F401
from .auto_cast import decorate # noqa: F401
__all__ = ['auto_cast', 'GradScaler'] __all__ = ['auto_cast', 'GradScaler', 'decorate']
...@@ -13,18 +13,22 @@ ...@@ -13,18 +13,22 @@
# limitations under the License. # limitations under the License.
from paddle.fluid.dygraph.amp import amp_guard from paddle.fluid.dygraph.amp import amp_guard
from paddle.fluid.dygraph.amp import amp_decorate
__all__ = [] __all__ = []
def auto_cast(enable=True, custom_white_list=None, custom_black_list=None): def auto_cast(enable=True,
custom_white_list=None,
custom_black_list=None,
level='O1'):
""" """
Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode. Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
If enabled, the input data type (float32 or float16) of each operator is decided If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance. by autocast algorithm for better performance.
Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
imperative mode. imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
Args: Args:
enable(bool, optional): Enable auto-mixed-precision or not. Default is True. enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
...@@ -34,6 +38,8 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None): ...@@ -34,6 +38,8 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16 custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
calculation and are considered numerically-dangerous and whose effects may also be calculation and are considered numerically-dangerous and whose effects may also be
observed in downstream ops. These ops will not be converted to fp16. observed in downstream ops. These ops will not be converted to fp16.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
Examples: Examples:
...@@ -62,5 +68,66 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None): ...@@ -62,5 +68,66 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
c = a + b c = a + b
print(c.dtype) # FP16 print(c.dtype) # FP16
with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
d = a + b
print(d.dtype) # FP16
"""
return amp_guard(enable, custom_white_list, custom_black_list, level)
def decorate(models,
optimizers=None,
level='O1',
master_weight=None,
save_dtype=None):
"""
Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
Commonly, it is used together with `auto_cast` to achieve Pure fp16 in imperative mode.
Args:
models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
Examples:
.. code-block:: python
# required: gpu
# Demo1: single model and optimizer:
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimzier = paddle.optimizer.SGD(parameters=model.parameters())
model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = model(data)
print(output.dtype) # FP16
# required: gpu
# Demo2: multi models and optimizers:
model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = models[0](data)
output2 = models[1](data)
print(output.dtype) # FP16
print(output2.dtype) # FP16
""" """
return amp_guard(enable, custom_white_list, custom_black_list) return amp_decorate(models, optimizers, level, master_weight, save_dtype)
...@@ -198,7 +198,11 @@ class _HPRecomputeFunction(PyLayer): ...@@ -198,7 +198,11 @@ class _HPRecomputeFunction(PyLayer):
# TODO support AMP # TODO support AMP
tracer = framework._dygraph_tracer() tracer = framework._dygraph_tracer()
ctx.is_fw_autocast = tracer._enable_autocast if tracer._amp_level == 0:
ctx.is_fw_autocast = False
else:
ctx.is_fw_autocast = True
ctx.amp_mode = 'O1'
ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
with paddle.no_grad(): with paddle.no_grad():
...@@ -258,7 +262,8 @@ class _HPRecomputeFunction(PyLayer): ...@@ -258,7 +262,8 @@ class _HPRecomputeFunction(PyLayer):
with paddle.amp.auto_cast( with paddle.amp.auto_cast(
enable=ctx.is_fw_autocast, enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list, custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list): custom_black_list=ctx.amp_black_list,
level=ctx.amp_mode):
detached_inputs = detach_variable(tuple(inputs)) detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs) outputs = ctx.run_function(*detached_inputs)
......
...@@ -98,7 +98,11 @@ class RecomputeFunction(PyLayer): ...@@ -98,7 +98,11 @@ class RecomputeFunction(PyLayer):
# TODO support AMP # TODO support AMP
tracer = framework._dygraph_tracer() tracer = framework._dygraph_tracer()
ctx.is_fw_autocast = tracer._enable_autocast if tracer._amp_level == 0:
ctx.is_fw_autocast = False
else:
ctx.is_fw_autocast = True
ctx.amp_mode = 'O1'
ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
with paddle.no_grad(): with paddle.no_grad():
...@@ -128,14 +132,16 @@ class RecomputeFunction(PyLayer): ...@@ -128,14 +132,16 @@ class RecomputeFunction(PyLayer):
with paddle.amp.auto_cast( with paddle.amp.auto_cast(
enable=ctx.is_fw_autocast, enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list, custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list): custom_black_list=ctx.amp_black_list,
level=ctx.amp_mode):
detached_inputs = detach_variable(tuple(inputs)) detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs) outputs = ctx.run_function(*detached_inputs)
else: else:
with paddle.amp.auto_cast( with paddle.amp.auto_cast(
enable=ctx.is_fw_autocast, enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list, custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list): custom_black_list=ctx.amp_black_list,
level=ctx.amp_mode):
detached_inputs = detach_variable(tuple(inputs)) detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs) outputs = ctx.run_function(*detached_inputs)
......
...@@ -203,19 +203,21 @@ class Momentum(Optimizer): ...@@ -203,19 +203,21 @@ class Momentum(Optimizer):
param_and_grad[0]) param_and_grad[0])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode():
_, _ = _C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
param_and_grad[0], velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov, 'regularization_method',
self._regularization_method, 'regularization_coeff',
self._regularization_coeff)
return None
find_master = self._multi_precision and param_and_grad[ find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16 0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name] master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None) if find_master else None)
if framework.in_dygraph_mode():
_, _, _ = _C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
master_weight, param_and_grad[0], velocity_acc, master_weight,
'mu', self._momentum, 'use_nesterov', self._use_nesterov,
'regularization_method', self._regularization_method,
'regularization_coeff', self._regularization_coeff,
'multi_precision', find_master)
return None
attrs = { attrs = {
"mu": self._momentum, "mu": self._momentum,
"use_nesterov": self._use_nesterov, "use_nesterov": self._use_nesterov,
......
...@@ -19,8 +19,13 @@ import contextlib ...@@ -19,8 +19,13 @@ import contextlib
from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags
import warnings import warnings
import copy import copy
import functools
import paddle
import operator
import types
import paddle.fluid as fluid
__all__ = ['amp_guard'] __all__ = ['amp_guard', 'amp_decorate']
# The set of ops that support fp16 calculation and are considered numerically- # The set of ops that support fp16 calculation and are considered numerically-
# safe and performance-critical. These ops are always converted to fp16. # safe and performance-critical. These ops are always converted to fp16.
...@@ -64,15 +69,22 @@ AMP_RELATED_FLAGS_SETTING = { ...@@ -64,15 +69,22 @@ AMP_RELATED_FLAGS_SETTING = {
'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
} }
PURE_FP16_BLACK_LIST = {' '}
PURE_FP16_WHITE_LIST = {'lookup_table', 'lookup_table_v2'}
#NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
# The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode. # The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
def _update_list(custom_white_list, custom_black_list): def _update_list(custom_white_list, custom_black_list, level='O1'):
""" """
Update black and white list according to users' custom list. Update black and white list according to users' custom list.
""" """
if level == 'O1':
_white_list = copy.copy(WHITE_LIST) _white_list = copy.copy(WHITE_LIST)
_black_list = copy.copy(BLACK_LIST) _black_list = copy.copy(BLACK_LIST)
else:
_white_list = copy.copy(PURE_FP16_WHITE_LIST)
_black_list = copy.copy(PURE_FP16_BLACK_LIST)
if custom_white_list and custom_black_list: if custom_white_list and custom_black_list:
for op_name in custom_white_list: for op_name in custom_white_list:
if op_name in custom_black_list: if op_name in custom_black_list:
...@@ -97,28 +109,111 @@ def _in_amp_guard(): ...@@ -97,28 +109,111 @@ def _in_amp_guard():
""" """
tracer = _dygraph_tracer() tracer = _dygraph_tracer()
if tracer: if tracer:
return tracer._enable_autocast if tracer._amp_level == 1:
return True
else:
return False
else: else:
return False return False
@dygraph_only
def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
if not enable_pure_fp16:
return models, optimizers
for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True):
layer._casted_by_pure_fp16 = True
if len(layer._sub_layers) is 0:
if (layer._dtype is 'float16') or isinstance(layer, (
paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
continue
layer.to(dtype='float16')
for idx_opt in range(len(optimizers)):
# update _param_groups
if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
optimizers[idx_opt]._param_groups[0], dict):
for param_group in optimizers[idx_opt]._param_groups:
for i, param in enumerate(param_group['params']):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
param_group['params'][
i] = layer._parameters_transform_map[id(
param)][0]
for param_group in optimizers[idx_opt]._parameter_list:
params = param_group['params']
for i, param in enumerate(params):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
params[i] = layer._parameters_transform_map[id(
param)][0]
# update _parameter_list
else:
for i, param in enumerate(optimizers[idx_opt]._parameter_list):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(include_self=True):
if id(param) in layer._parameters_transform_map:
optimizers[idx_opt]._parameter_list[
i] = layer._parameters_transform_map[id(param)][
0]
if hasattr(optimizers[idx_opt], '_param_groups'):
optimizers[idx_opt]._param_groups[
i] = layer._parameters_transform_map[id(
param)][0]
return models, optimizers
def check_models(models):
for model in models:
if not isinstance(model, paddle.nn.Layer):
raise RuntimeError(
"Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.".
format(type(model)))
def check_optimizers(optimizers):
for optimizer in optimizers:
if not isinstance(optimizer, (paddle.optimizer.Optimizer,
paddle.fluid.optimizer.Optimizer)):
raise RuntimeError(
"Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".
format(type(optimizer)))
@signature_safe_contextmanager @signature_safe_contextmanager
@dygraph_only @dygraph_only
def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): def amp_guard(enable=True,
custom_white_list=None,
custom_black_list=None,
level='O1'):
""" """
:api_attr: imperative :api_attr: imperative
Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode. Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
If enabled, the input data type (float32 or float16) of each operator is decided If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance. by autocast algorithm for better performance.
Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
imperative mode. imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
Args: Args:
enable(bool, optional): Enable auto-mixed-precision or not. Default is True. enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
custom_white_list(set|list, optional): The custom white_list. custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
custom_black_list(set|list, optional): The custom black_list. fp16 calculation and are considered numerically-safe and performance-critical. These ops
will be converted to fp16.
custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
calculation and are considered numerically-dangerous and whose effects may also be
observed in downstream ops. These ops will not be converted to fp16.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
Examples: Examples:
...@@ -139,6 +234,11 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): ...@@ -139,6 +234,11 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
print(conv.dtype) # FP32 print(conv.dtype) # FP32
""" """
if not (level in ['O1', 'O2']):
raise ValueError(
"level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
)
tracer = _dygraph_tracer() tracer = _dygraph_tracer()
if not tracer: if not tracer:
raise ValueError( raise ValueError(
...@@ -151,17 +251,27 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): ...@@ -151,17 +251,27 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
% tracer._expected_place) % tracer._expected_place)
enable = False enable = False
# use default white_list and black_list if no custom lists provided if level == 'O1':
amp_level = 1
_white_list = WHITE_LIST _white_list = WHITE_LIST
_black_list = BLACK_LIST _black_list = BLACK_LIST
else:
amp_level = 2
_white_list = PURE_FP16_WHITE_LIST
_black_list = PURE_FP16_BLACK_LIST
if custom_white_list or custom_black_list: if custom_white_list or custom_black_list:
_white_list, _black_list = _update_list(custom_white_list, _white_list, _black_list = _update_list(custom_white_list,
custom_black_list) custom_black_list, level)
if not enable:
amp_level = 0
if tracer: if tracer:
# enable auto_cast # enable auto_cast
original_enable = tracer._enable_autocast original_amp_level = tracer._amp_level
tracer._enable_autocast = enable tracer._amp_level = amp_level
# set amp op list # set amp op list
original_white_list, original_black_list = tracer._get_amp_op_list() original_white_list, original_black_list = tracer._get_amp_op_list()
tracer._set_amp_op_list(_white_list, _black_list) tracer._set_amp_op_list(_white_list, _black_list)
...@@ -179,6 +289,141 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): ...@@ -179,6 +289,141 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
yield yield
finally: finally:
if tracer: if tracer:
tracer._enable_autocast = original_enable tracer._amp_level = original_amp_level
tracer._set_amp_op_list(original_white_list, original_black_list) tracer._set_amp_op_list(original_white_list, original_black_list)
# set_flags(original_flags) # set_flags(original_flags)
class StateDictHook(object):
def __init__(self, save_dtype):
self._save_dtype = save_dtype
def __call__(self, state_dict):
for key in state_dict:
param = state_dict[key]
with fluid.dygraph.guard():
param_applied = paddle.cast(param, self._save_dtype)
param_applied.name = param.name
state_dict[key] = param_applied
@dygraph_only
def amp_decorate(models,
optimizers=None,
level='O1',
master_weight=None,
save_dtype=None):
"""
Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.
Args:
models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
Examples:
.. code-block:: python
# required: gpu
# Demo1: single model and optimizer:
import paddle
import paddle.fluid as fluid
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimzier = paddle.optimizer.SGD(parameters=model.parameters())
model, optimizer = fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
data = paddle.rand([10, 3, 32, 32])
with fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = model(data)
print(output.dtype) # FP16
# required: gpu
# Demo2: multi models and optimizers:
model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
models, optimizers = fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
data = paddle.rand([10, 3, 32, 32])
with fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = models[0](data)
output2 = models[1](data)
print(output.dtype) # FP16
print(output2.dtype) # FP16
"""
if not (level in ['O1', 'O2']):
raise ValueError(
"level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
)
if level == 'O1':
return models, optimizers
models_is_list = False
if isinstance(models, paddle.nn.Layer):
models_is_list = False
models = [models]
check_models(models)
elif isinstance(models, list):
check_models(models)
models_is_list = True
else:
raise TypeError(
"models must be either a single model or a list of models.")
optimizers_is_list = False
if isinstance(optimizers, (paddle.optimizer.Optimizer,
paddle.fluid.optimizer.Optimizer)):
optimizers_is_list = False
optimizers = [optimizers]
check_optimizers(optimizers)
elif isinstance(optimizers, list):
check_optimizers(optimizers)
optimizers_is_list = True
else:
raise TypeError(
"optimizers must be either a single optimizer or a list of optimizers."
)
models, optimizers = pure_fp16_initialize(
enable_pure_fp16=True, models=models, optimizers=optimizers)
# supprot master_weight
for idx_opt in range(len(optimizers)):
if hasattr(optimizers[idx_opt], '_multi_precision'):
if master_weight is False:
optimizers[idx_opt]._multi_precision = False
else:
optimizers[idx_opt]._multi_precision = True
if save_dtype is not None:
if not (save_dtype in ['float16', 'float32', 'float64']):
raise ValueError(
"save_dtype can only be float16 float32 or float64, but your input save_dtype is %s."
% save_dtype)
for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True):
layer.register_state_dict_hook(StateDictHook(save_dtype))
if models_is_list:
if optimizers_is_list:
return models, optimizers
else:
return models, optimizers[0]
else:
if optimizers_is_list:
return models[0], optimizers
else:
return models[0], optimizers[0]
...@@ -216,17 +216,45 @@ class AmpScaler(object): ...@@ -216,17 +216,45 @@ class AmpScaler(object):
if getattr(optimizer, '_param_groups', None) and isinstance( if getattr(optimizer, '_param_groups', None) and isinstance(
optimizer._param_groups[0], dict): optimizer._param_groups[0], dict):
param_grads = [] param_grads = []
param_grads_fp16 = []
param_grads_fp32 = []
for group in optimizer._param_groups: for group in optimizer._param_groups:
for param in group['params']: for param in group['params']:
if param._grad_ivar() is not None: if param._grad_ivar() is not None:
param_grads.append(param._grad_ivar()) param_grads.append(param._grad_ivar())
if param._grad_ivar(
).dtype == core.VarDesc.VarType.FP16:
param_grads_fp16.append(param._grad_ivar())
else:
param_grads_fp32.append(param._grad_ivar())
else: else:
param_grads = [ param_grads = [
param._grad_ivar() for param in optimizer._parameter_list param._grad_ivar() for param in optimizer._parameter_list
if param._grad_ivar() is not None if param._grad_ivar() is not None
] ]
_C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads, param_grads_fp16 = [
self._found_inf) param._grad_ivar() for param in optimizer._parameter_list
if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
)
]
param_grads_fp32 = [
param._grad_ivar() for param in optimizer._parameter_list
if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
)
]
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
if len(param_grads_fp16):
_C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
param_grads_fp16,
temp_found_inf_fp16)
if len(param_grads_fp32):
_C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
param_grads_fp32,
temp_found_inf_fp32)
self._found_inf = temp_found_inf_fp16 or temp_found_inf_fp32
def _update(self): def _update(self):
""" """
......
...@@ -779,10 +779,11 @@ def save(layer, path, input_spec=None, **configs): ...@@ -779,10 +779,11 @@ def save(layer, path, input_spec=None, **configs):
dygraph_state_dict = None dygraph_state_dict = None
if isinstance(inner_layer, Layer): if isinstance(inner_layer, Layer):
dygraph_state_dict = inner_layer.state_dict() dygraph_state_dict = inner_layer.to_static_state_dict()
elif isinstance(attr_func, StaticFunction): elif isinstance(attr_func, StaticFunction):
if attr_func._class_instance: if attr_func._class_instance:
dygraph_state_dict = attr_func._class_instance.state_dict() dygraph_state_dict = attr_func._class_instance.to_static_state_dict(
)
if dygraph_state_dict: if dygraph_state_dict:
# NOTE(chenweihang): we maintain the mapping of variable name to # NOTE(chenweihang): we maintain the mapping of variable name to
...@@ -790,15 +791,19 @@ def save(layer, path, input_spec=None, **configs): ...@@ -790,15 +791,19 @@ def save(layer, path, input_spec=None, **configs):
# saved to inference program may not need by dygraph Layer, # saved to inference program may not need by dygraph Layer,
# we only record the state_dict variable's structured name # we only record the state_dict variable's structured name
state_names_dict = dict() state_names_dict = dict()
state_var_dict = dict()
for structured_name, var in six.iteritems(dygraph_state_dict): for structured_name, var in six.iteritems(dygraph_state_dict):
state_names_dict[var.name] = structured_name state_names_dict[var.name] = structured_name
state_var_dict[var.name] = var
# 3. share parameters from Layer to scope & record var info # 3. share parameters from Layer to scope & record var info
for param_or_buffer in concrete_program.parameters: for param_or_buffer in concrete_program.parameters:
# share to scope # share to scope
param_or_buffer_tensor = scope.var( param_or_buffer_tensor = scope.var(
param_or_buffer.name).get_tensor() param_or_buffer.name).get_tensor()
src_tensor = param_or_buffer.value().get_tensor() #src_tensor = param_or_buffer.value().get_tensor()
src_tensor = state_var_dict[param_or_buffer.name].value(
).get_tensor()
param_or_buffer_tensor._share_data_with(src_tensor) param_or_buffer_tensor._share_data_with(src_tensor)
# record var info # record var info
if param_or_buffer.name not in extra_var_info: if param_or_buffer.name not in extra_var_info:
......
...@@ -121,6 +121,13 @@ class Layer(core.Layer): ...@@ -121,6 +121,13 @@ class Layer(core.Layer):
self._forward_pre_hooks = collections.OrderedDict() self._forward_pre_hooks = collections.OrderedDict()
self._forward_post_hooks = collections.OrderedDict() self._forward_post_hooks = collections.OrderedDict()
self._parameters_transform_map = {}
self._buffers_transform_map = {}
self._casted_by_pure_fp16 = False
self._state_dict_hooks = collections.OrderedDict()
def train(self): def train(self):
""" """
Sets this Layer and all its sublayers to training mode. Sets this Layer and all its sublayers to training mode.
...@@ -1259,31 +1266,24 @@ class Layer(core.Layer): ...@@ -1259,31 +1266,24 @@ class Layer(core.Layer):
final_str += ')' final_str += ')'
return final_str return final_str
def state_dict(self, def register_state_dict_hook(self, hook):
hook_remove_helper = HookRemoveHelper(self._state_dict_hooks)
self._state_dict_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper
def _state_dict_impl(self,
destination=None, destination=None,
include_sublayers=True, include_sublayers=True,
structured_name_prefix=""): structured_name_prefix="",
''' include_non_persistable_buffer=False):
"""
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
Parameters: Parameters:
destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
include_non_persistable_buffer(bool, optional): If true, include non persistable buffers of current layer and its sub-layers, it is used in pure fp16 and jit.save. Default: False
Retruns: """
dict: a dict contains all the parameters and persistable buffers.
Examples:
.. code-block:: python
import paddle
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict()
paddle.save( state_dict, "paddle_dy.pdparams")
'''
if destination is None: if destination is None:
destination = collections.OrderedDict() destination = collections.OrderedDict()
...@@ -1291,20 +1291,93 @@ class Layer(core.Layer): ...@@ -1291,20 +1291,93 @@ class Layer(core.Layer):
if data is not None: if data is not None:
destination[structured_name_prefix + name] = data destination[structured_name_prefix + name] = data
for name, buffer in self._buffers.items(): for name, buffer in self._buffers.items():
if not include_non_persistable_buffer:
if buffer is not None and name not in self._non_persistable_buffer_names_set: if buffer is not None and name not in self._non_persistable_buffer_names_set:
destination[structured_name_prefix + name] = buffer destination[structured_name_prefix + name] = buffer
else:
if buffer is not None:
destination[structured_name_prefix + name] = buffer
if include_sublayers: if include_sublayers:
for layer_name, layer_item in self._sub_layers.items(): for layer_name, layer_item in self._sub_layers.items():
if layer_item is not None: if layer_item is not None:
destination_temp = destination.copy() destination_temp = destination.copy()
destination_temp.update( destination_temp.update(
layer_item.state_dict( layer_item._state_dict_impl(
destination_temp, include_sublayers, destination_temp, include_sublayers,
structured_name_prefix + layer_name + ".")) structured_name_prefix + layer_name + ".",
include_non_persistable_buffer))
destination = destination_temp destination = destination_temp
for state_dict_hook in self._state_dict_hooks.values():
hook_result = state_dict_hook(destination)
if hook_result is not None:
destination = hook_result
return destination return destination
def to_static_state_dict(self,
destination=None,
include_sublayers=True,
structured_name_prefix=""):
'''
Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
Parameters:
destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
Retruns:
dict: a dict contains all the parameters and persistable buffers.
Examples:
.. code-block:: python
import paddle
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.to_static_state_dict()
paddle.save( state_dict, "paddle_dy.pdparams")
'''
return self._state_dict_impl(
destination=destination,
include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=True)
def state_dict(self,
destination=None,
include_sublayers=True,
structured_name_prefix=""):
'''
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
Parameters:
destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
Retruns:
dict: a dict contains all the parameters and persistable buffers.
Examples:
.. code-block:: python
import paddle
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict()
paddle.save( state_dict, "paddle_dy.pdparams")
'''
return self._state_dict_impl(
destination=destination,
include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=False)
@framework.deprecate_stat_dict @framework.deprecate_stat_dict
def set_state_dict(self, state_dict, use_structured_name=True): def set_state_dict(self, state_dict, use_structured_name=True):
''' '''
...@@ -1404,8 +1477,11 @@ class Layer(core.Layer): ...@@ -1404,8 +1477,11 @@ class Layer(core.Layer):
).stop_gradient ).stop_gradient
self._parameters[key]._set_grad_ivar(grad_applied) self._parameters[key]._set_grad_ivar(grad_applied)
self._parameters_transform_map[id(param)] = [param_applied, key]
for key, buf in self._buffers.items(): for key, buf in self._buffers.items():
self._buffers[key] = func(buf, device, dtype, blocking) self._buffers[key] = func(buf, device, dtype, blocking)
self._buffers_transform_map[id(buf)] = [self._buffers[key], key]
def to(self, device=None, dtype=None, blocking=None): def to(self, device=None, dtype=None, blocking=None):
''' '''
...@@ -1501,6 +1577,7 @@ class Layer(core.Layer): ...@@ -1501,6 +1577,7 @@ class Layer(core.Layer):
return new_t return new_t
self._apply(transform, device, dtype, blocking) self._apply(transform, device, dtype, blocking)
self._dtype = dtype
# [aliases] Compatible with old method names # [aliases] Compatible with old method names
set_dict = set_state_dict set_dict = set_state_dict
......
...@@ -1433,12 +1433,12 @@ class MomentumOptimizer(Optimizer): ...@@ -1433,12 +1433,12 @@ class MomentumOptimizer(Optimizer):
velocity_acc = self._get_accumulator(self._velocity_acc_str, velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0]) param_and_grad[0])
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
master_weight = None
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
_, _ = _C_ops.momentum(param_and_grad[0], param_and_grad[1], _, _, _ = _C_ops.momentum(
velocity_acc, lr, param_and_grad[0], param_and_grad[0], param_and_grad[1], velocity_acc, lr,
velocity_acc, 'mu', self._momentum, master_weight, param_and_grad[0], velocity_acc, master_weight,
'use_nesterov', self._use_nesterov) 'mu', self._momentum, 'use_nesterov', self._use_nesterov)
return None return None
attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
...@@ -1982,6 +1982,9 @@ class LarsMomentumOptimizer(Optimizer): ...@@ -1982,6 +1982,9 @@ class LarsMomentumOptimizer(Optimizer):
self._master_weights = {} self._master_weights = {}
def _create_master_weight(self, param): def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
var_name = param.name + '_fp32_master' var_name = param.name + '_fp32_master'
...@@ -2462,12 +2465,14 @@ class AdamOptimizer(Optimizer): ...@@ -2462,12 +2465,14 @@ class AdamOptimizer(Optimizer):
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance( _beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0) self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _ = _C_ops.adam( master_weight = None
_, _, _, _, _, _ = _C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2, param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow', 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, 'use_global_beta_pow',
self._use_global_beta_pow) self._use_global_beta_pow)
return None return None
......
...@@ -1099,7 +1099,6 @@ class TestJitSaveLoadSaveWithoutRunning(unittest.TestCase): ...@@ -1099,7 +1099,6 @@ class TestJitSaveLoadSaveWithoutRunning(unittest.TestCase):
paddle.static.InputSpec( paddle.static.InputSpec(
shape=[None, IMAGE_SIZE], dtype='float32') shape=[None, IMAGE_SIZE], dtype='float32')
]) ])
result_00 = layer_save(inps0) result_00 = layer_save(inps0)
result_01 = layer_save(inps1) result_01 = layer_save(inps1)
#load and save without running #load and save without running
......
...@@ -22,6 +22,8 @@ from ..fluid.layer_helper import LayerHelper ...@@ -22,6 +22,8 @@ from ..fluid.layer_helper import LayerHelper
import warnings import warnings
from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import base as imperative_base
from collections import defaultdict from collections import defaultdict
import numpy as np
import time
import paddle import paddle
from paddle import _C_ops from paddle import _C_ops
...@@ -208,6 +210,9 @@ class Adam(Optimizer): ...@@ -208,6 +210,9 @@ class Adam(Optimizer):
} }
def _create_master_weight(self, param): def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master" var_name = param.name + "_fp32_master"
...@@ -317,12 +322,13 @@ class Adam(Optimizer): ...@@ -317,12 +322,13 @@ class Adam(Optimizer):
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance( _beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0) self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _ = _C_ops.adam( _, _, _, _, _, _ = _C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2, param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
1000, 'beta1', _beta1, 'beta2', _beta2) 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, 'multi_precision', find_master)
return None return None
......
...@@ -297,13 +297,15 @@ class AdamW(Adam): ...@@ -297,13 +297,15 @@ class AdamW(Adam):
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance( _beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0) self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _ = _C_ops.adamw(
_, _, _, _, _, _ = _C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2, param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff, 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
"lr_ratio", lr_ratio_) 'beta2', _beta2, 'coeff', self._coeff, 'multi_precision',
find_master)
return None return None
......
...@@ -170,7 +170,7 @@ class Momentum(Optimizer): ...@@ -170,7 +170,7 @@ class Momentum(Optimizer):
'regularization_method': self._regularization_method, 'regularization_method': self._regularization_method,
'regularization_coeff': self._regularization_coeff, 'regularization_coeff': self._regularization_coeff,
} }
'''
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
if isinstance(self._parameter_list[0], dict): if isinstance(self._parameter_list[0], dict):
...@@ -180,6 +180,7 @@ class Momentum(Optimizer): ...@@ -180,6 +180,7 @@ class Momentum(Optimizer):
else: else:
for p in parameters: for p in parameters:
self._add_accumulator(self._velocity_acc_str, p) self._add_accumulator(self._velocity_acc_str, p)
'''
def _update_regularization(self, weight_decay): def _update_regularization(self, weight_decay):
reg_method = "" reg_method = ""
...@@ -194,6 +195,9 @@ class Momentum(Optimizer): ...@@ -194,6 +195,9 @@ class Momentum(Optimizer):
return reg_method, reg_coeff return reg_method, reg_coeff
def _create_master_weight(self, param): def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master" var_name = param.name + "_fp32_master"
...@@ -239,10 +243,15 @@ class Momentum(Optimizer): ...@@ -239,10 +243,15 @@ class Momentum(Optimizer):
return self._accumulators[name][target_name] return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
'''
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
return return
'''
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(parameters, dict):
parameters = self._update_param_group(parameters)
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
...@@ -291,21 +300,23 @@ class Momentum(Optimizer): ...@@ -291,21 +300,23 @@ class Momentum(Optimizer):
regularization_method = "" regularization_method = ""
regularization_coeff = 0 regularization_coeff = 0
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay']) self._update_regularization(param_and_grad['weight_decay'])
_, _ = _C_ops.momentum( _, _, _ = _C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr, param_and_grad[0], param_and_grad[1], velocity_acc, lr,
param_and_grad[0], velocity_acc, 'mu', self._momentum, master_weight, param_and_grad[0], velocity_acc, master_weight,
'use_nesterov', self._use_nesterov, 'regularization_method', 'mu', self._momentum, 'use_nesterov', self._use_nesterov,
regularization_method, 'regularization_coeff', 'regularization_method', regularization_method,
regularization_coeff) 'regularization_coeff', regularization_coeff, 'multi_precision',
return None find_master)
find_master = self._multi_precision and param_and_grad[ return None
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
attrs = { attrs = {
"mu": self._momentum, "mu": self._momentum,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册