未验证 提交 adaeee4d 编写于 作者: Z zhangbo9674 提交者: GitHub

[AMP] Support pure fp16 training mode for dygraph (#35521)

* add pure fp16 major function in auto_cast & tracer

* support master weight in dygraph for pure fp16

* check mix dtype of fp16&fp32 for check_finite_and_unscale op

* change pure fp16 funtion name

* refine some bug in auto_cast

* refine auto_cast interface logic

* add param _casted_by_pure_fp16 for class Layer

* support state_dict hook for save model by user appointed dtype in pure_fp16_decorator

* refine pure_fp16_decorator as decorator

* add unittest

* add comment

* add comment

* support recompute

* add comment for auto_cast and decorator

* support to_static_state_dict for paddle.jit.save

* unlimite models num and optimizers num

* add lookup_table in black_list

* fix momentum and layer state_dict

* fix bug in layer state_dict

* fix bug in layer state_dict_helper

* refine unittest

* refine test_momentun_op

* refine interface and some code

* refine amp_decorator interface

* refine pure fp16 interface

* refine master weight interface
上级 68ae6345
......@@ -117,7 +117,7 @@ static inline std::shared_ptr<imperative::VarBase> CastToType(
imperative::NameVarBaseMap outs = {{"Out", {out}}};
{
AutoCastGuard guard(tracer, false);
AutoCastGuard guard(tracer, 0);
tracer->TraceOp("cast", ins, outs, std::move(attrs));
}
......@@ -225,5 +225,30 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
return new_ins;
}
NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
const NameVarBaseMap& ins) {
NameVarBaseMap new_ins(ins);
auto dst_type = framework::proto::VarType::FP16;
if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
dst_type = framework::proto::VarType::FP32;
}
for (auto& pair : new_ins) {
if ((op_type == "batch_norm" || op_type == "layer_norm" ||
op_type == "sync_batch_norm") &&
pair.first != "X") {
continue;
}
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< framework::DataTypeToString(dst_type);
for (auto& var : pair.second) {
var = (dst_type == framework::proto::VarType::FP32 ? CastToFP32(var)
: CastToFP16(var));
}
}
return new_ins;
}
} // namespace imperative
} // namespace paddle
......@@ -63,15 +63,16 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
// NOTE(zhiqiu): AutoCastGuard is used for RAII.
class AutoCastGuard {
public:
AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode)
AutoCastGuard(std::shared_ptr<Tracer> tracer, int guard_level)
: tracer_(tracer) {
pre_mode_ = tracer_->IsAutoCastEnabled();
if (pre_mode_ != guard_mode) {
tracer_->SetEnableAutoCast(guard_mode);
pre_amp_level_ = tracer_->AMPLevel();
if (pre_amp_level_ != guard_level) {
tracer_->SetAMPLevel(guard_level);
}
}
~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); }
~AutoCastGuard() { tracer_->SetAMPLevel(pre_amp_level_); }
// forbid copy and operator=
AutoCastGuard(const AutoCastGuard& guard) = delete;
......@@ -79,11 +80,14 @@ class AutoCastGuard {
private:
std::shared_ptr<Tracer> tracer_;
bool pre_mode_;
int pre_amp_level_;
};
NameVarBaseMap AutoCastInputs(const std::string& op_type,
const NameVarBaseMap& ins);
NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
const NameVarBaseMap& ins);
} // namespace imperative
} // namespace paddle
......@@ -176,9 +176,12 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
: attr_checker->GetDefaultAttrMap();
NameVarBaseMap new_ins = ins;
if (enable_autocast_) {
if (amp_level_ == 1) {
VLOG(5) << "Auto mixed precision run operator: " << type;
new_ins = AutoCastInputs(type, ins);
} else if (amp_level_ == 2) {
VLOG(5) << "Pure fp16 run operator: " << type;
new_ins = CastPureFp16Inputs(type, ins);
}
try {
......
......@@ -105,9 +105,9 @@ class Tracer {
void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }
void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; }
void SetAMPLevel(int level) { amp_level_ = level; }
bool IsAutoCastEnabled() const { return enable_autocast_; }
int AMPLevel() const { return amp_level_; }
paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
const platform::Place& place);
......@@ -118,9 +118,9 @@ class Tracer {
bool enable_program_desc_tracing_{false};
std::unique_ptr<UniqueNameGenerator> generator_;
platform::Place expected_place_;
bool enable_autocast_{false};
GarbageCollectorMap gcs_;
static thread_local bool has_grad_;
int amp_level_{0};
};
// To access static variable current_tracer
......
......@@ -1947,8 +1947,8 @@ void BindImperative(py::module *m_ptr) {
.def_property("_enable_program_desc_tracing",
&imperative::Tracer::IsProgramDescTracingEnabled,
&imperative::Tracer::SetEnableProgramDescTracing)
.def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled,
&imperative::Tracer::SetEnableAutoCast)
.def_property("_amp_level", &imperative::Tracer::AMPLevel,
&imperative::Tracer::SetAMPLevel)
.def_property("_has_grad", &imperative::Tracer::HasGrad,
&imperative::Tracer::SetHasGrad)
.def_property(
......
......@@ -63,11 +63,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
{"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
{"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
{"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
{"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
{"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
{"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
{"run_program", {"X", "Params"}},
{"matrix_rank", {"X", "TolTensor"}}};
{"matrix_rank", {"X", "TolTensor"}},
{"adam",
{"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
"Beta2Pow", "MasterParam"}},
};
// NOTE(zhiqiu): Like op_ins_map.
// Commonly, the outputs in auto-generated OP function are determined by the
......@@ -97,12 +101,15 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
{"Out", "OutScale", "OutAccum", "OutState"}},
{"multiclass_nms3", {"Out", "NmsRoisNum"}},
{"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
{"momentum", {"ParamOut", "VelocityOut"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"rnn", {"DropoutState", "Reserve", "Out", "State"}},
{"lamb",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
{"run_program", {"DOut"}},
{"adam",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
"MasterParamOut"}},
};
// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
......@@ -119,13 +126,14 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"sgd", {"ParamOut"}},
{"adam",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
"MasterParamOut"}},
{"adamw",
{"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
{"average_accumulates",
{"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
"out_old_num_accumulates", "out_num_updates"}},
{"momentum", {"ParamOut", "VelocityOut"}},
{"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
{"sparse_momentum", {"ParamOut", "VelocityOut"}},
{"batch_norm", {"MeanOut", "VarianceOut"}},
{"sync_batch_norm", {"MeanOut", "VarianceOut"}},
......
......@@ -14,5 +14,6 @@
from .auto_cast import auto_cast # noqa: F401
from .grad_scaler import GradScaler # noqa: F401
from .auto_cast import decorate # noqa: F401
__all__ = ['auto_cast', 'GradScaler']
__all__ = ['auto_cast', 'GradScaler', 'decorate']
......@@ -13,18 +13,22 @@
# limitations under the License.
from paddle.fluid.dygraph.amp import amp_guard
from paddle.fluid.dygraph.amp import amp_decorate
__all__ = []
def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
def auto_cast(enable=True,
custom_white_list=None,
custom_black_list=None,
level='O1'):
"""
Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance.
Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
imperative mode.
imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
Args:
enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
......@@ -34,6 +38,8 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
calculation and are considered numerically-dangerous and whose effects may also be
observed in downstream ops. These ops will not be converted to fp16.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
Examples:
......@@ -62,5 +68,66 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
c = a + b
print(c.dtype) # FP16
with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
d = a + b
print(d.dtype) # FP16
"""
return amp_guard(enable, custom_white_list, custom_black_list, level)
def decorate(models,
optimizers=None,
level='O1',
master_weight=None,
save_dtype=None):
"""
Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
Commonly, it is used together with `auto_cast` to achieve Pure fp16 in imperative mode.
Args:
models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
Examples:
.. code-block:: python
# required: gpu
# Demo1: single model and optimizer:
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimzier = paddle.optimizer.SGD(parameters=model.parameters())
model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = model(data)
print(output.dtype) # FP16
# required: gpu
# Demo2: multi models and optimizers:
model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = models[0](data)
output2 = models[1](data)
print(output.dtype) # FP16
print(output2.dtype) # FP16
"""
return amp_guard(enable, custom_white_list, custom_black_list)
return amp_decorate(models, optimizers, level, master_weight, save_dtype)
......@@ -198,7 +198,11 @@ class _HPRecomputeFunction(PyLayer):
# TODO support AMP
tracer = framework._dygraph_tracer()
ctx.is_fw_autocast = tracer._enable_autocast
if tracer._amp_level == 0:
ctx.is_fw_autocast = False
else:
ctx.is_fw_autocast = True
ctx.amp_mode = 'O1'
ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
with paddle.no_grad():
......@@ -258,7 +262,8 @@ class _HPRecomputeFunction(PyLayer):
with paddle.amp.auto_cast(
enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list):
custom_black_list=ctx.amp_black_list,
level=ctx.amp_mode):
detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs)
......
......@@ -98,7 +98,11 @@ class RecomputeFunction(PyLayer):
# TODO support AMP
tracer = framework._dygraph_tracer()
ctx.is_fw_autocast = tracer._enable_autocast
if tracer._amp_level == 0:
ctx.is_fw_autocast = False
else:
ctx.is_fw_autocast = True
ctx.amp_mode = 'O1'
ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
with paddle.no_grad():
......@@ -128,14 +132,16 @@ class RecomputeFunction(PyLayer):
with paddle.amp.auto_cast(
enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list):
custom_black_list=ctx.amp_black_list,
level=ctx.amp_mode):
detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs)
else:
with paddle.amp.auto_cast(
enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list):
custom_black_list=ctx.amp_black_list,
level=ctx.amp_mode):
detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs)
......
......@@ -203,19 +203,21 @@ class Momentum(Optimizer):
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode():
_, _ = _C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
param_and_grad[0], velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov, 'regularization_method',
self._regularization_method, 'regularization_coeff',
self._regularization_coeff)
return None
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
if framework.in_dygraph_mode():
_, _, _ = _C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
master_weight, param_and_grad[0], velocity_acc, master_weight,
'mu', self._momentum, 'use_nesterov', self._use_nesterov,
'regularization_method', self._regularization_method,
'regularization_coeff', self._regularization_coeff,
'multi_precision', find_master)
return None
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
......
......@@ -19,8 +19,13 @@ import contextlib
from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags
import warnings
import copy
import functools
import paddle
import operator
import types
import paddle.fluid as fluid
__all__ = ['amp_guard']
__all__ = ['amp_guard', 'amp_decorate']
# The set of ops that support fp16 calculation and are considered numerically-
# safe and performance-critical. These ops are always converted to fp16.
......@@ -64,15 +69,22 @@ AMP_RELATED_FLAGS_SETTING = {
'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
}
PURE_FP16_BLACK_LIST = {' '}
PURE_FP16_WHITE_LIST = {'lookup_table', 'lookup_table_v2'}
#NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
# The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
def _update_list(custom_white_list, custom_black_list):
def _update_list(custom_white_list, custom_black_list, level='O1'):
"""
Update black and white list according to users' custom list.
"""
if level == 'O1':
_white_list = copy.copy(WHITE_LIST)
_black_list = copy.copy(BLACK_LIST)
else:
_white_list = copy.copy(PURE_FP16_WHITE_LIST)
_black_list = copy.copy(PURE_FP16_BLACK_LIST)
if custom_white_list and custom_black_list:
for op_name in custom_white_list:
if op_name in custom_black_list:
......@@ -97,28 +109,111 @@ def _in_amp_guard():
"""
tracer = _dygraph_tracer()
if tracer:
return tracer._enable_autocast
if tracer._amp_level == 1:
return True
else:
return False
else:
return False
@dygraph_only
def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
if not enable_pure_fp16:
return models, optimizers
for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True):
layer._casted_by_pure_fp16 = True
if len(layer._sub_layers) is 0:
if (layer._dtype is 'float16') or isinstance(layer, (
paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
continue
layer.to(dtype='float16')
for idx_opt in range(len(optimizers)):
# update _param_groups
if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
optimizers[idx_opt]._param_groups[0], dict):
for param_group in optimizers[idx_opt]._param_groups:
for i, param in enumerate(param_group['params']):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
param_group['params'][
i] = layer._parameters_transform_map[id(
param)][0]
for param_group in optimizers[idx_opt]._parameter_list:
params = param_group['params']
for i, param in enumerate(params):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(
include_self=True):
if id(param) in layer._parameters_transform_map:
params[i] = layer._parameters_transform_map[id(
param)][0]
# update _parameter_list
else:
for i, param in enumerate(optimizers[idx_opt]._parameter_list):
for idx_model in range(len(models)):
for layer in models[idx_model].sublayers(include_self=True):
if id(param) in layer._parameters_transform_map:
optimizers[idx_opt]._parameter_list[
i] = layer._parameters_transform_map[id(param)][
0]
if hasattr(optimizers[idx_opt], '_param_groups'):
optimizers[idx_opt]._param_groups[
i] = layer._parameters_transform_map[id(
param)][0]
return models, optimizers
def check_models(models):
for model in models:
if not isinstance(model, paddle.nn.Layer):
raise RuntimeError(
"Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.".
format(type(model)))
def check_optimizers(optimizers):
for optimizer in optimizers:
if not isinstance(optimizer, (paddle.optimizer.Optimizer,
paddle.fluid.optimizer.Optimizer)):
raise RuntimeError(
"Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".
format(type(optimizer)))
@signature_safe_contextmanager
@dygraph_only
def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
def amp_guard(enable=True,
custom_white_list=None,
custom_black_list=None,
level='O1'):
"""
:api_attr: imperative
Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode.
Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance.
Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in
imperative mode.
Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
Args:
enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
custom_white_list(set|list, optional): The custom white_list.
custom_black_list(set|list, optional): The custom black_list.
custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
fp16 calculation and are considered numerically-safe and performance-critical. These ops
will be converted to fp16.
custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
calculation and are considered numerically-dangerous and whose effects may also be
observed in downstream ops. These ops will not be converted to fp16.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
Examples:
......@@ -139,6 +234,11 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
print(conv.dtype) # FP32
"""
if not (level in ['O1', 'O2']):
raise ValueError(
"level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
)
tracer = _dygraph_tracer()
if not tracer:
raise ValueError(
......@@ -151,17 +251,27 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
% tracer._expected_place)
enable = False
# use default white_list and black_list if no custom lists provided
if level == 'O1':
amp_level = 1
_white_list = WHITE_LIST
_black_list = BLACK_LIST
else:
amp_level = 2
_white_list = PURE_FP16_WHITE_LIST
_black_list = PURE_FP16_BLACK_LIST
if custom_white_list or custom_black_list:
_white_list, _black_list = _update_list(custom_white_list,
custom_black_list)
custom_black_list, level)
if not enable:
amp_level = 0
if tracer:
# enable auto_cast
original_enable = tracer._enable_autocast
tracer._enable_autocast = enable
original_amp_level = tracer._amp_level
tracer._amp_level = amp_level
# set amp op list
original_white_list, original_black_list = tracer._get_amp_op_list()
tracer._set_amp_op_list(_white_list, _black_list)
......@@ -179,6 +289,141 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
yield
finally:
if tracer:
tracer._enable_autocast = original_enable
tracer._amp_level = original_amp_level
tracer._set_amp_op_list(original_white_list, original_black_list)
# set_flags(original_flags)
class StateDictHook(object):
def __init__(self, save_dtype):
self._save_dtype = save_dtype
def __call__(self, state_dict):
for key in state_dict:
param = state_dict[key]
with fluid.dygraph.guard():
param_applied = paddle.cast(param, self._save_dtype)
param_applied.name = param.name
state_dict[key] = param_applied
@dygraph_only
def amp_decorate(models,
optimizers=None,
level='O1',
master_weight=None,
save_dtype=None):
"""
Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.
Args:
models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
Examples:
.. code-block:: python
# required: gpu
# Demo1: single model and optimizer:
import paddle
import paddle.fluid as fluid
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimzier = paddle.optimizer.SGD(parameters=model.parameters())
model, optimizer = fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
data = paddle.rand([10, 3, 32, 32])
with fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = model(data)
print(output.dtype) # FP16
# required: gpu
# Demo2: multi models and optimizers:
model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
models, optimizers = fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
data = paddle.rand([10, 3, 32, 32])
with fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
output = models[0](data)
output2 = models[1](data)
print(output.dtype) # FP16
print(output2.dtype) # FP16
"""
if not (level in ['O1', 'O2']):
raise ValueError(
"level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
)
if level == 'O1':
return models, optimizers
models_is_list = False
if isinstance(models, paddle.nn.Layer):
models_is_list = False
models = [models]
check_models(models)
elif isinstance(models, list):
check_models(models)
models_is_list = True
else:
raise TypeError(
"models must be either a single model or a list of models.")
optimizers_is_list = False
if isinstance(optimizers, (paddle.optimizer.Optimizer,
paddle.fluid.optimizer.Optimizer)):
optimizers_is_list = False
optimizers = [optimizers]
check_optimizers(optimizers)
elif isinstance(optimizers, list):
check_optimizers(optimizers)
optimizers_is_list = True
else:
raise TypeError(
"optimizers must be either a single optimizer or a list of optimizers."
)
models, optimizers = pure_fp16_initialize(
enable_pure_fp16=True, models=models, optimizers=optimizers)
# supprot master_weight
for idx_opt in range(len(optimizers)):
if hasattr(optimizers[idx_opt], '_multi_precision'):
if master_weight is False:
optimizers[idx_opt]._multi_precision = False
else:
optimizers[idx_opt]._multi_precision = True
if save_dtype is not None:
if not (save_dtype in ['float16', 'float32', 'float64']):
raise ValueError(
"save_dtype can only be float16 float32 or float64, but your input save_dtype is %s."
% save_dtype)
for idx in range(len(models)):
for layer in models[idx].sublayers(include_self=True):
layer.register_state_dict_hook(StateDictHook(save_dtype))
if models_is_list:
if optimizers_is_list:
return models, optimizers
else:
return models, optimizers[0]
else:
if optimizers_is_list:
return models[0], optimizers
else:
return models[0], optimizers[0]
......@@ -216,17 +216,45 @@ class AmpScaler(object):
if getattr(optimizer, '_param_groups', None) and isinstance(
optimizer._param_groups[0], dict):
param_grads = []
param_grads_fp16 = []
param_grads_fp32 = []
for group in optimizer._param_groups:
for param in group['params']:
if param._grad_ivar() is not None:
param_grads.append(param._grad_ivar())
if param._grad_ivar(
).dtype == core.VarDesc.VarType.FP16:
param_grads_fp16.append(param._grad_ivar())
else:
param_grads_fp32.append(param._grad_ivar())
else:
param_grads = [
param._grad_ivar() for param in optimizer._parameter_list
if param._grad_ivar() is not None
]
_C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
self._found_inf)
param_grads_fp16 = [
param._grad_ivar() for param in optimizer._parameter_list
if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
)
]
param_grads_fp32 = [
param._grad_ivar() for param in optimizer._parameter_list
if (param._grad_ivar() is not None
) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
)
]
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
if len(param_grads_fp16):
_C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
param_grads_fp16,
temp_found_inf_fp16)
if len(param_grads_fp32):
_C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
param_grads_fp32,
temp_found_inf_fp32)
self._found_inf = temp_found_inf_fp16 or temp_found_inf_fp32
def _update(self):
"""
......
......@@ -779,10 +779,11 @@ def save(layer, path, input_spec=None, **configs):
dygraph_state_dict = None
if isinstance(inner_layer, Layer):
dygraph_state_dict = inner_layer.state_dict()
dygraph_state_dict = inner_layer.to_static_state_dict()
elif isinstance(attr_func, StaticFunction):
if attr_func._class_instance:
dygraph_state_dict = attr_func._class_instance.state_dict()
dygraph_state_dict = attr_func._class_instance.to_static_state_dict(
)
if dygraph_state_dict:
# NOTE(chenweihang): we maintain the mapping of variable name to
......@@ -790,15 +791,19 @@ def save(layer, path, input_spec=None, **configs):
# saved to inference program may not need by dygraph Layer,
# we only record the state_dict variable's structured name
state_names_dict = dict()
state_var_dict = dict()
for structured_name, var in six.iteritems(dygraph_state_dict):
state_names_dict[var.name] = structured_name
state_var_dict[var.name] = var
# 3. share parameters from Layer to scope & record var info
for param_or_buffer in concrete_program.parameters:
# share to scope
param_or_buffer_tensor = scope.var(
param_or_buffer.name).get_tensor()
src_tensor = param_or_buffer.value().get_tensor()
#src_tensor = param_or_buffer.value().get_tensor()
src_tensor = state_var_dict[param_or_buffer.name].value(
).get_tensor()
param_or_buffer_tensor._share_data_with(src_tensor)
# record var info
if param_or_buffer.name not in extra_var_info:
......
......@@ -121,6 +121,13 @@ class Layer(core.Layer):
self._forward_pre_hooks = collections.OrderedDict()
self._forward_post_hooks = collections.OrderedDict()
self._parameters_transform_map = {}
self._buffers_transform_map = {}
self._casted_by_pure_fp16 = False
self._state_dict_hooks = collections.OrderedDict()
def train(self):
"""
Sets this Layer and all its sublayers to training mode.
......@@ -1259,31 +1266,24 @@ class Layer(core.Layer):
final_str += ')'
return final_str
def state_dict(self,
def register_state_dict_hook(self, hook):
hook_remove_helper = HookRemoveHelper(self._state_dict_hooks)
self._state_dict_hooks[hook_remove_helper._hook_id] = hook
return hook_remove_helper
def _state_dict_impl(self,
destination=None,
include_sublayers=True,
structured_name_prefix=""):
'''
structured_name_prefix="",
include_non_persistable_buffer=False):
"""
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
Parameters:
destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
Retruns:
dict: a dict contains all the parameters and persistable buffers.
Examples:
.. code-block:: python
import paddle
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict()
paddle.save( state_dict, "paddle_dy.pdparams")
'''
include_non_persistable_buffer(bool, optional): If true, include non persistable buffers of current layer and its sub-layers, it is used in pure fp16 and jit.save. Default: False
"""
if destination is None:
destination = collections.OrderedDict()
......@@ -1291,20 +1291,93 @@ class Layer(core.Layer):
if data is not None:
destination[structured_name_prefix + name] = data
for name, buffer in self._buffers.items():
if not include_non_persistable_buffer:
if buffer is not None and name not in self._non_persistable_buffer_names_set:
destination[structured_name_prefix + name] = buffer
else:
if buffer is not None:
destination[structured_name_prefix + name] = buffer
if include_sublayers:
for layer_name, layer_item in self._sub_layers.items():
if layer_item is not None:
destination_temp = destination.copy()
destination_temp.update(
layer_item.state_dict(
layer_item._state_dict_impl(
destination_temp, include_sublayers,
structured_name_prefix + layer_name + "."))
structured_name_prefix + layer_name + ".",
include_non_persistable_buffer))
destination = destination_temp
for state_dict_hook in self._state_dict_hooks.values():
hook_result = state_dict_hook(destination)
if hook_result is not None:
destination = hook_result
return destination
def to_static_state_dict(self,
destination=None,
include_sublayers=True,
structured_name_prefix=""):
'''
Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
Parameters:
destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
Retruns:
dict: a dict contains all the parameters and persistable buffers.
Examples:
.. code-block:: python
import paddle
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.to_static_state_dict()
paddle.save( state_dict, "paddle_dy.pdparams")
'''
return self._state_dict_impl(
destination=destination,
include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=True)
def state_dict(self,
destination=None,
include_sublayers=True,
structured_name_prefix=""):
'''
Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
Parameters:
destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
Retruns:
dict: a dict contains all the parameters and persistable buffers.
Examples:
.. code-block:: python
import paddle
emb = paddle.nn.Embedding(10, 10)
state_dict = emb.state_dict()
paddle.save( state_dict, "paddle_dy.pdparams")
'''
return self._state_dict_impl(
destination=destination,
include_sublayers=include_sublayers,
structured_name_prefix=structured_name_prefix,
include_non_persistable_buffer=False)
@framework.deprecate_stat_dict
def set_state_dict(self, state_dict, use_structured_name=True):
'''
......@@ -1404,8 +1477,11 @@ class Layer(core.Layer):
).stop_gradient
self._parameters[key]._set_grad_ivar(grad_applied)
self._parameters_transform_map[id(param)] = [param_applied, key]
for key, buf in self._buffers.items():
self._buffers[key] = func(buf, device, dtype, blocking)
self._buffers_transform_map[id(buf)] = [self._buffers[key], key]
def to(self, device=None, dtype=None, blocking=None):
'''
......@@ -1501,6 +1577,7 @@ class Layer(core.Layer):
return new_t
self._apply(transform, device, dtype, blocking)
self._dtype = dtype
# [aliases] Compatible with old method names
set_dict = set_state_dict
......
......@@ -1433,12 +1433,12 @@ class MomentumOptimizer(Optimizer):
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)
master_weight = None
if framework.in_dygraph_mode():
_, _ = _C_ops.momentum(param_and_grad[0], param_and_grad[1],
velocity_acc, lr, param_and_grad[0],
velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov)
_, _, _ = _C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
master_weight, param_and_grad[0], velocity_acc, master_weight,
'mu', self._momentum, 'use_nesterov', self._use_nesterov)
return None
attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
......@@ -1982,6 +1982,9 @@ class LarsMomentumOptimizer(Optimizer):
self._master_weights = {}
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + '_fp32_master'
......@@ -2462,12 +2465,14 @@ class AdamOptimizer(Optimizer):
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _ = _C_ops.adam(
master_weight = None
_, _, _, _, _, _ = _C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow',
beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, 'use_global_beta_pow',
self._use_global_beta_pow)
return None
......
......@@ -1099,7 +1099,6 @@ class TestJitSaveLoadSaveWithoutRunning(unittest.TestCase):
paddle.static.InputSpec(
shape=[None, IMAGE_SIZE], dtype='float32')
])
result_00 = layer_save(inps0)
result_01 = layer_save(inps1)
#load and save without running
......
......@@ -22,6 +22,8 @@ from ..fluid.layer_helper import LayerHelper
import warnings
from ..fluid.dygraph import base as imperative_base
from collections import defaultdict
import numpy as np
import time
import paddle
from paddle import _C_ops
......@@ -208,6 +210,9 @@ class Adam(Optimizer):
}
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
......@@ -317,12 +322,13 @@ class Adam(Optimizer):
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _ = _C_ops.adam(
_, _, _, _, _, _ = _C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
1000, 'beta1', _beta1, 'beta2', _beta2)
beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, 'multi_precision', find_master)
return None
......
......@@ -297,13 +297,15 @@ class AdamW(Adam):
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _ = _C_ops.adamw(
_, _, _, _, _, _ = _C_ops.adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff,
"lr_ratio", lr_ratio_)
beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, 'coeff', self._coeff, 'multi_precision',
find_master)
return None
......
......@@ -170,7 +170,7 @@ class Momentum(Optimizer):
'regularization_method': self._regularization_method,
'regularization_coeff': self._regularization_coeff,
}
'''
if framework.in_dygraph_mode():
self.helper = LayerHelper(self.__class__.__name__)
if isinstance(self._parameter_list[0], dict):
......@@ -180,6 +180,7 @@ class Momentum(Optimizer):
else:
for p in parameters:
self._add_accumulator(self._velocity_acc_str, p)
'''
def _update_regularization(self, weight_decay):
reg_method = ""
......@@ -194,6 +195,9 @@ class Momentum(Optimizer):
return reg_method, reg_coeff
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
......@@ -239,10 +243,15 @@ class Momentum(Optimizer):
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
'''
if framework.in_dygraph_mode():
return
'''
assert isinstance(block, framework.Block)
if isinstance(parameters, dict):
parameters = self._update_param_group(parameters)
for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
......@@ -291,21 +300,23 @@ class Momentum(Optimizer):
regularization_method = ""
regularization_coeff = 0
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
if framework.in_dygraph_mode():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
_, _ = _C_ops.momentum(
_, _, _ = _C_ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
param_and_grad[0], velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov, 'regularization_method',
regularization_method, 'regularization_coeff',
regularization_coeff)
return None
master_weight, param_and_grad[0], velocity_acc, master_weight,
'mu', self._momentum, 'use_nesterov', self._use_nesterov,
'regularization_method', regularization_method,
'regularization_coeff', regularization_coeff, 'multi_precision',
find_master)
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
return None
attrs = {
"mu": self._momentum,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册