未验证 提交 2d95280e 编写于 作者: L Leo Chen 提交者: GitHub

Feature/Enable Auto-Mixed-Precision in dynamic graph (#24903)

* add auto_cast, test=develop

* add loss scaler, test=develop

* add comments, test=develop

* refine code, test=develop

* refine code, test=develop

* do not set flags automatically, test=develop

* fix custom op bug, test=develop

* add more test, test=develop

* refine enable logic, test=develop

* enable amp test with GPU, test=develop

* add unittest

* add test for found_inf

* follow comments

* follow comments

* remove global variable, use singleton

* add some notes

* update comments

* update comments

* update comments

* add use_dynamic_loss_scaling argument

* refine found_inf

* refine found_inf
上级 838e36e9
......@@ -2,10 +2,10 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
add_subdirectory(jit)
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc)
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/variable_wrapper.h"
namespace paddle {
namespace imperative {
AmpOperators::AmpOperators()
: allow_ops_(new std::unordered_set<std::string>()),
block_ops_(new std::unordered_set<std::string>()) {}
AmpOperators::~AmpOperators() {}
AmpOperators& AmpOperators::Instance() {
static AmpOperators instance;
return instance;
}
std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetAllowOps() {
return allow_ops_;
}
std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetBlockOps() {
return block_ops_;
}
inline std::string GetDtypeStr(
const std::shared_ptr<imperative::VarBase>& var) {
return framework::DataTypeToString(var->DataType());
}
inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
if (!platform::is_gpu_place(var->Place())) {
return false;
}
if (var->DataType() == framework::proto::VarType::FP32 ||
var->DataType() == framework::proto::VarType::FP16) {
return true;
} else {
return false;
}
}
// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
// var will be cast back from fp16 to fp32 during backward phase.
static inline std::shared_ptr<imperative::VarBase> CastToType(
const std::shared_ptr<VarBase>& var,
const framework::proto::VarType::Type dst_type) {
const auto& tracer = imperative::GetCurrentTracer();
imperative::NameVarBaseMap ins = {{"X", {var}}};
framework::AttributeMap attrs = {{"in_dtype", var->DataType()},
{"out_dtype", dst_type}};
auto out = std::shared_ptr<imperative::VarBase>(
new imperative::VarBase(tracer->GenerateUniqueName()));
imperative::NameVarBaseMap outs = {{"Out", {out}}};
{
AutoCastGuard guard(tracer, false);
tracer->TraceOp("cast", ins, outs, std::move(attrs));
}
return out;
}
static inline std::shared_ptr<imperative::VarBase> CastToFP16(
const std::shared_ptr<VarBase>& var) {
auto dst_type = framework::proto::VarType::FP16;
if (NeedCast(var) && (var->DataType() != dst_type)) {
return CastToType(var, dst_type);
}
return var;
}
static inline std::shared_ptr<imperative::VarBase> CastToFP32(
const std::shared_ptr<VarBase>& var) {
auto dst_type = framework::proto::VarType::FP32;
if (NeedCast(var) && (var->DataType() != dst_type)) {
return CastToType(var, dst_type);
}
return var;
}
static inline framework::proto::VarType::Type GetPromoteType(
const NameVarBaseMap& ins) {
auto dst_type = framework::proto::VarType::FP16;
for (const auto& pair : ins) {
for (const auto& var : pair.second) {
if (var->DataType() == framework::proto::VarType::FP32) {
dst_type = var->DataType();
break;
}
}
}
return dst_type;
}
NameVarBaseMap AutoCastInputs(const std::string& op_type,
const NameVarBaseMap& ins) {
NameVarBaseMap new_ins = {};
if (AmpOperators::Instance().GetAllowOps()->count(op_type)) {
for (const auto& pair : ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float16";
for (const auto& var : pair.second) {
auto new_var = CastToFP16(var);
new_ins[pair.first].emplace_back(new_var);
}
}
return new_ins;
} else if (AmpOperators::Instance().GetBlockOps()->count(op_type)) {
for (const auto& pair : ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to float";
for (const auto& var : pair.second) {
auto new_var = CastToFP32(var);
new_ins[pair.first].emplace_back(new_var);
}
}
return new_ins;
} else {
auto dst_type = GetPromoteType(ins);
for (const auto& pair : ins) {
VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
<< GetDtypeStr(*pair.second.cbegin()) << " to "
<< framework::DataTypeToString(dst_type);
for (const auto& var : pair.second) {
// NOTE(zhiqiu): Conv + BN always occur together, we needn't
// cast X of batch_norm to FP32, which is produced by conv as FP16 type.
if (op_type == "batch_norm" && pair.first == "X" &&
dst_type == framework::proto::VarType::FP32) {
new_ins[pair.first].emplace_back(var);
continue;
}
auto new_var = dst_type == framework::proto::VarType::FP32
? CastToFP32(var)
: CastToFP16(var);
new_ins[pair.first].emplace_back(new_var);
}
}
return new_ins;
}
return ins;
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <unordered_set>
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/imperative/type_defs.h"
namespace paddle {
namespace imperative {
// Singleton implementation with C++ 11
class AmpOperators {
public:
~AmpOperators();
AmpOperators(const AmpOperators& o) = delete;
const AmpOperators& operator=(const AmpOperators& o) = delete;
static AmpOperators& Instance();
std::shared_ptr<std::unordered_set<std::string>> GetAllowOps();
std::shared_ptr<std::unordered_set<std::string>> GetBlockOps();
private:
AmpOperators(); // forbid calling default constructor
// The set of ops that support fp16 calculation and are considered numerically
// safe and performance critical. These ops are always converted to fp16.
std::shared_ptr<std::unordered_set<std::string>> allow_ops_;
// The set of ops that support fp16 calculation and are considered numerically
// dangerous and whose effects may also be observed in downstream ops.
std::shared_ptr<std::unordered_set<std::string>> block_ops_;
};
// NOTE(zhiqiu): AutoCastGuard is used for RAII.
class AutoCastGuard {
public:
AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode)
: tracer_(tracer) {
pre_mode_ = tracer_->IsAutoCastEnabled();
if (pre_mode_ != guard_mode) {
tracer_->SetEnableAutoCast(guard_mode);
}
}
~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); }
// forbid copy and operator=
AutoCastGuard(const AutoCastGuard& guard) = delete;
AutoCastGuard& operator=(const AutoCastGuard& guard) = delete;
private:
std::shared_ptr<Tracer> tracer_;
bool pre_mode_;
};
NameVarBaseMap AutoCastInputs(const std::string& op_type,
const NameVarBaseMap& ins);
} // namespace imperative
} // namespace paddle
......@@ -186,6 +186,8 @@ class VarBase {
framework::proto::VarType::Type DataType() const { return var_->DataType(); }
const platform::Place Place() const { return var_->Place(); }
void ClearGradient();
std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
......
......@@ -16,6 +16,7 @@
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/op_base.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -53,8 +54,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
attr_checker->Check(&attrs, true);
}
NameVarBaseMap new_ins = ins;
if (enable_autocast_) {
VLOG(5) << "Auto mixed precision run operator: " << type;
new_ins = AutoCastInputs(type, ins);
}
try {
OpBase::Run(*op, ins, outs, attrs, place);
OpBase::Run(*op, new_ins, outs, attrs, place);
} catch (platform::EnforceNotMet& exception) {
framework::AppendErrorOpHint(type, &exception);
throw std::move(exception);
......@@ -73,11 +80,11 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
if (enable_program_desc_tracing_) {
VLOG(5) << "Trace op " << type << " into ProgramDesc";
program_desc_tracer_->InsertOp(type, ins, outs, attrs);
program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
}
if (ComputeRequiredGrad(ins, outs, trace_backward)) {
CreateGradOpNode(*op, ins, outs, attrs, place);
if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
CreateGradOpNode(*op, new_ins, outs, attrs, place);
} else {
VLOG(3) << "No Grad to track for Op: " << type;
}
......
......@@ -97,6 +97,10 @@ class Tracer {
void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }
void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; }
bool IsAutoCastEnabled() const { return enable_autocast_; }
private:
std::unique_ptr<BasicEngine> basic_engine_;
std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
......@@ -104,6 +108,7 @@ class Tracer {
std::unique_ptr<UniqueNameGenerator> generator_;
platform::Place expected_place_;
bool has_grad_{true};
bool enable_autocast_{false};
};
// To access static variable current_tracer
......
......@@ -111,6 +111,28 @@ class VariableWrapper {
}
}
const platform::Place Place() const {
const framework::Tensor* tensor = nullptr;
auto place =
platform::CPUPlace(); // Default place for var not initialized.
if (var_.IsInitialized()) {
if (type_ == framework::proto::VarType::LOD_TENSOR) {
tensor = &(var_.Get<framework::LoDTensor>());
} else if (type_ == framework::proto::VarType::SELECTED_ROWS) {
tensor = &(var_.Get<framework::SelectedRows>().value());
} else {
VLOG(6) << "Variable " << name_ << " is not initialized";
return place;
}
}
if (tensor && tensor->IsInitialized()) {
return tensor->place();
} else {
VLOG(6) << "The tensor of variable " << name_ << " is not initialized";
return place;
}
}
private:
void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
auto shared_var = grad_var_.lock();
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
#include <string>
#include <vector>
......@@ -67,7 +68,7 @@ class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
"amp_check_finite_and_unscale operator.")
.AsDuplicable();
AddOutput("FoundInfinite",
"(Tensor) 1-dim tensor, contains a int scalar, which indicates "
"(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
"if there there is infinite or nan item in input X.");
AddComment(R"DOC(
amp_check_finite_and_scale operator.
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda.h>
#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
#include "paddle/fluid/platform/float16.h"
......@@ -21,7 +22,7 @@ namespace operators {
template <typename T>
__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
int* found_inf, T* out) {
bool* found_inf, T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) {
......@@ -44,7 +45,7 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const T* scale_data = scale->data<T>();
int* found_inf_data = found_inf->mutable_data<int>(dev_ctx.GetPlace());
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
for (size_t i = 0; i < xs.size(); ++i) {
......
......@@ -19,13 +19,17 @@ limitations under the License. */
#include <pybind11/complex.h>
#include <pybind11/functional.h>
#include <pybind11/stl.h>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/imperative/all_reduce.h"
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/backward_strategy.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/data_loader.h"
......@@ -537,8 +541,7 @@ void BindImperative(py::module *m_ptr) {
});
py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
m, "VarBase",
R"DOC()DOC")
m, "VarBase", R"DOC()DOC")
.def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
.def("__init__",
[](imperative::VarBase &self, framework::proto::VarType::Type dtype,
......@@ -838,13 +841,14 @@ void BindImperative(py::module *m_ptr) {
.def("reset", &imperative::jit::ProgramDescTracer::Reset);
py::class_<imperative::Tracer, std::shared_ptr<imperative::Tracer>>(
m, "Tracer",
R"DOC()DOC")
m, "Tracer", R"DOC()DOC")
.def("__init__",
[](imperative::Tracer &self) { new (&self) imperative::Tracer(); })
.def_property("_enable_program_desc_tracing",
&imperative::Tracer::IsProgramDescTracingEnabled,
&imperative::Tracer::SetEnableProgramDescTracing)
.def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled,
&imperative::Tracer::SetEnableAutoCast)
.def_property("_train_mode", &imperative::Tracer::HasGrad,
&imperative::Tracer::SetHasGrad)
.def_property(
......@@ -874,6 +878,26 @@ void BindImperative(py::module *m_ptr) {
py::return_value_policy::reference)
.def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName,
py::arg("key") = "eager_tmp")
.def(
"_set_amp_op_list",
[](imperative::Tracer &self,
std::unordered_set<std::string> &allow_ops,
std::unordered_set<std::string> &block_ops) {
// NOTE(zhiqiu): The automatic conversion in pybind11 between c++
// STL and python set/list/dict involve a copy operation that
// prevents pass-by-reference semantics, so it is ok to swap.
// The reaseon why not directly pass
// std::shared_ptr<std::unordered_set<std::string>>
// is that pybind11 forbid shared_ptr<T> where T is not custom type.
imperative::AmpOperators::Instance().GetAllowOps()->swap(allow_ops);
imperative::AmpOperators::Instance().GetBlockOps()->swap(block_ops);
})
.def("_get_amp_op_list",
[](imperative::Tracer &self) {
return std::make_tuple(
*(imperative::AmpOperators::Instance().GetAllowOps()),
*(imperative::AmpOperators::Instance().GetBlockOps()));
})
.def("trace",
[](imperative::Tracer &self, const std::string &type,
const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
......
......@@ -56,6 +56,9 @@ from .dygraph_to_static import ProgramTranslator
from . import rnn
from .rnn import *
from . import amp
from .amp import *
__all__ = []
__all__ += layers.__all__
__all__ += base.__all__
......@@ -69,3 +72,4 @@ __all__ += jit.__all__
__all__ += io.__all__
__all__ += rnn.__all__
__all__ += ['ProgramTranslator']
__all__ += amp.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import auto_cast
from .auto_cast import *
from . import loss_scaler
from .loss_scaler import *
__all__ = []
__all__ += auto_cast.__all__
__all__ += loss_scaler.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddle.fluid.wrapped_decorator import signature_safe_contextmanager, wrap_decorator
from paddle.fluid import core
import contextlib
from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags
import warnings
import copy
__all__ = ['amp_guard']
# The set of ops that support fp16 calculation and are considered numerically-
# safe and performance-critical. These ops are always converted to fp16.
WHITE_LIST = {
'conv2d',
'matmul',
'mul',
}
# The set of ops that support fp16 calculation and are considered numerically-
# dangerous and whose effects may also be observed in downstream ops.
BLACK_LIST = {
'exp',
'square',
'log',
'mean',
'sum',
'cos_sim',
'softmax',
'softmax_with_cross_entropy',
'sigmoid_cross_entropy_with_logits',
'cross_entropy',
'cross_entropy2',
}
AMP_RELATED_FLAGS = [
'FLAGS_cudnn_exhaustive_search',
'FLAGS_conv_workspace_size_limit',
'FLAGS_cudnn_batchnorm_spatial_persistent',
]
AMP_RELATED_FLAGS_SETTING = {
'FLAGS_cudnn_exhaustive_search': 1,
'FLAGS_conv_workspace_size_limit': 1000,
'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
}
#NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
# The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
def _update_list(custom_white_list, custom_black_list):
"""
Update black and white list according to users' custom list.
"""
_white_list = copy.copy(WHITE_LIST)
_black_list = copy.copy(BLACK_LIST)
if custom_white_list and custom_black_list:
for op_name in custom_white_list:
if op_name in custom_black_list:
raise ValueError("Custom white list overlap "
"custom black list")
if custom_white_list:
for op_name in custom_white_list:
if op_name in _black_list:
_black_list.remove(op_name)
_white_list.add(op_name)
if custom_black_list:
for op_name in custom_black_list:
if op_name in _white_list:
_white_list.remove(op_name)
_black_list.add(op_name)
return _white_list, _black_list
@signature_safe_contextmanager
@dygraph_only
def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
"""
:api_attr: imperative
Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode.
If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance.
Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in
imperative mode.
Args:
enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
custom_white_list(set|list, optional): The custom white_list.
custom_black_list(set|list, optional): The custom black_list.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
conv2d = fluid.dygraph.Conv2D(3, 2, 3)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = conv2d(data)
print(conv.dtype) # FP16
with fluid.dygraph.amp_guard(enable=False):
conv = conv2d(data)
print(conv.dtype) # FP32
"""
tracer = _dygraph_tracer()
if not tracer:
raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place():
warnings.warn(
'amp_guard can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
% tracer._expected_place)
enable = False
# use default white_list and black_list if no custom lists provided
_white_list = WHITE_LIST
_black_list = BLACK_LIST
if custom_white_list or custom_black_list:
_white_list, _black_list = _update_list(custom_white_list,
custom_black_list)
if tracer:
# enable auto_cast
original_enable = tracer._enable_autocast
tracer._enable_autocast = enable
# set amp op list
original_white_list, original_black_list = tracer._get_amp_op_list()
tracer._set_amp_op_list(_white_list, _black_list)
# TODO(zhiqiu) set amp related flags automatically in this guard
# Currently, if FLAGS_cudnn_batchnorm_spatial_persistent is set True in amp_guard,
# batch_norm can run in fast mode, but batch_norm_grad can not if backward if not executed insise amp_guard.
# So, users need to set related flags manually.
# original_flags = get_flags(AMP_RELATED_FLAGS)
# set_flags(AMP_RELATED_FLAGS_SETTING)
# restore status
try:
yield
finally:
if tracer:
tracer._enable_autocast = original_enable
tracer._set_amp_op_list(original_white_list, original_black_list)
# set_flags(original_flags)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddle.fluid import core
from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import _varbase_creator, _dygraph_tracer, dygraph_only
from paddle.fluid.data_feeder import check_type
from ...wrapped_decorator import signature_safe_contextmanager, wrap_decorator
import warnings
import numpy as np
__all__ = ['AmpScaler']
class AmpScaler(object):
"""
:api_attr: imperative
AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative
mode. It controls the scaling of loss, helps avoiding numerical overflow.
The object of this class has two methods `scale()`, `minimize()`.
`scale()` is used to multiply the loss by a scale ratio.
`minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in
imperative mode.
Args:
enable(bool, optional): Enable loss scaling or not. Default is True.
init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
incr_ratio(float, optional): The multiplier to use when increasing the loss
scaling. Default is 2.0.
decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
the loss scaling. Default is 0.5.
incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
steps with finite gradients. Default is 1000.
decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
accumulated steps with nan or inf gradients. Default is 2.
use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
Returns:
An AmpScaler object.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = model(data)
loss = fluid.layers.reduce_mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
"""
@dygraph_only
def __init__(self,
enable=True,
init_loss_scaling=2.**15,
incr_ratio=2.0,
decr_ratio=0.5,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=1,
use_dynamic_loss_scaling=True):
tracer = _dygraph_tracer()
if not tracer:
raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place():
warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
% tracer._expected_place)
enable = False
self._enable = enable
if self._enable:
assert incr_ratio > 1.0, "The incr_ratio must be > 1.0."
assert decr_ratio < 1.0, "The decr_ratio must be < 1.0."
self._init_loss_scaling = init_loss_scaling
self._incr_ratio = incr_ratio
self._decr_ratio = decr_ratio
self._incr_every_n_steps = incr_every_n_steps
self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
self._incr_count = 0
self._decr_count = 0
self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
self._found_inf = to_variable(np.array([0]).astype(np.bool))
self._scale = to_variable(
np.array([self._init_loss_scaling]).astype(np.float32))
self._cache_founf_inf = None
def scale(self, var):
"""
Multiplies a variable(Tensor) by the scale factor and returns scaled outputs.
If this instance of :class:`AmpScaler` is not enabled, output are returned unmodified.
Args:
var (Variable): The variable to scale.
Returns:
The scaled variable or original variable.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = model(data)
loss = fluid.layers.reduce_mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
"""
check_type(var, "var", core.VarBase, 'AmpScaler.scale()')
if not self._enable:
return var
return var * self._scale
def minimize(self, optimizer, *args, **kwargs):
"""
This function is similar as `Optimizer.minimize()`, which performs parameters updating.
If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
Finally, the loss scaling ratio is updated.
Args:
optimizer(Optimizer): The optimizer used to update parameters.
args: Arguments, which will be forward to `optimizer.minimize()`.
kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3)
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard():
conv = model(data)
loss = fluid.layers.reduce_mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.minimize(optimizer, scaled)
"""
if not self._enable:
return optimizer.minimize(*args, **kwargs)
# unscale the grad
self._unscale(optimizer)
optimize_ops, params_grads = (None, None)
if self._found_inf:
self._cache_founf_inf = True
else:
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
if self._use_dynamic_loss_scaling:
# uopdate the scale
self._update()
return optimize_ops, params_grads
def _unscale(self, optimizer):
if not self._enable:
return
inv_scale = 1.0 / self._scale
param_grads = [
param._grad_ivar() for param in optimizer._parameter_list
if param._grad_ivar() is not None
]
core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads,
self._found_inf)
def _update(self):
"""
Updates the loss_scaling.
"""
if not self._enable:
return
if self._cache_founf_inf:
self._incr_count = 0
self._decr_count = self._decr_count + 1
if self._decr_count == self._decr_every_n_nan_or_inf:
print(
'Found inf or nan, current scale is: {}, decrease to: {}*{}'.
format(
float(self._scale),
float(self._scale), float(self._decr_ratio)))
self._scale = self._scale * self._decr_ratio
self._decr_count = 0
else:
self._decr_count = 0
self._incr_count = self._incr_count + 1
if self._incr_count == self._incr_every_n_steps:
self._scale = self._scale * self._incr_ratio
self._incr_count = 0
return
......@@ -106,6 +106,7 @@ if (NOT ${WITH_GPU})
list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
elseif(${CUDNN_VERSION} VERSION_LESS 7100)
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
endif()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import paddle.fluid as fluid
import numpy as np
import six
from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
class SimpleConv(fluid.dygraph.Layer):
def __init__(self,
num_channels,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
super(SimpleConv, self).__init__()
self._conv = fluid.dygraph.Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=None,
use_cudnn=True)
def forward(self, inputs):
return self._conv(inputs)
class TestAutoCast(unittest.TestCase):
def test_amp_guard_white_op(self):
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard(True):
out_fp16 = conv2d(data)
with fluid.dygraph.amp_guard(False):
out_fp32 = conv2d(data)
self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_amp_guard_black_op(self):
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard(True):
out_fp32 = fluid.layers.mean(data)
self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_custom_op_list(self):
with fluid.dygraph.guard():
tracer = fluid.framework._dygraph_tracer()
base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST
base_black_list = fluid.dygraph.amp.auto_cast.BLACK_LIST
with fluid.dygraph.amp_guard(
custom_white_list=["log"], custom_black_list=["conv2d"]):
white_list, black_list = tracer._get_amp_op_list()
self.assertTrue(
set(white_list) ==
(set(base_white_list) | {"log"}) - {"conv2d"})
self.assertTrue(
set(black_list) ==
(set(base_black_list) - {"log"}) | {"conv2d"})
def test_custom_op_list_exception(self):
inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
def func():
with fluid.dygraph.guard():
model = SimpleConv(
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
with fluid.dygraph.amp_guard(
custom_white_list=["conv2d"],
custom_black_list=["conv2d"]):
inp = fluid.dygraph.to_variable(inp_np)
out = model(inp)
self.assertRaises(ValueError, func)
class TestAmpScaler(unittest.TestCase):
def test_scale(self):
with fluid.dygraph.guard():
data = paddle.rand([10, 1024])
scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024)
scaled_data = scaler.scale(data)
self.assertEqual(
np.array_equal(scaled_data.numpy(), data.numpy() * 1024), True)
def test_minimize(self):
inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
def run_simple_conv(inp_np, use_scaler=True):
paddle.manual_seed(10)
with fluid.dygraph.guard():
model = SimpleConv(
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(inp_np)
out = model(data)
loss = fluid.layers.mean(out)
if use_scaler:
print('use scaler')
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer,
scaled_loss)
else:
print('use no scaler')
loss.backward()
optimize_ops, params_grads = optimizer.minimize(loss)
return optimize_ops, params_grads
outs_with_scaler = run_simple_conv(inp_np, use_scaler=True)
outs_no_scaler = run_simple_conv(inp_np, use_scaler=False)
self.assertEqual(outs_with_scaler[0],
[]) # optimize_ops is [] in dygraph mode
self.assertEqual(outs_no_scaler[0],
[]) # optimize_ops is [] in dygraph mode
for i in range(len(outs_with_scaler[1])):
# check each grad
self.assertEqual(
np.allclose(outs_with_scaler[1][i][1].numpy(),
outs_no_scaler[1][i][1].numpy()), True)
# check each parameter
self.assertEqual(
np.allclose(outs_with_scaler[1][i][0].numpy(),
outs_no_scaler[1][i][0].numpy()), True)
def test_nan_inf(self):
inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
inp_np[0][1][2][3] = np.nan
with fluid.dygraph.guard():
model = SimpleConv(
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
params_init = {}
for param in model.parameters():
params_init[param.name] = param.numpy()
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(inp_np)
out = model(data)
loss = fluid.layers.mean(out)
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True)
for param in model.parameters():
# param not update when tensor contains nan or inf
self.assertTrue(
np.array_equal(param.numpy(), params_init[param.name]))
class TestResnet(unittest.TestCase):
def train_resnet(self, enable_amp=True):
seed = 90
batch_size = train_parameters["batch_size"]
batch_num = 1
with fluid.dygraph.guard():
paddle.manual_seed(seed)
resnet = ResNet(use_cudnn=True)
optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters())
np.random.seed(seed)
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
dy_param_init_value = {}
for param in resnet.parameters():
dy_param_init_value[param.name] = param.numpy()
program = None
scaler = paddle.fluid.dygraph.AmpScaler(
enable=enable_amp, init_loss_scaling=2.**10)
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
if len(np.array([x[1]
for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-1, 1)
img = fluid.dygraph.to_variable(dy_x_data)
label = fluid.dygraph.to_variable(y_data)
label.stop_gradient = True
with paddle.fluid.dygraph.amp_guard(enable=enable_amp):
out = resnet(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
dy_out = avg_loss.numpy()
scaled_loss = scaler.scale(avg_loss)
scaled_loss.backward()
scaler.minimize(optimizer, scaled_loss)
dy_grad_value = {}
for param in resnet.parameters():
if param.trainable:
np_array = np.array(param._grad_ivar().value()
.get_tensor())
dy_grad_value[param.name + fluid.core.grad_var_suffix(
)] = np_array
resnet.clear_gradients()
dy_param_value = {}
for param in resnet.parameters():
dy_param_value[param.name] = param.numpy()
return dy_out, dy_param_value, dy_grad_value
def test_resnet(self):
out_fp32 = self.train_resnet(enable_amp=False)
out_amp = self.train_resnet(enable_amp=True)
print(out_fp32[0], out_amp[0])
self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
if __name__ == '__main__':
unittest.main()
......@@ -83,7 +83,8 @@ class ConvBNLayer(fluid.Layer):
filter_size,
stride=1,
groups=1,
act=None):
act=None,
use_cudnn=False):
super(ConvBNLayer, self).__init__()
self._conv = Conv2D(
......@@ -94,8 +95,8 @@ class ConvBNLayer(fluid.Layer):
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=None,
use_cudnn=False)
bias_attr=False,
use_cudnn=use_cudnn)
self._batch_norm = BatchNorm(num_filters, act=act)
......@@ -107,32 +108,41 @@ class ConvBNLayer(fluid.Layer):
class BottleneckBlock(fluid.Layer):
def __init__(self, num_channels, num_filters, stride, shortcut=True):
def __init__(self,
num_channels,
num_filters,
stride,
shortcut=True,
use_cudnn=False):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters,
filter_size=1,
act='relu')
act='relu',
use_cudnn=use_cudnn)
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
act='relu',
use_cudnn=use_cudnn)
self.conv2 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 4,
filter_size=1,
act=None)
act=None,
use_cudnn=use_cudnn)
if not shortcut:
self.short = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters * 4,
filter_size=1,
stride=stride)
stride=stride,
use_cudnn=use_cudnn)
self.shortcut = shortcut
......@@ -153,7 +163,7 @@ class BottleneckBlock(fluid.Layer):
class ResNet(fluid.Layer):
def __init__(self, layers=50, class_dim=102):
def __init__(self, layers=50, class_dim=102, use_cudnn=False):
super(ResNet, self).__init__()
self.layers = layers
......@@ -171,7 +181,12 @@ class ResNet(fluid.Layer):
num_filters = [64, 128, 256, 512]
self.conv = ConvBNLayer(
num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu',
use_cudnn=use_cudnn)
self.pool2d_max = Pool2D(
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
......@@ -186,7 +201,8 @@ class ResNet(fluid.Layer):
if i == 0 else num_filters[block] * 4,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut))
shortcut=shortcut,
use_cudnn=use_cudnn))
self.bottleneck_block_list.append(bottleneck_block)
shortcut = True
......
......@@ -158,6 +158,7 @@ packages=['paddle',
'paddle.fluid',
'paddle.fluid.dygraph',
'paddle.fluid.dygraph.dygraph_to_static',
'paddle.fluid.dygraph.amp',
'paddle.fluid.proto',
'paddle.fluid.proto.profiler',
'paddle.fluid.distributed',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册