diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 4d602d5c0211e221a99e0e87a3344c5a9c2a0142..3d01e4fe46f10f1c9494026ca1cb21496ed6fe6b 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -2,10 +2,10 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) add_subdirectory(jit) - -cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer) +cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) +cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc new file mode 100644 index 0000000000000000000000000000000000000000..c980b014b823e21f117bc6e44037349b06a1fdfd --- /dev/null +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/amp_auto_cast.h" + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/imperative/variable_wrapper.h" + +namespace paddle { +namespace imperative { + +AmpOperators::AmpOperators() + : allow_ops_(new std::unordered_set()), + block_ops_(new std::unordered_set()) {} +AmpOperators::~AmpOperators() {} + +AmpOperators& AmpOperators::Instance() { + static AmpOperators instance; + return instance; +} + +std::shared_ptr> AmpOperators::GetAllowOps() { + return allow_ops_; +} + +std::shared_ptr> AmpOperators::GetBlockOps() { + return block_ops_; +} + +inline std::string GetDtypeStr( + const std::shared_ptr& var) { + return framework::DataTypeToString(var->DataType()); +} + +inline bool NeedCast(const std::shared_ptr& var) { + if (!platform::is_gpu_place(var->Place())) { + return false; + } + if (var->DataType() == framework::proto::VarType::FP32 || + var->DataType() == framework::proto::VarType::FP16) { + return true; + } else { + return false; + } +} + +// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad +// var will be cast back from fp16 to fp32 during backward phase. +static inline std::shared_ptr CastToType( + const std::shared_ptr& var, + const framework::proto::VarType::Type dst_type) { + const auto& tracer = imperative::GetCurrentTracer(); + imperative::NameVarBaseMap ins = {{"X", {var}}}; + framework::AttributeMap attrs = {{"in_dtype", var->DataType()}, + {"out_dtype", dst_type}}; + auto out = std::shared_ptr( + new imperative::VarBase(tracer->GenerateUniqueName())); + imperative::NameVarBaseMap outs = {{"Out", {out}}}; + + { + AutoCastGuard guard(tracer, false); + tracer->TraceOp("cast", ins, outs, std::move(attrs)); + } + + return out; +} + +static inline std::shared_ptr CastToFP16( + const std::shared_ptr& var) { + auto dst_type = framework::proto::VarType::FP16; + if (NeedCast(var) && (var->DataType() != dst_type)) { + return CastToType(var, dst_type); + } + return var; +} + +static inline std::shared_ptr CastToFP32( + const std::shared_ptr& var) { + auto dst_type = framework::proto::VarType::FP32; + if (NeedCast(var) && (var->DataType() != dst_type)) { + return CastToType(var, dst_type); + } + return var; +} + +static inline framework::proto::VarType::Type GetPromoteType( + const NameVarBaseMap& ins) { + auto dst_type = framework::proto::VarType::FP16; + for (const auto& pair : ins) { + for (const auto& var : pair.second) { + if (var->DataType() == framework::proto::VarType::FP32) { + dst_type = var->DataType(); + break; + } + } + } + return dst_type; +} + +NameVarBaseMap AutoCastInputs(const std::string& op_type, + const NameVarBaseMap& ins) { + NameVarBaseMap new_ins = {}; + if (AmpOperators::Instance().GetAllowOps()->count(op_type)) { + for (const auto& pair : ins) { + VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from " + << GetDtypeStr(*pair.second.cbegin()) << " to float16"; + for (const auto& var : pair.second) { + auto new_var = CastToFP16(var); + new_ins[pair.first].emplace_back(new_var); + } + } + return new_ins; + } else if (AmpOperators::Instance().GetBlockOps()->count(op_type)) { + for (const auto& pair : ins) { + VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from " + << GetDtypeStr(*pair.second.cbegin()) << " to float"; + for (const auto& var : pair.second) { + auto new_var = CastToFP32(var); + new_ins[pair.first].emplace_back(new_var); + } + } + return new_ins; + } else { + auto dst_type = GetPromoteType(ins); + + for (const auto& pair : ins) { + VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from " + << GetDtypeStr(*pair.second.cbegin()) << " to " + << framework::DataTypeToString(dst_type); + for (const auto& var : pair.second) { + // NOTE(zhiqiu): Conv + BN always occur together, we needn't + // cast X of batch_norm to FP32, which is produced by conv as FP16 type. + if (op_type == "batch_norm" && pair.first == "X" && + dst_type == framework::proto::VarType::FP32) { + new_ins[pair.first].emplace_back(var); + continue; + } + auto new_var = dst_type == framework::proto::VarType::FP32 + ? CastToFP32(var) + : CastToFP16(var); + new_ins[pair.first].emplace_back(new_var); + } + } + return new_ins; + } + return ins; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h new file mode 100644 index 0000000000000000000000000000000000000000..d1da97e5a39057aed3ed0b4a450bd4a4f5c06984 --- /dev/null +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -0,0 +1,79 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/imperative/type_defs.h" + +namespace paddle { +namespace imperative { + +// Singleton implementation with C++ 11 +class AmpOperators { + public: + ~AmpOperators(); + AmpOperators(const AmpOperators& o) = delete; + const AmpOperators& operator=(const AmpOperators& o) = delete; + + static AmpOperators& Instance(); + + std::shared_ptr> GetAllowOps(); + + std::shared_ptr> GetBlockOps(); + + private: + AmpOperators(); // forbid calling default constructor + + // The set of ops that support fp16 calculation and are considered numerically + // safe and performance critical. These ops are always converted to fp16. + std::shared_ptr> allow_ops_; + + // The set of ops that support fp16 calculation and are considered numerically + // dangerous and whose effects may also be observed in downstream ops. + std::shared_ptr> block_ops_; +}; + +// NOTE(zhiqiu): AutoCastGuard is used for RAII. +class AutoCastGuard { + public: + AutoCastGuard(std::shared_ptr tracer, bool guard_mode) + : tracer_(tracer) { + pre_mode_ = tracer_->IsAutoCastEnabled(); + if (pre_mode_ != guard_mode) { + tracer_->SetEnableAutoCast(guard_mode); + } + } + + ~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); } + + // forbid copy and operator= + AutoCastGuard(const AutoCastGuard& guard) = delete; + AutoCastGuard& operator=(const AutoCastGuard& guard) = delete; + + private: + std::shared_ptr tracer_; + bool pre_mode_; +}; + +NameVarBaseMap AutoCastInputs(const std::string& op_type, + const NameVarBaseMap& ins); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 17a0b5e0431d44176b896fa1b5df4f88cadafe9f..03e83301d44a35dc98e9a1aee0e1b22ef2380d50 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -186,6 +186,8 @@ class VarBase { framework::proto::VarType::Type DataType() const { return var_->DataType(); } + const platform::Place Place() const { return var_->Place(); } + void ClearGradient(); std::shared_ptr NewVarBase(const platform::Place& dst_place, diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index ee4c5617397b39d6847fecd1c884af8b0e14440f..d09cb03360363088bb021285af4574ffbbb81ef0 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -16,6 +16,7 @@ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" @@ -53,8 +54,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, attr_checker->Check(&attrs, true); } + NameVarBaseMap new_ins = ins; + if (enable_autocast_) { + VLOG(5) << "Auto mixed precision run operator: " << type; + new_ins = AutoCastInputs(type, ins); + } + try { - OpBase::Run(*op, ins, outs, attrs, place); + OpBase::Run(*op, new_ins, outs, attrs, place); } catch (platform::EnforceNotMet& exception) { framework::AppendErrorOpHint(type, &exception); throw std::move(exception); @@ -73,11 +80,11 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, if (enable_program_desc_tracing_) { VLOG(5) << "Trace op " << type << " into ProgramDesc"; - program_desc_tracer_->InsertOp(type, ins, outs, attrs); + program_desc_tracer_->InsertOp(type, new_ins, outs, attrs); } - if (ComputeRequiredGrad(ins, outs, trace_backward)) { - CreateGradOpNode(*op, ins, outs, attrs, place); + if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { + CreateGradOpNode(*op, new_ins, outs, attrs, place); } else { VLOG(3) << "No Grad to track for Op: " << type; } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 7652b3aa291ac0063fcc411b5f86f6084f01e8ef..71996b3e1ac998be2c4cd3765591b640765089a0 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -97,6 +97,10 @@ class Tracer { void SetHasGrad(bool has_grad) { has_grad_ = has_grad; } + void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; } + + bool IsAutoCastEnabled() const { return enable_autocast_; } + private: std::unique_ptr basic_engine_; std::unique_ptr program_desc_tracer_; @@ -104,6 +108,7 @@ class Tracer { std::unique_ptr generator_; platform::Place expected_place_; bool has_grad_{true}; + bool enable_autocast_{false}; }; // To access static variable current_tracer diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 9c2ff39e8675fbe1ca3777731a5d9408bfc765b3..d730ddc12d1053910a36b8491c2ce983f60b3648 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -111,6 +111,28 @@ class VariableWrapper { } } + const platform::Place Place() const { + const framework::Tensor* tensor = nullptr; + auto place = + platform::CPUPlace(); // Default place for var not initialized. + if (var_.IsInitialized()) { + if (type_ == framework::proto::VarType::LOD_TENSOR) { + tensor = &(var_.Get()); + } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { + tensor = &(var_.Get().value()); + } else { + VLOG(6) << "Variable " << name_ << " is not initialized"; + return place; + } + } + if (tensor && tensor->IsInitialized()) { + return tensor->place(); + } else { + VLOG(6) << "The tensor of variable " << name_ << " is not initialized"; + return place; + } + } + private: void SetGradVar(const std::shared_ptr& var) { auto shared_var = grad_var_.lock(); diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc index 01b6ccedcdd8156269d445d7822a4184c062b225..7f0ca1493f712f7f4809a56bf6a23f8757f94c2d 100644 --- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc +++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" + #include #include @@ -67,7 +68,7 @@ class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker { "amp_check_finite_and_unscale operator.") .AsDuplicable(); AddOutput("FoundInfinite", - "(Tensor) 1-dim tensor, contains a int scalar, which indicates " + "(Tensor) 1-dim tensor, contains a bool scalar, which indicates " "if there there is infinite or nan item in input X."); AddComment(R"DOC( amp_check_finite_and_scale operator. diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu index b92c6901d71bd80c45b0681f62a1a2ddedfcf64a..ee00d7c5f4499867c2c706ddcf314c1bfae0a866 100644 --- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu +++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h" #include "paddle/fluid/platform/float16.h" @@ -21,7 +22,7 @@ namespace operators { template __global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num, - int* found_inf, T* out) { + bool* found_inf, T* out) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < num) { @@ -44,7 +45,7 @@ class AmpCheckFiniteAndScaleKernel auto* found_inf = ctx.Output("FoundInfinite"); const T* scale_data = scale->data(); - int* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); + bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool)); for (size_t i = 0; i < xs.size(); ++i) { diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 82941c58280560b1c09b149da01ef3d6e8a3f8e0..ac1d2bc1f31d62a2ca9ccb9378bc17ac37d09ec9 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -19,13 +19,17 @@ limitations under the License. */ #include #include #include + #include #include #include #include +#include #include #include + #include "paddle/fluid/imperative/all_reduce.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/backward_strategy.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/data_loader.h" @@ -537,8 +541,7 @@ void BindImperative(py::module *m_ptr) { }); py::class_>( - m, "VarBase", - R"DOC()DOC") + m, "VarBase", R"DOC()DOC") .def_static("_alive_vars", &imperative::VarBase::AliveVarNames) .def("__init__", [](imperative::VarBase &self, framework::proto::VarType::Type dtype, @@ -838,13 +841,14 @@ void BindImperative(py::module *m_ptr) { .def("reset", &imperative::jit::ProgramDescTracer::Reset); py::class_>( - m, "Tracer", - R"DOC()DOC") + m, "Tracer", R"DOC()DOC") .def("__init__", [](imperative::Tracer &self) { new (&self) imperative::Tracer(); }) .def_property("_enable_program_desc_tracing", &imperative::Tracer::IsProgramDescTracingEnabled, &imperative::Tracer::SetEnableProgramDescTracing) + .def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled, + &imperative::Tracer::SetEnableAutoCast) .def_property("_train_mode", &imperative::Tracer::HasGrad, &imperative::Tracer::SetHasGrad) .def_property( @@ -874,6 +878,26 @@ void BindImperative(py::module *m_ptr) { py::return_value_policy::reference) .def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName, py::arg("key") = "eager_tmp") + .def( + "_set_amp_op_list", + [](imperative::Tracer &self, + std::unordered_set &allow_ops, + std::unordered_set &block_ops) { + // NOTE(zhiqiu): The automatic conversion in pybind11 between c++ + // STL and python set/list/dict involve a copy operation that + // prevents pass-by-reference semantics, so it is ok to swap. + // The reaseon why not directly pass + // std::shared_ptr> + // is that pybind11 forbid shared_ptr where T is not custom type. + imperative::AmpOperators::Instance().GetAllowOps()->swap(allow_ops); + imperative::AmpOperators::Instance().GetBlockOps()->swap(block_ops); + }) + .def("_get_amp_op_list", + [](imperative::Tracer &self) { + return std::make_tuple( + *(imperative::AmpOperators::Instance().GetAllowOps()), + *(imperative::AmpOperators::Instance().GetBlockOps())); + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py index f990d02342be78fe998cebfa40ed8b348cf54b2a..20f48db0808b04d09ddd4537cbec2af939ad7692 100644 --- a/python/paddle/fluid/dygraph/__init__.py +++ b/python/paddle/fluid/dygraph/__init__.py @@ -56,6 +56,9 @@ from .dygraph_to_static import ProgramTranslator from . import rnn from .rnn import * +from . import amp +from .amp import * + __all__ = [] __all__ += layers.__all__ __all__ += base.__all__ @@ -69,3 +72,4 @@ __all__ += jit.__all__ __all__ += io.__all__ __all__ += rnn.__all__ __all__ += ['ProgramTranslator'] +__all__ += amp.__all__ diff --git a/python/paddle/fluid/dygraph/amp/__init__.py b/python/paddle/fluid/dygraph/amp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e86c5a20c5a411fda2a0011f63f4b5254e9bd07a --- /dev/null +++ b/python/paddle/fluid/dygraph/amp/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import auto_cast +from .auto_cast import * + +from . import loss_scaler +from .loss_scaler import * + +__all__ = [] +__all__ += auto_cast.__all__ +__all__ += loss_scaler.__all__ diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py new file mode 100644 index 0000000000000000000000000000000000000000..ffb4d9f16f29f384b83f175ddcb60f65e8077930 --- /dev/null +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -0,0 +1,166 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from paddle.fluid.wrapped_decorator import signature_safe_contextmanager, wrap_decorator +from paddle.fluid import core +import contextlib +from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags +import warnings +import copy + +__all__ = ['amp_guard'] + +# The set of ops that support fp16 calculation and are considered numerically- +# safe and performance-critical. These ops are always converted to fp16. +WHITE_LIST = { + 'conv2d', + 'matmul', + 'mul', +} + +# The set of ops that support fp16 calculation and are considered numerically- +# dangerous and whose effects may also be observed in downstream ops. +BLACK_LIST = { + 'exp', + 'square', + 'log', + 'mean', + 'sum', + 'cos_sim', + 'softmax', + 'softmax_with_cross_entropy', + 'sigmoid_cross_entropy_with_logits', + 'cross_entropy', + 'cross_entropy2', +} + +AMP_RELATED_FLAGS = [ + 'FLAGS_cudnn_exhaustive_search', + 'FLAGS_conv_workspace_size_limit', + 'FLAGS_cudnn_batchnorm_spatial_persistent', +] + +AMP_RELATED_FLAGS_SETTING = { + 'FLAGS_cudnn_exhaustive_search': 1, + 'FLAGS_conv_workspace_size_limit': 1000, + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, +} + + +#NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list +# The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode. +def _update_list(custom_white_list, custom_black_list): + """ + Update black and white list according to users' custom list. + """ + _white_list = copy.copy(WHITE_LIST) + _black_list = copy.copy(BLACK_LIST) + if custom_white_list and custom_black_list: + for op_name in custom_white_list: + if op_name in custom_black_list: + raise ValueError("Custom white list overlap " + "custom black list") + if custom_white_list: + for op_name in custom_white_list: + if op_name in _black_list: + _black_list.remove(op_name) + _white_list.add(op_name) + if custom_black_list: + for op_name in custom_black_list: + if op_name in _white_list: + _white_list.remove(op_name) + _black_list.add(op_name) + return _white_list, _black_list + + +@signature_safe_contextmanager +@dygraph_only +def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): + """ + :api_attr: imperative + + Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode. + If enabled, the input data type (float32 or float16) of each operator is decided + by autocast algorithm for better performance. + + Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in + imperative mode. + + Args: + enable(bool, optional): Enable auto-mixed-precision or not. Default is True. + custom_white_list(set|list, optional): The custom white_list. + custom_black_list(set|list, optional): The custom black_list. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle.fluid as fluid + + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + conv2d = fluid.dygraph.Conv2D(3, 2, 3) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = conv2d(data) + print(conv.dtype) # FP16 + with fluid.dygraph.amp_guard(enable=False): + conv = conv2d(data) + print(conv.dtype) # FP32 + + """ + tracer = _dygraph_tracer() + if not tracer: + raise ValueError( + "current_tracer is None, maybe it is not in imperative mode.") + + if enable and not tracer._expected_place.is_gpu_place(): + warnings.warn( + 'amp_guard can only be enabled on CUDAPlace, current place is %s, so it makes no effect.' + % tracer._expected_place) + enable = False + + # use default white_list and black_list if no custom lists provided + _white_list = WHITE_LIST + _black_list = BLACK_LIST + if custom_white_list or custom_black_list: + _white_list, _black_list = _update_list(custom_white_list, + custom_black_list) + + if tracer: + # enable auto_cast + original_enable = tracer._enable_autocast + tracer._enable_autocast = enable + # set amp op list + original_white_list, original_black_list = tracer._get_amp_op_list() + tracer._set_amp_op_list(_white_list, _black_list) + + # TODO(zhiqiu) set amp related flags automatically in this guard + # Currently, if FLAGS_cudnn_batchnorm_spatial_persistent is set True in amp_guard, + # batch_norm can run in fast mode, but batch_norm_grad can not if backward if not executed insise amp_guard. + # So, users need to set related flags manually. + + # original_flags = get_flags(AMP_RELATED_FLAGS) + # set_flags(AMP_RELATED_FLAGS_SETTING) + + # restore status + try: + yield + finally: + if tracer: + tracer._enable_autocast = original_enable + tracer._set_amp_op_list(original_white_list, original_black_list) + # set_flags(original_flags) diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py new file mode 100644 index 0000000000000000000000000000000000000000..8f3ca9ec007ef5c1ab8769dde741a5d2b3697600 --- /dev/null +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -0,0 +1,246 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from paddle.fluid import core +from paddle.fluid.dygraph import to_variable +from paddle.fluid.framework import _varbase_creator, _dygraph_tracer, dygraph_only +from paddle.fluid.data_feeder import check_type +from ...wrapped_decorator import signature_safe_contextmanager, wrap_decorator +import warnings +import numpy as np + +__all__ = ['AmpScaler'] + + +class AmpScaler(object): + """ + :api_attr: imperative + + AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative + mode. It controls the scaling of loss, helps avoiding numerical overflow. + The object of this class has two methods `scale()`, `minimize()`. + + `scale()` is used to multiply the loss by a scale ratio. + `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating. + + Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in + imperative mode. + + Args: + enable(bool, optional): Enable loss scaling or not. Default is True. + init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15. + incr_ratio(float, optional): The multiplier to use when increasing the loss + scaling. Default is 2.0. + decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing + the loss scaling. Default is 0.5. + incr_every_n_steps(int, optional): Increases loss scaling every n consecutive + steps with finite gradients. Default is 1000. + decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n + accumulated steps with nan or inf gradients. Default is 2. + use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True. + Returns: + An AmpScaler object. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle.fluid as fluid + + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + """ + + @dygraph_only + def __init__(self, + enable=True, + init_loss_scaling=2.**15, + incr_ratio=2.0, + decr_ratio=0.5, + incr_every_n_steps=1000, + decr_every_n_nan_or_inf=1, + use_dynamic_loss_scaling=True): + + tracer = _dygraph_tracer() + if not tracer: + raise ValueError( + "current_tracer is None, maybe it is not in imperative mode.") + + if enable and not tracer._expected_place.is_gpu_place(): + warnings.warn( + 'AmpScaler can only be enabled on CUDAPlace, current place is %s, so it makes no effect.' + % tracer._expected_place) + enable = False + + self._enable = enable + + if self._enable: + assert incr_ratio > 1.0, "The incr_ratio must be > 1.0." + assert decr_ratio < 1.0, "The decr_ratio must be < 1.0." + + self._init_loss_scaling = init_loss_scaling + self._incr_ratio = incr_ratio + self._decr_ratio = decr_ratio + self._incr_every_n_steps = incr_every_n_steps + self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf + self._incr_count = 0 + self._decr_count = 0 + self._use_dynamic_loss_scaling = use_dynamic_loss_scaling + + self._found_inf = to_variable(np.array([0]).astype(np.bool)) + self._scale = to_variable( + np.array([self._init_loss_scaling]).astype(np.float32)) + self._cache_founf_inf = None + + def scale(self, var): + """ + Multiplies a variable(Tensor) by the scale factor and returns scaled outputs. + If this instance of :class:`AmpScaler` is not enabled, output are returned unmodified. + + Args: + var (Variable): The variable to scale. + Returns: + The scaled variable or original variable. + + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid as fluid + + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + """ + check_type(var, "var", core.VarBase, 'AmpScaler.scale()') + + if not self._enable: + return var + + return var * self._scale + + def minimize(self, optimizer, *args, **kwargs): + """ + This function is similar as `Optimizer.minimize()`, which performs parameters updating. + + If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. + Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + + Finally, the loss scaling ratio is updated. + + Args: + optimizer(Optimizer): The optimizer used to update parameters. + args: Arguments, which will be forward to `optimizer.minimize()`. + kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`. + + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid as fluid + + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3) + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(): + conv = model(data) + loss = fluid.layers.reduce_mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.minimize(optimizer, scaled) + """ + if not self._enable: + return optimizer.minimize(*args, **kwargs) + + # unscale the grad + self._unscale(optimizer) + + optimize_ops, params_grads = (None, None) + + if self._found_inf: + self._cache_founf_inf = True + else: + optimize_ops, params_grads = optimizer.minimize(*args, **kwargs) + self._cache_founf_inf = False + + if self._use_dynamic_loss_scaling: + # uopdate the scale + self._update() + + return optimize_ops, params_grads + + def _unscale(self, optimizer): + if not self._enable: + return + inv_scale = 1.0 / self._scale + param_grads = [ + param._grad_ivar() for param in optimizer._parameter_list + if param._grad_ivar() is not None + ] + core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads, + self._found_inf) + + def _update(self): + """ + Updates the loss_scaling. + """ + if not self._enable: + return + + if self._cache_founf_inf: + self._incr_count = 0 + self._decr_count = self._decr_count + 1 + if self._decr_count == self._decr_every_n_nan_or_inf: + print( + 'Found inf or nan, current scale is: {}, decrease to: {}*{}'. + format( + float(self._scale), + float(self._scale), float(self._decr_ratio))) + self._scale = self._scale * self._decr_ratio + self._decr_count = 0 + else: + self._decr_count = 0 + self._incr_count = self._incr_count + 1 + if self._incr_count == self._incr_every_n_steps: + self._scale = self._scale * self._incr_ratio + self._incr_count = 0 + + return diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 5ce11144871ff5b46e50f93bf120a8d1b0120b9c..f6ac452c82c66132d8e0b41647a05daaa3de2a74 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -106,6 +106,7 @@ if (NOT ${WITH_GPU}) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer) + LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) elseif(${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) endif() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py new file mode 100644 index 0000000000000000000000000000000000000000..a4f3858d6fb242b8689bd1d300861faf8ed73e54 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -0,0 +1,273 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import paddle.fluid as fluid +import numpy as np +import six +from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting + + +class SimpleConv(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(SimpleConv, self).__init__() + self._conv = fluid.dygraph.Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=None, + use_cudnn=True) + + def forward(self, inputs): + return self._conv(inputs) + + +class TestAutoCast(unittest.TestCase): + def test_amp_guard_white_op(self): + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(True): + out_fp16 = conv2d(data) + + with fluid.dygraph.amp_guard(False): + out_fp32 = conv2d(data) + + self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32) + self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16) + self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) + + def test_amp_guard_black_op(self): + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(True): + out_fp32 = fluid.layers.mean(data) + + self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32) + self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) + + def test_custom_op_list(self): + with fluid.dygraph.guard(): + tracer = fluid.framework._dygraph_tracer() + base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST + base_black_list = fluid.dygraph.amp.auto_cast.BLACK_LIST + with fluid.dygraph.amp_guard( + custom_white_list=["log"], custom_black_list=["conv2d"]): + white_list, black_list = tracer._get_amp_op_list() + self.assertTrue( + set(white_list) == + (set(base_white_list) | {"log"}) - {"conv2d"}) + + self.assertTrue( + set(black_list) == + (set(base_black_list) - {"log"}) | {"conv2d"}) + + def test_custom_op_list_exception(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + + def func(): + with fluid.dygraph.guard(): + model = SimpleConv( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + + with fluid.dygraph.amp_guard( + custom_white_list=["conv2d"], + custom_black_list=["conv2d"]): + inp = fluid.dygraph.to_variable(inp_np) + out = model(inp) + + self.assertRaises(ValueError, func) + + +class TestAmpScaler(unittest.TestCase): + def test_scale(self): + with fluid.dygraph.guard(): + data = paddle.rand([10, 1024]) + scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024) + scaled_data = scaler.scale(data) + self.assertEqual( + np.array_equal(scaled_data.numpy(), data.numpy() * 1024), True) + + def test_minimize(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + + def run_simple_conv(inp_np, use_scaler=True): + paddle.manual_seed(10) + with fluid.dygraph.guard(): + model = SimpleConv( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(inp_np) + + out = model(data) + loss = fluid.layers.mean(out) + if use_scaler: + print('use scaler') + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + optimize_ops, params_grads = scaler.minimize(optimizer, + scaled_loss) + else: + print('use no scaler') + loss.backward() + optimize_ops, params_grads = optimizer.minimize(loss) + return optimize_ops, params_grads + + outs_with_scaler = run_simple_conv(inp_np, use_scaler=True) + outs_no_scaler = run_simple_conv(inp_np, use_scaler=False) + + self.assertEqual(outs_with_scaler[0], + []) # optimize_ops is [] in dygraph mode + self.assertEqual(outs_no_scaler[0], + []) # optimize_ops is [] in dygraph mode + for i in range(len(outs_with_scaler[1])): + # check each grad + self.assertEqual( + np.allclose(outs_with_scaler[1][i][1].numpy(), + outs_no_scaler[1][i][1].numpy()), True) + # check each parameter + self.assertEqual( + np.allclose(outs_with_scaler[1][i][0].numpy(), + outs_no_scaler[1][i][0].numpy()), True) + + def test_nan_inf(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + inp_np[0][1][2][3] = np.nan + with fluid.dygraph.guard(): + model = SimpleConv( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + params_init = {} + for param in model.parameters(): + params_init[param.name] = param.numpy() + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(inp_np) + + out = model(data) + loss = fluid.layers.mean(out) + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) + self.assertEqual(scaler._found_inf.numpy() == 1, True) + + for param in model.parameters(): + # param not update when tensor contains nan or inf + self.assertTrue( + np.array_equal(param.numpy(), params_init[param.name])) + + +class TestResnet(unittest.TestCase): + def train_resnet(self, enable_amp=True): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 1 + + with fluid.dygraph.guard(): + paddle.manual_seed(seed) + + resnet = ResNet(use_cudnn=True) + optimizer = optimizer_setting( + train_parameters, parameter_list=resnet.parameters()) + np.random.seed(seed) + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + dy_param_init_value = {} + for param in resnet.parameters(): + dy_param_init_value[param.name] = param.numpy() + + program = None + scaler = paddle.fluid.dygraph.AmpScaler( + enable=enable_amp, init_loss_scaling=2.**10) + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + dy_x_data = np.array( + [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + if len(np.array([x[1] + for x in data]).astype('int64')) != batch_size: + continue + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + -1, 1) + img = fluid.dygraph.to_variable(dy_x_data) + label = fluid.dygraph.to_variable(y_data) + label.stop_gradient = True + with paddle.fluid.dygraph.amp_guard(enable=enable_amp): + out = resnet(img) + + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + dy_out = avg_loss.numpy() + + scaled_loss = scaler.scale(avg_loss) + scaled_loss.backward() + + scaler.minimize(optimizer, scaled_loss) + + dy_grad_value = {} + for param in resnet.parameters(): + if param.trainable: + np_array = np.array(param._grad_ivar().value() + .get_tensor()) + dy_grad_value[param.name + fluid.core.grad_var_suffix( + )] = np_array + + resnet.clear_gradients() + + dy_param_value = {} + for param in resnet.parameters(): + dy_param_value[param.name] = param.numpy() + + return dy_out, dy_param_value, dy_grad_value + + def test_resnet(self): + out_fp32 = self.train_resnet(enable_amp=False) + out_amp = self.train_resnet(enable_amp=True) + print(out_fp32[0], out_amp[0]) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 106f58ccc99ffe42b77466e6dbf7b773ecee4ee2..815437072fde291b8d8348dba0b4b0ae872ec1b9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -83,7 +83,8 @@ class ConvBNLayer(fluid.Layer): filter_size, stride=1, groups=1, - act=None): + act=None, + use_cudnn=False): super(ConvBNLayer, self).__init__() self._conv = Conv2D( @@ -94,8 +95,8 @@ class ConvBNLayer(fluid.Layer): padding=(filter_size - 1) // 2, groups=groups, act=None, - bias_attr=None, - use_cudnn=False) + bias_attr=False, + use_cudnn=use_cudnn) self._batch_norm = BatchNorm(num_filters, act=act) @@ -107,32 +108,41 @@ class ConvBNLayer(fluid.Layer): class BottleneckBlock(fluid.Layer): - def __init__(self, num_channels, num_filters, stride, shortcut=True): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True, + use_cudnn=False): super(BottleneckBlock, self).__init__() self.conv0 = ConvBNLayer( num_channels=num_channels, num_filters=num_filters, filter_size=1, - act='relu') + act='relu', + use_cudnn=use_cudnn) self.conv1 = ConvBNLayer( num_channels=num_filters, num_filters=num_filters, filter_size=3, stride=stride, - act='relu') + act='relu', + use_cudnn=use_cudnn) self.conv2 = ConvBNLayer( num_channels=num_filters, num_filters=num_filters * 4, filter_size=1, - act=None) + act=None, + use_cudnn=use_cudnn) if not shortcut: self.short = ConvBNLayer( num_channels=num_channels, num_filters=num_filters * 4, filter_size=1, - stride=stride) + stride=stride, + use_cudnn=use_cudnn) self.shortcut = shortcut @@ -153,7 +163,7 @@ class BottleneckBlock(fluid.Layer): class ResNet(fluid.Layer): - def __init__(self, layers=50, class_dim=102): + def __init__(self, layers=50, class_dim=102, use_cudnn=False): super(ResNet, self).__init__() self.layers = layers @@ -171,7 +181,12 @@ class ResNet(fluid.Layer): num_filters = [64, 128, 256, 512] self.conv = ConvBNLayer( - num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu') + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu', + use_cudnn=use_cudnn) self.pool2d_max = Pool2D( pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') @@ -186,7 +201,8 @@ class ResNet(fluid.Layer): if i == 0 else num_filters[block] * 4, num_filters=num_filters[block], stride=2 if i == 0 and block != 0 else 1, - shortcut=shortcut)) + shortcut=shortcut, + use_cudnn=use_cudnn)) self.bottleneck_block_list.append(bottleneck_block) shortcut = True diff --git a/python/setup.py.in b/python/setup.py.in index a2628cac51af62c59c05484ecbf2d2b52c9bf859..43a57ae5463698947dfa9cef2fd2186e79733ba0 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -158,6 +158,7 @@ packages=['paddle', 'paddle.fluid', 'paddle.fluid.dygraph', 'paddle.fluid.dygraph.dygraph_to_static', + 'paddle.fluid.dygraph.amp', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.distributed',