未验证 提交 e6af9bd2 编写于 作者: Y yuehuayingxueluo 提交者: GitHub

Rename MultiTensorAdam To FusedAdam (#50449)

* rename multi_tensor_adam to fused_adam

* fix some bugs

* fix CI coverage

* rename test_fused_adam.py

* fix some bug

* add test_fused_adam_op.py

* fix some bugs

* fix fused_adam_op.cc

* fix CI bugs

* fix CI bug

* fix CI bug
上级 f803b239
...@@ -22,19 +22,34 @@ namespace operators { ...@@ -22,19 +22,34 @@ namespace operators {
using Tensor = phi::DenseTensor; using Tensor = phi::DenseTensor;
class MultiTensorAdamOp : public framework::OperatorWithKernel { class FusedAdamOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
phi::KernelKey GetExpectedKernelType( phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext &ctx) const override {
auto param_dtype = auto param_dtype =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "Params"); framework::OperatorWithKernel::IndicateVarDataType(ctx, "Params");
return phi::KernelKey(param_dtype, ctx.GetPlace()); return phi::KernelKey(param_dtype, ctx.GetPlace());
} }
phi::KernelKey GetKernelTypeForVar(
const std::string &var_name,
const phi::DenseTensor &tensor,
const phi::KernelKey &expected_kernel_type) const override {
if (var_name == "Beta1Pows" || var_name == "Beta2Pows" ||
var_name == "SkipUpdate") {
return phi::KernelKey(phi::Backend::ALL_BACKEND,
expected_kernel_type.layout(),
expected_kernel_type.dtype());
} else {
return phi::KernelKey(
tensor.place(), tensor.layout(), expected_kernel_type.dtype());
}
}
}; };
class MultiTensorAdamOpMaker : public framework::OpProtoAndCheckerMaker { class FusedAdamOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Params", "(Tensor) Input parameters").AsDuplicable(); AddInput("Params", "(Tensor) Input parameters").AsDuplicable();
...@@ -144,13 +159,13 @@ $$ ...@@ -144,13 +159,13 @@ $$
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(multi_tensor_adam, DECLARE_INFER_SHAPE_FUNCTOR(fused_adam,
MultiTensorAdamInferShapeFunctor, FusedAdamInferShapeFunctor,
PD_INFER_META(phi::MultiTensorAdamInferMeta)); PD_INFER_META(phi::FusedAdamInferMeta));
REGISTER_OPERATOR( REGISTER_OPERATOR(
multi_tensor_adam, fused_adam,
ops::MultiTensorAdamOp, ops::FusedAdamOp,
ops::MultiTensorAdamOpMaker, ops::FusedAdamOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>, paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MultiTensorAdamInferShapeFunctor); FusedAdamInferShapeFunctor);
...@@ -166,7 +166,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -166,7 +166,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
"Beta1Pow", "Beta1Pow",
"Beta2Pow", "Beta2Pow",
"MasterParam"}}, "MasterParam"}},
{"multi_tensor_adam", {"fused_adam",
{"Params", {"Params",
"Grads", "Grads",
"LearningRate", "LearningRate",
...@@ -332,7 +332,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = { ...@@ -332,7 +332,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
"Beta1PowOut", "Beta1PowOut",
"Beta2PowOut", "Beta2PowOut",
"MasterParamOut"}}, "MasterParamOut"}},
{"multi_tensor_adam", {"fused_adam",
{"ParamsOut", {"ParamsOut",
"Moments1Out", "Moments1Out",
"Moments2Out", "Moments2Out",
...@@ -400,7 +400,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = { ...@@ -400,7 +400,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
"Beta1PowOut", "Beta1PowOut",
"Beta2PowOut", "Beta2PowOut",
"MasterParamOut"}}, "MasterParamOut"}},
{"multi_tensor_adam", {"fused_adam",
{"ParamsOut", {"ParamsOut",
"Moments1Out", "Moments1Out",
"Moments2Out", "Moments2Out",
......
...@@ -781,6 +781,17 @@ ...@@ -781,6 +781,17 @@
data_transform : data_transform :
skip_transform : x skip_transform : x
- op : fused_adam_
args : (Tensor[] params, Tensor[] grads, Tensor learning_rate, Tensor[] moments1, Tensor[] moments2, Tensor[] beta1_pows, Tensor[] beta2_pows, Tensor[] master_params, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, int chunk_size, float weight_decay, bool use_adamw, bool multi_precision, bool use_global_beta_pow)
output : Tensor[](params_out){params.size()}, Tensor[](moments1_out){params.size()}, Tensor[](moments2_out){params.size()}, Tensor[](beta1_pows_out){params.size()}, Tensor[](beta2_pows_out){params.size()}, Tensor[](master_params_out){params.size()}
infer_meta :
func : FusedAdamInferMeta
kernel :
func : fused_adam
data_type : params
optional : skip_update, master_params
inplace : (params -> params_out), (moments1 -> moments1_out), (moments2 -> moments2_out), (beta1_pows -> beta1_pows_out), (beta2_pows -> beta2_pows_out), (master_params -> master_params_out)
- op : gather - op : gather
args : (Tensor x, Tensor index, Scalar(int) axis=0) args : (Tensor x, Tensor index, Scalar(int) axis=0)
output : Tensor(out) output : Tensor(out)
...@@ -1237,17 +1248,6 @@ ...@@ -1237,17 +1248,6 @@
optional : master_param optional : master_param
inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out) inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
- op : multi_tensor_adam_
args : (Tensor[] params, Tensor[] grads, Tensor learning_rate, Tensor[] moments1, Tensor[] moments2, Tensor[] beta1_pows, Tensor[] beta2_pows, Tensor[] master_params, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, int chunk_size, float weight_decay, bool use_adamw, bool multi_precision, bool use_global_beta_pow)
output : Tensor[](params_out){params.size()}, Tensor[](moments1_out){params.size()}, Tensor[](moments2_out){params.size()}, Tensor[](beta1_pows_out){params.size()}, Tensor[](beta2_pows_out){params.size()}, Tensor[](master_params_out){params.size()}
infer_meta :
func : MultiTensorAdamInferMeta
kernel :
func : multi_tensor_adam
data_type : params
optional : skip_update, master_params
inplace : (params -> params_out), (moments1 -> moments1_out), (moments2 -> moments2_out), (beta1_pows -> beta1_pows_out), (beta2_pows -> beta2_pows_out), (master_params -> master_params_out)
- op : multiclass_nms3 - op : multiclass_nms3
args : (Tensor bboxes, Tensor scores, Tensor rois_num, float score_threshold, int nms_top_k, int keep_top_k, float nms_threshold=0.3, bool normalized=true, float nms_eta=1.0, int background_label=0) args : (Tensor bboxes, Tensor scores, Tensor rois_num, float score_threshold, int nms_top_k, int keep_top_k, float nms_threshold=0.3, bool normalized=true, float nms_eta=1.0, int background_label=0)
output : Tensor(out), Tensor(index), Tensor(nms_rois_num) output : Tensor(out), Tensor(index), Tensor(nms_rois_num)
......
...@@ -2982,7 +2982,7 @@ void YoloLossInferMeta(const MetaTensor& x, ...@@ -2982,7 +2982,7 @@ void YoloLossInferMeta(const MetaTensor& x,
gt_match_mask->set_dtype(x.dtype()); gt_match_mask->set_dtype(x.dtype());
} }
void MultiTensorAdamInferMeta( void FusedAdamInferMeta(
const std::vector<const MetaTensor*>& params, const std::vector<const MetaTensor*>& params,
const std::vector<const MetaTensor*>& grads, const std::vector<const MetaTensor*>& grads,
const MetaTensor& learning_rate, const MetaTensor& learning_rate,
......
...@@ -533,7 +533,7 @@ void YoloLossInferMeta(const MetaTensor& x, ...@@ -533,7 +533,7 @@ void YoloLossInferMeta(const MetaTensor& x,
MetaTensor* objectness_mask, MetaTensor* objectness_mask,
MetaTensor* gt_match_mask); MetaTensor* gt_match_mask);
void MultiTensorAdamInferMeta( void FusedAdamInferMeta(
const std::vector<const MetaTensor*>& params, const std::vector<const MetaTensor*>& params,
const std::vector<const MetaTensor*>& grads, const std::vector<const MetaTensor*>& grads,
const MetaTensor& learning_rate, const MetaTensor& learning_rate,
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/multi_tensor_adam_kernel.h" #include "paddle/phi/kernels/fused_adam_kernel.h"
#include <vector> #include <vector>
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
...@@ -29,7 +29,7 @@ static paddle::optional<DenseTensor> TensorPtrToOptionalTensor( ...@@ -29,7 +29,7 @@ static paddle::optional<DenseTensor> TensorPtrToOptionalTensor(
} }
template <typename T, typename Context> template <typename T, typename Context>
void MultiTensorAdamKernel( void FusedAdamKernel(
const Context& dev_ctx, const Context& dev_ctx,
const std::vector<const DenseTensor*>& params, const std::vector<const DenseTensor*>& params,
const std::vector<const DenseTensor*>& grads, const std::vector<const DenseTensor*>& grads,
...@@ -157,9 +157,5 @@ void MultiTensorAdamKernel( ...@@ -157,9 +157,5 @@ void MultiTensorAdamKernel(
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL(multi_tensor_adam, PD_REGISTER_KERNEL(
CPU, fused_adam, CPU, ALL_LAYOUT, phi::FusedAdamKernel, float, double) {}
ALL_LAYOUT,
phi::MultiTensorAdamKernel,
float,
double) {}
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
namespace phi { namespace phi {
template <typename T, typename Context> template <typename T, typename Context>
void MultiTensorAdamKernel( void FusedAdamKernel(
const Context &dev_ctx, const Context &dev_ctx,
const std::vector<const DenseTensor *> &params, const std::vector<const DenseTensor *> &params,
const std::vector<const DenseTensor *> &grads, const std::vector<const DenseTensor *> &grads,
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/multi_tensor_adam_kernel.h" #include "paddle/phi/kernels/fused_adam_kernel.h"
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
...@@ -31,9 +31,9 @@ namespace phi { ...@@ -31,9 +31,9 @@ namespace phi {
// https://github.com/NVIDIA/apex // https://github.com/NVIDIA/apex
template <typename T, bool CPUBetaPows /*=true*/> template <typename T, bool CPUBetaPows /*=true*/>
struct MultiTensorAdamBetaPowInfo { struct FusedAdamBetaPowInfo {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type; using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
MultiTensorAdamBetaPowInfo(const MPDType* beta1pow, const MPDType* beta2pow) { FusedAdamBetaPowInfo(const MPDType* beta1pow, const MPDType* beta2pow) {
beta1pow_ = *beta1pow; beta1pow_ = *beta1pow;
beta2pow_ = *beta2pow; beta2pow_ = *beta2pow;
} }
...@@ -48,9 +48,9 @@ struct MultiTensorAdamBetaPowInfo { ...@@ -48,9 +48,9 @@ struct MultiTensorAdamBetaPowInfo {
}; };
template <typename T> template <typename T>
struct MultiTensorAdamBetaPowInfo<T, /*CPUBetaPows=*/false> { struct FusedAdamBetaPowInfo<T, /*CPUBetaPows=*/false> {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type; using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
MultiTensorAdamBetaPowInfo(const MPDType* beta1pow, const MPDType* beta2pow) { FusedAdamBetaPowInfo(const MPDType* beta1pow, const MPDType* beta2pow) {
beta1pow_ = beta1pow; beta1pow_ = beta1pow;
beta2pow_ = beta2pow; beta2pow_ = beta2pow;
} }
...@@ -73,13 +73,13 @@ template <typename T, ...@@ -73,13 +73,13 @@ template <typename T,
int N, int N,
int MaxTensorSize, int MaxTensorSize,
int MaxBlockSize> int MaxBlockSize>
struct MultiTensorAdamFunctor { struct FusedAdamFunctor {
__device__ __forceinline__ void operator()( __device__ __forceinline__ void operator()(
int chunk_size, int chunk_size,
const funcs::TensorAndBlockInfo<N, MaxTensorSize, MaxBlockSize>& t_info, const funcs::TensorAndBlockInfo<N, MaxTensorSize, MaxBlockSize>& t_info,
MT beta1, MT beta1,
MT beta2, MT beta2,
MultiTensorAdamBetaPowInfo<T, IsCPUBetaPow> beta_pow, FusedAdamBetaPowInfo<T, IsCPUBetaPow> beta_pow,
MT epsilon, MT epsilon,
const MT* learning_rate, const MT* learning_rate,
MT decay) const { MT decay) const {
...@@ -261,7 +261,7 @@ static int GetVecSizeFromTensors(const std::vector<TensorT*>& tensors, ...@@ -261,7 +261,7 @@ static int GetVecSizeFromTensors(const std::vector<TensorT*>& tensors,
} }
template <typename T, typename Context> template <typename T, typename Context>
void MultiTensorAdamKernel( void FusedAdamKernel(
const Context& dev_ctx, const Context& dev_ctx,
const std::vector<const DenseTensor*>& params, const std::vector<const DenseTensor*>& params,
const std::vector<const DenseTensor*>& grads, const std::vector<const DenseTensor*>& grads,
...@@ -365,17 +365,17 @@ void MultiTensorAdamKernel( ...@@ -365,17 +365,17 @@ void MultiTensorAdamKernel(
constexpr int kMaxTensorSize = __multi_precision ? 48 : 60; \ constexpr int kMaxTensorSize = __multi_precision ? 48 : 60; \
constexpr int kMaxBlockSize = __multi_precision ? 320 : 320; \ constexpr int kMaxBlockSize = __multi_precision ? 320 : 320; \
constexpr int kBlockSize = 512; \ constexpr int kBlockSize = 512; \
MultiTensorAdamBetaPowInfo<T, __is_cpu_betapow> beta_pow_info( \ FusedAdamBetaPowInfo<T, __is_cpu_betapow> beta_pow_info( \
beta1_pow_first->data<MPDType>(), beta2_pow_first->data<MPDType>()); \ beta1_pow_first->data<MPDType>(), beta2_pow_first->data<MPDType>()); \
MultiTensorAdamFunctor<T, \ FusedAdamFunctor<T, \
MPDType, \ MPDType, \
__vec_size, \ __vec_size, \
__multi_precision, \ __multi_precision, \
__is_cpu_betapow, \ __is_cpu_betapow, \
__use_adamw, \ __use_adamw, \
kInputNum, \ kInputNum, \
kMaxTensorSize, \ kMaxTensorSize, \
kMaxBlockSize> \ kMaxBlockSize> \
functor; \ functor; \
funcs::LaunchMultiTensorApplyKernel<kInputNum, \ funcs::LaunchMultiTensorApplyKernel<kInputNum, \
kMaxTensorSize, \ kMaxTensorSize, \
...@@ -487,10 +487,10 @@ void MultiTensorAdamKernel( ...@@ -487,10 +487,10 @@ void MultiTensorAdamKernel(
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL(multi_tensor_adam, PD_REGISTER_KERNEL(fused_adam,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
phi::MultiTensorAdamKernel, phi::FusedAdamKernel,
phi::dtype::float16, phi::dtype::float16,
float, float,
double) { double) {
......
...@@ -18,22 +18,21 @@ ...@@ -18,22 +18,21 @@
namespace phi { namespace phi {
KernelSignature MultiTensorAdamOpArgumentMapping( KernelSignature FusedAdamOpArgumentMapping(const ArgumentMappingContext& ctx) {
const ArgumentMappingContext& ctx) {
paddle::small_vector<const char*> in_names = {"Params", paddle::small_vector<const char*> in_names = {"Params",
"Grads", "Grads",
"LearningRate", "LearningRate",
"Moments1", "Moments1",
"Moments2", "Moments2",
"Beta1Pow", "Beta1Pows",
"Beta2Pow", "Beta2Pows",
"MasterParams", "MasterParams",
"SkipUpdate"}; "SkipUpdate"};
paddle::small_vector<const char*> out_names = {"ParamsOut", paddle::small_vector<const char*> out_names = {"ParamsOut",
"Moments1Out", "Moments1Out",
"Moments2Out", "Moments2Out",
"Beta1PowOut", "Beta1PowsOut",
"Beta2PowOut", "Beta2PowsOut",
"MasterParamsOut"}; "MasterParamsOut"};
paddle::small_vector<const char*> attr_names = {"beta1", paddle::small_vector<const char*> attr_names = {"beta1",
"beta2", "beta2",
...@@ -44,7 +43,7 @@ KernelSignature MultiTensorAdamOpArgumentMapping( ...@@ -44,7 +43,7 @@ KernelSignature MultiTensorAdamOpArgumentMapping(
"multi_precision", "multi_precision",
"use_global_beta_pow"}; "use_global_beta_pow"};
return KernelSignature("multi_tensor_adam", return KernelSignature("fused_adam",
std::move(in_names), std::move(in_names),
std::move(attr_names), std::move(attr_names),
std::move(out_names)); std::move(out_names));
...@@ -52,5 +51,4 @@ KernelSignature MultiTensorAdamOpArgumentMapping( ...@@ -52,5 +51,4 @@ KernelSignature MultiTensorAdamOpArgumentMapping(
} // namespace phi } // namespace phi
PD_REGISTER_ARG_MAPPING_FN(multi_tensor_adam, PD_REGISTER_ARG_MAPPING_FN(fused_adam, phi::FusedAdamOpArgumentMapping);
phi::MultiTensorAdamOpArgumentMapping);
...@@ -77,8 +77,8 @@ if(WITH_GPU) ...@@ -77,8 +77,8 @@ if(WITH_GPU)
SRCS test_auto_tune.cu SRCS test_auto_tune.cu
DEPS gtest) DEPS gtest)
cc_test( cc_test(
test_multi_tensor_adam_kernel test_fused_adam_kernel
SRCS test_multi_tensor_adam_kernel.cc SRCS test_fused_adam_kernel.cc
DEPS gtest phi phi_api_utils) DEPS gtest phi phi_api_utils)
elseif(WITH_ROCM) elseif(WITH_ROCM)
hip_test( hip_test(
......
...@@ -32,8 +32,8 @@ ...@@ -32,8 +32,8 @@
#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/elementwise_subtract_kernel.h" #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/fused_adam_kernel.h"
#include "paddle/phi/kernels/gaussian_kernel.h" #include "paddle/phi/kernels/gaussian_kernel.h"
#include "paddle/phi/kernels/multi_tensor_adam_kernel.h"
#include "paddle/phi/kernels/reduce_max_kernel.h" #include "paddle/phi/kernels/reduce_max_kernel.h"
namespace phi { namespace phi {
...@@ -179,9 +179,9 @@ struct AdamInfo { ...@@ -179,9 +179,9 @@ struct AdamInfo {
GenerateConstantTensorVectors<MT, Context>(*ctx, one_shapes, beta2); GenerateConstantTensorVectors<MT, Context>(*ctx, one_shapes, beta2);
} }
void Update(bool use_multi_tensor, const std::vector<DenseTensor> &grads) { void Update(bool use_fused, const std::vector<DenseTensor> &grads) {
if (use_multi_tensor) { if (use_fused) {
UpdateWithMultiTensorAdam(grads); UpdateWithFusedAdam(grads);
} else { } else {
for (size_t j = 0; j < params.size(); ++j) { for (size_t j = 0; j < params.size(); ++j) {
if (use_adamw) { if (use_adamw) {
...@@ -226,7 +226,7 @@ struct AdamInfo { ...@@ -226,7 +226,7 @@ struct AdamInfo {
} }
private: private:
void UpdateWithMultiTensorAdam(const std::vector<DenseTensor> &grads) { void UpdateWithFusedAdam(const std::vector<DenseTensor> &grads) {
auto param_metas = ToMetaTensorVector(params); auto param_metas = ToMetaTensorVector(params);
auto grad_metas = ToMetaTensorVector(grads); auto grad_metas = ToMetaTensorVector(grads);
auto master_param_metas = ToMetaTensorVector(master_params); auto master_param_metas = ToMetaTensorVector(master_params);
...@@ -235,34 +235,34 @@ struct AdamInfo { ...@@ -235,34 +235,34 @@ struct AdamInfo {
auto beta1_pow_metas = ToMetaTensorVector(beta1_pows); auto beta1_pow_metas = ToMetaTensorVector(beta1_pows);
auto beta2_pow_metas = ToMetaTensorVector(beta2_pows); auto beta2_pow_metas = ToMetaTensorVector(beta2_pows);
MultiTensorAdamInferMeta( FusedAdamInferMeta(ToConstMetaTensorPtrVector(param_metas),
ToConstMetaTensorPtrVector(param_metas), ToConstMetaTensorPtrVector(grad_metas),
ToConstMetaTensorPtrVector(grad_metas), learning_rate,
learning_rate, ToConstMetaTensorPtrVector(moment1_metas),
ToConstMetaTensorPtrVector(moment1_metas), ToConstMetaTensorPtrVector(moment2_metas),
ToConstMetaTensorPtrVector(moment2_metas), ToConstMetaTensorPtrVector(beta1_pow_metas),
ToConstMetaTensorPtrVector(beta1_pow_metas), ToConstMetaTensorPtrVector(beta2_pow_metas),
ToConstMetaTensorPtrVector(beta2_pow_metas), multi_precision
multi_precision ? paddle::make_optional( ? paddle::make_optional(
ToConstMetaTensorPtrVector(master_param_metas)) ToConstMetaTensorPtrVector(master_param_metas))
: paddle::none, : paddle::none,
MetaTensor(), MetaTensor(),
beta1, beta1,
beta2, beta2,
epsilon, epsilon,
chunk_size, chunk_size,
weight_decay, weight_decay,
use_adamw, use_adamw,
multi_precision, multi_precision,
false, false,
ToMutableMetaTensorPtrVector(param_metas), ToMutableMetaTensorPtrVector(param_metas),
ToMutableMetaTensorPtrVector(moment1_metas), ToMutableMetaTensorPtrVector(moment1_metas),
ToMutableMetaTensorPtrVector(moment2_metas), ToMutableMetaTensorPtrVector(moment2_metas),
ToMutableMetaTensorPtrVector(beta1_pow_metas), ToMutableMetaTensorPtrVector(beta1_pow_metas),
ToMutableMetaTensorPtrVector(beta2_pow_metas), ToMutableMetaTensorPtrVector(beta2_pow_metas),
ToMutableMetaTensorPtrVector(master_param_metas)); ToMutableMetaTensorPtrVector(master_param_metas));
MultiTensorAdamKernel<T, Context>( FusedAdamKernel<T, Context>(
*ctx, *ctx,
ToConstTensorPtrVector(params), ToConstTensorPtrVector(params),
ToConstTensorPtrVector(grads), ToConstTensorPtrVector(grads),
...@@ -395,15 +395,15 @@ auto MaxDiff(const Context &ctx, ...@@ -395,15 +395,15 @@ auto MaxDiff(const Context &ctx,
} }
template <typename T, typename PlaceType> template <typename T, typename PlaceType>
void TestMultiTensorAdamBase(const std::vector<std::vector<int64_t>> &shapes, void TestFusedAdamBase(const std::vector<std::vector<int64_t>> &shapes,
float atol, float atol,
bool use_adamw, bool use_adamw,
bool multi_precision = false, bool multi_precision = false,
float beta1 = 0.9, float beta1 = 0.9,
float beta2 = 0.99, float beta2 = 0.99,
float weight_decay = 0.1, float weight_decay = 0.1,
size_t steps = 5, size_t steps = 5,
uint64_t seed = 10) { uint64_t seed = 10) {
const auto &ctx = const auto &ctx =
*paddle::platform::DeviceContextPool::Instance().GetByPlace(PlaceType()); *paddle::platform::DeviceContextPool::Instance().GetByPlace(PlaceType());
using Context = typename std::remove_const< using Context = typename std::remove_const<
...@@ -448,29 +448,28 @@ static auto GenerateRandomShapes(size_t n, uint64_t low, uint64_t high) { ...@@ -448,29 +448,28 @@ static auto GenerateRandomShapes(size_t n, uint64_t low, uint64_t high) {
return shapes; return shapes;
} }
TEST(multi_tensor_adam, test_fp32_cpu) { TEST(fused_adam, test_fp32_cpu) {
auto shapes = GenerateRandomShapes(30, 10, 20); auto shapes = GenerateRandomShapes(30, 10, 20);
float atol = 0.0f; float atol = 0.0f;
for (auto use_adamw : {false, true}) { for (auto use_adamw : {false, true}) {
TestMultiTensorAdamBase<float, CPUPlace>(shapes, atol, use_adamw); TestFusedAdamBase<float, CPUPlace>(shapes, atol, use_adamw);
} }
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(multi_tensor_adam, test_fp32_gpu) { TEST(fused_adam, test_fp32_gpu) {
auto shapes = GenerateRandomShapes(40, 0, 2 << 18); auto shapes = GenerateRandomShapes(40, 0, 2 << 18);
float atol = 0.0f; float atol = 0.0f;
for (auto use_adamw : {false, true}) { for (auto use_adamw : {false, true}) {
TestMultiTensorAdamBase<float, GPUPlace>(shapes, atol, use_adamw); TestFusedAdamBase<float, GPUPlace>(shapes, atol, use_adamw);
} }
} }
TEST(multi_tensor_adam, test_fp16_gpu) { TEST(fused_adam, test_fp16_gpu) {
auto shapes = GenerateRandomShapes(40, 0, 2 << 18); auto shapes = GenerateRandomShapes(40, 0, 2 << 18);
float atol = 5e-3f; float atol = 5e-3f;
for (auto use_adamw : {false, true}) { for (auto use_adamw : {false, true}) {
TestMultiTensorAdamBase<dtype::float16, GPUPlace>( TestFusedAdamBase<dtype::float16, GPUPlace>(shapes, atol, use_adamw, true);
shapes, atol, use_adamw, true);
} }
} }
#endif #endif
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
import paddle
def fused_adam_step(inputs, attributes, num):
'''
Simulate one step of the fused_adam optimizer
:param inputs: dict of inputs
:param attributes: dict of attributes
:return tuple: tuple of output params, moments1, moments2, beta1_pows, beta2_pows
'''
params = inputs['Params']
grads = inputs['Grads']
moments1 = inputs['Moments1']
moments2 = inputs['Moments2']
lr = inputs['LearningRate']
beta1_pows = inputs['Beta1Pows']
beta2_pows = inputs['Beta2Pows']
params_out = []
moments1_out = []
moments2_out = []
beta1_pows_out = []
beta2_pows_out = []
epsilon = attributes['epsilon']
if 'beta1' in attributes:
beta1 = attributes['beta1']
else:
beta1 = inputs['Beta1Tensor'][0][0]
if 'beta2' in attributes:
beta2 = attributes['beta2']
else:
beta2 = inputs['Beta2Tensor'][0][0]
for i in range(num):
moments1_out.append(beta1 * moments1[i][1] + (1 - beta1) * grads[i][1])
moments2_out.append(
beta2 * moments2[i][1] + (1 - beta2) * np.square(grads[i][1])
)
lr_t = lr * np.sqrt(1 - beta2_pows[i][1]) / (1 - beta1_pows[i][1])
params_out.append(
params[i][1]
- lr_t * (moments1_out[i] / (np.sqrt(moments2_out[i]) + epsilon))
)
for i in range(num):
beta1_pows_out.append(
np.array([beta1_pows[i][1]]).astype("float32") * beta1
)
beta2_pows_out.append(
np.array([beta2_pows[i][1]]).astype("float32") * beta2
)
return (
params_out,
moments1_out,
moments2_out,
beta1_pows_out,
beta2_pows_out,
)
class TestFusedAdamOp(OpTest):
def setUp(self):
paddle.enable_static()
'''Test FusedAdam Op with supplied attributes'''
self.__class__.op_type = "fused_adam"
num = 10
inputs_list = [[0] * num] * 6
learning_rate = 0.004
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
beta1_pow = beta1**10
beta2_pow = beta2**10
self.attrs = {
'epsilon': epsilon,
'beta1': beta1,
'beta2': beta2,
"chunk_size": 32 * 2048,
}
for i in range(num):
inputs_list[0][i] = np.random.uniform(-1, 1, (102, 105)).astype(
"float32"
)
inputs_list[1][i] = np.random.uniform(-1, 1, (102, 105)).astype(
"float32"
)
inputs_list[2][i] = np.random.uniform(-1, 1, (102, 105)).astype(
"float32"
)
inputs_list[3][i] = np.random.random((102, 105)).astype("float32")
inputs_list[4][i] = np.array([beta1_pow]).astype("float32")
inputs_list[5][i] = np.array([beta2_pow]).astype("float32")
self.inputs = {
'Params': [
("params" + str(i), inputs_list[0][i]) for i in range(num)
],
'Grads': [
("grads" + str(i), inputs_list[1][i]) for i in range(num)
],
'Moments1': [
("moments1" + str(i), inputs_list[2][i]) for i in range(num)
],
'Moments2': [
("moments2" + str(i), inputs_list[3][i]) for i in range(num)
],
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pows': [
("beta1_pows" + str(i), inputs_list[4][i]) for i in range(num)
],
'Beta2Pows': [
("beta2_pows" + str(i), inputs_list[5][i]) for i in range(num)
],
}
(
params_out,
moments1_out,
moments2_out,
beta1_pows_out,
beta2_pows_out,
) = fused_adam_step(self.inputs, self.attrs, num)
self.outputs = {
'Moments1Out': [
("moments1_out" + str(i), moments1_out[i]) for i in range(num)
],
'Moments2Out': [
("moments2_out" + str(i), moments2_out[i]) for i in range(num)
],
'ParamsOut': [
("params_out" + str(i), params_out[i]) for i in range(num)
],
'Beta1PowsOut': [
("beta1_pows_out" + str(i), beta1_pows_out[i])
for i in range(num)
],
'Beta2PowsOut': [
("beta2_pows_out" + str(i), beta2_pows_out[i])
for i in range(num)
],
}
def test_check_output(self):
paddle.enable_static()
if paddle.is_compiled_with_cuda():
self.check_output()
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册