未验证 提交 10654c77 编写于 作者: Y yuehuayingxueluo 提交者: GitHub

Add MultiTenosrAdam OP (#49220)

* add multi_tenosr_adam

* update multi_tensor_base.py, test_multi_tensor_adam.py, adamw.py

* fix adam.py optimizer.py

* fix adamw.py

* fix test_multi_tensor_adam.py

* fix CI bug

* fix CI coverage

* fix ci bug

* fix betapow

* fix some bugs

* fix test_adamw_op.py

* fix CI coverage

* fix multi_tensor_adam_kernel.cc

* fix CI bug

* fix multi_tensor_adam_op.cc and test_multi_tensor_adam.py

* fix code style

* update C++ parts

* remove python parts modification temporarily

* add C++ ut

* update betapow copy code logic

* fix ci ut

* fix windows ci

* fix coverage ci

* improve coverage rate

---------
Co-authored-by: Nsneaxiy <sneaxiy@126.com>
上级 d676d552
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
using Tensor = phi::DenseTensor;
class MultiTensorAdamOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto param_dtype =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "Params");
return phi::KernelKey(param_dtype, ctx.GetPlace());
}
};
class MultiTensorAdamOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Params", "(Tensor) Input parameters").AsDuplicable();
AddInput("Grads", "(Tensor) Input gradients").AsDuplicable();
AddInput("LearningRate", "(Tensor, default Tensor<float>) Learning rate");
AddInput("Moments1", "(Tensor) Input first moments").AsDuplicable();
AddInput("Moments2", "(Tensor) Input second moments").AsDuplicable();
AddInput("Beta1Pows",
"(Tensor, default Tensor<float>) Input beta1 power accumulator")
.AsDuplicable();
AddInput("Beta2Pows",
"(Tensor, default Tensor<float>) Input beta2 power accumulator")
.AsDuplicable();
AddInput("MasterParams", "FP32 master weight for AMP.")
.AsDispensable()
.AsDuplicable();
AddInput("SkipUpdate", "(Tensor<bool>, optional), Skip the update or not.")
.AsDispensable();
AddOutput("ParamsOut", "(Tensor) Output parameters").AsDuplicable();
AddOutput("Moments1Out", "(Tensor) Output first moments").AsDuplicable();
AddOutput("Moments2Out", "(Tensor) Output second moments").AsDuplicable();
AddOutput("Beta1PowsOut", "(Tensor) Output beta1 power accumulator")
.AsDuplicable();
AddOutput("Beta2PowsOut", "(Tensor) Output beta2 power accumulator")
.AsDuplicable();
AddOutput("MasterParamsOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParams).")
.AsDispensable()
.AsDuplicable();
AddAttr<float>("beta1",
"(float, default 0.9) "
"Exponential decay rate for the "
"first moment estimates.")
.SetDefault(0.9f)
.SupportTensor();
AddAttr<float>("beta2",
"(float, default 0.999) "
"exponential decay rate for the "
"second moment estimates.")
.SetDefault(0.999f)
.SupportTensor();
AddAttr<float>("epsilon",
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0e-8f)
.SupportTensor();
AddAttr<int>("chunk_size", "ChunkSize for blocks computing");
AddAttr<float>("weight_decay",
"(float, default 0) "
"weight decay (L2 penalty)")
.SetDefault(0);
AddAttr<bool>("use_adamw",
"(bool, default False) "
"Whether to use AdamW"
"True for decoupled weight decay")
.SetDefault(false);
AddAttr<bool>("multi_precision",
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
// TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
// as dispensable since they are not used when use_global_beta_pow is true.
AddAttr<bool>("use_global_beta_pow",
"(bool, default false) "
"Whether to use global beta_pow for whole model instead of "
"creating beta_pow for each parameter.")
.SetDefault(false);
AddComment(R"DOC(
Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates:
$$
moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
learning\_rate = learning\_rate *
\frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
$$
AdamW updates:
$$
moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
learning\_rate = learning\_rate *
\frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(multi_tensor_adam,
MultiTensorAdamInferShapeFunctor,
PD_INFER_META(phi::MultiTensorAdamInferMeta));
REGISTER_OPERATOR(
multi_tensor_adam,
ops::MultiTensorAdamOp,
ops::MultiTensorAdamOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MultiTensorAdamInferShapeFunctor);
......@@ -165,6 +165,16 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
"Beta1Pow",
"Beta2Pow",
"MasterParam"}},
{"multi_tensor_adam",
{"Params",
"Grads",
"LearningRate",
"Moments1",
"Moments2",
"Beta1Pows",
"Beta2Pows",
"MasterParams",
"SkipUpdate"}},
{"adamw",
{"Param",
"Grad",
......@@ -321,6 +331,13 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
"Beta1PowOut",
"Beta2PowOut",
"MasterParamOut"}},
{"multi_tensor_adam",
{"ParamsOut",
"Moments1Out",
"Moments2Out",
"Beta1PowsOut",
"Beta2PowsOut",
"MasterParamsOut"}},
{"adamw",
{"ParamOut",
"Moment1Out",
......@@ -382,6 +399,13 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
"Beta1PowOut",
"Beta2PowOut",
"MasterParamOut"}},
{"multi_tensor_adam",
{"ParamsOut",
"Moments1Out",
"Moments2Out",
"Beta1PowsOut",
"Beta2PowsOut",
"MasterParamsOut"}},
{"adamw",
{"ParamOut",
"Moment1Out",
......
......@@ -1225,6 +1225,17 @@
optional : master_param
inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
- op : multi_tensor_adam_
args : (Tensor[] params, Tensor[] grads, Tensor learning_rate, Tensor[] moments1, Tensor[] moments2, Tensor[] beta1_pows, Tensor[] beta2_pows, Tensor[] master_params, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, int chunk_size, float weight_decay, bool use_adamw, bool multi_precision, bool use_global_beta_pow)
output : Tensor[](params_out){params.size()}, Tensor[](moments1_out){params.size()}, Tensor[](moments2_out){params.size()}, Tensor[](beta1_pows_out){params.size()}, Tensor[](beta2_pows_out){params.size()}, Tensor[](master_params_out){params.size()}
infer_meta :
func : MultiTensorAdamInferMeta
kernel :
func : multi_tensor_adam
data_type : params
optional : skip_update, master_params
inplace : (params -> params_out), (moments1 -> moments1_out), (moments2 -> moments2_out), (beta1_pows -> beta1_pows_out), (beta2_pows -> beta2_pows_out), (master_params -> master_params_out)
- op : multiclass_nms3
args : (Tensor bboxes, Tensor scores, Tensor rois_num, float score_threshold, int nms_top_k, int keep_top_k, float nms_threshold=0.3, bool normalized=true, float nms_eta=1.0, int background_label=0)
output : Tensor(out), Tensor(index), Tensor(nms_rois_num)
......
......@@ -2981,6 +2981,49 @@ void YoloLossInferMeta(const MetaTensor& x,
gt_match_mask->set_dtype(x.dtype());
}
void MultiTensorAdamInferMeta(
const std::vector<const MetaTensor*>& params,
const std::vector<const MetaTensor*>& grads,
const MetaTensor& learning_rate,
const std::vector<const MetaTensor*>& moments1,
const std::vector<const MetaTensor*>& moments2,
const std::vector<const MetaTensor*>& beta1_pows,
const std::vector<const MetaTensor*>& beta2_pows,
const paddle::optional<std::vector<const MetaTensor*>>& master_params,
const MetaTensor& skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
int chunk_size,
float weight_decay,
bool use_adamw,
bool multi_precision,
bool use_global_beta_pow,
std::vector<MetaTensor*> params_out,
std::vector<MetaTensor*> moments1_out,
std::vector<MetaTensor*> moments2_out,
std::vector<MetaTensor*> beta1_pows_out,
std::vector<MetaTensor*> beta2_pows_out,
std::vector<MetaTensor*> master_params_out) {
size_t in_size = params.size();
for (size_t i = 0; i < in_size; i++) {
params_out[i]->set_dims(params[i]->dims());
params_out[i]->set_dtype(params[i]->dtype());
moments1_out[i]->set_dims(moments1[i]->dims());
moments1_out[i]->set_dtype(moments1[i]->dtype());
moments2_out[i]->set_dims(moments2[i]->dims());
moments2_out[i]->set_dtype(moments2[i]->dtype());
beta1_pows_out[i]->set_dims(beta1_pows[i]->dims());
beta1_pows_out[i]->set_dtype(beta1_pows[i]->dtype());
beta2_pows_out[i]->set_dims(beta2_pows[i]->dims());
beta2_pows_out[i]->set_dtype(beta2_pows[i]->dtype());
if (master_params && !master_params_out.empty()) {
master_params_out[i]->set_dims(master_params.get()[i]->dims());
master_params_out[i]->set_dtype(master_params.get()[i]->dtype());
}
}
}
void MoeInferMeta(const MetaTensor& x,
const MetaTensor& gate,
const MetaTensor& bmm0,
......
......@@ -532,6 +532,31 @@ void YoloLossInferMeta(const MetaTensor& x,
MetaTensor* objectness_mask,
MetaTensor* gt_match_mask);
void MultiTensorAdamInferMeta(
const std::vector<const MetaTensor*>& params,
const std::vector<const MetaTensor*>& grads,
const MetaTensor& learning_rate,
const std::vector<const MetaTensor*>& moments1,
const std::vector<const MetaTensor*>& moments2,
const std::vector<const MetaTensor*>& beta1_pows,
const std::vector<const MetaTensor*>& beta2_pows,
const paddle::optional<std::vector<const MetaTensor*>>& master_params,
const MetaTensor& skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
int chunk_size,
float weight_decay,
bool use_adamw,
bool multi_precision,
bool use_global_beta_pow,
std::vector<MetaTensor*> params_out,
std::vector<MetaTensor*> moments1_out,
std::vector<MetaTensor*> moments2_out,
std::vector<MetaTensor*> beta1_pows_out,
std::vector<MetaTensor*> beta2_pows_out,
std::vector<MetaTensor*> master_params_out);
void MoeInferMeta(const MetaTensor& x,
const MetaTensor& gate,
const MetaTensor& bmm0,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/multi_tensor_adam_kernel.h"
#include <vector>
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/adam_kernel.h"
#include "paddle/phi/kernels/adamw_kernel.h"
namespace phi {
static paddle::optional<DenseTensor> TensorPtrToOptionalTensor(
const paddle::optional<std::vector<const DenseTensor*>>& t, size_t idx) {
return t ? paddle::optional<DenseTensor>(*(t.get()[idx])) : paddle::none;
}
template <typename T, typename Context>
void MultiTensorAdamKernel(
const Context& dev_ctx,
const std::vector<const DenseTensor*>& params,
const std::vector<const DenseTensor*>& grads,
const DenseTensor& learning_rate,
const std::vector<const DenseTensor*>& moments1,
const std::vector<const DenseTensor*>& moments2,
const std::vector<const DenseTensor*>& beta1_pows,
const std::vector<const DenseTensor*>& beta2_pows,
const paddle::optional<std::vector<const DenseTensor*>>& master_params,
const paddle::optional<DenseTensor>& skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
int chunk_size,
float weight_decay,
bool use_adamw,
bool multi_precision,
bool use_global_beta_pow,
std::vector<DenseTensor*> params_out,
std::vector<DenseTensor*> moments1_out,
std::vector<DenseTensor*> moments2_out,
std::vector<DenseTensor*> beta1_pows_out,
std::vector<DenseTensor*> beta2_pows_out,
std::vector<DenseTensor*> master_params_out) {
size_t params_num = params.size();
PADDLE_ENFORCE_EQ(
params_num,
grads.size(),
errors::InvalidArgument("The size of Input(grads) must be equal to "
"Input(params), but got the size of Input(grads) "
"is %d, the size of Input(params) is %d.",
grads.size(),
params_num));
PADDLE_ENFORCE_EQ(params_num,
moments1.size(),
errors::InvalidArgument(
"The size of Input(moments1) must be equal to "
"Input(params), but got the size of Input(moments1) "
"is %d, the size of Input(params) is %d.",
moments1.size(),
params_num));
PADDLE_ENFORCE_EQ(params_num,
moments2.size(),
errors::InvalidArgument(
"The size of Input(moments2) must be equal to "
"Input(params), but got the size of Input(moments2) "
"is %d, the size of Input(params) is %d.",
moments2.size(),
params_num));
PADDLE_ENFORCE_EQ(params_num,
beta1_pows.size(),
errors::InvalidArgument(
"The size of Input(beta1_pows) must be equal to "
"Input(params), but got the size of Input(beta1_pows) "
"is %d, the size of Input(params) is %d.",
beta1_pows.size(),
params_num));
PADDLE_ENFORCE_EQ(params_num,
beta2_pows.size(),
errors::InvalidArgument(
"The size of Input(beta2_pows) must be equal to "
"Input(params), but got the size of Input(beta2_pows) "
"is %d, the size of Input(params) is %d.",
beta2_pows.size(),
params_num));
for (size_t idx = 0; idx < params_num; idx++) {
auto master_params_tmp = TensorPtrToOptionalTensor(master_params, idx);
if (!use_adamw) {
AdamDenseKernel<T, Context>(
dev_ctx,
*params[idx],
*grads[idx],
learning_rate,
*moments1[idx],
*moments2[idx],
*beta1_pows[idx],
*beta2_pows[idx],
master_params_tmp,
skip_update,
beta1,
beta2,
epsilon,
false,
1000,
multi_precision,
use_global_beta_pow,
params_out[idx],
moments1_out[idx],
moments2_out[idx],
beta1_pows_out[idx],
beta2_pows_out[idx],
master_params_out.empty() ? nullptr : master_params_out[idx]);
} else {
AdamwDenseKernel<T, Context>(
dev_ctx,
*params[idx],
*grads[idx],
learning_rate,
*moments1[idx],
*moments2[idx],
*beta1_pows[idx],
*beta2_pows[idx],
master_params_tmp,
skip_update,
beta1,
beta2,
epsilon,
1.0,
weight_decay,
use_adamw,
false,
1000,
multi_precision,
use_global_beta_pow,
params_out[idx],
moments1_out[idx],
moments2_out[idx],
beta1_pows_out[idx],
beta2_pows_out[idx],
master_params_out.empty() ? nullptr : master_params_out[idx]);
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(multi_tensor_adam,
CPU,
ALL_LAYOUT,
phi::MultiTensorAdamKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/tensor_utils.h"
namespace phi {
namespace funcs {
// This code is referenced from apex's multi_tensor_apply.cuh.
// https://github.com/NVIDIA/apex
template <int N, int MaxTensorSize, int MaxBlockSize>
struct TensorAndBlockInfo {
void *tensor_addrs[N - 1][MaxTensorSize];
const void *grads[MaxTensorSize];
int sizes[MaxTensorSize];
uint8_t tensor_ids[MaxBlockSize];
// int16
uint16_t chunk_ids[MaxBlockSize];
int start_chunk_id;
DEVICE void GetChunkIdAndTensorId(int *chunk_id, int *tensor_id) const {
int block_id = blockIdx.x;
int tmp_tensor_id = tensor_ids[block_id];
*chunk_id = static_cast<int>(chunk_ids[block_id]) +
(tmp_tensor_id == 0) * start_chunk_id;
*tensor_id = tmp_tensor_id;
}
};
template <int N,
int MaxTensorSize,
int MaxBlockSize,
typename Functor,
typename... ArgTypes>
__global__ void MultiTensorApplyCudaKernel(
int chunk_size,
TensorAndBlockInfo<N, MaxTensorSize, MaxBlockSize> t_info,
Functor functor,
ArgTypes... args) {
functor(chunk_size, t_info, args...);
}
template <int InputNum,
int MaxTensorSize,
int MaxBlockSize,
typename Functor,
typename Context,
typename... ArgTypes>
void LaunchMultiTensorApplyKernel(
const Context &dev_ctx,
int block_size,
int chunk_size,
const std::vector<std::vector<DenseTensor *>> &input_vector,
const std::vector<const DenseTensor *> &grads,
Functor functor,
ArgTypes... args) {
PADDLE_ENFORCE_EQ(
input_vector.size(),
InputNum - 1,
errors::InvalidArgument(
"input_vector.size() != InputNum - 1, the input vector's size is "
"unequal to InputNum - 1, please cheack grads, params, momemts1, "
"moments2, and, master_params."));
size_t length = input_vector[0].size();
PADDLE_ENFORCE_GT(
length,
0,
errors::InvalidArgument(
"input_vector[0].size() is not > 0, please cheack params."));
auto place = input_vector[0][0]->place();
PADDLE_ENFORCE_EQ(
place,
GPUPlace(),
errors::InvalidArgument(
"expected input to be on gpu, but input is on cpu now."));
for (size_t i = 0; i < input_vector.size(); i++) {
PADDLE_ENFORCE_EQ(
input_vector[i].size(),
length,
errors::InvalidArgument(
"some input vectors' size mismatch other input vector."));
for (size_t j = 0; j < input_vector[i].size(); j++) {
PADDLE_ENFORCE_EQ(
input_vector[i][j]->place(),
place,
errors::InvalidArgument(
"A tensor was not on the same device as the first tensor"));
PADDLE_ENFORCE_EQ(input_vector[i][j]->numel(),
input_vector[0][j]->numel(),
errors::InvalidArgument(
"The number of elements of Inputs must be equal."));
}
}
size_t tensors_size = input_vector[0].size();
TensorAndBlockInfo<InputNum, MaxTensorSize, MaxBlockSize> t_info;
t_info.start_chunk_id = 0;
auto stream = dev_ctx.stream();
int block_id = 0;
int tensor_id = 0;
for (int t = 0; t < tensors_size; t++) {
t_info.sizes[tensor_id] = input_vector[0][t]->numel();
t_info.grads[tensor_id] = grads[t]->data();
for (int d = 0; d < InputNum - 1; d++) {
t_info.tensor_addrs[d][tensor_id] = input_vector[d][t]->data();
}
tensor_id++;
int chunks_this_tensor =
(input_vector[0][t]->numel() + chunk_size - 1) / chunk_size;
constexpr auto kMaxChunkId = std::numeric_limits<uint16_t>::max();
for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
t_info.tensor_ids[block_id] = tensor_id - 1;
auto saved_chunk_id =
(tensor_id == 1 ? chunk - t_info.start_chunk_id : chunk);
PADDLE_ENFORCE_GE(saved_chunk_id,
0,
errors::InvalidArgument(
"The chunk id is less than 0 in "
"MultiTensorApplyKernel. This may be a bug."));
PADDLE_ENFORCE_LE(
saved_chunk_id,
kMaxChunkId,
errors::InvalidArgument(
"The chunk id exceeds maximum value %d. This may be a bug.",
kMaxChunkId));
t_info.chunk_ids[block_id] = saved_chunk_id;
block_id++;
bool reach_tensors_limit =
(tensor_id == MaxTensorSize && chunk == chunks_this_tensor - 1);
bool reach_blocks_limit = (block_id == MaxBlockSize);
bool finish_compute =
(t == tensors_size - 1 && chunk == chunks_this_tensor - 1);
if (reach_tensors_limit || reach_blocks_limit || finish_compute) {
MultiTensorApplyCudaKernel<InputNum,
MaxTensorSize,
MaxBlockSize,
Functor,
ArgTypes...>
<<<block_id, block_size, 0, stream>>>(
chunk_size, t_info, functor, args...);
block_id = 0;
if (chunk == chunks_this_tensor - 1) {
tensor_id = 0;
t_info.start_chunk_id = 0;
} else {
t_info.sizes[0] = t_info.sizes[tensor_id - 1];
t_info.grads[0] = t_info.grads[tensor_id - 1];
for (int d = 0; d < InputNum - 1; d++) {
t_info.tensor_addrs[d][0] = t_info.tensor_addrs[d][tensor_id - 1];
}
tensor_id = 1;
t_info.start_chunk_id = chunk + 1;
}
}
}
}
}
} // namespace funcs
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/multi_tensor_adam_kernel.h"
#include <vector>
#include "glog/logging.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/aligned_vector.h"
#include "paddle/phi/kernels/funcs/multi_tensor_apply.h"
namespace phi {
// This code is referenced from apex's multi_tensor_adam.cu.
// https://github.com/NVIDIA/apex
template <typename T, bool CPUBetaPows /*=true*/>
struct MultiTensorAdamBetaPowInfo {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
MultiTensorAdamBetaPowInfo(const MPDType* beta1pow, const MPDType* beta2pow) {
beta1pow_ = *beta1pow;
beta2pow_ = *beta2pow;
}
DEVICE MPDType GetBeta1PowValue() const { return beta1pow_; }
DEVICE MPDType GetBeta2PowValue() const { return beta2pow_; }
private:
MPDType beta1pow_;
MPDType beta2pow_;
};
template <typename T>
struct MultiTensorAdamBetaPowInfo<T, /*CPUBetaPows=*/false> {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
MultiTensorAdamBetaPowInfo(const MPDType* beta1pow, const MPDType* beta2pow) {
beta1pow_ = beta1pow;
beta2pow_ = beta2pow;
}
DEVICE MPDType GetBeta1PowValue() const { return *beta1pow_; }
DEVICE MPDType GetBeta2PowValue() const { return *beta2pow_; }
private:
const MPDType* __restrict__ beta1pow_;
const MPDType* __restrict__ beta2pow_;
};
template <typename T,
typename MT,
int VecSize,
bool IsMultiPrecision,
bool IsCPUBetaPow,
bool UseAdamW,
int N,
int MaxTensorSize,
int MaxBlockSize>
struct MultiTensorAdamFunctor {
__device__ __forceinline__ void operator()(
int chunk_size,
const funcs::TensorAndBlockInfo<N, MaxTensorSize, MaxBlockSize>& t_info,
MT beta1,
MT beta2,
MultiTensorAdamBetaPowInfo<T, IsCPUBetaPow> beta_pow,
MT epsilon,
const MT* learning_rate,
MT decay) const {
MT lr = *learning_rate;
MT beta1_pow = beta_pow.GetBeta1PowValue();
MT beta2_pow = beta_pow.GetBeta2PowValue();
T* __restrict__ p_ptr;
const T* __restrict__ g_ptr;
MT* __restrict__ mom1_ptr, * __restrict__ mom2_ptr;
MT* __restrict__ mp_ptr;
int n;
{
int chunk_id, tensor_id;
t_info.GetChunkIdAndTensorId(&chunk_id, &tensor_id);
n = t_info.sizes[tensor_id];
int offset = chunk_id * chunk_size;
g_ptr = static_cast<const T*>(t_info.grads[tensor_id]) + offset;
p_ptr = static_cast<T*>(t_info.tensor_addrs[0][tensor_id]) + offset;
mom1_ptr = static_cast<MT*>(t_info.tensor_addrs[1][tensor_id]) + offset;
mom2_ptr = static_cast<MT*>(t_info.tensor_addrs[2][tensor_id]) + offset;
mp_ptr =
IsMultiPrecision
? static_cast<MT*>(t_info.tensor_addrs[3][tensor_id]) + offset
: nullptr;
n -= offset;
if (n > chunk_size) {
n = chunk_size;
}
}
int stride = blockDim.x * VecSize;
int idx = threadIdx.x * VecSize;
for (; idx < n; idx += stride) {
phi::AlignedVector<T, VecSize> g_vec;
phi::AlignedVector<T, VecSize> p_vec;
phi::AlignedVector<MT, VecSize> mp_vec;
phi::AlignedVector<MT, VecSize> mom1_vec;
phi::AlignedVector<MT, VecSize> mom2_vec;
if (idx <= n - VecSize) {
if (IsMultiPrecision) {
phi::Load<MT, VecSize>(mp_ptr + idx, &mp_vec);
} else {
phi::Load<T, VecSize>(p_ptr + idx, &p_vec);
}
phi::Load<T, VecSize>(g_ptr + idx, &g_vec);
phi::Load<MT, VecSize>(mom1_ptr + idx, &mom1_vec);
phi::Load<MT, VecSize>(mom2_ptr + idx, &mom2_vec);
} else {
int size = n - idx;
for (int j = 0; j < size; j++) {
if (IsMultiPrecision) {
mp_vec[j] = mp_ptr[idx + j];
} else {
p_vec[j] = p_ptr[idx + j];
}
g_vec[j] = g_ptr[idx + j];
mom1_vec[j] = static_cast<MT>(mom1_ptr[idx + j]);
mom2_vec[j] = static_cast<MT>(mom2_ptr[idx + j]);
}
#pragma unroll
for (int j = size; j < VecSize; j++) {
g_vec[j] = T(0);
p_vec[j] = T(0);
mp_vec[j] = MT(0);
mom1_vec[j] = MT(0);
mom2_vec[j] = MT(0);
}
}
#pragma unroll
for (int j = 0; j < VecSize; j++) {
MT p = IsMultiPrecision ? mp_vec[j] : static_cast<MT>(p_vec[j]);
UpdateMoments(&mom1_vec[j],
&mom2_vec[j],
static_cast<MT>(g_vec[j]),
beta1,
beta2);
mp_vec[j] = UpdateParameter(p,
mom1_vec[j],
mom2_vec[j],
beta1_pow,
beta2_pow,
lr,
epsilon,
decay);
}
if (idx <= n - VecSize) {
phi::Store<MT, VecSize>(mom1_vec, mom1_ptr + idx);
phi::Store<MT, VecSize>(mom2_vec, mom2_ptr + idx);
if (IsMultiPrecision) {
phi::Store<MT, VecSize>(mp_vec, mp_ptr + idx);
}
for (int j = 0; j < VecSize; j++) {
p_ptr[idx + j] = static_cast<T>(mp_vec[j]);
}
} else {
int size = n - idx;
for (int j = 0; j < size; j++) {
if (IsMultiPrecision) {
mp_ptr[idx + j] = mp_vec[j];
}
p_ptr[idx + j] = static_cast<T>(mp_vec[j]);
mom1_ptr[idx + j] = mom1_vec[j];
mom2_ptr[idx + j] = mom2_vec[j];
}
}
}
}
private:
static __device__ __forceinline__ void UpdateMoments(
MT* __restrict__ mom1_ptr,
MT* __restrict__ mom2_ptr,
MT g,
MT beta1,
MT beta2) {
MT mom1 = static_cast<MT>(mom1_ptr[0]);
MT mom2 = static_cast<MT>(mom2_ptr[0]);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
mom1_ptr[0] = mom1;
mom2_ptr[0] = mom2;
}
static __device__ __forceinline__ MT UpdateParameter(MT p,
MT mom1,
MT mom2,
MT beta1_pow,
MT beta2_pow,
MT lr,
MT epsilon,
MT decay) {
if (UseAdamW) {
p *= (static_cast<MT>(1.0) - lr * decay);
}
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
return p;
}
};
template <typename T, int N>
__global__ void UpdateBetaPowGroup(
Array<T*, N> beta1_pow, Array<T*, N> beta2_pow, T beta1, T beta2, int n) {
auto idx = threadIdx.x;
if (idx < n) {
beta1_pow[idx][0] *= beta1;
beta2_pow[idx][0] *= beta2;
}
}
template <typename Context>
static void CopyTensorIfDifferent(const Context& dev_ctx,
const std::vector<const DenseTensor*>& src,
const std::vector<DenseTensor*>& dst,
bool use_src_place = false) {
for (size_t i = 0; i < src.size(); ++i) {
if (src[i] != dst[i]) {
VLOG(10) << "Copy Tensor " << i;
phi::Place place = (use_src_place ? src[i]->place() : dev_ctx.GetPlace());
phi::Copy<Context>(dev_ctx, *(src[i]), place, false, dst[i]);
}
}
}
template <typename T, typename TensorT>
static int GetVecSizeFromTensors(const std::vector<TensorT*>& tensors,
int vec_size = 4) {
for (const auto* t : tensors) {
vec_size = min(vec_size, GetVectorizedSize(t->template data<T>()));
}
return vec_size;
}
template <typename T, typename Context>
void MultiTensorAdamKernel(
const Context& dev_ctx,
const std::vector<const DenseTensor*>& params,
const std::vector<const DenseTensor*>& grads,
const DenseTensor& learning_rate,
const std::vector<const DenseTensor*>& moments1,
const std::vector<const DenseTensor*>& moments2,
const std::vector<const DenseTensor*>& beta1_pows,
const std::vector<const DenseTensor*>& beta2_pows,
const paddle::optional<std::vector<const DenseTensor*>>& master_params,
const paddle::optional<DenseTensor>& skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
int chunk_size,
float weight_decay,
bool use_adamw,
bool multi_precision,
bool use_global_beta_pow,
std::vector<DenseTensor*> params_out,
std::vector<DenseTensor*> moments1_out,
std::vector<DenseTensor*> moments2_out,
std::vector<DenseTensor*> beta1_pows_out,
std::vector<DenseTensor*> beta2_pows_out,
std::vector<DenseTensor*> master_params_out) {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
auto n = params.size();
auto beta1_pow_first = beta1_pows[0];
auto beta2_pow_first = beta2_pows[0];
for (int i = 1; i < beta1_pows.size(); i++) {
PADDLE_ENFORCE_EQ(beta1_pow_first->place(),
beta1_pows[i]->place(),
phi::errors::InvalidArgument(
"All Beta1Pow must be in the same place."));
PADDLE_ENFORCE_EQ(beta2_pow_first->place(),
beta2_pows[i]->place(),
phi::errors::InvalidArgument(
"All Beta2Pow must be in the same place."));
}
PADDLE_ENFORCE_EQ(
beta1_pow_first->place(),
beta2_pow_first->place(),
phi::errors::InvalidArgument(
"Input(Beta1Pows) and Input(Beta2Pows) must be in the same place."));
bool is_cpu_betapow = (beta1_pow_first->place() == CPUPlace());
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
CopyTensorIfDifferent(dev_ctx, params, params_out);
CopyTensorIfDifferent(dev_ctx, moments1, moments1_out);
CopyTensorIfDifferent(dev_ctx, moments2, moments2_out);
CopyTensorIfDifferent(dev_ctx, beta1_pows, beta1_pows_out, true);
CopyTensorIfDifferent(dev_ctx, beta2_pows, beta2_pows_out, true);
if (master_params) {
CopyTensorIfDifferent(dev_ctx, master_params.get(), master_params_out);
}
bool skip_update_value = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
DenseTensor skip_update_tensor;
phi::Copy(
dev_ctx, skip_update.get(), CPUPlace(), false, &skip_update_tensor);
skip_update_value = skip_update_tensor.data<bool>()[0];
VLOG(4) << "skip_update_value:" << skip_update_value;
}
// skip_update=true
if (skip_update_value) {
VLOG(4) << "Adam skip update";
return;
}
MPDType beta1_tmp = beta1.to<MPDType>();
MPDType beta2_tmp = beta2.to<MPDType>();
std::vector<std::vector<DenseTensor*>> input_vector;
input_vector.reserve(4);
input_vector.push_back(params_out);
input_vector.push_back(moments1_out);
input_vector.push_back(moments2_out);
if (multi_precision) {
input_vector.push_back(master_params_out);
}
VLOG(4) << "use_adamw: " << use_adamw;
VLOG(4) << "multi_precision: " << multi_precision;
#define PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
__multi_precision, __is_cpu_betapow, __use_adamw, __vec_size) \
do { \
constexpr int kInputNum = __multi_precision ? 5 : 4; \
constexpr int kMaxTensorSize = __multi_precision ? 48 : 60; \
constexpr int kMaxBlockSize = __multi_precision ? 320 : 320; \
constexpr int kBlockSize = 512; \
MultiTensorAdamBetaPowInfo<T, __is_cpu_betapow> beta_pow_info( \
beta1_pow_first->data<MPDType>(), beta2_pow_first->data<MPDType>()); \
MultiTensorAdamFunctor<T, \
MPDType, \
__vec_size, \
__multi_precision, \
__is_cpu_betapow, \
__use_adamw, \
kInputNum, \
kMaxTensorSize, \
kMaxBlockSize> \
functor; \
funcs::LaunchMultiTensorApplyKernel<kInputNum, \
kMaxTensorSize, \
kMaxBlockSize>( \
dev_ctx, \
kBlockSize, \
((chunk_size + __vec_size - 1) / __vec_size) * __vec_size, \
input_vector, \
grads, \
functor, \
beta1_tmp, \
beta2_tmp, \
beta_pow_info, \
epsilon.to<MPDType>(), \
learning_rate.data<MPDType>(), \
static_cast<MPDType>(weight_decay)); \
} while (0)
#define PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL(__vec_size) \
case __vec_size: { \
if (multi_precision) { \
if (is_cpu_betapow) { \
if (use_adamw) { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
true, true, true, __vec_size); \
} else { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
true, true, false, __vec_size); \
} \
} else { \
if (use_adamw) { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
true, false, true, __vec_size); \
} else { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
true, false, false, __vec_size); \
} \
} \
} else { \
if (is_cpu_betapow) { \
if (use_adamw) { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
false, true, true, __vec_size); \
} else { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
false, true, false, __vec_size); \
} \
} else { \
if (use_adamw) { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
false, false, true, __vec_size); \
} else { \
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL_BASE( \
false, false, false, __vec_size); \
} \
} \
} \
} break
int vec_size = GetVecSizeFromTensors<T>(params_out);
vec_size = GetVecSizeFromTensors<MPDType>(moments1_out, vec_size);
vec_size = GetVecSizeFromTensors<MPDType>(moments2_out, vec_size);
if (master_params) {
vec_size = GetVecSizeFromTensors<MPDType>(master_params_out, vec_size);
}
switch (vec_size) {
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL(4);
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL(2);
PD_LAUNCH_MULTI_TENSOR_APPLY_ADAM_KERNEL(1);
default:
PADDLE_THROW(
errors::InvalidArgument("Unsupported vectorized size %d", vec_size));
break;
}
if (!use_global_beta_pow) {
if (is_cpu_betapow) {
for (size_t i = 0; i < n; i++) {
VLOG(10) << "CPU Update BetaPow here...";
auto* beta1_ptr =
dev_ctx.template HostAlloc<MPDType>(beta1_pows_out[i]);
(*beta1_ptr) *= beta1_tmp;
auto* beta2_ptr =
dev_ctx.template HostAlloc<MPDType>(beta2_pows_out[i]);
(*beta2_ptr) *= beta2_tmp;
}
} else {
constexpr size_t kGroupSize = 32;
auto group_num = (n + kGroupSize - 1) / kGroupSize;
VLOG(10) << "GPU Update BetaPow here...";
for (size_t i = 0; i < group_num; ++i) {
size_t start = i * kGroupSize;
size_t end = std::min((i + 1) * kGroupSize, n);
Array<MPDType*, kGroupSize> beta1_ptrs, beta2_ptrs;
for (size_t j = start; j < end; ++j) {
size_t idx = j - start;
beta1_ptrs[idx] = dev_ctx.template Alloc<MPDType>(beta1_pows_out[j]);
beta2_ptrs[idx] = dev_ctx.template Alloc<MPDType>(beta2_pows_out[j]);
}
UpdateBetaPowGroup<MPDType, kGroupSize>
<<<1, kGroupSize, 0, dev_ctx.stream()>>>(
beta1_ptrs, beta2_ptrs, beta1_tmp, beta2_tmp, end - start);
}
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(multi_tensor_adam,
GPU,
ALL_LAYOUT,
phi::MultiTensorAdamKernel,
phi::dtype::float16,
float,
double) {
// Skip beta1_pow, beta2_pow, skip_update data transform
kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND);
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void MultiTensorAdamKernel(
const Context &dev_ctx,
const std::vector<const DenseTensor *> &params,
const std::vector<const DenseTensor *> &grads,
const DenseTensor &learning_rate,
const std::vector<const DenseTensor *> &moments1,
const std::vector<const DenseTensor *> &moments2,
const std::vector<const DenseTensor *> &beta1_pows,
const std::vector<const DenseTensor *> &beta2_pows,
const paddle::optional<std::vector<const DenseTensor *>> &master_params,
const paddle::optional<DenseTensor> &skip_update,
const Scalar &beta1,
const Scalar &beta2,
const Scalar &epsilon,
int chunk_size,
float weight_decay,
bool use_adamw,
bool multi_precision,
bool use_global_beta_pow,
std::vector<DenseTensor *> params_out,
std::vector<DenseTensor *> moments1_out,
std::vector<DenseTensor *> moments2_out,
std::vector<DenseTensor *> beta1_pows_out,
std::vector<DenseTensor *> beta2_pows_out,
std::vector<DenseTensor *> master_params_out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/utils/small_vector.h"
namespace phi {
KernelSignature MultiTensorAdamOpArgumentMapping(
const ArgumentMappingContext& ctx) {
paddle::small_vector<const char*> in_names = {"Params",
"Grads",
"LearningRate",
"Moments1",
"Moments2",
"Beta1Pow",
"Beta2Pow",
"MasterParams",
"SkipUpdate"};
paddle::small_vector<const char*> out_names = {"ParamsOut",
"Moments1Out",
"Moments2Out",
"Beta1PowOut",
"Beta2PowOut",
"MasterParamsOut"};
paddle::small_vector<const char*> attr_names = {"beta1",
"beta2",
"epsilon",
"chunk_size",
"weight_decay",
"use_adamw",
"multi_precision",
"use_global_beta_pow"};
return KernelSignature("multi_tensor_adam",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(multi_tensor_adam,
phi::MultiTensorAdamOpArgumentMapping);
......@@ -76,6 +76,10 @@ if(WITH_GPU)
test_auto_tune
SRCS test_auto_tune.cu
DEPS gtest)
cc_test(
test_multi_tensor_adam_kernel
SRCS test_multi_tensor_adam_kernel.cc
DEPS gtest phi phi_api_utils)
elseif(WITH_ROCM)
hip_test(
test_gpu_timer
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/generator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/gpu/gpu_context.h"
#endif
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/infermeta/multiary.h"
#include "paddle/phi/kernels/abs_kernel.h"
#include "paddle/phi/kernels/adam_kernel.h"
#include "paddle/phi/kernels/adamw_kernel.h"
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
#include "paddle/phi/kernels/full_kernel.h"
#include "paddle/phi/kernels/gaussian_kernel.h"
#include "paddle/phi/kernels/multi_tensor_adam_kernel.h"
#include "paddle/phi/kernels/reduce_max_kernel.h"
namespace phi {
template <typename T, typename Context>
auto GenerateRandomTensorVectors(
const Context &ctx, const std::vector<std::vector<int64_t>> &shapes) {
size_t n = shapes.size();
std::vector<DenseTensor> tensors(n);
for (size_t i = 0; i < n; ++i) {
GaussianKernel<T, Context>(
ctx,
shapes[i],
0.0f,
1.0f,
0,
paddle::experimental::CppTypeToDataType<T>::Type(),
&tensors[i]);
}
return tensors;
}
template <typename T, typename Context>
auto GenerateConstantTensorVectors(
const Context &ctx,
const std::vector<std::vector<int64_t>> &shapes,
T value) {
size_t n = shapes.size();
std::vector<DenseTensor> tensors(n);
for (size_t i = 0; i < n; ++i) {
FullKernel<T, Context>(ctx,
shapes[i],
value,
paddle::experimental::CppTypeToDataType<T>::Type(),
&tensors[i]);
}
return tensors;
}
static auto ToConstTensorPtrVector(const std::vector<DenseTensor> &tensors) {
std::vector<const DenseTensor *> results;
for (const auto &t : tensors) {
results.push_back(&t);
}
return results;
}
static auto ToMutableTensorPtrVector(
std::vector<DenseTensor> &tensors) { // NOLINT
std::vector<DenseTensor *> results;
for (auto &t : tensors) {
results.push_back(&t);
}
return results;
}
static auto ToMetaTensorVector(const std::vector<DenseTensor> &tensors) {
std::vector<MetaTensor> results;
for (auto &t : tensors) {
results.push_back(t);
}
return results;
}
static auto ToConstMetaTensorPtrVector(
const std::vector<MetaTensor> &meta_tensors) {
std::vector<const MetaTensor *> results;
for (auto &t : meta_tensors) {
results.push_back(&t);
}
return results;
}
static auto ToMutableMetaTensorPtrVector(
std::vector<MetaTensor> &meta_tensors) { // NOLINT
std::vector<MetaTensor *> results;
for (auto &t : meta_tensors) {
results.push_back(&t);
}
return results;
}
template <typename T, typename Context>
struct AdamInfo {
const Context *ctx;
std::vector<std::vector<int64_t>> shapes;
std::vector<DenseTensor> params;
std::vector<DenseTensor> master_params;
std::vector<DenseTensor> moment1s;
std::vector<DenseTensor> moment2s;
std::vector<DenseTensor> beta1_pows;
std::vector<DenseTensor> beta2_pows;
DenseTensor learning_rate;
float beta1;
float beta2;
float weight_decay;
float epsilon = 1e-6;
bool multi_precision;
bool use_adamw;
int chunk_size = 4096;
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
AdamInfo(const Context &ctx_ref,
const std::vector<std::vector<int64_t>> &shapes,
float beta1,
float beta2,
float weight_decay,
bool multi_precision,
bool use_adamw)
: ctx(&ctx_ref),
shapes(shapes),
beta1(beta1),
beta2(beta2),
weight_decay(weight_decay),
multi_precision(multi_precision),
use_adamw(use_adamw) {
std::vector<std::vector<int64_t>> one_shapes(shapes.size(),
std::vector<int64_t>(1, 1));
std::vector<std::vector<int64_t>> learning_rate_shapes(
one_shapes.begin(), one_shapes.begin() + 1);
params = GenerateRandomTensorVectors<T, Context>(*ctx, shapes);
learning_rate = GenerateConstantTensorVectors<MT, Context>(
*ctx, learning_rate_shapes, 1e-3)[0];
moment1s = GenerateConstantTensorVectors<MT, Context>(*ctx, shapes, 0);
moment2s = GenerateConstantTensorVectors<MT, Context>(*ctx, shapes, 0);
if (multi_precision) {
master_params.resize(shapes.size());
for (size_t i = 0; i < shapes.size(); ++i) {
master_params[i] = Cast<T, Context>(
*ctx,
params[i],
paddle::experimental::CppTypeToDataType<MT>::Type());
}
}
beta1_pows =
GenerateConstantTensorVectors<MT, Context>(*ctx, one_shapes, beta1);
beta2_pows =
GenerateConstantTensorVectors<MT, Context>(*ctx, one_shapes, beta2);
}
void Update(bool use_multi_tensor, const std::vector<DenseTensor> &grads) {
if (use_multi_tensor) {
UpdateWithMultiTensorAdam(grads);
} else {
for (size_t j = 0; j < params.size(); ++j) {
if (use_adamw) {
UpdateWithAdamWBaseline(grads, j);
} else {
UpdateWithAdamBaseline(grads, j);
}
}
}
}
static AdamInfo<T, Context> DeepCopy(const AdamInfo &other) {
AdamInfo copied(*other.ctx,
other.shapes,
other.beta1,
other.beta2,
other.weight_decay,
other.multi_precision,
other.use_adamw);
auto copy_tensor = [&other](const DenseTensor &x, DenseTensor *y) {
Copy<Context>(*other.ctx, x, x.place(), false, y);
};
auto copy_tensors = [&other](const std::vector<DenseTensor> &xs,
std::vector<DenseTensor> *ys) {
for (size_t i = 0; i < xs.size(); ++i) {
Copy<Context>(*other.ctx, xs[i], xs[i].place(), false, &((*ys)[i]));
}
};
copy_tensors(other.params, &copied.params);
copy_tensors(other.master_params, &copied.master_params);
copy_tensors(other.moment1s, &copied.moment1s);
copy_tensors(other.moment2s, &copied.moment2s);
copy_tensors(other.beta1_pows, &copied.beta1_pows);
copy_tensors(other.beta2_pows, &copied.beta2_pows);
copy_tensor(other.learning_rate, &copied.learning_rate);
copied.epsilon = other.epsilon;
copied.chunk_size = other.chunk_size;
other.ctx->Wait();
return copied;
}
private:
void UpdateWithMultiTensorAdam(const std::vector<DenseTensor> &grads) {
auto param_metas = ToMetaTensorVector(params);
auto grad_metas = ToMetaTensorVector(grads);
auto master_param_metas = ToMetaTensorVector(master_params);
auto moment1_metas = ToMetaTensorVector(moment1s);
auto moment2_metas = ToMetaTensorVector(moment2s);
auto beta1_pow_metas = ToMetaTensorVector(beta1_pows);
auto beta2_pow_metas = ToMetaTensorVector(beta2_pows);
MultiTensorAdamInferMeta(
ToConstMetaTensorPtrVector(param_metas),
ToConstMetaTensorPtrVector(grad_metas),
learning_rate,
ToConstMetaTensorPtrVector(moment1_metas),
ToConstMetaTensorPtrVector(moment2_metas),
ToConstMetaTensorPtrVector(beta1_pow_metas),
ToConstMetaTensorPtrVector(beta2_pow_metas),
multi_precision ? paddle::make_optional(
ToConstMetaTensorPtrVector(master_param_metas))
: paddle::none,
MetaTensor(),
beta1,
beta2,
epsilon,
chunk_size,
weight_decay,
use_adamw,
multi_precision,
false,
ToMutableMetaTensorPtrVector(param_metas),
ToMutableMetaTensorPtrVector(moment1_metas),
ToMutableMetaTensorPtrVector(moment2_metas),
ToMutableMetaTensorPtrVector(beta1_pow_metas),
ToMutableMetaTensorPtrVector(beta2_pow_metas),
ToMutableMetaTensorPtrVector(master_param_metas));
MultiTensorAdamKernel<T, Context>(
*ctx,
ToConstTensorPtrVector(params),
ToConstTensorPtrVector(grads),
learning_rate,
ToConstTensorPtrVector(moment1s),
ToConstTensorPtrVector(moment2s),
ToConstTensorPtrVector(beta1_pows),
ToConstTensorPtrVector(beta2_pows),
multi_precision
? paddle::make_optional(ToConstTensorPtrVector(master_params))
: paddle::none,
paddle::none,
beta1,
beta2,
epsilon,
chunk_size,
weight_decay,
use_adamw,
multi_precision,
false,
ToMutableTensorPtrVector(params),
ToMutableTensorPtrVector(moment1s),
ToMutableTensorPtrVector(moment2s),
ToMutableTensorPtrVector(beta1_pows),
ToMutableTensorPtrVector(beta2_pows),
ToMutableTensorPtrVector(master_params));
}
void UpdateWithAdamWBaseline(const std::vector<DenseTensor> &grads,
size_t idx) {
AdamwDenseKernel<T, Context>(
*ctx,
params[idx],
grads[idx],
learning_rate,
moment1s[idx],
moment2s[idx],
beta1_pows[idx],
beta2_pows[idx],
multi_precision ? paddle::make_optional(master_params[idx])
: paddle::none,
paddle::none,
beta1,
beta2,
epsilon,
1.0,
weight_decay,
true,
false,
1000,
multi_precision,
false,
&params[idx],
&moment1s[idx],
&moment2s[idx],
&beta1_pows[idx],
&beta2_pows[idx],
multi_precision ? &master_params[idx] : nullptr);
}
void UpdateWithAdamBaseline(const std::vector<DenseTensor> &grads,
size_t idx) {
AdamDenseKernel<T, Context>(
*ctx,
params[idx],
grads[idx],
learning_rate,
moment1s[idx],
moment2s[idx],
beta1_pows[idx],
beta2_pows[idx],
multi_precision ? paddle::make_optional(master_params[idx])
: paddle::none,
paddle::none,
beta1,
beta2,
epsilon,
false,
1000,
multi_precision,
false,
&params[idx],
&moment1s[idx],
&moment2s[idx],
&beta1_pows[idx],
&beta2_pows[idx],
multi_precision ? &master_params[idx] : nullptr);
}
};
template <typename T, typename Context>
auto MaxDiff(const Context &ctx,
const DenseTensor &x_t,
const DenseTensor &y_t) {
using MT = typename AdamInfo<T, Context>::MT;
auto mp_dtype = paddle::experimental::CppTypeToDataType<MT>::Type();
auto x = Cast<T, Context>(ctx, x_t, mp_dtype);
auto y = Cast<T, Context>(ctx, y_t, mp_dtype);
EXPECT_EQ(x.dims(), y.dims());
DenseTensor diff, diff_reduced, diff_reduced_cpu;
diff.Resize(x.dims());
ctx.template Alloc<MT>(&diff);
SubtractKernel<MT, Context>(ctx, x, y, &diff);
AbsKernel<MT, Context>(ctx, diff, &diff);
diff_reduced.Resize({1});
ctx.template Alloc<MT>(&diff_reduced);
MaxRawKernel<MT, Context>(
ctx, diff, vectorize<int64_t>(x.dims()), false, true, &diff_reduced);
diff_reduced_cpu.Resize(diff_reduced.dims());
ctx.template HostAlloc<MT>(&diff_reduced_cpu);
Copy<Context>(ctx, diff_reduced, CPUPlace(), true, &diff_reduced_cpu);
EXPECT_EQ(diff_reduced_cpu.place(), CPUPlace());
return diff_reduced_cpu.data<MT>()[0];
}
template <typename T, typename Context>
auto MaxDiff(const Context &ctx,
const std::vector<DenseTensor> &xs,
const std::vector<DenseTensor> &ys) {
using MT = typename AdamInfo<T, Context>::MT;
MT diff = 0;
for (size_t i = 0; i < xs.size(); ++i) {
diff = std::max<MT>(diff, MaxDiff<T, Context>(ctx, xs[i], ys[i]));
}
return diff;
}
template <typename T, typename PlaceType>
void TestMultiTensorAdamBase(const std::vector<std::vector<int64_t>> &shapes,
float atol,
bool use_adamw,
bool multi_precision = false,
float beta1 = 0.9,
float beta2 = 0.99,
float weight_decay = 0.1,
size_t steps = 5,
uint64_t seed = 10) {
const auto &ctx =
*paddle::platform::DeviceContextPool::Instance().GetByPlace(PlaceType());
using Context = typename std::remove_const<
typename std::remove_pointer<decltype(&ctx)>::type>::type;
ctx.GetGenerator()->SetCurrentSeed(seed);
AdamInfo<T, Context> info1(
ctx, shapes, beta1, beta2, weight_decay, multi_precision, use_adamw);
auto info2 = AdamInfo<T, Context>::DeepCopy(info1);
for (size_t i = 0; i < steps; ++i) {
auto grads = GenerateRandomTensorVectors<T>(ctx, shapes);
info1.Update(false, grads);
info2.Update(true, grads);
}
using MT = typename decltype(info1)::MT;
#define PD_ADAM_TEST_COMP(__field, __dtype) \
do { \
MT __diff = MaxDiff<__dtype>(ctx, info1.__field, info2.__field); \
EXPECT_LE(__diff, static_cast<MT>(atol)) \
<< #__field << " has diff when use_adamw = " << use_adamw \
<< " , multi_precision = " << multi_precision; \
} while (0)
PD_ADAM_TEST_COMP(beta1_pows, MT);
PD_ADAM_TEST_COMP(beta2_pows, MT);
PD_ADAM_TEST_COMP(params, T);
PD_ADAM_TEST_COMP(master_params, MT);
PD_ADAM_TEST_COMP(moment1s, MT);
PD_ADAM_TEST_COMP(moment2s, MT);
}
static auto GenerateRandomShapes(size_t n, uint64_t low, uint64_t high) {
std::random_device device;
std::default_random_engine engine(device());
std::uniform_int_distribution<uint64_t> dist(low, high);
std::vector<std::vector<int64_t>> shapes(n);
for (size_t i = 0; i < n; ++i) {
shapes[i].push_back(dist(engine));
}
return shapes;
}
TEST(multi_tensor_adam, test_fp32_cpu) {
auto shapes = GenerateRandomShapes(30, 10, 20);
float atol = 0.0f;
for (auto use_adamw : {false, true}) {
TestMultiTensorAdamBase<float, CPUPlace>(shapes, atol, use_adamw);
}
}
#ifdef PADDLE_WITH_CUDA
TEST(multi_tensor_adam, test_fp32_gpu) {
auto shapes = GenerateRandomShapes(40, 0, 2 << 18);
float atol = 0.0f;
for (auto use_adamw : {false, true}) {
TestMultiTensorAdamBase<float, GPUPlace>(shapes, atol, use_adamw);
}
}
TEST(multi_tensor_adam, test_fp16_gpu) {
auto shapes = GenerateRandomShapes(40, 0, 2 << 18);
float atol = 5e-3f;
for (auto use_adamw : {false, true}) {
TestMultiTensorAdamBase<dtype::float16, GPUPlace>(
shapes, atol, use_adamw, true);
}
}
#endif
} // namespace phi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册